summaryrefslogtreecommitdiffstats
path: root/lib/cache
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/cache/api.c889
-rw-r--r--lib/cache/api.h201
-rw-r--r--lib/cache/cdb_api.h68
-rw-r--r--lib/cache/cdb_lmdb.c710
-rw-r--r--lib/cache/cdb_lmdb.h23
-rw-r--r--lib/cache/entry_list.c293
-rw-r--r--lib/cache/entry_pkt.c224
-rw-r--r--lib/cache/entry_rr.c146
-rw-r--r--lib/cache/impl.h412
-rw-r--r--lib/cache/knot_pkt.c106
-rw-r--r--lib/cache/nsec1.c511
-rw-r--r--lib/cache/nsec3.c501
-rw-r--r--lib/cache/peek.c724
13 files changed, 4808 insertions, 0 deletions
diff --git a/lib/cache/api.c b/lib/cache/api.c
new file mode 100644
index 0000000..c0591d6
--- /dev/null
+++ b/lib/cache/api.c
@@ -0,0 +1,889 @@
+/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <libknot/descriptor.h>
+#include <libknot/dname.h>
+#include <libknot/errcode.h>
+#include <libknot/rrtype/rrsig.h>
+
+#include "contrib/cleanup.h"
+#include "contrib/ucw/lib.h"
+#include "lib/cache/api.h"
+#include "lib/cache/cdb_lmdb.h"
+#include "lib/defines.h"
+#include "lib/generic/trie.h"
+#include "lib/resolve.h"
+#include "lib/rplan.h"
+#include "lib/utils.h"
+
+#include "lib/cache/impl.h"
+
+/* TODO:
+ * - Reconsider when RRSIGs are put in and retrieved from the cache.
+ * Currently it's always done, which _might_ be spurious, depending
+ * on how kresd will use the returned result.
+ * There's also the "problem" that kresd ATM does _not_ ask upstream
+ * with DO bit in some cases.
+ */
+
+
+/** Cache version */
+static const uint16_t CACHE_VERSION = 5;
+/** Key size */
+#define KEY_HSIZE (sizeof(uint8_t) + sizeof(uint16_t))
+#define KEY_SIZE (KEY_HSIZE + KNOT_DNAME_MAXLEN)
+
+
+/** @internal Forward declarations of the implementation details
+ * \param optout[out] Set *optout = true; when encountering an opt-out NSEC3 (optional). */
+static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
+ const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
+ uint8_t rank, trie_t *nsec_pmap, bool *has_optout);
+/** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */
+static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/);
+
+/** @internal Removes all records from cache. */
+static inline int cache_clear(struct kr_cache *cache)
+{
+ cache->stats.delete += 1;
+ return cache_op(cache, clear);
+}
+
+/** @internal Open cache db transaction and check internal data version. */
+static int assert_right_version(struct kr_cache *cache)
+{
+ /* Check cache ABI version. */
+ /* CACHE_KEY_DEF: to avoid collisions with kr_cache_match(). */
+ uint8_t key_str[4] = "VERS";
+ knot_db_val_t key = { .data = key_str, .len = sizeof(key_str) };
+ knot_db_val_t val = { NULL, 0 };
+ int ret = cache_op(cache, read, &key, &val, 1);
+ if (ret == 0 && val.len == sizeof(CACHE_VERSION)
+ && memcmp(val.data, &CACHE_VERSION, sizeof(CACHE_VERSION)) == 0) {
+ ret = kr_error(EEXIST);
+ } else {
+ int oldret = ret;
+ /* Version doesn't match. Recreate cache and write version key. */
+ ret = cache_op(cache, count);
+ if (ret != 0) { /* Non-empty cache, purge it. */
+ kr_log_info("[ ][cach] incompatible cache database detected, purging\n");
+ if (oldret) {
+ kr_log_verbose("bad ret: %d\n", oldret);
+ } else if (val.len != sizeof(CACHE_VERSION)) {
+ kr_log_verbose("bad length: %d\n", (int)val.len);
+ } else {
+ uint16_t ver;
+ memcpy(&ver, val.data, sizeof(ver));
+ kr_log_verbose("bad version: %d\n", (int)ver);
+ }
+ ret = cache_clear(cache);
+ }
+ /* Either purged or empty. */
+ if (ret == 0) {
+ /* Key/Val is invalidated by cache purge, recreate it */
+ val.data = /*const-cast*/(void *)&CACHE_VERSION;
+ val.len = sizeof(CACHE_VERSION);
+ ret = cache_op(cache, write, &key, &val, 1);
+ }
+ }
+ kr_cache_sync(cache);
+ return ret;
+}
+
+int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm)
+{
+ if (!cache) {
+ return kr_error(EINVAL);
+ }
+ /* Open cache */
+ if (!api) {
+ api = kr_cdb_lmdb();
+ }
+ cache->api = api;
+ int ret = cache->api->open(&cache->db, opts, mm);
+ if (ret != 0) {
+ return ret;
+ }
+ memset(&cache->stats, 0, sizeof(cache->stats));
+ cache->ttl_min = KR_CACHE_DEFAULT_TTL_MIN;
+ cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX;
+ /* Check cache ABI version */
+ kr_cache_make_checkpoint(cache);
+ (void)assert_right_version(cache);
+
+ char *fpath;
+ ret = asprintf(&fpath, "%s/data.mdb", opts->path);
+ if (ret > 0) {
+ kr_cache_emergency_file_to_remove = fpath;
+ } else {
+ assert(false); /* non-critical, but still */
+ }
+ return 0;
+}
+
+const char *kr_cache_emergency_file_to_remove = NULL;
+
+
+#define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db)
+
+void kr_cache_close(struct kr_cache *cache)
+{
+ if (cache_isvalid(cache)) {
+ cache_op(cache, close);
+ cache->db = NULL;
+ }
+ free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove);
+ kr_cache_emergency_file_to_remove = NULL;
+}
+
+int kr_cache_sync(struct kr_cache *cache)
+{
+ if (!cache_isvalid(cache)) {
+ return kr_error(EINVAL);
+ }
+ if (cache->api->sync) {
+ return cache_op(cache, sync);
+ }
+ return kr_ok();
+}
+
+int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp)
+{
+ int err = stash_rrset_precond(rr, NULL);
+ if (err <= 0) {
+ return kr_ok();
+ }
+ ssize_t written = stash_rrset(cache, NULL, rr, rrsig, timestamp, rank, NULL, NULL);
+ /* Zone's NSEC* parames aren't updated, but that's probably OK
+ * for kr_cache_insert_rr() */
+ if (written >= 0) {
+ return kr_ok();
+ }
+
+ return (int) written;
+}
+
+int kr_cache_clear(struct kr_cache *cache)
+{
+ if (!cache_isvalid(cache)) {
+ return kr_error(EINVAL);
+ }
+ int ret = cache_clear(cache);
+ if (ret == 0) {
+ kr_cache_make_checkpoint(cache);
+ ret = assert_right_version(cache);
+ }
+ return ret;
+}
+
+/* When going stricter, BEWARE of breaking entry_h_consistent_NSEC() */
+struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type)
+{
+ (void) type; /* unused, for now */
+ if (!data.data) return NULL;
+ /* Length checks. */
+ if (data.len < offsetof(struct entry_h, data))
+ return NULL;
+ const struct entry_h *eh = data.data;
+ if (eh->is_packet) {
+ uint16_t pkt_len;
+ if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)) {
+ return NULL;
+ }
+ memcpy(&pkt_len, eh->data, sizeof(pkt_len));
+ if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)
+ + pkt_len) {
+ return NULL;
+ }
+ }
+
+ bool ok = true;
+ ok = ok && kr_rank_check(eh->rank);
+ ok = ok && (!kr_rank_test(eh->rank, KR_RANK_BOGUS)
+ || eh->is_packet);
+ ok = ok && (eh->is_packet || !eh->has_optout);
+
+ return ok ? /*const-cast*/(struct entry_h *)eh : NULL;
+}
+
+int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry,
+ const knot_dname_t *owner, uint16_t type, uint32_t now)
+{
+ int32_t diff = now - entry->time;
+ if (diff < 0) {
+ /* We may have obtained the record *after* the request started. */
+ diff = 0;
+ }
+ int32_t res = entry->ttl - diff;
+ if (res < 0 && owner && qry && qry->stale_cb) {
+ /* Stale-serving decision, delegated to a callback. */
+ int res_stale = qry->stale_cb(res, owner, type, qry);
+ if (res_stale >= 0)
+ return res_stale;
+ }
+ return res;
+}
+
+int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry,
+ const knot_dname_t *name, uint16_t type)
+{
+ const struct entry_h *eh = peek->raw_data;
+ return get_new_ttl(eh, qry, name, type, qry->timestamp.tv_sec);
+}
+
+/** Check that no label contains a zero character, incl. a log trace.
+ *
+ * We refuse to work with those, as LF and our cache keys might become ambiguous.
+ * Assuming uncompressed name, as usual.
+ * CACHE_KEY_DEF
+ */
+static bool check_dname_for_lf(const knot_dname_t *n, const struct kr_query *qry/*logging*/)
+{
+ const bool ret = knot_dname_size(n) == strlen((const char *)n) + 1;
+ if (!ret) { WITH_VERBOSE(qry) {
+ auto_free char *n_str = kr_dname_text(n);
+ VERBOSE_MSG(qry, "=> skipping zero-containing name %s\n", n_str);
+ } }
+ return ret;
+}
+
+/** Return false on types to be ignored. Meant both for sname and direct cache requests. */
+static bool check_rrtype(uint16_t type, const struct kr_query *qry/*logging*/)
+{
+ const bool ret = !knot_rrtype_is_metatype(type)
+ && type != KNOT_RRTYPE_RRSIG;
+ if (!ret) { WITH_VERBOSE(qry) {
+ auto_free char *type_str = kr_rrtype_text(type);
+ VERBOSE_MSG(qry, "=> skipping RR type %s\n", type_str);
+ } }
+ return ret;
+}
+
+/** Like key_exact_type() but omits a couple checks not holding for pkt cache. */
+knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type)
+{
+ assert(check_rrtype(type, NULL));
+ switch (type) {
+ case KNOT_RRTYPE_RRSIG: /* no RRSIG query caching, at least for now */
+ assert(false);
+ return (knot_db_val_t){ NULL, 0 };
+ /* xNAME lumped into NS. */
+ case KNOT_RRTYPE_CNAME:
+ case KNOT_RRTYPE_DNAME:
+ type = KNOT_RRTYPE_NS;
+ default:
+ break;
+ }
+
+ int name_len = k->buf[0];
+ k->buf[name_len + 1] = 0; /* make sure different names can never match */
+ k->buf[name_len + 2] = 'E'; /* tag for exact name+type matches */
+ memcpy(k->buf + name_len + 3, &type, 2);
+ k->type = type;
+ /* CACHE_KEY_DEF: key == dname_lf + '\0' + 'E' + RRTYPE */
+ return (knot_db_val_t){ k->buf + 1, name_len + 4 };
+}
+
+
+/** The inside for cache_peek(); implementation separated to ./peek.c */
+int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt);
+/** function for .produce phase */
+int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+ /* We first check various exit-conditions and then call the _real function. */
+
+ if (!kr_cache_is_open(&req->ctx->cache)
+ || ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || qry->flags.NO_CACHE
+ || (qry->flags.CACHE_TRIED && !qry->stale_cb)
+ || !check_rrtype(qry->stype, qry) /* LATER: some other behavior for some of these? */
+ || qry->sclass != KNOT_CLASS_IN) {
+ return ctx->state; /* Already resolved/failed or already tried, etc. */
+ }
+ /* ATM cache only peeks for qry->sname and that would be useless
+ * to repeat on every iteration, so disable it from now on.
+ * LATER(optim.): assist with more precise QNAME minimization. */
+ qry->flags.CACHE_TRIED = true;
+
+ if (qry->stype == KNOT_RRTYPE_NSEC) {
+ VERBOSE_MSG(qry, "=> skipping stype NSEC\n");
+ return ctx->state;
+ }
+ if (!check_dname_for_lf(qry->sname, qry)) {
+ return ctx->state;
+ }
+
+ int ret = peek_nosync(ctx, pkt);
+ kr_cache_sync(&req->ctx->cache);
+ return ret;
+}
+
+
+
+/** It's simply inside of cycle taken out to decrease indentation. \return error code. */
+static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
+ const struct kr_query *qry, struct kr_cache *cache,
+ int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout);
+/** Stash a single nsec_p. \return 0 (errors are ignored). */
+static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
+ struct kr_request *req);
+
+/** The whole .consume phase for the cache module. */
+int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+ struct kr_cache *cache = &req->ctx->cache;
+
+ /* Note: we cache even in KR_STATE_FAIL. For example,
+ * BOGUS answer can go to +cd cache even without +cd request. */
+ if (!kr_cache_is_open(cache) || !qry
+ || qry->flags.CACHED || !check_rrtype(knot_pkt_qtype(pkt), qry)
+ || qry->sclass != KNOT_CLASS_IN) {
+ return ctx->state;
+ }
+ /* Do not cache truncated answers, at least for now. LATER */
+ if (knot_wire_get_tc(pkt->wire)) {
+ return ctx->state;
+ }
+ /* Stash individual records. */
+ ranked_rr_array_t *selected[] = kr_request_selected(req);
+ int unauth_cnt = 0;
+ trie_t *nsec_pmap = trie_create(&req->pool);
+ if (!nsec_pmap) {
+ assert(!ENOMEM);
+ goto finally;
+ }
+ bool has_optout = false;
+ /* ^^ DNSSEC_OPTOUT is not fired in cases like `com. A`,
+ * but currently we don't stash separate NSEC3 proving that. */
+ for (int psec = KNOT_ANSWER; psec <= KNOT_ADDITIONAL; ++psec) {
+ ranked_rr_array_t *arr = selected[psec];
+ /* uncached entries are located at the end */
+ for (ssize_t i = arr->len - 1; i >= 0; --i) {
+ ranked_rr_array_entry_t *entry = arr->at[i];
+ if (entry->qry_uid != qry->uid) {
+ continue;
+ /* TODO: probably safe to break but maybe not worth it */
+ }
+ int ret = stash_rrarray_entry(arr, i, qry, cache, &unauth_cnt,
+ nsec_pmap, &has_optout);
+ if (ret) {
+ VERBOSE_MSG(qry, "=> stashing RRs errored out\n");
+ goto finally;
+ }
+ /* LATER(optim.): maybe filter out some type-rank combinations
+ * that won't be useful as separate RRsets. */
+ }
+ }
+
+ trie_it_t *it;
+ for (it = trie_it_begin(nsec_pmap); !trie_it_finished(it); trie_it_next(it)) {
+ stash_nsec_p((const knot_dname_t *)trie_it_key(it, NULL),
+ (const char *)*trie_it_val(it), req);
+ }
+ trie_it_free(it);
+ /* LATER(optim.): typically we also have corresponding NS record in the list,
+ * so we might save a cache operation. */
+
+ stash_pkt(pkt, qry, req, has_optout);
+
+finally:
+ if (unauth_cnt) {
+ VERBOSE_MSG(qry, "=> stashed also %d nonauth RRsets\n", unauth_cnt);
+ };
+ kr_cache_sync(cache);
+ return ctx->state; /* we ignore cache-stashing errors */
+}
+
+/** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */
+static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/)
+{
+ if (!rr || rr->rclass != KNOT_CLASS_IN) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+ if (!check_rrtype(rr->type, qry)) {
+ return kr_ok();
+ }
+ if (!check_dname_for_lf(rr->owner, qry)) {
+ return kr_ok();
+ }
+ return 1/*proceed*/;
+}
+
+static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
+ const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
+ uint8_t rank, trie_t *nsec_pmap, bool *has_optout)
+{
+ assert(stash_rrset_precond(rr, qry) > 0);
+ if (!cache) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ const int wild_labels = rr_sigs == NULL ? 0 :
+ knot_dname_labels(rr->owner, NULL) - knot_rrsig_labels(rr_sigs->rrs.rdata);
+ if (wild_labels < 0) {
+ return kr_ok();
+ }
+ const knot_dname_t *encloser = rr->owner; /**< the closest encloser name */
+ for (int i = 0; i < wild_labels; ++i) {
+ encloser = knot_wire_next_label(encloser, NULL);
+ }
+ int ret = 0;
+
+ /* Construct the key under which RRs will be stored,
+ * and add corresponding nsec_pmap item (if necessary). */
+ struct key k_storage, *k = &k_storage;
+ knot_db_val_t key;
+ switch (rr->type) {
+ case KNOT_RRTYPE_NSEC3:
+ /* Skip "suspicious" or opt-out NSEC3 sets. */
+ if (rr->rrs.count != 1) return kr_ok();
+ if (KNOT_NSEC3_FLAG_OPT_OUT & knot_nsec3_flags(rr->rrs.rdata)) {
+ if (has_optout) *has_optout = true;
+ return kr_ok();
+ }
+ /* fall through */
+ case KNOT_RRTYPE_NSEC:
+ if (!kr_rank_test(rank, KR_RANK_SECURE)) {
+ /* Skip any NSEC*s that aren't validated. */
+ return kr_ok();
+ }
+ if (!rr_sigs || !rr_sigs->rrs.count || !rr_sigs->rrs.rdata) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+ const knot_dname_t *signer = knot_rrsig_signer_name(rr_sigs->rrs.rdata);
+ const int signer_size = knot_dname_size(signer);
+ k->zlf_len = signer_size - 1;
+
+ void **npp = nsec_pmap == NULL ? NULL
+ : trie_get_ins(nsec_pmap, (const char *)signer, signer_size);
+ assert(!nsec_pmap || (npp && ENOMEM));
+ if (rr->type == KNOT_RRTYPE_NSEC) {
+ key = key_NSEC1(k, encloser, wild_labels);
+ break;
+ }
+
+ assert(rr->type == KNOT_RRTYPE_NSEC3);
+ const knot_rdata_t * const rdata = rr->rrs.rdata;
+ if (rdata->len <= 4) return kr_error(EILSEQ); /*< data from outside; less trust */
+ const int np_dlen = nsec_p_rdlen(rdata->data);
+ if (np_dlen > rdata->len) return kr_error(EILSEQ);
+ key = key_NSEC3(k, encloser, nsec_p_mkHash(rdata->data));
+ if (npp && !*npp) {
+ *npp = mm_alloc(&qry->request->pool, np_dlen);
+ if (!*npp) {
+ assert(!ENOMEM);
+ break;
+ }
+ memcpy(*npp, rdata->data, np_dlen);
+ }
+ break;
+ default:
+ ret = kr_dname_lf(k->buf, encloser, wild_labels);
+ if (ret) {
+ assert(!ret);
+ return kr_error(ret);
+ }
+ key = key_exact_type(k, rr->type);
+ }
+
+ /* Compute materialized sizes of the new data. */
+ const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL;
+ const int rr_ssize = rdataset_dematerialize_size(&rr->rrs);
+ assert(rr_ssize == to_even(rr_ssize));
+ knot_db_val_t val_new_entry = {
+ .data = NULL,
+ .len = offsetof(struct entry_h, data) + rr_ssize
+ + rdataset_dematerialize_size(rds_sigs),
+ };
+
+ /* Prepare raw memory for the new entry. */
+ ret = entry_h_splice(&val_new_entry, rank, key, k->type, rr->type,
+ rr->owner, qry, cache, timestamp);
+ if (ret) return kr_ok(); /* some aren't really errors */
+ assert(val_new_entry.data);
+
+ const uint32_t ttl = rr->ttl;
+ /* FIXME: consider TTLs and expirations of RRSIGs as well, just in case. */
+
+ /* Write the entry itself. */
+ struct entry_h *eh = val_new_entry.data;
+ memset(eh, 0, offsetof(struct entry_h, data));
+ eh->time = timestamp;
+ eh->ttl = MAX(MIN(ttl, cache->ttl_max), cache->ttl_min);
+ eh->rank = rank;
+ if (rdataset_dematerialize(&rr->rrs, eh->data)
+ || rdataset_dematerialize(rds_sigs, eh->data + rr_ssize)) {
+ /* minimize the damage from incomplete write; TODO: better */
+ eh->time = 0;
+ eh->ttl = 0;
+ eh->rank = 0;
+ assert(false);
+ }
+ assert(entry_h_consistent(val_new_entry, rr->type));
+
+ #if 0 /* Occasionally useful when debugging some kinds of changes. */
+ {
+ kr_cache_sync(cache);
+ knot_db_val_t val = { NULL, 0 };
+ ret = cache_op(cache, read, &key, &val, 1);
+ if (ret != kr_error(ENOENT)) { // ENOENT might happen in some edge case, I guess
+ assert(!ret);
+ entry_list_t el;
+ entry_list_parse(val, el);
+ }
+ }
+ #endif
+
+ /* Update metrics */
+ cache->stats.insert += 1;
+
+ /* Verbose-log some not-too-common cases. */
+ WITH_VERBOSE(qry) { if (kr_rank_test(rank, KR_RANK_AUTH)
+ || rr->type == KNOT_RRTYPE_NS) {
+ auto_free char *type_str = kr_rrtype_text(rr->type),
+ *encl_str = kr_dname_text(encloser);
+ VERBOSE_MSG(qry, "=> stashed %s%s %s, rank 0%.2o, "
+ "%d B total, incl. %d RRSIGs\n",
+ (wild_labels ? "*." : ""), encl_str, type_str, rank,
+ (int)val_new_entry.len, (rr_sigs ? rr_sigs->rrs.count : 0)
+ );
+ } }
+
+ return (ssize_t) val_new_entry.len;
+}
+
+static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
+ const struct kr_query *qry, struct kr_cache *cache,
+ int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout)
+{
+ ranked_rr_array_entry_t *entry = arr->at[arr_i];
+ if (entry->cached) {
+ return kr_ok();
+ }
+ const knot_rrset_t *rr = entry->rr;
+ if (rr->type == KNOT_RRTYPE_RRSIG) {
+ return kr_ok(); /* reduce verbose logging from the following call */
+ }
+ int ret = stash_rrset_precond(rr, qry);
+ if (ret <= 0) {
+ return ret;
+ }
+
+ /* Try to find corresponding signatures, always. LATER(optim.): speed. */
+ ranked_rr_array_entry_t *entry_rrsigs = NULL;
+ const knot_rrset_t *rr_sigs = NULL;
+ for (ssize_t j = arr->len - 1; j >= 0; --j) {
+ /* TODO: ATM we assume that some properties are the same
+ * for all RRSIGs in the set (esp. label count). */
+ ranked_rr_array_entry_t *e = arr->at[j];
+ bool ok = e->qry_uid == qry->uid && !e->cached
+ && e->rr->type == KNOT_RRTYPE_RRSIG
+ && knot_rrsig_type_covered(e->rr->rrs.rdata) == rr->type
+ && knot_dname_is_equal(rr->owner, e->rr->owner);
+ if (!ok) continue;
+ entry_rrsigs = e;
+ rr_sigs = e->rr;
+ break;
+ }
+
+ ssize_t written = stash_rrset(cache, qry, rr, rr_sigs, qry->timestamp.tv_sec,
+ entry->rank, nsec_pmap, has_optout);
+ if (written < 0) {
+ kr_log_error("[%05u.%02u][cach] stash failed, ret = %d\n", qry->request->uid,
+ qry->uid, ret);
+ return (int) written;
+ }
+
+ if (written > 0) {
+ /* Mark entry as cached for the rest of the query processing */
+ entry->cached = true;
+ if (entry_rrsigs) {
+ entry_rrsigs->cached = true;
+ }
+ if (!kr_rank_test(entry->rank, KR_RANK_AUTH) && rr->type != KNOT_RRTYPE_NS) {
+ *unauth_cnt += 1;
+ }
+ }
+
+ return kr_ok();
+}
+
+static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
+ struct kr_request *req)
+{
+ const struct kr_query *qry = req->current_query;
+ struct kr_cache *cache = &req->ctx->cache;
+ uint32_t valid_until = qry->timestamp.tv_sec + cache->ttl_max;
+ /* LATER(optim.): be more precise here ^^ and reduce calls. */
+ static const int32_t ttl_margin = 3600;
+ const uint8_t *nsec_p = (const uint8_t *)nsec_p_v;
+ int data_stride = sizeof(valid_until) + nsec_p_rdlen(nsec_p);
+
+ unsigned int log_hash = 0xFeeeFeee; /* this type is simpler for printf args */
+ auto_free char *log_dname = NULL;
+ WITH_VERBOSE(qry) {
+ log_hash = nsec_p_v ? nsec_p_mkHash((const uint8_t *)nsec_p_v) : 0;
+ log_dname = kr_dname_text(dname);
+ }
+ /* Find what's in the cache. */
+ struct key k_storage, *k = &k_storage;
+ int ret = kr_dname_lf(k->buf, dname, false);
+ if (ret) return kr_error(ret);
+ knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS);
+ knot_db_val_t val_orig = { NULL, 0 };
+ ret = cache_op(cache, read, &key, &val_orig, 1);
+ if (ret && ret != -ABS(ENOENT)) {
+ VERBOSE_MSG(qry, "=> EL read failed (ret: %d)\n", ret);
+ return kr_ok();
+ }
+ /* Prepare new entry_list_t so we can just write at el[0]. */
+ entry_list_t el;
+ int log_refresh_by = 0;
+ if (ret == -ABS(ENOENT)) {
+ memset(el, 0, sizeof(el));
+ } else {
+ ret = entry_list_parse(val_orig, el);
+ if (ret) {
+ VERBOSE_MSG(qry, "=> EL parse failed (ret: %d)\n", ret);
+ return kr_error(0);
+ }
+ /* Find the index to replace. */
+ int i_replace = ENTRY_APEX_NSECS_CNT - 1;
+ for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+ if (el[i].len != data_stride) continue;
+ if (nsec_p && memcmp(nsec_p, (uint8_t *)el[i].data + sizeof(uint32_t),
+ data_stride - sizeof(uint32_t)) != 0) {
+ continue;
+ }
+ /* Save a cache operation if TTL extended only a little. */
+ uint32_t valid_orig;
+ memcpy(&valid_orig, el[i].data, sizeof(valid_orig));
+ const int32_t ttl_extended_by = valid_until - valid_orig;
+ if (ttl_extended_by < ttl_margin) {
+ VERBOSE_MSG(qry,
+ "=> nsec_p stash for %s skipped (extra TTL: %d, hash: %x)\n",
+ log_dname, ttl_extended_by, log_hash);
+ return kr_ok();
+ }
+ i_replace = i;
+ log_refresh_by = ttl_extended_by;
+ break;
+ }
+ /* Shift the other indices: move the first `i_replace` blocks
+ * by one position. */
+ if (i_replace) {
+ memmove(&el[1], &el[0], sizeof(el[0]) * i_replace);
+ }
+ }
+ /* Prepare old data into a buffer. See entry_h_splice() for why. LATER(optim.) */
+ el[0].len = data_stride;
+ el[0].data = NULL;
+ knot_db_val_t val;
+ val.len = entry_list_serial_size(el),
+ val.data = mm_alloc(&req->pool, val.len),
+ entry_list_memcpy(val.data, el);
+ /* Prepare the new data chunk */
+ memcpy(el[0].data, &valid_until, sizeof(valid_until));
+ if (nsec_p) {
+ memcpy((uint8_t *)el[0].data + sizeof(valid_until), nsec_p,
+ data_stride - sizeof(valid_until));
+ }
+ /* Write it all to the cache */
+ ret = cache_op(cache, write, &key, &val, 1);
+ if (ret || !val.data) {
+ VERBOSE_MSG(qry, "=> EL write failed (ret: %d)\n", ret);
+ return kr_ok();
+ }
+ if (log_refresh_by) {
+ VERBOSE_MSG(qry, "=> nsec_p stashed for %s (refresh by %d, hash: %x)\n",
+ log_dname, log_refresh_by, log_hash);
+ } else {
+ VERBOSE_MSG(qry, "=> nsec_p stashed for %s (new, hash: %x)\n",
+ log_dname, log_hash);
+ }
+ return kr_ok();
+}
+
+
+static int peek_exact_real(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+ struct kr_cache_p *peek)
+{
+ if (!check_rrtype(type, NULL) || !check_dname_for_lf(name, NULL)) {
+ return kr_error(ENOTSUP);
+ }
+ struct key k_storage, *k = &k_storage;
+
+ int ret = kr_dname_lf(k->buf, name, false);
+ if (ret) return kr_error(ret);
+
+ knot_db_val_t key = key_exact_type(k, type);
+ knot_db_val_t val = { NULL, 0 };
+ ret = cache_op(cache, read, &key, &val, 1);
+ if (!ret) ret = entry_h_seek(&val, type);
+ if (ret) return kr_error(ret);
+
+ const struct entry_h *eh = entry_h_consistent(val, type);
+ if (!eh || eh->is_packet) {
+ // TODO: no packets, but better get rid of whole kr_cache_peek_exact().
+ return kr_error(ENOENT);
+ }
+ *peek = (struct kr_cache_p){
+ .time = eh->time,
+ .ttl = eh->ttl,
+ .rank = eh->rank,
+ .raw_data = val.data,
+ .raw_bound = knot_db_val_bound(val),
+ };
+ return kr_ok();
+}
+int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+ struct kr_cache_p *peek)
+{ /* Just wrap with extra verbose logging. */
+ const int ret = peek_exact_real(cache, name, type, peek);
+ if (false && VERBOSE_STATUS) { /* too noisy for usual --verbose */
+ auto_free char *type_str = kr_rrtype_text(type),
+ *name_str = kr_dname_text(name);
+ const char *result_str = (ret == kr_ok() ? "hit" :
+ (ret == kr_error(ENOENT) ? "miss" : "error"));
+ VERBOSE_MSG(NULL, "_peek_exact: %s %s %s (ret: %d)",
+ type_str, name_str, result_str, ret);
+ }
+ return ret;
+}
+
+int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type)
+{
+ if (!cache_isvalid(cache)) {
+ return kr_error(EINVAL);
+ }
+ if (!cache->api->remove) {
+ return kr_error(ENOSYS);
+ }
+ struct key k_storage, *k = &k_storage;
+ int ret = kr_dname_lf(k->buf, name, false);
+ if (ret) return kr_error(ret);
+
+ knot_db_val_t key = key_exact_type(k, type);
+ return cache_op(cache, remove, &key, 1);
+}
+
+int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name,
+ bool exact_name, knot_db_val_t keyval[][2], int maxcount)
+{
+ if (!cache_isvalid(cache)) {
+ return kr_error(EINVAL);
+ }
+ if (!cache->api->match) {
+ return kr_error(ENOSYS);
+ }
+
+ struct key k_storage, *k = &k_storage;
+
+ int ret = kr_dname_lf(k->buf, name, false);
+ if (ret) return kr_error(ret);
+
+ // use a mock type
+ knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_A);
+ /* CACHE_KEY_DEF */
+ key.len -= sizeof(uint16_t); /* the type */
+ if (!exact_name) {
+ key.len -= 2; /* '\0' 'E' */
+ if (name[0] == '\0') ++key.len; /* the root name is special ATM */
+ }
+ return cache_op(cache, match, &key, keyval, maxcount);
+}
+
+int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type)
+{
+ if (key.data == NULL || buf == NULL || type == NULL) {
+ return kr_error(EINVAL);
+ }
+
+ int len = -1;
+ const char *tag, *key_data = key.data;
+ for (tag = key_data + 1; tag < key_data + key.len; ++tag) {
+ /* CACHE_KEY_DEF */
+ if (tag[-1] == '\0' && (tag == key_data + 1 || tag[-2] == '\0')) {
+ if (tag[0] != 'E') return kr_error(EINVAL);
+ len = tag - 1 - key_data;
+ break;
+ }
+ }
+
+ if (len == -1 || len > KNOT_DNAME_MAXLEN) {
+ return kr_error(EINVAL);
+ }
+
+ int ret = knot_dname_lf2wire(buf, len, key.data);
+ if (ret < 0) {
+ return kr_error(ret);
+ }
+
+ /* CACHE_KEY_DEF: jump over "\0 E/1" */
+ memcpy(type, tag + 1, sizeof(uint16_t));
+
+ return kr_ok();
+}
+
+
+int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name,
+ bool exact_name, int maxcount)
+{
+ if (!cache_isvalid(cache)) {
+ return kr_error(EINVAL);
+ }
+
+ knot_db_val_t keyval[maxcount][2], keys[maxcount];
+ int ret = kr_cache_match(cache, name, exact_name, keyval, maxcount);
+ if (ret <= 0) { /* ENOENT -> nothing to remove */
+ return (ret == KNOT_ENOENT) ? 0 : ret;
+ }
+ const int count = ret;
+ /* Duplicate the key strings, as deletion may invalidate the pointers. */
+ int i;
+ for (i = 0; i < count; ++i) {
+ keys[i].len = keyval[i][0].len;
+ keys[i].data = malloc(keys[i].len);
+ if (!keys[i].data) {
+ ret = kr_error(ENOMEM);
+ goto cleanup;
+ }
+ memcpy(keys[i].data, keyval[i][0].data, keys[i].len);
+ }
+ ret = cache->api->remove(cache->db, keys, count);
+cleanup:
+ kr_cache_sync(cache); /* Sync even after just kr_cache_match(). */
+ /* Free keys */
+ while (--i >= 0) {
+ free(keys[i].data);
+ }
+ return ret;
+}
+
diff --git a/lib/cache/api.h b/lib/cache/api.h
new file mode 100644
index 0000000..61bf796
--- /dev/null
+++ b/lib/cache/api.h
@@ -0,0 +1,201 @@
+/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <libknot/consts.h>
+#include <libknot/rrset.h>
+#include <sys/time.h>
+#include "lib/cache/cdb_api.h"
+#include "lib/defines.h"
+#include "contrib/ucw/config.h" /*uint*/
+
+/** When knot_pkt is passed from cache without ->wire, this is the ->size. */
+static const size_t PKT_SIZE_NOWIRE = -1;
+
+
+#include "lib/module.h"
+/* Prototypes for the 'cache' module implementation. */
+int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt);
+int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt);
+
+
+/**
+ * Cache structure, keeps API, instance and metadata.
+ */
+struct kr_cache
+{
+ knot_db_t *db; /**< Storage instance */
+ const struct kr_cdb_api *api; /**< Storage engine */
+ struct {
+ uint32_t hit; /**< Number of cache hits */
+ uint32_t miss; /**< Number of cache misses */
+ uint32_t insert; /**< Number of insertions */
+ uint32_t delete; /**< Number of deletions */
+ } stats;
+
+ uint32_t ttl_min, ttl_max; /**< TTL limits */
+
+ /* A pair of stamps for detection of real-time shifts during runtime. */
+ struct timeval checkpoint_walltime; /**< Wall time on the last check-point. */
+ uint64_t checkpoint_monotime; /**< Monotonic milliseconds on the last check-point. */
+};
+
+/**
+ * Open/create cache with provided storage options.
+ * @param cache cache structure to be initialized
+ * @param api storage engine API
+ * @param opts storage-specific options (may be NULL for default)
+ * @param mm memory context.
+ * @return 0 or an error code
+ */
+KR_EXPORT
+int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm);
+
+/**
+ * Path to cache file to remove on critical out-of-space error. (do NOT modify it)
+ */
+KR_EXPORT extern
+const char *kr_cache_emergency_file_to_remove;
+
+/**
+ * Close persistent cache.
+ * @note This doesn't clear the data, just closes the connection to the database.
+ * @param cache structure
+ */
+KR_EXPORT
+void kr_cache_close(struct kr_cache *cache);
+
+/** Run after a row of operations to release transaction/lock if needed. */
+KR_EXPORT
+int kr_cache_sync(struct kr_cache *cache);
+
+/**
+ * Return true if cache is open and enabled.
+ */
+static inline bool kr_cache_is_open(struct kr_cache *cache)
+{
+ return cache->db != NULL;
+}
+
+/** (Re)set the time pair to the current values. */
+static inline void kr_cache_make_checkpoint(struct kr_cache *cache)
+{
+ cache->checkpoint_monotime = kr_now();
+ gettimeofday(&cache->checkpoint_walltime, NULL);
+}
+
+/**
+ * Insert RRSet into cache, replacing any existing data.
+ * @param cache cache structure
+ * @param rr inserted RRSet
+ * @param rrsig RRSIG for inserted RRSet (optional)
+ * @param rank rank of the data
+ * @param timestamp current time
+ * @return 0 or an errcode
+ */
+KR_EXPORT
+int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp);
+
+/**
+ * Clear all items from the cache.
+ * @param cache cache structure
+ * @return 0 or an errcode
+ */
+KR_EXPORT
+int kr_cache_clear(struct kr_cache *cache);
+
+
+/* ** This interface is temporary. ** */
+
+struct kr_cache_p {
+ uint32_t time; /**< The time of inception. */
+ uint32_t ttl; /**< TTL at inception moment. Assuming it fits into int32_t ATM. */
+ uint8_t rank; /**< See enum kr_rank */
+ struct {
+ /* internal: pointer to eh struct */
+ void *raw_data, *raw_bound;
+ };
+};
+KR_EXPORT
+int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+ struct kr_cache_p *peek);
+/* Parameters (qry, name, type) are used for timestamp and stale-serving decisions. */
+KR_EXPORT
+int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry,
+ const knot_dname_t *name, uint16_t type);
+
+KR_EXPORT
+int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref,
+ knot_mm_t *pool);
+
+
+/**
+ * Remove an entry from cache.
+ * @param cache cache structure
+ * @param name dname
+ * @param type rr type
+ * @return number of deleted records, or negative error code
+ * @note only "exact hits" are considered ATM, and
+ * some other information may be removed alongside.
+ */
+KR_EXPORT
+int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type);
+
+/**
+ * Get keys matching a dname lf prefix
+ * @param cache cache structure
+ * @param name dname
+ * @param exact_name whether to only consider exact name matches
+ * @param keyval matched key-value pairs
+ * @param maxcount limit on the number of returned key-value pairs
+ * @return result count or an errcode
+ * @note the cache keys are matched by prefix, i.e. it very much depends
+ * on their structure; CACHE_KEY_DEF.
+ */
+KR_EXPORT
+int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name,
+ bool exact_name, knot_db_val_t keyval[][2], int maxcount);
+
+/**
+ * Remove a subtree in cache. It's like _match but removing them instead of returning.
+ * @return number of deleted entries or an errcode
+ */
+KR_EXPORT
+int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name,
+ bool exact_name, int maxcount);
+
+/**
+ * Find the closest cached zone apex for a name (in cache).
+ * @param is_DS start searching one name higher
+ * @return the number of labels to remove from the name, or negative error code
+ * @note timestamp is found by a syscall, and stale-serving is not considered
+ */
+KR_EXPORT
+int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS,
+ knot_dname_t **apex);
+
+/**
+ * Unpack dname and type from db key
+ * @param key db key representation
+ * @param buf output buffer of domain name in dname format
+ * @param type output for type
+ * @return length of dname or an errcode
+ * @note only "exact hits" are considered ATM, moreover xNAME records
+ * are "hidden" as NS. (see comments in struct entry_h)
+ */
+KR_EXPORT
+int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type);
diff --git a/lib/cache/cdb_api.h b/lib/cache/cdb_api.h
new file mode 100644
index 0000000..814a5f5
--- /dev/null
+++ b/lib/cache/cdb_api.h
@@ -0,0 +1,68 @@
+/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <libknot/db/db.h>
+
+/* Cache options. */
+struct kr_cdb_opts {
+ const char *path; /*!< Cache URI path. */
+ size_t maxsize; /*!< Suggested cache size in bytes. */
+};
+
+/*! Cache database API.
+ * This is a simplified version of generic DB API from libknot,
+ * that is tailored to caching purposes.
+ */
+struct kr_cdb_api {
+ const char *name;
+
+ /* Context operations */
+
+ int (*open)(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *mm);
+ void (*close)(knot_db_t *db);
+ int (*count)(knot_db_t *db);
+ int (*clear)(knot_db_t *db);
+
+ /** Run after a row of operations to release transaction/lock if needed. */
+ int (*sync)(knot_db_t *db);
+
+ /* Data access */
+
+ int (*read)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+ int maxcount);
+ int (*write)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+ int maxcount);
+
+ /** Remove maxcount keys.
+ * \returns the number of succesfully removed keys or the first error code
+ * It returns on first error, but ENOENT is not considered an error. */
+ int (*remove)(knot_db_t *db, knot_db_val_t keys[], int maxcount);
+
+ /* Specialised operations */
+
+ /** Find key-value pairs that are prefixed by the given key, limited by maxcount.
+ * \return the number of pairs or negative error. */
+ int (*match)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount);
+ /** Not implemented ATM. */
+ int (*prune)(knot_db_t *db, int maxcount);
+
+ /** Less-or-equal search (lexicographic ordering).
+ * On successful return, key->data and val->data point to DB-owned data.
+ * return: 0 for equality, > 0 for less, < 0 kr_error */
+ int (*read_leq)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t *val);
+};
diff --git a/lib/cache/cdb_lmdb.c b/lib/cache/cdb_lmdb.c
new file mode 100644
index 0000000..621d2e8
--- /dev/null
+++ b/lib/cache/cdb_lmdb.c
@@ -0,0 +1,710 @@
+/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <lmdb.h>
+
+#include "contrib/cleanup.h"
+#include "lib/cache/cdb_lmdb.h"
+#include "lib/cache/api.h"
+#include "lib/utils.h"
+
+
+/* Defines */
+#define LMDB_DIR_MODE 0770
+#define LMDB_FILE_MODE 0660
+
+struct lmdb_env
+{
+ size_t mapsize;
+ MDB_dbi dbi;
+ MDB_env *env;
+
+ /** Cached transactions
+ *
+ * - only one of (ro,rw) may be active at once
+ * - non-NULL .ro may be active or reset
+ * - non-NULL .rw is always active
+ */
+ struct {
+ bool ro_active, ro_curs_active;
+ MDB_txn *ro, *rw;
+ MDB_cursor *ro_curs;
+ } txn;
+};
+
+/** @brief Convert LMDB error code. */
+static int lmdb_error(int error)
+{
+ /* _BAD_TXN may happen with overfull DB,
+ * even during mdb_get with a single fork :-/ */
+ if (error == MDB_BAD_TXN) {
+ kr_log_info("[cache] MDB_BAD_TXN, probably overfull\n");
+ error = ENOSPC;
+ }
+ switch (error) {
+ case MDB_SUCCESS:
+ return kr_ok();
+ case MDB_NOTFOUND:
+ return kr_error(ENOENT);
+ case ENOSPC:
+ case MDB_MAP_FULL:
+ case MDB_TXN_FULL:
+ return kr_error(ENOSPC);
+ default:
+ kr_log_error("[cache] LMDB error: %s\n", mdb_strerror(error));
+ return kr_error(error);
+ }
+}
+
+/** Conversion between knot and lmdb structs for values. */
+static inline knot_db_val_t val_mdb2knot(MDB_val v)
+{
+ return (knot_db_val_t){ .len = v.mv_size, .data = v.mv_data };
+}
+static inline MDB_val val_knot2mdb(knot_db_val_t v)
+{
+ return (MDB_val){ .mv_size = v.len, .mv_data = v.data };
+}
+
+
+/*! \brief Set the environment map size.
+ * \note This also sets the maximum database size, see \fn mdb_env_set_mapsize
+ */
+static int set_mapsize(MDB_env *env, size_t map_size)
+{
+ long page_size = sysconf(_SC_PAGESIZE);
+ if (page_size <= 0) {
+ return KNOT_ERROR;
+ }
+
+ /* Round to page size. */
+ map_size = (map_size / page_size) * page_size;
+ int ret = mdb_env_set_mapsize(env, map_size);
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+
+ return 0;
+}
+
+#define FLAG_RENEW (2*MDB_RDONLY)
+/** mdb_txn_begin or _renew + handle MDB_MAP_RESIZED.
+ *
+ * The retrying logic for MDB_MAP_RESIZED is so ugly that it has its own function.
+ * \note this assumes no transactions are active
+ * \return MDB_ errcode, not usual kr_error(...)
+ */
+static int txn_get_noresize(struct lmdb_env *env, unsigned int flag, MDB_txn **txn)
+{
+ assert(!env->txn.rw && (!env->txn.ro || !env->txn.ro_active));
+ int ret;
+ if (flag == FLAG_RENEW) {
+ ret = mdb_txn_renew(*txn);
+ } else {
+ ret = mdb_txn_begin(env->env, NULL, flag, txn);
+ }
+ if (ret != MDB_MAP_RESIZED) {
+ return ret;
+ }
+ //:unlikely
+ /* Another process increased the size; let's try to recover. */
+ kr_log_info("[cache] detected size increased by another process\n");
+ ret = mdb_env_set_mapsize(env->env, 0);
+ if (ret != MDB_SUCCESS) {
+ return ret;
+ }
+ if (flag == FLAG_RENEW) {
+ ret = mdb_txn_renew(*txn);
+ } else {
+ ret = mdb_txn_begin(env->env, NULL, flag, txn);
+ }
+ return ret;
+}
+
+/** Obtain a transaction. (they're cached in env->txn) */
+static int txn_get(struct lmdb_env *env, MDB_txn **txn, bool rdonly)
+{
+ assert(env && txn);
+ if (env->txn.rw) {
+ /* Reuse the *open* RW txn even if only reading is requested.
+ * We leave the management of this to the cdb_sync command.
+ * The user may e.g. want to do some reads between the writes. */
+ *txn = env->txn.rw;
+ return kr_ok();
+ }
+
+ if (!rdonly) {
+ /* avoid two active transactions */
+ if (env->txn.ro && env->txn.ro_active) {
+ mdb_txn_reset(env->txn.ro);
+ env->txn.ro_active = false;
+ env->txn.ro_curs_active = false;
+ }
+ int ret = txn_get_noresize(env, 0/*RW*/, &env->txn.rw);
+ if (ret == MDB_SUCCESS) {
+ *txn = env->txn.rw;
+ assert(*txn);
+ }
+ return lmdb_error(ret);
+ }
+
+ /* Get an active RO txn and return it. */
+ int ret = MDB_SUCCESS;
+ if (!env->txn.ro) { //:unlikely
+ ret = txn_get_noresize(env, MDB_RDONLY, &env->txn.ro);
+ } else if (!env->txn.ro_active) {
+ ret = txn_get_noresize(env, FLAG_RENEW, &env->txn.ro);
+ }
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+ env->txn.ro_active = true;
+ *txn = env->txn.ro;
+ assert(*txn);
+ return kr_ok();
+}
+
+static int cdb_sync(knot_db_t *db)
+{
+ struct lmdb_env *env = db;
+ int ret = kr_ok();
+ if (env->txn.rw) {
+ ret = lmdb_error(mdb_txn_commit(env->txn.rw));
+ env->txn.rw = NULL; /* the transaction got freed even in case of errors */
+ } else if (env->txn.ro && env->txn.ro_active) {
+ mdb_txn_reset(env->txn.ro);
+ env->txn.ro_active = false;
+ env->txn.ro_curs_active = false;
+ }
+ return ret;
+}
+
+/** Obtain a read-only cursor (and a read-only transaction). */
+static int txn_curs_get(struct lmdb_env *env, MDB_cursor **curs)
+{
+ assert(env && curs);
+ if (env->txn.ro_curs_active) {
+ goto success;
+ }
+ /* Only in a read-only txn; TODO: it's a bit messy/coupled */
+ if (env->txn.rw) {
+ int ret = cdb_sync(env);
+ if (ret) return ret;
+ }
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, true);
+ if (ret) return ret;
+
+ if (env->txn.ro_curs) {
+ ret = mdb_cursor_renew(txn, env->txn.ro_curs);
+ } else {
+ ret = mdb_cursor_open(txn, env->dbi, &env->txn.ro_curs);
+ }
+ if (ret) return ret;
+ env->txn.ro_curs_active = true;
+success:
+ assert(env->txn.ro_curs_active && env->txn.ro && env->txn.ro_active
+ && !env->txn.rw);
+ *curs = env->txn.ro_curs;
+ assert(*curs);
+ return kr_ok();
+}
+
+static void free_txn_ro(struct lmdb_env *env)
+{
+ if (env->txn.ro) {
+ mdb_txn_abort(env->txn.ro);
+ env->txn.ro = NULL;
+ }
+ if (env->txn.ro_curs) {
+ mdb_cursor_close(env->txn.ro_curs);
+ env->txn.ro_curs = NULL;
+ }
+}
+
+/*! \brief Close the database. */
+static void cdb_close_env(struct lmdb_env *env)
+{
+ assert(env && env->env);
+
+ /* Get rid of any transactions. */
+ cdb_sync(env);
+ free_txn_ro(env);
+
+ mdb_env_sync(env->env, 1);
+ mdb_dbi_close(env->env, env->dbi);
+ mdb_env_close(env->env);
+ memset(env, 0, sizeof(*env));
+}
+
+/*! \brief Open database environment. */
+static int cdb_open_env(struct lmdb_env *env, unsigned flags, const char *path, size_t mapsize)
+{
+ int ret = mkdir(path, LMDB_DIR_MODE);
+ if (ret == -1 && errno != EEXIST) {
+ return kr_error(errno);
+ }
+
+ MDB_env *mdb_env = NULL;
+ ret = mdb_env_create(&mdb_env);
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+
+ ret = set_mapsize(mdb_env, mapsize);
+ if (ret != 0) {
+ mdb_env_close(mdb_env);
+ return ret;
+ }
+
+ ret = mdb_env_open(mdb_env, path, flags, LMDB_FILE_MODE);
+ if (ret != MDB_SUCCESS) {
+ mdb_env_close(mdb_env);
+ return lmdb_error(ret);
+ }
+
+ /* Keep the environment pointer. */
+ env->env = mdb_env;
+ env->mapsize = mapsize;
+ return 0;
+}
+
+static int cdb_open(struct lmdb_env *env, const char *path, size_t mapsize)
+{
+ /* Cache doesn't require durability, we can be
+ * loose with the requirements as a tradeoff for speed. */
+ const unsigned flags = MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOTLS;
+ int ret = cdb_open_env(env, flags, path, mapsize);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* Open the database. */
+ MDB_txn *txn = NULL;
+ ret = mdb_txn_begin(env->env, NULL, 0, &txn);
+ if (ret != MDB_SUCCESS) {
+ mdb_env_close(env->env);
+ return lmdb_error(ret);
+ }
+
+ ret = mdb_dbi_open(txn, NULL, 0, &env->dbi);
+ if (ret != MDB_SUCCESS) {
+ mdb_txn_abort(txn);
+ mdb_env_close(env->env);
+ return lmdb_error(ret);
+ }
+
+ ret = mdb_txn_commit(txn);
+ if (ret != MDB_SUCCESS) {
+ mdb_env_close(env->env);
+ return lmdb_error(ret);
+ }
+
+ return 0;
+}
+
+static int cdb_init(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *pool)
+{
+ if (!db || !opts) {
+ return kr_error(EINVAL);
+ }
+
+ struct lmdb_env *env = malloc(sizeof(*env));
+ if (!env) {
+ return kr_error(ENOMEM);
+ }
+ memset(env, 0, sizeof(struct lmdb_env));
+
+ /* Clear stale lockfiles. */
+ auto_free char *lockfile = kr_strcatdup(2, opts->path, "/.cachelock");
+ if (lockfile) {
+ if (unlink(lockfile) == 0) {
+ kr_log_info("[cache] cleared stale lockfile '%s'\n", lockfile);
+ } else if (errno != ENOENT) {
+ kr_log_info("[cache] failed to clear stale lockfile '%s': %s\n", lockfile,
+ strerror(errno));
+ }
+ }
+
+ /* Open the database. */
+ int ret = cdb_open(env, opts->path, opts->maxsize);
+ if (ret != 0) {
+ free(env);
+ return ret;
+ }
+
+ *db = env;
+ return 0;
+}
+
+static void cdb_deinit(knot_db_t *db)
+{
+ struct lmdb_env *env = db;
+ cdb_close_env(env);
+ free(env);
+}
+
+static int cdb_count(knot_db_t *db)
+{
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, true);
+ if (ret != 0) {
+ return ret;
+ }
+
+ MDB_stat stat;
+ ret = mdb_stat(txn, env->dbi, &stat);
+
+ return (ret == MDB_SUCCESS) ? stat.ms_entries : lmdb_error(ret);
+}
+
+static int cdb_clear(knot_db_t *db)
+{
+ struct lmdb_env *env = db;
+ /* First try mdb_drop() to clear the DB; this may fail with ENOSPC. */
+ /* If we didn't do this, explicit cache.clear() ran on an instance
+ * would lead to the instance detaching from the cache of others,
+ * until they reopened cache explicitly or cleared it for some reason.
+ */
+ {
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, false);
+ if (ret == kr_ok()) {
+ ret = lmdb_error(mdb_drop(txn, env->dbi, 0));
+ if (ret == kr_ok()) {
+ ret = cdb_sync(db);
+ }
+ if (ret == kr_ok()) {
+ return ret;
+ }
+ }
+ kr_log_info("[cache] clearing error, falling back\n");
+ }
+
+ /* We are about to switch to a different file, so end all txns, to be sure. */
+ (void) cdb_sync(db);
+ free_txn_ro(db);
+
+ /* Since there is no guarantee that there will be free
+ * pages to hold whole dirtied db for transaction-safe clear,
+ * we simply remove the database files and reopen.
+ * We can afford this since other readers will continue to read
+ * from removed file, but will reopen when encountering next
+ * error. */
+ mdb_filehandle_t fd = -1;
+ int ret = mdb_env_get_fd(env->env, &fd);
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+ const char *path = NULL;
+ ret = mdb_env_get_path(env->env, &path);
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+
+ auto_free char *mdb_datafile = kr_strcatdup(2, path, "/data.mdb");
+ auto_free char *mdb_lockfile = kr_strcatdup(2, path, "/lock.mdb");
+ auto_free char *lockfile = kr_strcatdup(2, path, "/.cachelock");
+ if (!mdb_datafile || !mdb_lockfile || !lockfile) {
+ return kr_error(ENOMEM);
+ }
+ /* Find if we get a lock on lockfile. */
+ ret = open(lockfile, O_CREAT|O_EXCL|O_RDONLY, S_IRUSR);
+ if (ret == -1) {
+ kr_log_error("[cache] clearing failed to get ./.cachelock; retry later\n");
+ /* As we're out of space (almost certainly - mdb_drop didn't work),
+ * we will retry on the next failing write operation. */
+ return kr_error(errno);
+ }
+ close(ret);
+ /* We acquired lockfile. Now find whether *.mdb are what we have open now. */
+ struct stat old_stat, new_stat;
+ if (fstat(fd, &new_stat) || stat(mdb_datafile, &old_stat)) {
+ ret = errno;
+ unlink(lockfile);
+ return kr_error(ret);
+ }
+ /* Remove underlying files only if current open environment
+ * points to file on the disk. Otherwise just reopen as someone
+ * else has already removed the files.
+ */
+ if (old_stat.st_dev == new_stat.st_dev && old_stat.st_ino == new_stat.st_ino) {
+ kr_log_verbose("[cache] clear: identical files, unlinking\n");
+ // coverity[toctou]
+ unlink(mdb_datafile);
+ unlink(mdb_lockfile);
+ } else
+ kr_log_verbose("[cache] clear: not identical files, reopening\n");
+ /* Keep copy as it points to current handle internals. */
+ auto_free char *path_copy = strdup(path);
+ size_t mapsize = env->mapsize;
+ cdb_close_env(env);
+ ret = cdb_open(env, path_copy, mapsize);
+ /* Environment updated, release lockfile. */
+ unlink(lockfile);
+ return ret;
+}
+
+static int cdb_readv(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+ int maxcount)
+{
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, true);
+ if (ret) {
+ return ret;
+ }
+
+ for (int i = 0; i < maxcount; ++i) {
+ /* Convert key structs */
+ MDB_val _key = val_knot2mdb(key[i]);
+ MDB_val _val = val_knot2mdb(val[i]);
+ ret = mdb_get(txn, env->dbi, &_key, &_val);
+ if (ret != MDB_SUCCESS) {
+ ret = lmdb_error(ret);
+ if (ret == kr_error(ENOSPC)) {
+ /* we're likely to be forced to cache clear anyway */
+ ret = kr_error(ENOENT);
+ }
+ return ret;
+ }
+ /* Update the result. */
+ val[i] = val_mdb2knot(_val);
+ }
+ return kr_ok();
+}
+
+static int cdb_write(struct lmdb_env *env, MDB_txn **txn, const knot_db_val_t *key,
+ knot_db_val_t *val, unsigned flags)
+{
+ /* Convert key structs and write */
+ MDB_val _key = val_knot2mdb(*key);
+ MDB_val _val = val_knot2mdb(*val);
+ int ret = mdb_put(*txn, env->dbi, &_key, &_val, flags);
+
+ /* Try to recover from doing too much writing in a single transaction. */
+ if (ret == MDB_TXN_FULL) {
+ ret = cdb_sync(env);
+ if (ret) {
+ ret = txn_get(env, txn, false);
+ }
+ if (ret) {
+ ret = mdb_put(*txn, env->dbi, &_key, &_val, flags);
+ }
+ }
+ if (ret != MDB_SUCCESS) {
+ return lmdb_error(ret);
+ }
+
+ /* Update the result. */
+ val->data = _val.mv_data;
+ val->len = _val.mv_size;
+ return kr_ok();
+}
+
+static int cdb_writev(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+ int maxcount)
+{
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, false);
+
+ for (int i = 0; ret == kr_ok() && i < maxcount; ++i) {
+ /* This is LMDB specific optimisation,
+ * if caller specifies value with NULL data and non-zero length,
+ * LMDB will preallocate the entry for caller and leave write
+ * transaction open, caller is responsible for syncing thus committing transaction.
+ */
+ unsigned mdb_flags = 0;
+ if (val[i].len > 0 && val[i].data == NULL) {
+ mdb_flags |= MDB_RESERVE;
+ }
+ ret = cdb_write(env, &txn, &key[i], &val[i], mdb_flags);
+ }
+
+ return ret;
+}
+
+static int cdb_remove(knot_db_t *db, knot_db_val_t keys[], int maxcount)
+{
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, false);
+ int deleted = 0;
+
+ for (int i = 0; ret == kr_ok() && i < maxcount; ++i) {
+ MDB_val _key = val_knot2mdb(keys[i]);
+ MDB_val val = { 0, NULL };
+ ret = lmdb_error(mdb_del(txn, env->dbi, &_key, &val));
+ if (ret == kr_ok())
+ deleted++;
+ else if (ret == KNOT_ENOENT)
+ ret = kr_ok(); /* skip over non-existing entries */
+ }
+
+ return ret < 0 ? ret : deleted;
+}
+
+static int cdb_match(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount)
+{
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, true);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* LATER(optim.): use txn_curs_get() instead, to save resources. */
+ MDB_cursor *cur = NULL;
+ ret = mdb_cursor_open(txn, env->dbi, &cur);
+ if (ret != 0) {
+ return lmdb_error(ret);
+ }
+
+ MDB_val cur_key = val_knot2mdb(*key);
+ MDB_val cur_val = { 0, NULL };
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_SET_RANGE);
+ if (ret != MDB_SUCCESS) {
+ mdb_cursor_close(cur);
+ return lmdb_error(ret);
+ }
+
+ int results = 0;
+ while (ret == MDB_SUCCESS) {
+ /* Retrieve current key and compare with prefix */
+ if (cur_key.mv_size < key->len || memcmp(cur_key.mv_data, key->data, key->len) != 0) {
+ break;
+ }
+ /* Add to result set */
+ if (results < maxcount) {
+ keyval[results][0] = val_mdb2knot(cur_key);
+ keyval[results][1] = val_mdb2knot(cur_val);
+ ++results;
+ } else {
+ break;
+ }
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+ }
+
+ mdb_cursor_close(cur);
+ return results;
+}
+
+
+static int cdb_prune(knot_db_t *db, int limit)
+{
+ return -1;
+#if 0
+ /* Sync in-flight transactions */
+ cdb_sync(db);
+
+ /* Prune old records */
+ struct lmdb_env *env = db;
+ MDB_txn *txn = NULL;
+ int ret = txn_get(env, &txn, false);
+ if (ret != 0) {
+ return ret;
+ }
+
+ MDB_cursor *cur = NULL;
+ ret = mdb_cursor_open(txn, env->dbi, &cur);
+ if (ret != 0) {
+ return lmdb_error(ret);
+ }
+
+ MDB_val cur_key, cur_val;
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_FIRST);
+ if (ret != 0) {
+ mdb_cursor_close(cur);
+ return lmdb_error(ret);
+ }
+
+ int results = 0;
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ while (ret == 0 && results < limit) {
+ /* Ignore special namespaces. */
+ if (cur_key.mv_size < 2 || ((const char *)cur_key.mv_data)[0] == 'V') {
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+ continue;
+ }
+ /* Check entry age. */
+ struct kr_cache_entry *entry = cur_val.mv_data;
+ if (entry->timestamp > now.tv_sec ||
+ (now.tv_sec - entry->timestamp) < entry->ttl) {
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+ continue;
+ }
+ /* Remove entry */
+ mdb_cursor_del(cur, 0);
+ ++results;
+ ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+ }
+ mdb_cursor_close(cur);
+ return ret < 0 ? ret : results;
+#endif
+}
+
+static int cdb_read_leq(knot_db_t *env, knot_db_val_t *key, knot_db_val_t *val)
+{
+ assert(env && key && key->data && val);
+ MDB_cursor *curs = NULL;
+ int ret = txn_curs_get(env, &curs);
+ if (ret) return ret;
+
+ MDB_val key2_m = val_knot2mdb(*key);
+ MDB_val val2_m = { 0, NULL };
+ ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_SET_RANGE);
+ if (ret) return lmdb_error(ret);
+ /* test for equality //:unlikely */
+ if (key2_m.mv_size == key->len
+ && memcmp(key2_m.mv_data, key->data, key->len) == 0) {
+ ret = 0; /* equality */
+ goto success;
+ }
+ /* we must be greater than key; do one step to smaller */
+ ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_PREV);
+ if (ret) return lmdb_error(ret);
+ ret = 1;
+success:
+ /* finalize the output */
+ *key = val_mdb2knot(key2_m);
+ *val = val_mdb2knot(val2_m);
+ return ret;
+}
+
+
+const struct kr_cdb_api *kr_cdb_lmdb(void)
+{
+ static const struct kr_cdb_api api = {
+ "lmdb",
+ cdb_init, cdb_deinit, cdb_count, cdb_clear, cdb_sync,
+ cdb_readv, cdb_writev, cdb_remove,
+ cdb_match, cdb_prune,
+ cdb_read_leq
+ };
+
+ return &api;
+}
diff --git a/lib/cache/cdb_lmdb.h b/lib/cache/cdb_lmdb.h
new file mode 100644
index 0000000..bc2c7d6
--- /dev/null
+++ b/lib/cache/cdb_lmdb.h
@@ -0,0 +1,23 @@
+/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "lib/cache/cdb_api.h"
+#include "lib/defines.h"
+
+KR_EXPORT KR_CONST
+const struct kr_cdb_api *kr_cdb_lmdb(void);
diff --git a/lib/cache/entry_list.c b/lib/cache/entry_list.c
new file mode 100644
index 0000000..6a5001c
--- /dev/null
+++ b/lib/cache/entry_list.c
@@ -0,0 +1,293 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of chaining in struct entry_h. Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+#include "lib/utils.h"
+
+
+static int entry_h_len(knot_db_val_t val);
+
+
+void entry_list_memcpy(struct entry_apex *ea, entry_list_t list)
+{
+ assert(ea);
+ memset(ea, 0, offsetof(struct entry_apex, data));
+ ea->has_ns = list[EL_NS ].len;
+ ea->has_cname = list[EL_CNAME ].len;
+ ea->has_dname = list[EL_DNAME ].len;
+ for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+ ea->nsecs[i] = list[i].len == 0 ? 0 :
+ (list[i].len == 4 ? 1 : 3);
+ }
+ uint8_t *it = ea->data;
+ for (int i = 0; i < EL_LENGTH; ++i) {
+ if (list[i].data) {
+ memcpy(it, list[i].data, list[i].len);
+ /* LATER(optim.): coalesce consecutive writes? */
+ } else {
+ list[i].data = it;
+ }
+ it += to_even(list[i].len);
+ }
+}
+
+int entry_list_parse(const knot_db_val_t val, entry_list_t list)
+{
+ const bool ok = val.data && val.len && list;
+ if (!ok) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+ /* Parse the apex itself (nsec parameters). */
+ const struct entry_apex *ea = entry_apex_consistent(val);
+ if (!ea) {
+ return kr_error(EILSEQ);
+ }
+ const uint8_t *it = ea->data,
+ *it_bound = knot_db_val_bound(val);
+ for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+ if (it > it_bound) {
+ return kr_error(EILSEQ);
+ }
+ list[i].data = (void *)it;
+ switch (ea->nsecs[i]) {
+ case 0:
+ list[i].len = 0;
+ break;
+ case 1:
+ list[i].len = sizeof(uint32_t); /* just timestamp */
+ break;
+ case 3: { /* timestamp + NSEC3PARAM wire */
+ if (it + sizeof(uint32_t) + 4 > it_bound) {
+ return kr_error(EILSEQ);
+ }
+ list[i].len = sizeof(uint32_t)
+ + nsec_p_rdlen(it + sizeof(uint32_t));
+ break;
+ }
+ default:
+ return kr_error(EILSEQ);
+ };
+ it += to_even(list[i].len);
+ }
+ /* Parse every entry_h. */
+ for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) {
+ list[i].data = (void *)it;
+ bool has_type;
+ switch (i) {
+ case EL_NS: has_type = ea->has_ns; break;
+ case EL_CNAME: has_type = ea->has_cname; break;
+ case EL_DNAME: has_type = ea->has_dname; break;
+ default: assert(false); return kr_error(EINVAL); /* something very bad */
+ }
+ if (!has_type) {
+ list[i].len = 0;
+ continue;
+ }
+ if (it >= it_bound) {
+ assert(!EILSEQ);
+ return kr_error(EILSEQ);
+ }
+ const int len = entry_h_len(
+ (knot_db_val_t){ .data = (void *)it, .len = it_bound - it });
+ if (len < 0) {
+ assert(false);
+ return kr_error(len);
+ }
+ list[i].len = len;
+ it += to_even(len);
+ }
+ assert(it == it_bound);
+ return kr_ok();
+}
+
+/** Given a valid entry header, find its length (i.e. offset of the next entry).
+ * \param val The beginning of the data and the bound (read only).
+ */
+static int entry_h_len(const knot_db_val_t val)
+{
+ const bool ok = val.data && ((ssize_t)val.len) > 0;
+ if (!ok) return kr_error(EINVAL);
+ const struct entry_h *eh = val.data;
+ const uint8_t *d = eh->data; /* iterates over the data in entry */
+ const uint8_t *data_bound = knot_db_val_bound(val);
+ if (d >= data_bound) return kr_error(EILSEQ);
+ if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
+ int sets = 2;
+ while (sets-- > 0) {
+ d += rdataset_dematerialized_size(d);
+ if (d > data_bound) {
+ assert(!EILSEQ);
+ return kr_error(EILSEQ);
+ }
+ }
+ } else { /* A "packet" (opaque ATM). */
+ uint16_t len;
+ if (d + sizeof(len) > data_bound) return kr_error(EILSEQ);
+ memcpy(&len, d, sizeof(len));
+ d += 2 + to_even(len);
+ }
+ if (d > data_bound) {
+ assert(!EILSEQ);
+ return kr_error(EILSEQ);
+ }
+ return d - (uint8_t *)val.data;
+}
+
+struct entry_apex * entry_apex_consistent(knot_db_val_t val)
+{
+ //XXX: check lengths, etc.
+ return val.data;
+}
+
+/* See the header file. */
+int entry_h_seek(knot_db_val_t *val, uint16_t type)
+{
+ int i = -1;
+ switch (type) {
+ case KNOT_RRTYPE_NS: i = EL_NS; break;
+ case KNOT_RRTYPE_CNAME: i = EL_CNAME; break;
+ case KNOT_RRTYPE_DNAME: i = EL_DNAME; break;
+ default: return kr_ok();
+ }
+
+ entry_list_t el;
+ int ret = entry_list_parse(*val, el);
+ if (ret) return ret;
+ *val = el[i];
+ return val->len ? kr_ok() : kr_error(ENOENT);
+}
+
+static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key,
+ knot_db_val_t *val, const struct kr_query *qry)
+{
+ int ret = cache_op(cache, write, key, val, 1);
+ if (!ret) return kr_ok();
+ /* Clear cache if overfull. It's nontrivial to do better with LMDB.
+ * LATER: some garbage-collection mechanism. */
+ if (ret == kr_error(ENOSPC)) {
+ ret = kr_cache_clear(cache);
+ const char *msg = "[cache] clearing because overfull, ret = %d\n";
+ if (ret) {
+ kr_log_error(msg, ret);
+ } else {
+ kr_log_info(msg, ret);
+ ret = kr_error(ENOSPC);
+ }
+ return ret;
+ }
+ VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret);
+ return kr_error(ret ? ret : ENOSPC);
+}
+
+
+/* See the header file. */
+int entry_h_splice(
+ knot_db_val_t *val_new_entry, uint8_t rank,
+ const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+ const knot_dname_t *owner/*log only*/,
+ const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp)
+{
+ //TODO: another review, perhaps incuding the API
+ const bool ok = val_new_entry && val_new_entry->len > 0;
+ if (!ok) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ int i_type;
+ switch (type) {
+ case KNOT_RRTYPE_NS: i_type = EL_NS; break;
+ case KNOT_RRTYPE_CNAME: i_type = EL_CNAME; break;
+ case KNOT_RRTYPE_DNAME: i_type = EL_DNAME; break;
+ default: i_type = 0;
+ }
+
+ /* Get eh_orig (original entry), and also el list if multi-entry case. */
+ const struct entry_h *eh_orig = NULL;
+ entry_list_t el;
+ int ret = -1;
+ if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
+ knot_db_val_t val;
+ ret = cache_op(cache, read, &key, &val, 1);
+ if (i_type) {
+ if (!ret) ret = entry_list_parse(val, el);
+ if (ret) memset(el, 0, sizeof(el));
+ val = el[i_type];
+ }
+ /* val is on the entry, in either case (or error) */
+ if (!ret) {
+ eh_orig = entry_h_consistent(val, type);
+ }
+ } else {
+ /* We want to fully overwrite the entry, so don't even read it. */
+ memset(el, 0, sizeof(el));
+ }
+
+ if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
+ /* If equal rank was accepted, spoofing a *single* answer would be
+ * enough to e.g. override NS record in AUTHORITY section.
+ * This way they would have to hit the first answer
+ * (whenever TTL nears expiration).
+ * Stale-serving is NOT considered, but TTL 1 would be considered
+ * as expiring anyway, ... */
+ int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp);
+ if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl)
+ && rank <= eh_orig->rank) {
+ WITH_VERBOSE(qry) {
+ auto_free char *type_str = kr_rrtype_text(type),
+ *owner_str = kr_dname_text(owner);
+ VERBOSE_MSG(qry, "=> not overwriting %s %s\n",
+ type_str, owner_str);
+ }
+ return kr_error(EEXIST);
+ }
+ }
+
+ if (!i_type) {
+ /* The non-list types are trivial now. */
+ return cache_write_or_clear(cache, &key, val_new_entry, qry);
+ }
+ /* Now we're in trouble. In some cases, parts of data to be written
+ * is an lmdb entry that may be invalidated by our write request.
+ * (lmdb does even in-place updates!) Therefore we copy all into a buffer.
+ * (We don't bother deallocating from the mempool.)
+ * LATER(optim.): do this only when neccessary, or perhaps another approach.
+ * This is also complicated by the fact that the val_new_entry part
+ * is to be written *afterwards* by the caller.
+ */
+ el[i_type] = (knot_db_val_t){
+ .len = val_new_entry->len,
+ .data = NULL, /* perhaps unclear in the entry_h_splice() API */
+ };
+ knot_db_val_t val = {
+ .len = entry_list_serial_size(el),
+ .data = NULL,
+ };
+ void *buf = mm_alloc(&qry->request->pool, val.len);
+ entry_list_memcpy(buf, el);
+ ret = cache_write_or_clear(cache, &key, &val, qry);
+ if (ret) return kr_error(ret);
+ memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */
+ val_new_entry->data = (uint8_t *)val.data
+ + ((uint8_t *)el[i_type].data - (uint8_t *)buf);
+ return kr_ok();
+}
+
diff --git a/lib/cache/entry_pkt.c b/lib/cache/entry_pkt.c
new file mode 100644
index 0000000..00e73d4
--- /dev/null
+++ b/lib/cache/entry_pkt.c
@@ -0,0 +1,224 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of packet-caching. Prototypes in ./impl.h
+ *
+ * The packet is stashed in entry_h::data as uint16_t length + full packet wire format.
+ */
+
+#include "lib/utils.h"
+#include "lib/layer/iterate.h" /* kr_response_classify */
+#include "lib/cache/impl.h"
+
+
+/** Compute TTL for a packet. Generally it's minimum TTL, with extra conditions. */
+static uint32_t packet_ttl(const knot_pkt_t *pkt, bool is_negative)
+{
+ bool has_ttl = false;
+ uint32_t ttl = UINT32_MAX;
+ /* Find minimum entry TTL in the packet or SOA minimum TTL. */
+ for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+ const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+ for (unsigned k = 0; k < sec->count; ++k) {
+ const knot_rrset_t *rr = knot_pkt_rr(sec, k);
+ if (is_negative) {
+ /* Use SOA minimum TTL for negative answers. */
+ if (rr->type == KNOT_RRTYPE_SOA) {
+ return MIN(rr->ttl, knot_soa_minimum(rr->rrs.rdata));
+ } else {
+ continue; /* Use SOA only for negative answers. */
+ }
+ }
+ if (knot_rrtype_is_metatype(rr->type)) {
+ continue; /* Skip metatypes. */
+ }
+ ttl = MIN(ttl, rr->ttl);
+ }
+ }
+ /* If no valid TTL present, go with zero (will get clamped to minimum). */
+ return has_ttl ? ttl : 0;
+}
+
+
+void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry,
+ const struct kr_request *req, const bool has_optout)
+{
+ /* In some cases, stash also the packet. */
+ const bool is_negative = kr_response_classify(pkt)
+ & (PKT_NODATA|PKT_NXDOMAIN);
+ const struct kr_qflags * const qf = &qry->flags;
+ const bool want_negative = qf->DNSSEC_INSECURE || !qf->DNSSEC_WANT || has_optout;
+ const bool want_pkt = qf->DNSSEC_BOGUS /*< useful for +cd answers */
+ || (is_negative && want_negative);
+
+ if (!want_pkt || !knot_wire_get_aa(pkt->wire)
+ || pkt->parsed != pkt->size /*< malformed packet; still can't detect KNOT_EFEWDATA */
+ ) {
+ return;
+ }
+
+ /* Compute rank. If cd bit is set or we got answer via non-validated
+ * forwarding, make the rank bad; otherwise it depends on flags.
+ * TODO: probably make validator attempt validation even with +cd. */
+ uint8_t rank = KR_RANK_AUTH;
+ const bool risky_vldr = is_negative && qf->FORWARD && qf->CNAME;
+ /* ^^ CNAME'ed NXDOMAIN answer in forwarding mode can contain
+ * unvalidated records; original commit: d6e22f476. */
+ if (knot_wire_get_cd(req->qsource.packet->wire) || qf->STUB || risky_vldr) {
+ kr_rank_set(&rank, KR_RANK_OMIT);
+ } else {
+ if (qf->DNSSEC_BOGUS) {
+ kr_rank_set(&rank, KR_RANK_BOGUS);
+ } else if (qf->DNSSEC_INSECURE) {
+ kr_rank_set(&rank, KR_RANK_INSECURE);
+ } else if (!qf->DNSSEC_WANT) {
+ /* no TAs at all, leave _RANK_AUTH */
+ } else if (has_optout) {
+ /* All bad cases should be filtered above,
+ * at least the same way as pktcache in kresd 1.5.x. */
+ kr_rank_set(&rank, KR_RANK_SECURE);
+ } else assert(false);
+ }
+
+ const uint16_t pkt_type = knot_pkt_qtype(pkt);
+ const knot_dname_t *owner = knot_pkt_qname(pkt); /* qname can't be compressed */
+
+ // LATER: nothing exists under NXDOMAIN. Implement that (optionally)?
+#if 0
+ if (knot_wire_get_rcode(pkt->wire) == KNOT_RCODE_NXDOMAIN
+ /* && !qf->DNSSEC_INSECURE */ ) {
+ pkt_type = KNOT_RRTYPE_NS;
+ }
+#endif
+
+ /* Construct the key under which the pkt will be stored. */
+ struct key k_storage, *k = &k_storage;
+ knot_db_val_t key;
+ int ret = kr_dname_lf(k->buf, owner, false);
+ if (ret) {
+ /* A server might (incorrectly) reply with QDCOUNT=0. */
+ assert(owner == NULL);
+ return;
+ }
+ key = key_exact_type_maypkt(k, pkt_type);
+
+ /* For now we stash the full packet byte-exactly as it came from upstream. */
+ const uint16_t pkt_size = pkt->size;
+ knot_db_val_t val_new_entry = {
+ .data = NULL,
+ .len = offsetof(struct entry_h, data) + sizeof(pkt_size) + pkt->size,
+ };
+ /* Prepare raw memory for the new entry and fill it. */
+ struct kr_cache *cache = &req->ctx->cache;
+ ret = entry_h_splice(&val_new_entry, rank, key, k->type, pkt_type,
+ owner, qry, cache, qry->timestamp.tv_sec);
+ if (ret) return; /* some aren't really errors */
+ assert(val_new_entry.data);
+ struct entry_h *eh = val_new_entry.data;
+ memset(eh, 0, offsetof(struct entry_h, data));
+ eh->time = qry->timestamp.tv_sec;
+ eh->ttl = MAX(MIN(packet_ttl(pkt, is_negative), cache->ttl_max), cache->ttl_min);
+ eh->rank = rank;
+ eh->is_packet = true;
+ eh->has_optout = qf->DNSSEC_OPTOUT;
+ memcpy(eh->data, &pkt_size, sizeof(pkt_size));
+ memcpy(eh->data + sizeof(pkt_size), pkt->wire, pkt_size);
+
+ WITH_VERBOSE(qry) {
+ auto_free char *type_str = kr_rrtype_text(pkt_type),
+ *owner_str = kr_dname_text(owner);
+ VERBOSE_MSG(qry, "=> stashed packet: rank 0%.2o, TTL %d, "
+ "%s %s (%d B)\n",
+ eh->rank, eh->ttl,
+ type_str, owner_str, (int)val_new_entry.len);
+ }
+}
+
+
+int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+ const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+
+ uint16_t pkt_len;
+ memcpy(&pkt_len, eh->data, sizeof(pkt_len));
+ if (pkt_len > pkt->max_size) {
+ return kr_error(ENOENT);
+ }
+
+ /* Copy answer and reparse it, but keep the original message id. */
+ uint16_t msgid = knot_wire_get_id(pkt->wire);
+ knot_pkt_clear(pkt);
+ memcpy(pkt->wire, eh->data + 2, pkt_len);
+ pkt->size = pkt_len;
+ int ret = knot_pkt_parse(pkt, 0);
+ if (ret == KNOT_EFEWDATA || ret == KNOT_EMALF) {
+ return kr_error(ENOENT);
+ /* LATER(opt): try harder to avoid stashing such packets */
+ }
+ if (ret != KNOT_EOK) {
+ assert(!ret);
+ return kr_error(ret);
+ }
+ knot_wire_set_id(pkt->wire, msgid);
+
+ /* Add rank into the additional field. */
+ for (size_t i = 0; i < pkt->rrset_count; ++i) {
+ assert(!pkt->rr[i].additional);
+ uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank));
+ if (!rr_rank) {
+ return kr_error(ENOMEM);
+ }
+ *rr_rank = eh->rank;
+ pkt->rr[i].additional = rr_rank;
+ }
+
+ /* Adjust TTL in each record. */
+ const uint32_t drift = eh->ttl - new_ttl;
+ for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+ const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+ for (unsigned k = 0; k < sec->count; ++k) {
+ knot_rrset_t *rrs = // vv FIXME??
+ /*const-cast*/(knot_rrset_t *)knot_pkt_rr(sec, k);
+ /* We need to be careful: due to enforcing minimum TTL
+ * on packet, some records may be below that value.
+ * We keep those records at TTL 0. */
+ if (rrs->ttl >= drift) {
+ rrs->ttl -= drift;
+ } else {
+ rrs->ttl = 0;
+ }
+ }
+ }
+
+ /* Finishing touches. TODO: perhaps factor out */
+ struct kr_qflags * const qf = &qry->flags;
+ qf->EXPIRING = is_expiring(eh->ttl, new_ttl);
+ qf->CACHED = true;
+ qf->NO_MINIMIZE = true;
+ qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+ qf->DNSSEC_BOGUS = kr_rank_test(eh->rank, KR_RANK_BOGUS);
+ if (qf->DNSSEC_INSECURE || qf->DNSSEC_BOGUS) {
+ qf->DNSSEC_WANT = false;
+ }
+ qf->DNSSEC_OPTOUT = eh->has_optout;
+ VERBOSE_MSG(qry, "=> satisfied by exact packet: rank 0%.2o, new TTL %d\n",
+ eh->rank, new_ttl);
+ return kr_ok();
+}
+
diff --git a/lib/cache/entry_rr.c b/lib/cache/entry_rr.c
new file mode 100644
index 0000000..94dd0e8
--- /dev/null
+++ b/lib/cache/entry_rr.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of RRset (de)materialization, i.e. (de)serialization to storage
+ * format used in cache (some repeated fields are omitted). Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data)
+{
+ /* FIXME: either give up on even alignment and thus direct usability
+ * of rdatasets as they are in lmdb, or align inside cdb_* functions
+ * (request sizes one byte longer and shift iff on an odd address). */
+ //if ((size_t)data & 1) VERBOSE_MSG(NULL, "dematerialize: odd address\n");
+ //const uint8_t *data0 = data;
+ if (!data) {
+ assert(data);
+ return kr_error(EINVAL);
+ }
+ const uint16_t rr_count = rds ? rds->count : 0;
+ memcpy(data, &rr_count, sizeof(rr_count));
+ data += sizeof(rr_count);
+ if (rr_count) {
+ size_t size = knot_rdataset_size(rds);
+ memcpy(data, rds->rdata, size);
+ data += size;
+ }
+ //VERBOSE_MSG(NULL, "dematerialized to %d B\n", (int)(data - data0));
+ (void)data;
+ return kr_ok();
+}
+
+/** Materialize a knot_rdataset_t from cache with given TTL.
+ * Return the number of bytes consumed or an error code.
+ */
+static int rdataset_materialize(knot_rdataset_t * restrict rds, const uint8_t * const data,
+ const uint8_t *data_bound, knot_mm_t *pool)
+{
+ assert(rds && data && data_bound && data_bound > data && !rds->rdata
+ /*&& !((size_t)data & 1)*/);
+ assert(pool); /* not required, but that's our current usage; guard leaks */
+ const uint8_t *d = data; /* iterates over the cache data */
+ {
+ uint16_t rr_count;
+ memcpy(&rr_count, d, sizeof(rr_count));
+ d += sizeof(rr_count);
+ rds->count = rr_count;
+ if (!rr_count) { /* avoid mm_alloc(pool, 0); etc. */
+ return d - data;
+ }
+ }
+ /* First sum up the sizes for wire format length. */
+ const knot_rdataset_t rds_tmp = {
+ .count = rds->count,
+ .rdata = (knot_rdata_t *)d,
+ };
+ size_t rds_size = knot_rdataset_size(&rds_tmp); /* TODO: we might overrun here already,
+ but we need to trust cache anyway...*/
+ if (d + rds_size > data_bound) {
+ VERBOSE_MSG(NULL, "materialize: EILSEQ!\n");
+ return kr_error(EILSEQ);
+ }
+ rds->rdata = mm_alloc(pool, rds_size);
+ if (!rds->rdata) {
+ return kr_error(ENOMEM);
+ }
+ memcpy(rds->rdata, d, rds_size);
+ d += rds_size;
+ //VERBOSE_MSG(NULL, "materialized from %d B\n", (int)(d - data));
+ return d - data;
+}
+
+int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref,
+ knot_mm_t *pool)
+{
+ struct entry_h *eh = ref->raw_data;
+ return rdataset_materialize(dst, eh->data, ref->raw_bound, pool);
+}
+
+
+int entry2answer(struct answer *ans, int id,
+ const struct entry_h *eh, const uint8_t *eh_bound,
+ const knot_dname_t *owner, uint16_t type, uint32_t new_ttl)
+{
+ /* We assume it's zeroed. Do basic sanity check. */
+ if (ans->rrsets[id].set.rr || ans->rrsets[id].sig_rds.rdata
+ || (type == KNOT_RRTYPE_NSEC && ans->nsec_p.raw)
+ || (type == KNOT_RRTYPE_NSEC3 && !ans->nsec_p.raw)
+ )
+ {
+ assert(false);
+ return kr_error(EINVAL);
+ }
+ /* Materialize the base RRset. */
+ knot_rrset_t *rr = ans->rrsets[id].set.rr
+ = knot_rrset_new(owner, type, KNOT_CLASS_IN, new_ttl, ans->mm);
+ if (!rr) {
+ assert(!ENOMEM);
+ return kr_error(ENOMEM);
+ }
+ int ret = rdataset_materialize(&rr->rrs, eh->data, eh_bound, ans->mm);
+ if (ret < 0) goto fail;
+ size_t data_off = ret;
+ ans->rrsets[id].set.rank = eh->rank;
+ ans->rrsets[id].set.expiring = is_expiring(eh->ttl, new_ttl);
+ /* Materialize the RRSIG RRset for the answer in (pseudo-)packet. */
+ bool want_rrsigs = true; /* LATER(optim.): might be omitted in some cases. */
+ if (want_rrsigs) {
+ ret = rdataset_materialize(&ans->rrsets[id].sig_rds, eh->data + data_off,
+ eh_bound, ans->mm);
+ if (ret < 0) goto fail;
+ /* Sanity check: we consumed exactly all data. */
+ int unused_bytes = eh_bound - (uint8_t *)eh->data - data_off - ret;
+ if (unused_bytes) {
+ kr_log_error("[cach] entry2answer ERROR: unused bytes: %d\n",
+ unused_bytes);
+ assert(!EILSEQ);
+ ret = kr_error(EILSEQ);
+ goto fail; /* to be on the safe side */
+ }
+ }
+ return kr_ok();
+fail:
+ assert(/*false*/!ret);
+ /* Cleanup the item that we might've (partially) written to. */
+ knot_rrset_free(ans->rrsets[id].set.rr, ans->mm);
+ knot_rdataset_clear(&ans->rrsets[id].sig_rds, ans->mm);
+ memset(&ans->rrsets[id], 0, sizeof(ans->rrsets[id]));
+ return kr_error(ret);
+}
+
diff --git a/lib/cache/impl.h b/lib/cache/impl.h
new file mode 100644
index 0000000..a08f355
--- /dev/null
+++ b/lib/cache/impl.h
@@ -0,0 +1,412 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Header internal for cache implementation(s).
+ * Only LMDB works for now.
+ */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libdnssec/error.h>
+#include <libdnssec/nsec.h>
+#include <libknot/consts.h>
+#include <libknot/db/db.h>
+#include <libknot/dname.h>
+
+#include "contrib/cleanup.h"
+#include "contrib/murmurhash3/murmurhash3.h" /* hash() for nsec_p_hash() */
+#include "lib/cache/cdb_api.h"
+#include "lib/resolve.h"
+
+/* Cache entry values - binary layout.
+ *
+ * It depends on type which is recognizable by the key.
+ * Code depending on the contents of the key is marked by CACHE_KEY_DEF.
+ *
+ * 'E' entry (exact hit):
+ * - ktype == NS: struct entry_apex - multiple types inside (NS and xNAME);
+ * - ktype != NS: struct entry_h
+ * * is_packet: uint16_t length, the rest is opaque and handled by ./entry_pkt.c
+ * * otherwise RRset + its RRSIG set (possibly empty).
+ * '1' or '3' entry (NSEC or NSEC3)
+ * - struct entry_h, contents is the same as for exact hit
+ * - flags don't make sense there
+ */
+
+struct entry_h {
+ uint32_t time; /**< The time of inception. */
+ uint32_t ttl; /**< TTL at inception moment. Assuming it fits into int32_t ATM. */
+ uint8_t rank : 6; /**< See enum kr_rank */
+ bool is_packet : 1; /**< Negative-answer packet for insecure/bogus name. */
+ bool has_optout : 1; /**< Only for packets; persisted DNSSEC_OPTOUT. */
+ uint8_t _pad; /**< We need even alignment for data now. */
+ uint8_t data[];
+};
+struct entry_apex;
+
+/** Check basic consistency of entry_h for 'E' entries, not looking into ->data.
+ * (for is_packet the length of data is checked)
+ */
+struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type);
+
+struct entry_apex * entry_apex_consistent(knot_db_val_t val);
+
+/** Consistency check, ATM common for NSEC and NSEC3. */
+static inline struct entry_h * entry_h_consistent_NSEC(knot_db_val_t data)
+{
+ /* ATM it's enough to just extend the checks for exact entries. */
+ const struct entry_h *eh = entry_h_consistent(data, KNOT_RRTYPE_NSEC);
+ bool ok = eh != NULL;
+ ok = ok && !eh->is_packet && !eh->has_optout;
+ return ok ? /*const-cast*/(struct entry_h *)eh : NULL;
+}
+
+
+/* nsec_p* - NSEC* chain parameters */
+
+static inline int nsec_p_rdlen(const uint8_t *rdata)
+{
+ //TODO: do we really need the zero case?
+ return rdata ? 5 + rdata[4] : 0; /* rfc5155 4.2 and 3.2. */
+}
+static const int NSEC_P_MAXLEN = sizeof(uint32_t) + 5 + 255; // TODO: remove??
+
+/** Hash of NSEC3 parameters, used as a tag to separate different chains for same zone. */
+typedef uint32_t nsec_p_hash_t;
+static inline nsec_p_hash_t nsec_p_mkHash(const uint8_t *nsec_p)
+{
+ assert(nsec_p && !(KNOT_NSEC3_FLAG_OPT_OUT & nsec_p[1]));
+ return hash((const char *)nsec_p, nsec_p_rdlen(nsec_p));
+}
+
+/** NSEC* parameters for the chain. */
+struct nsec_p {
+ const uint8_t *raw; /**< Pointer to raw NSEC3 parameters; NULL for NSEC. */
+ nsec_p_hash_t hash; /**< Hash of `raw`, used for cache keys. */
+ dnssec_nsec3_params_t libknot; /**< Format for libknot; owns malloced memory! */
+};
+
+
+
+/** LATER(optim.): this is overshot, but struct key usage should be cheap ATM. */
+#define KR_CACHE_KEY_MAXLEN (KNOT_DNAME_MAXLEN + 100) /* CACHE_KEY_DEF */
+
+struct key {
+ const knot_dname_t *zname; /**< current zone name (points within qry->sname) */
+ uint8_t zlf_len; /**< length of current zone's lookup format */
+
+ /** Corresponding key type; e.g. NS for CNAME.
+ * Note: NSEC type is ambiguous (exact and range key). */
+ uint16_t type;
+ /** The key data start at buf+1, and buf[0] contains some length.
+ * For details see key_exact* and key_NSEC* functions. */
+ uint8_t buf[KR_CACHE_KEY_MAXLEN];
+ /* LATER(opt.): ^^ probably change the anchoring, so that kr_dname_lf()
+ * doesn't need to move data after knot_dname_lf(). */
+};
+
+static inline size_t key_nwz_off(const struct key *k)
+{
+ /* CACHE_KEY_DEF: zone name lf + 0 ('1' or '3').
+ * NSEC '1' case continues just with the name within zone. */
+ return k->zlf_len + 2;
+}
+static inline size_t key_nsec3_hash_off(const struct key *k)
+{
+ /* CACHE_KEY_DEF NSEC3: tag (nsec_p_hash_t) + 20 bytes NSEC3 name hash) */
+ return key_nwz_off(k) + sizeof(nsec_p_hash_t);
+}
+/** Hash is always SHA1; I see no plans to standardize anything else.
+ * https://www.iana.org/assignments/dnssec-nsec3-parameters/dnssec-nsec3-parameters.xhtml#dnssec-nsec3-parameters-3
+ */
+static const int NSEC3_HASH_LEN = 20,
+ NSEC3_HASH_TXT_LEN = 32;
+
+/** Finish constructing string key for for exact search.
+ * It's assumed that kr_dname_lf(k->buf, owner, *) had been ran.
+ */
+knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type);
+
+/** Like key_exact_type_maypkt but with extra checks if used for RRs only. */
+static inline knot_db_val_t key_exact_type(struct key *k, uint16_t type)
+{
+ switch (type) {
+ /* Sanity check: forbidden types represented in other way(s). */
+ case KNOT_RRTYPE_NSEC:
+ case KNOT_RRTYPE_NSEC3:
+ assert(false);
+ return (knot_db_val_t){ NULL, 0 };
+ }
+ return key_exact_type_maypkt(k, type);
+}
+
+
+/* entry_h chaining; implementation in ./entry_list.c */
+
+enum { ENTRY_APEX_NSECS_CNT = 2 };
+
+/** Header of 'E' entry with ktype == NS. Inside is private to ./entry_list.c
+ *
+ * We store xNAME at NS type to lower the number of searches in closest_NS().
+ * CNAME is only considered for equal name, of course.
+ * We also store NSEC* parameters at NS type.
+ */
+struct entry_apex {
+ /* ENTRY_H_FLAGS */
+ bool has_ns : 1;
+ bool has_cname : 1;
+ bool has_dname : 1;
+
+ uint8_t pad_; /**< Weird: 1 byte + 2 bytes + x bytes; let's do 2+2+x. */
+ int8_t nsecs[ENTRY_APEX_NSECS_CNT]; /**< values: 0: none, 1: NSEC, 3: NSEC3 */
+ uint8_t data[];
+ /* XXX: if not first, stamp of last being the first?
+ * Purpose: save cache operations if rolled the algo/params long ago. */
+};
+
+/** Indices for decompressed entry_list_t. */
+enum EL {
+ EL_NS = ENTRY_APEX_NSECS_CNT,
+ EL_CNAME,
+ EL_DNAME,
+ EL_LENGTH
+};
+/** Decompressed entry_apex. It's an array of unparsed entry_h references.
+ * Note: arrays are passed "by reference" to functions (in C99). */
+typedef knot_db_val_t entry_list_t[EL_LENGTH];
+
+static inline uint16_t EL2RRTYPE(enum EL i)
+{
+ switch (i) {
+ case EL_NS: return KNOT_RRTYPE_NS;
+ case EL_CNAME: return KNOT_RRTYPE_CNAME;
+ case EL_DNAME: return KNOT_RRTYPE_DNAME;
+ default: assert(false); return 0;
+ }
+}
+
+/** There may be multiple entries within, so rewind `val` to the one we want.
+ *
+ * ATM there are multiple types only for the NS ktype - it also accommodates xNAMEs.
+ * \note `val->len` represents the bound of the whole list, not of a single entry.
+ * \note in case of ENOENT, `val` is still rewound to the beginning of the next entry.
+ * \return error code
+ * TODO: maybe get rid of this API?
+ */
+int entry_h_seek(knot_db_val_t *val, uint16_t type);
+
+/** Prepare space to insert an entry.
+ *
+ * Some checks are performed (rank, TTL), the current entry in cache is copied
+ * with a hole ready for the new entry (old one of the same type is cut out).
+ *
+ * \param val_new_entry The only changing parameter; ->len is read, ->data written.
+ * \return error code
+ */
+int entry_h_splice(
+ knot_db_val_t *val_new_entry, uint8_t rank,
+ const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+ const knot_dname_t *owner/*log only*/,
+ const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp);
+
+/** Parse an entry_apex into individual items. @return error code. */
+int entry_list_parse(const knot_db_val_t val, entry_list_t list);
+
+static inline size_t to_even(size_t n)
+{
+ return n + (n & 1);
+}
+
+static inline int entry_list_serial_size(const entry_list_t list)
+{
+ int size = offsetof(struct entry_apex, data);
+ for (int i = 0; i < EL_LENGTH; ++i) {
+ size += to_even(list[i].len);
+ }
+ return size;
+}
+
+/** Fill contents of an entry_apex.
+ *
+ * @note NULL pointers are overwritten - caller may like to fill the space later.
+ */
+void entry_list_memcpy(struct entry_apex *ea, entry_list_t list);
+
+
+
+/* Packet caching; implementation in ./entry_pkt.c */
+
+/** Stash the packet into cache (if suitable, etc.)
+ * \param has_optout whether the packet contains an opt-out NSEC3 */
+void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry,
+ const struct kr_request *req, bool has_optout);
+
+/** Try answering from packet cache, given an entry_h.
+ *
+ * This assumes the TTL is OK and entry_h_consistent, but it may still return error.
+ * On success it handles all the rest, incl. qry->flags.
+ */
+int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+ const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl);
+
+
+/** Record is expiring if it has less than 1% TTL (or less than 5s) */
+static inline bool is_expiring(uint32_t orig_ttl, uint32_t new_ttl)
+{
+ int64_t nttl = new_ttl; /* avoid potential over/under-flow */
+ return 100 * (nttl - 5) < orig_ttl;
+}
+
+/** Returns signed result so you can inspect how much stale the RR is.
+ *
+ * @param owner name for stale-serving decisions. You may pass NULL to disable stale.
+ * @note: NSEC* uses zone name ATM; for NSEC3 the owner may not even be knowable.
+ * @param type for stale-serving.
+ */
+int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry,
+ const knot_dname_t *owner, uint16_t type, uint32_t now);
+
+
+/* RRset (de)materialization; implementation in ./entry_rr.c */
+
+/** Size of the RR count field */
+#define KR_CACHE_RR_COUNT_SIZE sizeof(uint16_t)
+
+/** Compute size of serialized rdataset. NULL is accepted as empty set. */
+static inline int rdataset_dematerialize_size(const knot_rdataset_t *rds)
+{
+ return KR_CACHE_RR_COUNT_SIZE + (rds == NULL ? 0 : knot_rdataset_size(rds));
+}
+
+static inline int rdataset_dematerialized_size(const uint8_t *data)
+{
+ knot_rdataset_t rds;
+ memcpy(&rds.count, data, sizeof(rds.count));
+ rds.rdata = (knot_rdata_t *)(data + sizeof(rds.count));
+ return sizeof(rds.count) + knot_rdataset_size(&rds);
+}
+
+/** Serialize an rdataset. */
+int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data);
+
+
+/** Partially constructed answer when gathering RRsets from cache. */
+struct answer {
+ int rcode; /**< PKT_NODATA, etc. */
+ struct nsec_p nsec_p; /**< Don't mix different NSEC* parameters in one answer. */
+ knot_mm_t *mm; /**< Allocator for rrsets */
+ struct answer_rrset {
+ ranked_rr_array_entry_t set; /**< set+rank for the main data */
+ knot_rdataset_t sig_rds; /**< RRSIG data, if any */
+ } rrsets[1+1+3]; /**< see AR_ANSWER and friends; only required records are filled */
+};
+enum {
+ AR_ANSWER = 0, /**< Positive answer record. It might be wildcard-expanded. */
+ AR_SOA, /**< SOA record. */
+ AR_NSEC, /**< NSEC* covering or matching the SNAME (next closer name in NSEC3 case). */
+ AR_WILD, /**< NSEC* covering or matching the source of synthesis. */
+ AR_CPE, /**< NSEC3 matching the closest provable encloser. */
+};
+
+/** Materialize RRset + RRSIGs into ans->rrsets[id].
+ * LATER(optim.): it's slightly wasteful that we allocate knot_rrset_t for the packet
+ *
+ * \return error code. They are all bad conditions and "guarded" by assert.
+ */
+int entry2answer(struct answer *ans, int id,
+ const struct entry_h *eh, const uint8_t *eh_bound,
+ const knot_dname_t *owner, uint16_t type, uint32_t new_ttl);
+
+
+/* Preparing knot_pkt_t for cache answer from RRs; implementation in ./knot_pkt.c */
+
+/** Prepare answer packet to be filled by RRs (without RR data in wire). */
+int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type);
+
+/** Append RRset + its RRSIGs into the current section (*shallow* copy), with given rank.
+ * \note it works with empty set as well (skipped)
+ * \note pkt->wire is not updated in any way
+ * \note KNOT_CLASS_IN is assumed
+ */
+int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank);
+
+
+
+/* NSEC (1) stuff. Implementation in ./nsec1.c */
+
+/** Construct a string key for for NSEC (1) predecessor-search.
+ * \param add_wildcard Act as if the name was extended by "*."
+ * \note k->zlf_len is assumed to have been correctly set */
+knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard);
+
+/** Closest encloser check for NSEC (1).
+ * To understand the interface, see the call point.
+ * \param k space to store key + input: zname and zlf_len
+ * \return 0: success; >0: try other (NSEC3); <0: exit cache immediately. */
+int nsec1_encloser(struct key *k, struct answer *ans,
+ const int sname_labels, int *clencl_labels,
+ knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz,
+ const struct kr_query *qry, struct kr_cache *cache);
+
+/** Source of synthesis (SS) check for NSEC (1).
+ * To understand the interface, see the call point.
+ * \return 0: continue; <0: exit cache immediately;
+ * AR_SOA: skip to adding SOA (SS was covered or matched for NODATA). */
+int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz,
+ const struct kr_query *qry, struct kr_cache *cache);
+
+
+/* NSEC3 stuff. Implementation in ./nsec3.c */
+
+/** Construct a string key for for NSEC3 predecessor-search, from an NSEC3 name.
+ * \note k->zlf_len is assumed to have been correctly set */
+knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name,
+ const nsec_p_hash_t nsec_p_hash);
+
+/** TODO. See nsec1_encloser(...) */
+int nsec3_encloser(struct key *k, struct answer *ans,
+ const int sname_labels, int *clencl_labels,
+ const struct kr_query *qry, struct kr_cache *cache);
+
+/** TODO. See nsec1_src_synth(...) */
+int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ const struct kr_query *qry, struct kr_cache *cache);
+
+
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "cach", ## __VA_ARGS__)
+
+/** Shorthand for operations on cache backend */
+#define cache_op(cache, op, ...) (cache)->api->op((cache)->db, ## __VA_ARGS__)
+
+
+static inline uint16_t get_uint16(const void *address)
+{
+ uint16_t tmp;
+ memcpy(&tmp, address, sizeof(tmp));
+ return tmp;
+}
+
+/** Useful pattern, especially as void-pointer arithmetic isn't standard-compliant. */
+static inline uint8_t * knot_db_val_bound(knot_db_val_t val)
+{
+ return (uint8_t *)val.data + val.len;
+}
+
diff --git a/lib/cache/knot_pkt.c b/lib/cache/knot_pkt.c
new file mode 100644
index 0000000..56836a6
--- /dev/null
+++ b/lib/cache/knot_pkt.c
@@ -0,0 +1,106 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of preparing knot_pkt_t for filling with RRs.
+ * Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type)
+{
+ /* Update packet question if needed. */
+ if (!knot_dname_is_equal(knot_pkt_qname(pkt), name)
+ || knot_pkt_qtype(pkt) != type || knot_pkt_qclass(pkt) != KNOT_CLASS_IN) {
+ int ret = kr_pkt_recycle(pkt);
+ if (ret) return kr_error(ret);
+ ret = knot_pkt_put_question(pkt, name, KNOT_CLASS_IN, type);
+ if (ret) return kr_error(ret);
+ }
+
+ pkt->parsed = pkt->size = PKT_SIZE_NOWIRE;
+ knot_wire_set_qr(pkt->wire);
+ knot_wire_set_aa(pkt->wire);
+ return kr_ok();
+}
+
+/** Reserve space for additional `count` RRsets.
+ * \note pkt->rr_info gets correct length but is always zeroed
+ */
+static int pkt_alloc_space(knot_pkt_t *pkt, int count)
+{
+ size_t allocd_orig = pkt->rrset_allocd;
+ if (pkt->rrset_count + count <= allocd_orig) {
+ return kr_ok();
+ }
+ /* A simple growth strategy, amortized O(count). */
+ pkt->rrset_allocd = MAX(
+ pkt->rrset_count + count,
+ pkt->rrset_count + allocd_orig);
+
+ pkt->rr = mm_realloc(&pkt->mm, pkt->rr,
+ sizeof(pkt->rr[0]) * pkt->rrset_allocd,
+ sizeof(pkt->rr[0]) * allocd_orig);
+ if (!pkt->rr) {
+ return kr_error(ENOMEM);
+ }
+ /* Allocate pkt->rr_info to be certain, but just leave it zeroed. */
+ mm_free(&pkt->mm, pkt->rr_info);
+ pkt->rr_info = mm_alloc(&pkt->mm, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd);
+ if (!pkt->rr_info) {
+ return kr_error(ENOMEM);
+ }
+ memset(pkt->rr_info, 0, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd);
+ return kr_ok();
+}
+
+int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank)
+{
+ /* allocate space, to be sure */
+ int rrset_cnt = (rrset->set.rr->rrs.count > 0) + (rrset->sig_rds.count > 0);
+ int ret = pkt_alloc_space(pkt, rrset_cnt);
+ if (ret) return kr_error(ret);
+ /* write both sets */
+ const knot_rdataset_t *rdss[2] = { &rrset->set.rr->rrs, &rrset->sig_rds };
+ for (int i = 0; i < rrset_cnt; ++i) {
+ assert(rdss[i]->count);
+ /* allocate rank */
+ uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank));
+ if (!rr_rank) return kr_error(ENOMEM);
+ *rr_rank = (i == 0) ? rank : (KR_RANK_OMIT | KR_RANK_AUTH);
+ /* rank for RRSIGs isn't really useful: ^^ */
+ if (i == 0) {
+ pkt->rr[pkt->rrset_count] = *rrset->set.rr;
+ pkt->rr[pkt->rrset_count].additional = rr_rank;
+ } else {
+ /* append the RR array */
+ pkt->rr[pkt->rrset_count] = (knot_rrset_t){
+ .owner = knot_dname_copy(rrset->set.rr->owner, &pkt->mm),
+ /* ^^ well, another copy isn't really needed */
+ .ttl = rrset->set.rr->ttl,
+ .type = KNOT_RRTYPE_RRSIG,
+ .rclass = KNOT_CLASS_IN,
+ .rrs = *rdss[i],
+ .additional = rr_rank,
+ };
+ }
+ ++pkt->rrset_count;
+ ++(pkt->sections[pkt->current].count);
+ }
+ return kr_ok();
+}
+
diff --git a/lib/cache/nsec1.c b/lib/cache/nsec1.c
new file mode 100644
index 0000000..74b3d34
--- /dev/null
+++ b/lib/cache/nsec1.c
@@ -0,0 +1,511 @@
+/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of NSEC (1) handling. Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+#include "lib/dnssec/nsec.h"
+#include "lib/layer/iterate.h"
+
+
+/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN).
+ * \return kr_ok() or error code (<0). */
+static int dname_wire_reconstruct(knot_dname_t *buf, const struct key *k,
+ knot_db_val_t kwz)
+{
+ /* Reconstruct from key: first the ending, then zone name. */
+ int ret = knot_dname_lf2wire(buf, kwz.len, kwz.data);
+ if (ret < 0) {
+ VERBOSE_MSG(NULL, "=> NSEC: LF2wire ret = %d\n", ret);
+ assert(false);
+ return ret;
+ }
+ /* The last written byte is the zero label for root -> overwrite. */
+ knot_dname_t *zone_start = buf + ret - 1;
+ assert(*zone_start == '\0');
+ ret = knot_dname_to_wire(zone_start, k->zname, KNOT_DNAME_MAXLEN - kwz.len);
+ if (ret != k->zlf_len + 1) {
+ assert(false);
+ return ret < 0 ? ret : kr_error(EILSEQ);
+ }
+ return kr_ok();
+}
+
+
+knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard)
+{
+ /* we basically need dname_lf with two bytes added
+ * on a correct place within the name (the cut) */
+ int ret;
+ const bool ok = k && name
+ && !(ret = kr_dname_lf(k->buf, name, add_wildcard));
+ if (!ok) {
+ assert(false);
+ return (knot_db_val_t){ NULL, 0 };
+ }
+
+ uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */
+ uint8_t *end = k->buf + 1 + k->buf[0]; /* we don't use the final zero in key,
+ * but move it anyway */
+ if (end < begin) {
+ assert(false);
+ return (knot_db_val_t){ NULL, 0 };
+ }
+ int key_len;
+ if (end > begin) {
+ memmove(begin + 2, begin, end - begin);
+ key_len = k->buf[0] + 1;
+ } else {
+ key_len = k->buf[0] + 2;
+ }
+ /* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '1' + dname_lf
+ * of the name within the zone without the final 0. Iff the latter is empty,
+ * there's no zero to cut and thus the key_len difference.
+ */
+ begin[0] = 0;
+ begin[1] = '1'; /* tag for NSEC1 */
+ k->type = KNOT_RRTYPE_NSEC;
+
+ /*
+ VERBOSE_MSG(NULL, "<> key_NSEC1; name: ");
+ kr_dname_print(name, add_wildcard ? "*." : "" , " ");
+ kr_log_verbose("(zone name LF length: %d; total key length: %d)\n",
+ k->zlf_len, key_len);
+ */
+
+ return (knot_db_val_t){ k->buf + 1, key_len };
+}
+
+
+/** Assuming that k1 < k4, find where k2 is. (Considers DNS wrap-around.)
+ *
+ * \return Intuition: position of k2 among kX.
+ * 0: k2 < k1; 1: k1 == k2; 2: k1 is a prefix of k2 < k4;
+ * 3: k1 < k2 < k4 (and not 2); 4: k2 == k4; 5: k2 > k4
+ * \note k1.data may be NULL, meaning assumption that k1 < k2 and not a prefix
+ * (i.e. return code will be > 2)
+ */
+static int kwz_between(knot_db_val_t k1, knot_db_val_t k2, knot_db_val_t k4)
+{
+ assert(k2.data && k4.data);
+ /* CACHE_KEY_DEF; we need to beware of one key being a prefix of another */
+ int ret_maybe; /**< result, assuming we confirm k2 < k4 */
+ if (k1.data) {
+ const int cmp12 = memcmp(k1.data, k2.data, MIN(k1.len, k2.len));
+ if (cmp12 == 0 && k1.len == k2.len) /* iff k1 == k2 */
+ return 1;
+ if (cmp12 > 0 || (cmp12 == 0 && k1.len > k2.len)) /* iff k1 > k2 */
+ return 0;
+ ret_maybe = cmp12 == 0 ? 2 : 3;
+ } else {
+ ret_maybe = 3;
+ }
+ if (k4.len == 0) { /* wrap-around */
+ return k2.len > 0 ? ret_maybe : 4;
+ } else {
+ const int cmp24 = memcmp(k2.data, k4.data, MIN(k2.len, k4.len));
+ if (cmp24 == 0 && k2.len == k4.len) /* iff k2 == k4 */
+ return 4;
+ if (cmp24 > 0 || (cmp24 == 0 && k2.len > k4.len)) /* iff k2 > k4 */
+ return 5;
+ return ret_maybe;
+ }
+}
+
+
+/** NSEC1 range search.
+ *
+ * \param key Pass output of key_NSEC1(k, ...)
+ * \param value[out] The raw data of the NSEC cache record (optional; consistency checked).
+ * \param exact_match[out] Whether the key was matched exactly or just covered (optional).
+ * \param kwz_low[out] Output the low end of covering NSEC, pointing within DB (optional).
+ * \param kwz_high[in,out] Storage for the high end of covering NSEC (optional).
+ * It's only set if !exact_match.
+ * \param new_ttl[out] New TTL of the NSEC (optional).
+ * \return Error message or NULL.
+ * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4.
+ */
+static const char * find_leq_NSEC1(struct kr_cache *cache, const struct kr_query *qry,
+ const knot_db_val_t key, const struct key *k, knot_db_val_t *value,
+ bool *exact_match, knot_db_val_t *kwz_low, knot_db_val_t *kwz_high,
+ uint32_t *new_ttl)
+{
+ /* Do the cache operation. */
+ const size_t nwz_off = key_nwz_off(k);
+ if (!key.data || key.len < nwz_off) {
+ assert(false);
+ return "range search ERROR";
+ }
+ knot_db_val_t key_nsec = key;
+ knot_db_val_t val = { NULL, 0 };
+ int ret = cache_op(cache, read_leq, &key_nsec, &val);
+ if (ret < 0) {
+ if (ret == kr_error(ENOENT)) {
+ return "range search miss";
+ } else {
+ assert(false);
+ return "range search ERROR";
+ }
+ }
+ if (value) {
+ *value = val;
+ }
+ /* Check consistency, TTL, rank. */
+ const bool is_exact = (ret == 0);
+ if (exact_match) {
+ *exact_match = is_exact;
+ }
+ const struct entry_h *eh = entry_h_consistent_NSEC(val);
+ if (!eh) {
+ /* This might be just finding something else than NSEC1 entry,
+ * in case we searched before the very first one in the zone. */
+ return "range search found inconsistent entry";
+ }
+ /* Passing just zone name instead of owner, as we don't
+ * have it reconstructed at this point. */
+ int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC,
+ qry->timestamp.tv_sec);
+ if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) {
+ return "range search found stale or insecure entry";
+ /* TODO: remove the stale record *and* retry,
+ * in case we haven't run off. Perhaps start by in_zone check. */
+ }
+ if (new_ttl) {
+ *new_ttl = new_ttl_;
+ }
+ if (kwz_low) {
+ *kwz_low = (knot_db_val_t){
+ .data = (uint8_t *)key_nsec.data + nwz_off,
+ .len = key_nsec.len - nwz_off,
+ }; /* CACHE_KEY_DEF */
+ }
+ if (is_exact) {
+ /* Nothing else to do. */
+ return NULL;
+ }
+ /* The NSEC starts strictly before our target name;
+ * now check that it still belongs into that zone. */
+ const bool nsec_in_zone = key_nsec.len >= nwz_off
+ /* CACHE_KEY_DEF */
+ && memcmp(key.data, key_nsec.data, nwz_off) == 0;
+ if (!nsec_in_zone) {
+ return "range search miss (!nsec_in_zone)";
+ }
+ /* We know it starts before sname, so let's check the other end.
+ * 1. construct the key for the next name - kwz_hi. */
+ /* it's *full* name ATM */
+ const knot_rdata_t *next = (const knot_rdata_t *)
+ (eh->data + KR_CACHE_RR_COUNT_SIZE);
+ if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) {
+ assert(false);
+ return "ERROR";
+ /* TODO: more checks? */
+ }
+ /*
+ WITH_VERBOSE {
+ VERBOSE_MSG(qry, "=> NSEC: next name: ");
+ kr_dname_print(next, "", "\n");
+ }
+ */
+ knot_dname_t ch_buf[KNOT_DNAME_MAXLEN];
+ knot_dname_t *chs = kwz_high ? kwz_high->data : ch_buf;
+ if (!chs) {
+ assert(false);
+ return "EINVAL";
+ }
+ {
+ /* Lower-case chs; see also RFC 6840 5.1.
+ * LATER(optim.): we do lots of copying etc. */
+ knot_dname_t lower_buf[KNOT_DNAME_MAXLEN];
+ ret = knot_dname_to_wire(lower_buf, next->data,
+ MIN(next->len, KNOT_DNAME_MAXLEN));
+ if (ret < 0) { /* _ESPACE */
+ return "range search found record with incorrect contents";
+ }
+ knot_dname_to_lower(lower_buf);
+ ret = kr_dname_lf(chs, lower_buf, false);
+ }
+ if (ret) {
+ assert(false);
+ return "ERROR";
+ }
+ knot_db_val_t kwz_hi = { /* skip the zone name */
+ .data = chs + 1 + k->zlf_len,
+ .len = chs[0] - k->zlf_len,
+ };
+ assert((ssize_t)(kwz_hi.len) >= 0);
+ /* 2. do the actual range check. */
+ const knot_db_val_t kwz_sname = {
+ .data = (void *)/*const-cast*/(k->buf + 1 + nwz_off),
+ .len = k->buf[0] - k->zlf_len,
+ };
+ assert((ssize_t)(kwz_sname.len) >= 0);
+ bool covers = /* we know for sure that the low end is before kwz_sname */
+ 3 == kwz_between((knot_db_val_t){ NULL, 0 }, kwz_sname, kwz_hi);
+ if (!covers) {
+ return "range search miss (!covers)";
+ }
+ if (kwz_high) {
+ *kwz_high = kwz_hi;
+ }
+ return NULL;
+}
+
+
+int nsec1_encloser(struct key *k, struct answer *ans,
+ const int sname_labels, int *clencl_labels,
+ knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz,
+ const struct kr_query *qry, struct kr_cache *cache)
+{
+ static const int ESKIP = ABS(ENOENT);
+ /* Basic sanity check. */
+ const bool ok = k && ans && clencl_labels && cover_low_kwz && cover_hi_kwz
+ && qry && cache;
+ if (!ok) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ /* Find a previous-or-equal name+NSEC in cache covering the QNAME,
+ * checking TTL etc. */
+ knot_db_val_t key = key_NSEC1(k, qry->sname, false);
+ knot_db_val_t val = { NULL, 0 };
+ bool exact_match;
+ uint32_t new_ttl;
+ const char *err = find_leq_NSEC1(cache, qry, key, k, &val,
+ &exact_match, cover_low_kwz, cover_hi_kwz, &new_ttl);
+ if (err) {
+ VERBOSE_MSG(qry, "=> NSEC sname: %s\n", err);
+ return ESKIP;
+ }
+
+ /* Get owner name of the record. */
+ const knot_dname_t *owner;
+ knot_dname_t owner_buf[KNOT_DNAME_MAXLEN];
+ if (exact_match) {
+ owner = qry->sname;
+ } else {
+ int ret = dname_wire_reconstruct(owner_buf, k, *cover_low_kwz);
+ if (unlikely(ret)) return ESKIP;
+ owner = owner_buf;
+ }
+ /* Basic checks OK -> materialize data. */
+ {
+ const struct entry_h *nsec_eh = val.data;
+ int ret = entry2answer(ans, AR_NSEC, nsec_eh, knot_db_val_bound(val),
+ owner, KNOT_RRTYPE_NSEC, new_ttl);
+ if (ret) return kr_error(ret);
+ }
+
+ /* Final checks, split for matching vs. covering our sname. */
+ const knot_rrset_t *nsec_rr = ans->rrsets[AR_NSEC].set.rr;
+ const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata);
+ uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata);
+ assert(bm);
+
+ if (exact_match) {
+ if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) != 0) {
+ VERBOSE_MSG(qry,
+ "=> NSEC sname: match but failed type check\n");
+ return ESKIP;
+ }
+ /* NODATA proven; just need to add SOA+RRSIG later */
+ VERBOSE_MSG(qry, "=> NSEC sname: match proved NODATA, new TTL %d\n",
+ new_ttl);
+ ans->rcode = PKT_NODATA;
+ return kr_ok();
+ } /* else */
+
+ /* Inexact match. First check if sname is delegated by that NSEC. */
+ const int nsec_matched = knot_dname_matched_labels(nsec_rr->owner, qry->sname);
+ const bool is_sub = nsec_matched == knot_dname_labels(nsec_rr->owner, NULL);
+ if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+ VERBOSE_MSG(qry, "=> NSEC sname: covered but delegated (or error)\n");
+ return ESKIP;
+ }
+ /* NXDOMAIN proven *except* for wildcards. */
+ WITH_VERBOSE(qry) {
+ auto_free char *owner_str = kr_dname_text(nsec_rr->owner),
+ *next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata));
+ VERBOSE_MSG(qry, "=> NSEC sname: covered by: %s -> %s, new TTL %d\n",
+ owner_str, next_str, new_ttl);
+ }
+
+ /* Find label count of the closest encloser.
+ * Both endpoints in an NSEC do exist (though possibly in a child zone)
+ * and any prefixes of those names as well (empty non-terminals),
+ * but nothing else exists inside this "triangle".
+ *
+ * Note that we have to lower-case the next name for comparison,
+ * even though we have canonicalized NSEC already; see RFC 6840 5.1.
+ * LATER(optim.): it might be faster to use the LFs we already have.
+ */
+ knot_dname_t next[KNOT_DNAME_MAXLEN];
+ int ret = knot_dname_to_wire(next, knot_nsec_next(nsec_rr->rrs.rdata), sizeof(next));
+ if (ret < 0) {
+ assert(!ret);
+ return kr_error(ret);
+ }
+ knot_dname_to_lower(next);
+ *clencl_labels = MAX(
+ nsec_matched,
+ knot_dname_matched_labels(qry->sname, next)
+ );
+
+ /* Empty non-terminals don't need to have
+ * a matching NSEC record. */
+ if (sname_labels == *clencl_labels) {
+ ans->rcode = PKT_NODATA;
+ VERBOSE_MSG(qry,
+ "=> NSEC sname: empty non-terminal by the same RR\n");
+ } else {
+ ans->rcode = PKT_NXDOMAIN;
+ }
+ return kr_ok();
+}
+
+/** Verify non-existence after kwz_between() call. */
+static bool nonexistence_ok(int cmp, const knot_rrset_t *rrs)
+{
+ if (cmp == 3) {
+ return true;
+ }
+ if (cmp != 2) {
+ return false;
+ }
+ const uint8_t *bm = knot_nsec_bitmap(rrs->rrs.rdata);
+ uint16_t bm_size = knot_nsec_bitmap_len(rrs->rrs.rdata);
+ return kr_nsec_children_in_zone_check(bm, bm_size) != 0;
+}
+
+int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz,
+ const struct kr_query *qry, struct kr_cache *cache)
+{
+ /* Construct key for the source of synthesis. */
+ knot_db_val_t key = key_NSEC1(k, clencl_name, true);
+ const size_t nwz_off = key_nwz_off(k);
+ if (!key.data || key.len < nwz_off) {
+ assert(false);
+ return kr_error(1);
+ }
+ /* Check if our sname-covering NSEC also covers/matches SS. */
+ knot_db_val_t kwz = {
+ .data = (uint8_t *)key.data + nwz_off,
+ .len = key.len - nwz_off,
+ };
+ assert((ssize_t)(kwz.len) >= 0);
+ const int cmp = kwz_between(cover_low_kwz, kwz, cover_hi_kwz);
+ if (nonexistence_ok(cmp, ans->rrsets[AR_NSEC].set.rr)) {
+ VERBOSE_MSG(qry, "=> NSEC wildcard: covered by the same RR\n");
+ return AR_SOA;
+ }
+ const knot_rrset_t *nsec_rr = NULL; /**< the wildcard proof NSEC */
+ bool exact_match; /**< whether it matches the source of synthesis */
+ if (cmp == 1) {
+ exact_match = true;
+ nsec_rr = ans->rrsets[AR_NSEC].set.rr;
+ } else {
+ /* Try to find the NSEC for SS. */
+ knot_db_val_t val = { NULL, 0 };
+ knot_db_val_t wild_low_kwz = { NULL, 0 };
+ uint32_t new_ttl;
+ const char *err = find_leq_NSEC1(cache, qry, key, k, &val,
+ &exact_match, &wild_low_kwz, NULL, &new_ttl);
+ if (err) {
+ VERBOSE_MSG(qry, "=> NSEC wildcard: %s\n", err);
+ return kr_ok();
+ }
+ /* Materialize the record into answer (speculatively). */
+ knot_dname_t owner[KNOT_DNAME_MAXLEN];
+ int ret = dname_wire_reconstruct(owner, k, wild_low_kwz);
+ if (ret) return kr_error(ret);
+ const struct entry_h *nsec_eh = val.data;
+ ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val),
+ owner, KNOT_RRTYPE_NSEC, new_ttl);
+ if (ret) return kr_error(ret);
+ nsec_rr = ans->rrsets[AR_WILD].set.rr;
+ }
+
+ assert(nsec_rr);
+ const uint32_t new_ttl_log =
+ kr_verbose_status ? nsec_rr->ttl : -1;
+ const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata);
+ uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata);
+ int ret;
+ struct answer_rrset * const arw = &ans->rrsets[AR_WILD];
+ if (!bm) {
+ assert(false);
+ ret = kr_error(1);
+ goto clean_wild;
+ }
+ if (!exact_match) {
+ /* Finish verification that the source of synthesis doesn't exist. */
+ const int nsec_matched =
+ knot_dname_matched_labels(nsec_rr->owner, clencl_name);
+ /* we don't need to use the full source of synthesis ^ */
+ const bool is_sub =
+ nsec_matched == knot_dname_labels(nsec_rr->owner, NULL);
+ if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+ VERBOSE_MSG(qry,
+ "=> NSEC wildcard: covered but delegated (or error)\n");
+ ret = kr_ok();
+ goto clean_wild;
+ }
+ /* We have a record proving wildcard non-existence. */
+ WITH_VERBOSE(qry) {
+ auto_free char *owner_str = kr_dname_text(nsec_rr->owner),
+ *next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata));
+ VERBOSE_MSG(qry, "=> NSEC wildcard: covered by: %s -> %s, new TTL %d\n",
+ owner_str, next_str, new_ttl_log);
+ }
+ return AR_SOA;
+ }
+
+ /* The wildcard exists. Find if it's NODATA - check type bitmap. */
+ if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) {
+ /* NODATA proven; just need to add SOA+RRSIG later */
+ WITH_VERBOSE(qry) {
+ const char *msg_start = "=> NSEC wildcard: match proved NODATA";
+ if (arw->set.rr) {
+ auto_free char *owner_str = kr_dname_text(nsec_rr->owner);
+ VERBOSE_MSG(qry, "%s: %s, new TTL %d\n",
+ msg_start, owner_str, new_ttl_log);
+ } else {
+ /* don't repeat the RR if it's the same */
+ VERBOSE_MSG(qry, "%s, by the same RR\n", msg_start);
+ }
+ }
+ ans->rcode = PKT_NODATA;
+ return AR_SOA;
+
+ } /* else */
+ /* The data probably exists -> don't add this NSEC
+ * and (later) try to find the real wildcard data */
+ VERBOSE_MSG(qry, "=> NSEC wildcard: should exist (or error)\n");
+ ans->rcode = PKT_NOERROR;
+ ret = kr_ok();
+clean_wild:
+ if (arw->set.rr) { /* we may have matched AR_NSEC */
+ knot_rrset_free(arw->set.rr, ans->mm);
+ arw->set.rr = NULL;
+ knot_rdataset_clear(&arw->sig_rds, ans->mm);
+ }
+ return ret;
+}
+
diff --git a/lib/cache/nsec3.c b/lib/cache/nsec3.c
new file mode 100644
index 0000000..d2ae72a
--- /dev/null
+++ b/lib/cache/nsec3.c
@@ -0,0 +1,501 @@
+/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of NSEC3 handling. Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+#include "contrib/base32hex.h"
+#include "lib/dnssec/nsec.h"
+#include "lib/layer/iterate.h"
+
+#include <libknot/rrtype/nsec3.h>
+
+static const knot_db_val_t VAL_EMPTY = { NULL, 0 };
+
+/** Common part: write all but the NSEC3 hash. */
+static knot_db_val_t key_NSEC3_common(struct key *k, const knot_dname_t *zname,
+ const nsec_p_hash_t nsec_p_hash)
+{
+ int ret;
+ const bool ok = k && zname
+ && !(ret = kr_dname_lf(k->buf, zname, false));
+ if (!ok) {
+ assert(false);
+ return VAL_EMPTY;
+ }
+
+ /* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '3' + nsec_p hash (4B)
+ * + NSEC3 hash (20B == NSEC3_HASH_LEN binary!)
+ * LATER(optim.) nsec_p hash: perhaps 2B would give a sufficient probability
+ * of avoiding collisions.
+ */
+ uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */
+ begin[0] = 0;
+ begin[1] = '3'; /* tag for NSEC3 */
+ k->type = KNOT_RRTYPE_NSEC3;
+ memcpy(begin + 2, &nsec_p_hash, sizeof(nsec_p_hash));
+ return (knot_db_val_t){
+ .data = k->buf + 1,
+ .len = begin + 2 + sizeof(nsec_p_hash) - (k->buf + 1),
+ };
+}
+
+knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name,
+ const nsec_p_hash_t nsec_p_hash)
+{
+ knot_db_val_t val = key_NSEC3_common(k, nsec3_name /*only zname required*/,
+ nsec_p_hash);
+ if (!val.data) return val;
+ int len = base32hex_decode(nsec3_name + 1, nsec3_name[0],
+ knot_db_val_bound(val), KR_CACHE_KEY_MAXLEN - val.len);
+ if (len != NSEC3_HASH_LEN) {
+ return VAL_EMPTY;
+ }
+ val.len += len;
+ return val;
+}
+
+/** Construct a string key for for NSEC3 predecessor-search, from an non-NSEC3 name.
+ * \note k->zlf_len and k->zname are assumed to have been correctly set */
+static knot_db_val_t key_NSEC3_name(struct key *k, const knot_dname_t *name,
+ const bool add_wildcard, const struct nsec_p *nsec_p)
+{
+ bool ok = k && name && nsec_p && nsec_p->raw;
+ if (!ok) return VAL_EMPTY;
+ knot_db_val_t val = key_NSEC3_common(k, k->zname, nsec_p->hash);
+ if (!val.data) return val;
+
+ /* Make `name` point to correctly wildcarded owner name. */
+ uint8_t buf[KNOT_DNAME_MAXLEN];
+ int name_len;
+ if (add_wildcard) {
+ buf[0] = '\1';
+ buf[1] = '*';
+ name_len = knot_dname_to_wire(buf + 2, name, sizeof(buf) - 2);
+ if (name_len < 0) return VAL_EMPTY; /* wants wildcard but doesn't fit */
+ name = buf;
+ name_len += 2;
+ } else {
+ name_len = knot_dname_size(name);
+ }
+ /* Append the NSEC3 hash. */
+ const dnssec_binary_t dname = {
+ .size = name_len,
+ .data = (uint8_t *)/*const-cast*/name,
+ };
+
+ #if 0 // LATER(optim.): this requires a patched libdnssec - tries to realloc()
+ dnssec_binary_t hash = {
+ .size = KR_CACHE_KEY_MAXLEN - val.len,
+ .data = val.data + val.len,
+ };
+ int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash);
+ if (ret != DNSSEC_EOK) return VAL_EMPTY;
+ assert(hash.size == NSEC3_HASH_LEN);
+
+ #else
+ dnssec_binary_t hash = { .size = 0, .data = NULL };
+ int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash);
+ if (ret != DNSSEC_EOK) return VAL_EMPTY;
+ if (hash.size != NSEC3_HASH_LEN || !hash.data) {
+ assert(false);
+ return VAL_EMPTY;
+ }
+ memcpy(knot_db_val_bound(val), hash.data, NSEC3_HASH_LEN);
+ free(hash.data);
+ #endif
+
+ val.len += hash.size;
+ return val;
+}
+
+/** Return h1 < h2, semantically on NSEC3 hashes. */
+static inline bool nsec3_hash_ordered(const uint8_t *h1, const uint8_t *h2)
+{
+ return memcmp(h1, h2, NSEC3_HASH_LEN) < 0;
+}
+
+/** NSEC3 range search.
+ *
+ * \param key Pass output of key_NSEC3(k, ...)
+ * \param nsec_p Restrict to this NSEC3 parameter-set.
+ * \param value[out] The raw data of the NSEC3 cache record (optional; consistency checked).
+ * \param exact_match[out] Whether the key was matched exactly or just covered (optional).
+ * \param hash_low[out] Output the low end hash of covering NSEC3, pointing within DB (optional).
+ * \param new_ttl[out] New TTL of the NSEC3 (optional).
+ * \return Error message or NULL.
+ * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4.
+ */
+static const char * find_leq_NSEC3(struct kr_cache *cache, const struct kr_query *qry,
+ const knot_db_val_t key, const struct key *k, const struct nsec_p *nsec_p,
+ knot_db_val_t *value, bool *exact_match, const uint8_t **hash_low,
+ uint32_t *new_ttl)
+{
+ /* Do the cache operation. */
+ const size_t hash_off = key_nsec3_hash_off(k);
+ if (!key.data || key.len < hash_off) {
+ assert(false);
+ return "range search ERROR";
+ }
+ knot_db_val_t key_found = key;
+ knot_db_val_t val = { NULL, 0 };
+ int ret = cache_op(cache, read_leq, &key_found, &val);
+ /* ^^ LATER(optim.): incrementing key and doing less-than search
+ * would probably be slightly more efficient with LMDB,
+ * but the code complexity would grow considerably. */
+ if (ret < 0) {
+ if (ret == kr_error(ENOENT)) {
+ return "range search miss";
+ } else {
+ assert(false);
+ return "range search ERROR";
+ }
+ }
+ if (value) {
+ *value = val;
+ }
+ /* Check consistency, TTL, rank. */
+ const bool is_exact = (ret == 0);
+ if (exact_match) {
+ *exact_match = is_exact;
+ }
+ const struct entry_h *eh = entry_h_consistent_NSEC(val);
+ if (!eh) {
+ /* This might be just finding something else than NSEC3 entry,
+ * in case we searched before the very first one in the zone. */
+ return "range search found inconsistent entry";
+ }
+ /* Passing just zone name instead of owner. */
+ int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC3,
+ qry->timestamp.tv_sec);
+ if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) {
+ return "range search found stale or insecure entry";
+ /* TODO: remove the stale record *and* retry,
+ * in case we haven't run off. Perhaps start by in_zone check. */
+ }
+ if (new_ttl) {
+ *new_ttl = new_ttl_;
+ }
+ if (hash_low) {
+ *hash_low = (uint8_t *)key_found.data + hash_off;
+ }
+ if (is_exact) {
+ /* Nothing else to do. */
+ return NULL;
+ }
+ /* The NSEC3 starts strictly before our target name;
+ * now check that it still belongs into that zone and chain. */
+ const uint8_t *nsec_p_raw = eh->data + KR_CACHE_RR_COUNT_SIZE
+ + 2 /* RDLENGTH from rfc1034 */;
+ const int nsec_p_len = nsec_p_rdlen(nsec_p_raw);
+ const bool same_chain = key_found.len == hash_off + NSEC3_HASH_LEN
+ /* CACHE_KEY_DEF */
+ && memcmp(key.data, key_found.data, hash_off) == 0
+ /* exact comparison of NSEC3 parameters */
+ && nsec_p_len == nsec_p_rdlen(nsec_p->raw)
+ && memcmp(nsec_p_raw, nsec_p->raw, nsec_p_len) == 0;
+ if (!same_chain) {
+ return "range search miss (!same_chain)";
+ }
+ /* We know it starts before sname, so let's check the other end.
+ * A. find the next hash and check its length. */
+ if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) {
+ assert(false);
+ return "ERROR";
+ /* TODO: more checks? Also, `next` computation is kinda messy. */
+ }
+ const uint8_t *hash_next = nsec_p_raw + nsec_p_len
+ + sizeof(uint8_t) /* hash length from rfc5155 */;
+ if (hash_next[-1] != NSEC3_HASH_LEN) {
+ return "unexpected next hash length";
+ }
+ /* B. do the actual range check. */
+ const uint8_t * const hash_searched = (uint8_t *)key.data + hash_off;
+ bool covers = /* we know for sure that the low end is before the searched name */
+ nsec3_hash_ordered(hash_searched, hash_next)
+ /* and the wrap-around case */
+ || nsec3_hash_ordered(hash_next, (const uint8_t *)key_found.data + hash_off);
+ if (!covers) {
+ return "range search miss (!covers)";
+ }
+ return NULL;
+}
+
+/** Extract textual representation of NSEC3 hash from a cache key.
+ * \param text must have length at least NSEC3_HASH_TXT_LEN+1 (will get 0-terminated). */
+static void key_NSEC3_hash2text(const knot_db_val_t key, char *text)
+{
+ assert(key.data && key.len > NSEC3_HASH_LEN);
+ const uint8_t *hash_raw = knot_db_val_bound(key) - NSEC3_HASH_LEN;
+ /* CACHE_KEY_DEF ^^ */
+ int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, (uint8_t *)text,
+ NSEC3_HASH_TXT_LEN);
+ assert(len == NSEC3_HASH_TXT_LEN); (void)len;
+ text[NSEC3_HASH_TXT_LEN] = '\0';
+}
+
+/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN).
+ * \return kr_ok() or error code (<0). */
+static int dname_wire_reconstruct(knot_dname_t *buf, const knot_dname_t *zname,
+ const uint8_t *hash_raw)
+{
+ int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, buf + 1, NSEC3_HASH_TXT_LEN);
+ if (len != NSEC3_HASH_TXT_LEN) {
+ assert(false);
+ return kr_error(EINVAL);
+ }
+ buf[0] = len;
+ int ret = knot_dname_to_wire(buf + 1 + len, zname, KNOT_DNAME_MAXLEN - 1 - len);
+ return ret < 0 ? kr_error(ret) : kr_ok();
+}
+
+static void nsec3_hash2text(const knot_dname_t *owner, char *text)
+{
+ assert(owner[0] == NSEC3_HASH_TXT_LEN);
+ memcpy(text, owner + 1, MIN(owner[0], NSEC3_HASH_TXT_LEN));
+ text[NSEC3_HASH_TXT_LEN] = '\0';
+}
+
+int nsec3_encloser(struct key *k, struct answer *ans,
+ const int sname_labels, int *clencl_labels,
+ const struct kr_query *qry, struct kr_cache *cache)
+{
+ static const int ESKIP = ABS(ENOENT);
+ /* Basic sanity check. */
+ const bool ok = k && k->zname && ans && clencl_labels
+ && qry && cache;
+ if (!ok) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ /*** Find the closest encloser - cycle: name starting at sname,
+ * proceeding while longer than zname, shortening by one label on step.
+ * We need a pair where a name doesn't exist *and* its parent does. */
+ /* LATER(optim.): perhaps iterate in the other order - that
+ * should help significantly against deep queries where we have
+ * a shallow proof in the cache. We can also optimize by using
+ * only exact search unless we had a match in the previous iteration. */
+ const int zname_labels = knot_dname_labels(k->zname, NULL);
+ int last_nxproven_labels = -1;
+ const knot_dname_t *name = qry->sname;
+ for (int name_labels = sname_labels; name_labels >= zname_labels;
+ --name_labels, name += 1 + name[0]) {
+ /* Find a previous-or-equal NSEC3 in cache covering the name,
+ * checking TTL etc. */
+ const knot_db_val_t key = key_NSEC3_name(k, name, false, &ans->nsec_p);
+ if (!key.data) continue;
+ WITH_VERBOSE(qry) {
+ char hash_txt[NSEC3_HASH_TXT_LEN + 1];
+ key_NSEC3_hash2text(key, hash_txt);
+ VERBOSE_MSG(qry, "=> NSEC3 depth %d: hash %s\n",
+ name_labels - zname_labels, hash_txt);
+ }
+ knot_db_val_t val = { NULL, 0 };
+ bool exact_match;
+ uint32_t new_ttl;
+ const uint8_t *hash_low;
+ const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val,
+ &exact_match, &hash_low, &new_ttl);
+ if (err) {
+ WITH_VERBOSE(qry) {
+ auto_free char *name_str = kr_dname_text(name);
+ VERBOSE_MSG(qry, "=> NSEC3 encloser error for %s: %s\n",
+ name_str, err);
+ }
+ continue;
+ }
+ if (exact_match && name_labels != sname_labels
+ && name_labels + 1 != last_nxproven_labels) {
+ /* This name exists (checked rank and TTL), and it's
+ * neither of the two interesting cases, so we do not
+ * keep searching for non-existence above this name. */
+ VERBOSE_MSG(qry,
+ "=> NSEC3 encloser: only found existence of an ancestor\n");
+ return ESKIP;
+ }
+ /* Optimization: avoid the rest of the last iteration if pointless. */
+ if (!exact_match && name_labels == zname_labels
+ && last_nxproven_labels != name_labels + 1) {
+ break;
+ }
+
+ /* Basic checks OK -> materialize data, cleaning any previous
+ * records on that answer index (unsuccessful attempts). */
+ knot_dname_t owner[KNOT_DNAME_MAXLEN];
+ {
+ int ret = dname_wire_reconstruct(owner, k->zname, hash_low);
+ if (unlikely(ret)) continue;
+ }
+ const int ans_id = (exact_match && name_labels + 1 == last_nxproven_labels)
+ ? AR_CPE : AR_NSEC;
+ {
+ const struct entry_h *nsec_eh = val.data;
+ memset(&ans->rrsets[ans_id], 0, sizeof(ans->rrsets[ans_id]));
+ int ret = entry2answer(ans, ans_id, nsec_eh, knot_db_val_bound(val),
+ owner, KNOT_RRTYPE_NSEC3, new_ttl);
+ if (ret) return kr_error(ret);
+ }
+
+ if (!exact_match) {
+ /* Non-existence proven, but we don't know if `name`
+ * is the next closer name.
+ * Note: we don't need to check for the sname being
+ * delegated away by this record, as with NSEC3 only
+ * *exact* match on an ancestor could do that. */
+ last_nxproven_labels = name_labels;
+ WITH_VERBOSE(qry) {
+ char hash_low_txt[NSEC3_HASH_TXT_LEN + 1];
+ nsec3_hash2text(owner, hash_low_txt);
+ VERBOSE_MSG(qry,
+ "=> NSEC3 depth %d: covered by %s -> TODO, new TTL %d\n",
+ name_labels - zname_labels, hash_low_txt, new_ttl);
+ }
+ continue;
+ }
+
+ /* Exactly matched NSEC3: two cases, one after another. */
+ const knot_rrset_t *nsec_rr = ans->rrsets[ans_id].set.rr;
+ const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata);
+ uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata);
+ assert(bm);
+ if (name_labels == sname_labels) {
+ if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype,
+ nsec_rr->owner) != 0) {
+ VERBOSE_MSG(qry,
+ "=> NSEC3 sname: match but failed type check\n");
+ return ESKIP;
+ }
+ /* NODATA proven; just need to add SOA+RRSIG later */
+ VERBOSE_MSG(qry,
+ "=> NSEC3 sname: match proved NODATA, new TTL %d\n",
+ new_ttl);
+ ans->rcode = PKT_NODATA;
+ return kr_ok();
+
+ } /* else */
+
+ assert(name_labels + 1 == last_nxproven_labels);
+ if (kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+ VERBOSE_MSG(qry,
+ "=> NSEC3 encloser: found but delegated (or error)\n");
+ return ESKIP;
+ }
+ /* NXDOMAIN proven *except* for wildcards. */
+ WITH_VERBOSE(qry) {
+ auto_free char *name_str = kr_dname_text(name);
+ VERBOSE_MSG(qry,
+ "=> NSEC3 encloser: confirmed as %s, new TTL %d\n",
+ name_str, new_ttl);
+ }
+ *clencl_labels = name_labels;
+ ans->rcode = PKT_NXDOMAIN;
+ /* Avoid repeated NSEC3 - remove either if the hashes match.
+ * This is very unlikely in larger zones: 1/size (per attempt).
+ * Well, deduplication would happen anyway when the answer
+ * from cache is read by kresd (internally). */
+ if (unlikely(0 == memcmp(ans->rrsets[AR_NSEC].set.rr->owner + 1,
+ ans->rrsets[AR_CPE ].set.rr->owner + 1,
+ NSEC3_HASH_LEN))) {
+ memset(&ans->rrsets[AR_CPE], 0, sizeof(ans->rrsets[AR_CPE]));
+ /* LATER(optim.): perhaps check this earlier and avoid some work? */
+ }
+ return kr_ok();
+ }
+
+ /* We've ran out of options. */
+ if (last_nxproven_labels > 0) {
+ /* We didn't manage to prove existence of the closest encloser,
+ * meaning the only chance left is a *positive* wildcard record. */
+ *clencl_labels = last_nxproven_labels - 1;
+ ans->rcode = PKT_NXDOMAIN;
+ /* FIXME: review */
+ }
+ return ESKIP;
+}
+
+int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ const struct kr_query *qry, struct kr_cache *cache)
+{
+ /* Find a previous-or-equal NSEC3 in cache covering or matching
+ * the source of synthesis, checking TTL etc. */
+ const knot_db_val_t key = key_NSEC3_name(k, clencl_name, true, &ans->nsec_p);
+ if (!key.data) return kr_error(1);
+ WITH_VERBOSE(qry) {
+ char hash_txt[NSEC3_HASH_TXT_LEN + 1];
+ key_NSEC3_hash2text(key, hash_txt);
+ VERBOSE_MSG(qry, "=> NSEC3 wildcard: hash %s\n", hash_txt);
+ }
+ knot_db_val_t val = { NULL, 0 };
+ bool exact_match;
+ uint32_t new_ttl;
+ const uint8_t *hash_low;
+ const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val,
+ &exact_match, &hash_low, &new_ttl);
+ if (err) {
+ VERBOSE_MSG(qry, "=> NSEC3 wildcard: %s\n", err);
+ return kr_ok();
+ }
+
+ /* LATER(optim.): avoid duplicities in answer. */
+
+ /* Basic checks OK -> materialize the data (speculatively). */
+ knot_dname_t owner[KNOT_DNAME_MAXLEN];
+ {
+ int ret = dname_wire_reconstruct(owner, k->zname, hash_low);
+ if (unlikely(ret)) return kr_ok();
+ const struct entry_h *nsec_eh = val.data;
+ ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val),
+ owner, KNOT_RRTYPE_NSEC3, new_ttl);
+ if (ret) return kr_error(ret);
+ }
+ const knot_rrset_t *nsec_rr = ans->rrsets[AR_WILD].set.rr;
+
+ if (!exact_match) {
+ /* The record proves wildcard non-existence. */
+ WITH_VERBOSE(qry) {
+ char hash_low_txt[NSEC3_HASH_TXT_LEN + 1];
+ nsec3_hash2text(owner, hash_low_txt);
+ VERBOSE_MSG(qry,
+ "=> NSEC3 wildcard: covered by %s -> TODO, new TTL %d\n",
+ hash_low_txt, new_ttl);
+ }
+ return AR_SOA;
+ }
+
+ /* The wildcard exists. Find if it's NODATA - check type bitmap. */
+ const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata);
+ uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata);
+ assert(bm);
+ if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) {
+ /* NODATA proven; just need to add SOA+RRSIG later */
+ VERBOSE_MSG(qry, "=> NSEC3 wildcard: match proved NODATA, new TTL %d\n",
+ new_ttl);
+ ans->rcode = PKT_NODATA;
+ return AR_SOA;
+
+ } /* else */
+ /* The data probably exists -> don't add this NSEC3
+ * and (later) try to find the real wildcard data */
+ VERBOSE_MSG(qry, "=> NSEC3 wildcard: should exist (or error)\n");
+ ans->rcode = PKT_NOERROR;
+ memset(&ans->rrsets[AR_WILD], 0, sizeof(ans->rrsets[AR_WILD]));
+ return kr_ok();
+}
+
diff --git a/lib/cache/peek.c b/lib/cache/peek.c
new file mode 100644
index 0000000..1af1627
--- /dev/null
+++ b/lib/cache/peek.c
@@ -0,0 +1,724 @@
+/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "lib/cache/impl.h"
+
+#include "lib/dnssec/ta.h"
+#include "lib/layer/iterate.h"
+
+/* The whole file only exports peek_nosync().
+ * Forwards for larger chunks of code: */
+
+static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
+ uint8_t lowest_rank);
+static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el,
+ struct kr_query *qry, bool only_NS, bool is_DS);
+static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+ const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl);
+static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ uint16_t type, uint8_t lowest_rank,
+ const struct kr_query *qry, struct kr_cache *cache);
+
+static int peek_encloser(
+ struct key *k, struct answer *ans, int sname_labels,
+ uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache);
+
+
+static int nsec_p_init(struct nsec_p *nsec_p, knot_db_val_t nsec_p_entry, bool with_knot)
+{
+ const size_t stamp_len = sizeof(uint32_t);
+ if (nsec_p_entry.len <= stamp_len) { /* plain NSEC if equal */
+ nsec_p->raw = NULL;
+ nsec_p->hash = 0;
+ return kr_ok();
+ }
+ nsec_p->raw = (uint8_t *)nsec_p_entry.data + stamp_len;
+ nsec_p->hash = nsec_p_mkHash(nsec_p->raw);
+ if (!with_knot) return kr_ok();
+ /* Convert NSEC3 params to another format. */
+ const dnssec_binary_t rdata = {
+ .size = nsec_p_rdlen(nsec_p->raw),
+ .data = (uint8_t *)/*const-cast*/nsec_p->raw,
+ };
+ int ret = dnssec_nsec3_params_from_rdata(&nsec_p->libknot, &rdata);
+ return ret == DNSSEC_EOK ? kr_ok() : kr_error(ret);
+}
+
+static void nsec_p_cleanup(struct nsec_p *nsec_p)
+{
+ dnssec_binary_free(&nsec_p->libknot.salt);
+ /* We don't really need to clear it, but it's not large. (`salt` zeroed above) */
+ memset(nsec_p, 0, sizeof(*nsec_p));
+}
+
+/** Compute new TTL for nsec_p entry, using SOA serial arith.
+ * \param new_ttl (optionally) write the new TTL (even if negative)
+ * \return error code, e.g. kr_error(ESTALE) */
+static int nsec_p_ttl(knot_db_val_t entry, const uint32_t timestamp, int32_t *new_ttl)
+{
+ if (!entry.data) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+ uint32_t stamp;
+ if (!entry.len) {
+ return kr_error(ENOENT);
+ }
+ if (entry.len < sizeof(stamp)) {
+ assert(!EILSEQ);
+ return kr_error(EILSEQ);
+ }
+ memcpy(&stamp, entry.data, sizeof(stamp));
+ int32_t newttl = stamp - timestamp;
+ if (new_ttl) *new_ttl = newttl;
+ return newttl < 0 ? kr_error(ESTALE) : kr_ok();
+}
+
+static uint8_t get_lowest_rank(const struct kr_request *req, const struct kr_query *qry)
+{
+ /* TODO: move rank handling into the iterator (DNSSEC_* flags)? */
+ const bool allow_unverified =
+ knot_wire_get_cd(req->qsource.packet->wire) || qry->flags.STUB;
+ /* in stub mode we don't trust RRs anyway ^^ */
+ if (qry->flags.NONAUTH) {
+ return KR_RANK_INITIAL;
+ /* Note: there's little sense in validation status for non-auth records.
+ * In case of using NONAUTH to get NS IPs, knowing that you ask correct
+ * IP doesn't matter much for security; it matters whether you can
+ * validate the answers from the NS.
+ */
+ } else if (!allow_unverified) {
+ /* Records not present under any TA don't have their security
+ * verified at all, so we also accept low ranks in that case. */
+ const bool ta_covers = kr_ta_covers_qry(req->ctx, qry->sname, qry->stype);
+ /* ^ TODO: performance? TODO: stype - call sites */
+ if (ta_covers) {
+ return KR_RANK_INSECURE | KR_RANK_AUTH;
+ } /* else falltrhough */
+ }
+ return KR_RANK_INITIAL | KR_RANK_AUTH;
+}
+
+
+/** Almost whole .produce phase for the cache module.
+ * \note we don't transition to KR_STATE_FAIL even in case of "unexpected errors".
+ */
+int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+ struct kr_cache *cache = &req->ctx->cache;
+
+ struct key k_storage, *k = &k_storage;
+ int ret = kr_dname_lf(k->buf, qry->sname, false);
+ if (unlikely(ret)) {
+ assert(false);
+ return ctx->state;
+ }
+
+ const uint8_t lowest_rank = get_lowest_rank(req, qry);
+
+ /**** 1. find the name or the closest (available) zone, not considering wildcards
+ **** 1a. exact name+type match (can be negative answer in insecure zones) */
+ {
+ knot_db_val_t key = key_exact_type_maypkt(k, qry->stype);
+ knot_db_val_t val = { NULL, 0 };
+ ret = cache_op(cache, read, &key, &val, 1);
+ if (!ret) {
+ /* found an entry: test conditions, materialize into pkt, etc. */
+ ret = found_exact_hit(ctx, pkt, val, lowest_rank);
+ }
+ }
+ if (ret && ret != -abs(ENOENT)) {
+ VERBOSE_MSG(qry, "=> exact hit error: %d %s\n", ret, kr_strerror(ret));
+ assert(false);
+ return ctx->state;
+ } else if (!ret) {
+ return KR_STATE_DONE;
+ }
+
+ /**** 1b. otherwise, find the longest prefix zone/xNAME (with OK time+rank). [...] */
+ k->zname = qry->sname;
+ ret = kr_dname_lf(k->buf, k->zname, false); /* LATER(optim.): probably remove */
+ if (unlikely(ret)) {
+ assert(false);
+ return ctx->state;
+ }
+ entry_list_t el;
+ ret = closest_NS(cache, k, el, qry, false, qry->stype == KNOT_RRTYPE_DS);
+ if (ret) {
+ assert(ret == kr_error(ENOENT));
+ if (ret != kr_error(ENOENT) || !el[0].len) {
+ return ctx->state;
+ }
+ }
+ switch (k->type) {
+ case KNOT_RRTYPE_CNAME: {
+ const knot_db_val_t v = el[EL_CNAME];
+ assert(v.data && v.len);
+ const int32_t new_ttl = get_new_ttl(v.data, qry, qry->sname,
+ KNOT_RRTYPE_CNAME, qry->timestamp.tv_sec);
+ ret = answer_simple_hit(ctx, pkt, KNOT_RRTYPE_CNAME, v.data,
+ knot_db_val_bound(v), new_ttl);
+ /* TODO: ^^ cumbersome code; we also recompute the TTL */
+ return ret == kr_ok() ? KR_STATE_DONE : ctx->state;
+ }
+ case KNOT_RRTYPE_DNAME:
+ VERBOSE_MSG(qry, "=> DNAME not supported yet\n"); // LATER
+ return ctx->state;
+ }
+
+ /* We have to try proving from NSEC*. */
+ auto_free char *log_zname = NULL;
+ WITH_VERBOSE(qry) {
+ log_zname = kr_dname_text(k->zname);
+ if (!el[0].len) {
+ VERBOSE_MSG(qry, "=> no NSEC* cached for zone: %s\n", log_zname);
+ }
+ }
+
+#if 0
+ if (!eh) { /* fall back to root hints? */
+ ret = kr_zonecut_set_sbelt(req->ctx, &qry->zone_cut);
+ if (ret) return ctx->state;
+ assert(!qry->zone_cut.parent);
+
+ //VERBOSE_MSG(qry, "=> using root hints\n");
+ //qry->flags.AWAIT_CUT = false;
+ return ctx->state;
+ }
+
+ /* Now `eh` points to the closest NS record that we've found,
+ * and that's the only place to start - we may either find
+ * a negative proof or we may query upstream from that point. */
+ kr_zonecut_set(&qry->zone_cut, k->zname);
+ ret = kr_make_query(qry, pkt); // TODO: probably not yet - qname minimization
+ if (ret) return ctx->state;
+#endif
+
+ /** Structure for collecting multiple NSEC* + RRSIG records,
+ * in preparation for the answer, and for tracking the progress. */
+ struct answer ans;
+ memset(&ans, 0, sizeof(ans));
+ ans.mm = &pkt->mm;
+ const int sname_labels = knot_dname_labels(qry->sname, NULL);
+
+ /* Try the NSEC* parameters in order, until success.
+ * Let's not mix different parameters for NSEC* RRs in a single proof. */
+ for (int i = 0; ;) {
+ int32_t log_new_ttl = -123456789; /* visually recognizable value */
+ ret = nsec_p_ttl(el[i], qry->timestamp.tv_sec, &log_new_ttl);
+ if (!ret || VERBOSE_STATUS) {
+ nsec_p_init(&ans.nsec_p, el[i], !ret);
+ }
+ if (ret) {
+ VERBOSE_MSG(qry, "=> skipping zone: %s, %s, hash %x;"
+ "new TTL %d, ret %d\n",
+ log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"),
+ (unsigned)ans.nsec_p.hash, (int)log_new_ttl, ret);
+ /* no need for nsec_p_cleanup() in this case */
+ goto cont;
+ }
+ VERBOSE_MSG(qry, "=> trying zone: %s, %s, hash %x\n",
+ log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"),
+ (unsigned)ans.nsec_p.hash);
+ /**** 2. and 3. inside */
+ ret = peek_encloser(k, &ans, sname_labels,
+ lowest_rank, qry, cache);
+ nsec_p_cleanup(&ans.nsec_p);
+ if (!ret) break;
+ if (ret < 0) return ctx->state;
+ cont:
+ /* Otherwise we try another nsec_p, if available. */
+ if (++i == ENTRY_APEX_NSECS_CNT) return ctx->state;
+ /* clear possible partial answers in `ans` (no need to deallocate) */
+ ans.rcode = 0;
+ memset(&ans.rrsets, 0, sizeof(ans.rrsets));
+ }
+
+ /**** 4. add SOA iff needed */
+ if (ans.rcode != PKT_NOERROR) {
+ /* Assuming k->buf still starts with zone's prefix,
+ * look up the SOA in cache. */
+ k->buf[0] = k->zlf_len;
+ knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_SOA);
+ knot_db_val_t val = { NULL, 0 };
+ ret = cache_op(cache, read, &key, &val, 1);
+ const struct entry_h *eh;
+ if (ret || !(eh = entry_h_consistent(val, KNOT_RRTYPE_SOA))) {
+ assert(ret); /* only want to catch `eh` failures */
+ VERBOSE_MSG(qry, "=> SOA missed\n");
+ return ctx->state;
+ }
+ /* Check if the record is OK. */
+ int32_t new_ttl = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_SOA,
+ qry->timestamp.tv_sec);
+ if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) {
+ VERBOSE_MSG(qry, "=> SOA unfit %s: rank 0%.2o, new TTL %d\n",
+ (eh->is_packet ? "packet" : "RR"),
+ eh->rank, new_ttl);
+ return ctx->state;
+ }
+ /* Add the SOA into the answer. */
+ ret = entry2answer(&ans, AR_SOA, eh, knot_db_val_bound(val),
+ k->zname, KNOT_RRTYPE_SOA, new_ttl);
+ if (ret) return ctx->state;
+ }
+
+ /* Find our target RCODE. */
+ int real_rcode;
+ switch (ans.rcode) {
+ case PKT_NODATA:
+ case PKT_NOERROR: /* positive wildcarded response */
+ real_rcode = KNOT_RCODE_NOERROR;
+ break;
+ case PKT_NXDOMAIN:
+ real_rcode = KNOT_RCODE_NXDOMAIN;
+ break;
+ default:
+ assert(false);
+ case 0: /* i.e. nothing was found */
+ /* LATER(optim.): zone cut? */
+ VERBOSE_MSG(qry, "=> cache miss\n");
+ return ctx->state;
+ }
+
+ if (pkt_renew(pkt, qry->sname, qry->stype)
+ || knot_pkt_begin(pkt, KNOT_ANSWER)
+ ) {
+ assert(false);
+ return ctx->state;
+ }
+ knot_wire_set_rcode(pkt->wire, real_rcode);
+
+ bool expiring = false; // TODO
+ VERBOSE_MSG(qry, "=> writing RRsets: ");
+ for (int i = 0; i < sizeof(ans.rrsets) / sizeof(ans.rrsets[0]); ++i) {
+ if (i == 1) knot_pkt_begin(pkt, KNOT_AUTHORITY);
+ if (!ans.rrsets[i].set.rr) continue;
+ expiring = expiring || ans.rrsets[i].set.expiring;
+ ret = pkt_append(pkt, &ans.rrsets[i], ans.rrsets[i].set.rank);
+ if (ret) {
+ assert(false);
+ return ctx->state;
+ }
+ kr_log_verbose(kr_rank_test(ans.rrsets[i].set.rank, KR_RANK_SECURE)
+ ? "+" : "-");
+ }
+ kr_log_verbose("\n");
+
+ /* Finishing touches. */
+ struct kr_qflags * const qf = &qry->flags;
+ qf->EXPIRING = expiring;
+ qf->CACHED = true;
+ qf->NO_MINIMIZE = true;
+
+ return KR_STATE_DONE;
+}
+
+/**
+ * This is where the high-level "business logic" of aggressive cache is.
+ * \return 0: success (may need SOA); >0: try other nsec_p; <0: exit cache immediately.
+ */
+static int peek_encloser(
+ struct key *k, struct answer *ans, const int sname_labels,
+ uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache)
+{
+ /** Start of NSEC* covering the sname;
+ * it's part of key - the one within zone (read only) */
+ knot_db_val_t cover_low_kwz = { NULL, 0 };
+ knot_dname_t cover_hi_storage[KNOT_DNAME_MAXLEN];
+ /** End of NSEC* covering the sname. */
+ knot_db_val_t cover_hi_kwz = {
+ .data = cover_hi_storage,
+ .len = sizeof(cover_hi_storage),
+ };
+
+ /**** 2. Find a closest (provable) encloser (of sname). */
+ int clencl_labels = -1;
+ bool clencl_is_tentative = false;
+ if (!ans->nsec_p.raw) { /* NSEC */
+ int ret = nsec1_encloser(k, ans, sname_labels, &clencl_labels,
+ &cover_low_kwz, &cover_hi_kwz, qry, cache);
+ if (ret) return ret;
+ } else {
+ int ret = nsec3_encloser(k, ans, sname_labels, &clencl_labels,
+ qry, cache);
+ clencl_is_tentative = ret == ABS(ENOENT) && clencl_labels >= 0;
+ /* ^^ Last chance: *positive* wildcard record under this clencl. */
+ if (ret && !clencl_is_tentative) return ret;
+ }
+
+ /* We should have either a match or a cover at this point. */
+ if (ans->rcode != PKT_NODATA && ans->rcode != PKT_NXDOMAIN) {
+ assert(false);
+ return kr_error(EINVAL);
+ }
+ const bool ncloser_covered = ans->rcode == PKT_NXDOMAIN;
+
+ /** Name of the closest (provable) encloser. */
+ const knot_dname_t *clencl_name = qry->sname;
+ for (int l = sname_labels; l > clencl_labels; --l)
+ clencl_name = knot_wire_next_label(clencl_name, NULL);
+
+ /**** 3. source of synthesis checks, in case the next closer name was covered.
+ **** 3a. We want to query for NSEC* of source of synthesis (SS) or its
+ * predecessor, providing us with a proof of its existence or non-existence. */
+ if (ncloser_covered && !ans->nsec_p.raw) {
+ int ret = nsec1_src_synth(k, ans, clencl_name,
+ cover_low_kwz, cover_hi_kwz, qry, cache);
+ if (ret == AR_SOA) return 0;
+ assert(ret <= 0);
+ if (ret) return ret;
+
+ } else if (ncloser_covered && ans->nsec_p.raw && !clencl_is_tentative) {
+ int ret = nsec3_src_synth(k, ans, clencl_name, qry, cache);
+ if (ret == AR_SOA) return 0;
+ assert(ret <= 0);
+ if (ret) return ret;
+
+ } /* else (!ncloser_covered) so no wildcard checks needed,
+ * as we proved that sname exists. */
+
+ /**** 3b. find wildcarded answer, if next closer name was covered
+ * and we don't have a full proof yet. (common for NSEC*) */
+ if (!ncloser_covered)
+ return kr_ok(); /* decrease indentation */
+ /* Construct key for exact qry->stype + source of synthesis. */
+ int ret = kr_dname_lf(k->buf, clencl_name, true);
+ if (ret) {
+ assert(!ret);
+ return kr_error(ret);
+ }
+ const uint16_t types[] = { qry->stype, KNOT_RRTYPE_CNAME };
+ for (int i = 0; i < (2 - (qry->stype == KNOT_RRTYPE_CNAME)); ++i) {
+ ret = try_wild(k, ans, clencl_name, types[i],
+ lowest_rank, qry, cache);
+ if (ret == kr_ok()) {
+ return kr_ok();
+ } else if (ret != -ABS(ENOENT) && ret != -ABS(ESTALE)) {
+ assert(false);
+ return kr_error(ret);
+ }
+ /* else continue */
+ }
+ /* Neither attempt succeeded, but the NSEC* proofs were found,
+ * so skip trying other parameters, as it seems very unlikely
+ * to turn out differently than by the same wildcard search. */
+ return -ABS(ENOENT);
+}
+
+
+static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+ const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
+#define CHECK_RET(ret) do { \
+ if ((ret) < 0) { assert(false); return kr_error((ret)); } \
+} while (false)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+
+ /* All OK, so start constructing the (pseudo-)packet. */
+ int ret = pkt_renew(pkt, qry->sname, qry->stype);
+ CHECK_RET(ret);
+
+ /* Materialize the sets for the answer in (pseudo-)packet. */
+ struct answer ans;
+ memset(&ans, 0, sizeof(ans));
+ ans.mm = &pkt->mm;
+ ret = entry2answer(&ans, AR_ANSWER, eh, eh_bound,
+ qry->sname, type, new_ttl);
+ CHECK_RET(ret);
+ /* Put links to the materialized data into the pkt. */
+ ret = pkt_append(pkt, &ans.rrsets[AR_ANSWER], eh->rank);
+ CHECK_RET(ret);
+
+ /* Finishing touches. */
+ struct kr_qflags * const qf = &qry->flags;
+ qf->EXPIRING = is_expiring(eh->ttl, new_ttl);
+ qf->CACHED = true;
+ qf->NO_MINIMIZE = true;
+ qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+ if (qf->DNSSEC_INSECURE) {
+ qf->DNSSEC_WANT = false;
+ }
+ VERBOSE_MSG(qry, "=> satisfied by exact %s: rank 0%.2o, new TTL %d\n",
+ (type == KNOT_RRTYPE_CNAME ? "CNAME" : "RRset"),
+ eh->rank, new_ttl);
+ return kr_ok();
+}
+#undef CHECK_RET
+
+
+/** TODO: description; see the single call site for now. */
+static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
+ uint8_t lowest_rank)
+{
+ struct kr_request *req = ctx->req;
+ struct kr_query *qry = req->current_query;
+
+ int ret = entry_h_seek(&val, qry->stype);
+ if (ret) return ret;
+ const struct entry_h *eh = entry_h_consistent(val, qry->stype);
+ if (!eh) {
+ assert(false);
+ return kr_error(ENOENT);
+ // LATER: recovery in case of error, perhaps via removing the entry?
+ // LATER(optim): pehaps optimize the zone cut search
+ }
+
+ int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, qry->stype,
+ qry->timestamp.tv_sec);
+ if (new_ttl < 0 || eh->rank < lowest_rank) {
+ /* Positive record with stale TTL or bad rank.
+ * LATER(optim.): It's unlikely that we find a negative one,
+ * so we might theoretically skip all the cache code. */
+
+ VERBOSE_MSG(qry, "=> skipping exact %s: rank 0%.2o (min. 0%.2o), new TTL %d\n",
+ eh->is_packet ? "packet" : "RR", eh->rank, lowest_rank, new_ttl);
+ return kr_error(ENOENT);
+ }
+
+ const uint8_t *eh_bound = knot_db_val_bound(val);
+ if (eh->is_packet) {
+ /* Note: we answer here immediately, even if it's (theoretically)
+ * possible that we could generate a higher-security negative proof.
+ * Rank is high-enough so we take it to save time searching. */
+ return answer_from_pkt (ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
+ } else {
+ return answer_simple_hit(ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
+ }
+}
+
+
+/** Try to satisfy via wildcard (positively). See the single call site. */
+static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+ const uint16_t type, const uint8_t lowest_rank,
+ const struct kr_query *qry, struct kr_cache *cache)
+{
+ knot_db_val_t key = key_exact_type(k, type);
+ /* Find the record. */
+ knot_db_val_t val = { NULL, 0 };
+ int ret = cache_op(cache, read, &key, &val, 1);
+ if (!ret) {
+ ret = entry_h_seek(&val, type);
+ }
+ if (ret) {
+ if (ret != -ABS(ENOENT)) {
+ VERBOSE_MSG(qry, "=> wildcard: hit error %d %s\n",
+ ret, strerror(abs(ret)));
+ assert(false);
+ }
+ WITH_VERBOSE(qry) {
+ auto_free char *clencl_str = kr_dname_text(clencl_name),
+ *type_str = kr_rrtype_text(type);
+ VERBOSE_MSG(qry, "=> wildcard: not found: *.%s %s\n",
+ clencl_str, type_str);
+ }
+ return ret;
+ }
+ /* Check if the record is OK. */
+ const struct entry_h *eh = entry_h_consistent(val, type);
+ if (!eh) {
+ assert(false);
+ return kr_error(ret);
+ // LATER: recovery in case of error, perhaps via removing the entry?
+ }
+ int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, type, qry->timestamp.tv_sec);
+ /* ^^ here we use the *expanded* wildcard name */
+ if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) {
+ /* Wildcard record with stale TTL, bad rank or packet. */
+ VERBOSE_MSG(qry, "=> wildcard: skipping %s, rank 0%.2o, new TTL %d\n",
+ eh->is_packet ? "packet" : "RR", eh->rank, new_ttl);
+ return -ABS(ESTALE);
+ }
+ /* Add the RR into the answer. */
+ ret = entry2answer(ans, AR_ANSWER, eh, knot_db_val_bound(val),
+ qry->sname, type, new_ttl);
+ VERBOSE_MSG(qry, "=> wildcard: answer expanded, ret = %d, new TTL %d\n",
+ ret, (int)new_ttl);
+ if (ret) return kr_error(ret);
+ ans->rcode = PKT_NOERROR;
+ return kr_ok();
+}
+
+int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS,
+ knot_dname_t ** apex)
+{
+ if (!cache || !cache->db || !name || !apex || *apex) {
+ assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+ struct key k_storage, *k = &k_storage;
+ int ret = kr_dname_lf(k->buf, name, false);
+ if (ret)
+ return kr_error(ret);
+ entry_list_t el_;
+ k->zname = name;
+ ret = closest_NS(cache, k, el_, NULL, true, is_DS);
+ if (ret && ret != -abs(ENOENT))
+ return ret;
+ *apex = knot_dname_copy(k->zname, NULL);
+ if (!*apex)
+ return kr_error(ENOMEM);
+ return kr_ok();
+}
+
+/** \internal for closest_NS. Check suitability of a single entry, setting k->type if OK.
+ * \return error code, negative iff whole list should be skipped.
+ */
+static int check_NS_entry(struct key *k, knot_db_val_t entry, int i,
+ bool exact_match, bool is_DS,
+ const struct kr_query *qry, uint32_t timestamp);
+
+/**
+ * Find the longest prefix zone/xNAME (with OK time+rank), starting at k->*.
+ *
+ * The found type is returned via k->type; the values are returned in el.
+ * \note we use k->type = KNOT_RRTYPE_NS also for the nsec_p result.
+ * \param qry can be NULL (-> gettimeofday(), but you lose the stale-serve hook)
+ * \param only_NS don't consider xNAMEs
+ * \return error code
+ */
+static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el,
+ struct kr_query *qry, const bool only_NS, const bool is_DS)
+{
+ /* get the current timestamp */
+ uint32_t timestamp;
+ if (qry) {
+ timestamp = qry->timestamp.tv_sec;
+ } else {
+ struct timeval tv;
+ if (gettimeofday(&tv, NULL)) return kr_error(errno);
+ timestamp = tv.tv_sec;
+ }
+
+ int zlf_len = k->buf[0];
+
+ // LATER(optim): if stype is NS, we check the same value again
+ bool exact_match = true;
+ bool need_zero = true;
+ /* Inspect the NS/xNAME entries, shortening by a label on each iteration. */
+ do {
+ k->buf[0] = zlf_len;
+ knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS);
+ knot_db_val_t val;
+ int ret = cache_op(cache, read, &key, &val, 1);
+ if (ret == -abs(ENOENT)) goto next_label;
+ if (ret) {
+ assert(!ret);
+ if (need_zero) memset(el, 0, sizeof(entry_list_t));
+ return kr_error(ret);
+ }
+
+ /* Check consistency, find any type;
+ * using `goto` for shortening by another label. */
+ ret = entry_list_parse(val, el);
+ if (ret) {
+ assert(!ret); // do something about it?
+ goto next_label;
+ }
+ need_zero = false;
+ /* More types are possible; try in order.
+ * For non-fatal failures just "continue;" to try the next type. */
+ const int el_count = only_NS ? EL_NS + 1 : EL_LENGTH;
+ for (int i = 0; i < el_count; ++i) {
+ ret = check_NS_entry(k, el[i], i, exact_match, is_DS,
+ qry, timestamp);
+ if (ret < 0) goto next_label; else
+ if (!ret) {
+ /* We found our match. */
+ k->zlf_len = zlf_len;
+ return kr_ok();
+ }
+ }
+
+ next_label:
+ /* remove one more label */
+ exact_match = false;
+ if (k->zname[0] == 0) {
+ /* We miss root NS in cache, but let's at least assume it exists. */
+ k->type = KNOT_RRTYPE_NS;
+ k->zlf_len = zlf_len;
+ assert(zlf_len == 0);
+ if (need_zero) memset(el, 0, sizeof(entry_list_t));
+ return kr_error(ENOENT);
+ }
+ zlf_len -= (k->zname[0] + 1);
+ k->zname += (k->zname[0] + 1);
+ k->buf[zlf_len + 1] = 0;
+ } while (true);
+}
+
+static int check_NS_entry(struct key *k, const knot_db_val_t entry, const int i,
+ const bool exact_match, const bool is_DS,
+ const struct kr_query *qry, uint32_t timestamp)
+{
+ const int ESKIP = ABS(ENOENT);
+ if (!entry.len
+ /* On a zone cut we want DS from the parent zone. */
+ || (i <= EL_NS && exact_match && is_DS)
+ /* CNAME is interesting only if we
+ * directly hit the name that was asked.
+ * Note that we want it even in the DS case. */
+ || (i == EL_CNAME && !exact_match)
+ /* DNAME is interesting only if we did NOT
+ * directly hit the name that was asked. */
+ || (i == EL_DNAME && exact_match)
+ ) {
+ return ESKIP;
+ }
+
+ uint16_t type;
+ if (i < ENTRY_APEX_NSECS_CNT) {
+ type = KNOT_RRTYPE_NS;
+ int32_t log_new_ttl = -123456789; /* visually recognizable value */
+ const int err = nsec_p_ttl(entry, timestamp, &log_new_ttl);
+ if (err) {
+ VERBOSE_MSG(qry,
+ "=> skipping unfit nsec_p: new TTL %d, error %d\n",
+ (int)log_new_ttl, err);
+ return ESKIP;
+ }
+ } else {
+ type = EL2RRTYPE(i);
+ /* Find the entry for the type, check positivity, TTL */
+ const struct entry_h *eh = entry_h_consistent(entry, type);
+ if (!eh) {
+ VERBOSE_MSG(qry, "=> EH not consistent\n");
+ assert(false);
+ return kr_error(EILSEQ);
+ }
+ const int32_t log_new_ttl = get_new_ttl(eh, qry, k->zname, type, timestamp);
+ const uint8_t rank_min = KR_RANK_INSECURE | KR_RANK_AUTH;
+ const bool ok = /* For NS any kr_rank is accepted,
+ * as insecure or even nonauth is OK */
+ (type == KNOT_RRTYPE_NS || eh->rank >= rank_min)
+ /* Not interested in negative bogus or outdated RRs. */
+ && !eh->is_packet && log_new_ttl >= 0;
+ WITH_VERBOSE(qry) { if (!ok) {
+ auto_free char *type_str = kr_rrtype_text(type);
+ const char *packet_str = eh->is_packet ? "packet" : "RR";
+ VERBOSE_MSG(qry,
+ "=> skipping unfit %s %s: rank 0%.2o, new TTL %d\n",
+ type_str, packet_str, eh->rank, (int)log_new_ttl);
+ } }
+ if (!ok) return ESKIP;
+ }
+ k->type = type;
+ return kr_ok();
+}
+