13 files changed, 4808 insertions, 0 deletions
diff --git a/lib/cache/api.c b/lib/cache/api.c
new file mode 100644
index 0000000..c0591d6
--- /dev/null
+++ b/lib/cache/api.c
@@ -0,0 +1,889 @@
+/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <libknot/descriptor.h>
+#include <libknot/dname.h>
+#include <libknot/errcode.h>
+#include <libknot/rrtype/rrsig.h>
+
+#include "contrib/cleanup.h"
+#include "contrib/ucw/lib.h"
+#include "lib/cache/api.h"
+#include "lib/cache/cdb_lmdb.h"
+#include "lib/defines.h"
+#include "lib/generic/trie.h"
+#include "lib/resolve.h"
+#include "lib/rplan.h"
+#include "lib/utils.h"
+
+#include "lib/cache/impl.h"
+
+/* TODO:
+ *	- Reconsider when RRSIGs are put in and retrieved from the cache.
+ *	  Currently it's always done, which _might_ be spurious, depending
+ *	  on how kresd will use the returned result.
+ *	  There's also the "problem" that kresd ATM does _not_ ask upstream
+ *	  with DO bit in some cases.
+ */
+
+
+/** Cache version */
+static const uint16_t CACHE_VERSION = 5;
+/** Key size */
+#define KEY_HSIZE (sizeof(uint8_t) + sizeof(uint16_t))
+#define KEY_SIZE (KEY_HSIZE + KNOT_DNAME_MAXLEN)
+
+
+/** @internal Forward declarations of the implementation details
+ * \param optout[out] Set *optout = true; when encountering an opt-out NSEC3 (optional). */
+static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
+		const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
+		uint8_t rank, trie_t *nsec_pmap, bool *has_optout);
+/** Preliminary checks before stash_rrset().  Don't call if returns <= 0. */
+static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/);
+
+/** @internal Removes all records from cache. */
+static inline int cache_clear(struct kr_cache *cache)
+{
+	cache->stats.delete += 1;
+	return cache_op(cache, clear);
+}
+
+/** @internal Open cache db transaction and check internal data version. */
+static int assert_right_version(struct kr_cache *cache)
+{
+	/* Check cache ABI version. */
+	/* CACHE_KEY_DEF: to avoid collisions with kr_cache_match(). */
+	uint8_t key_str[4] = "VERS";
+	knot_db_val_t key = { .data = key_str, .len = sizeof(key_str) };
+	knot_db_val_t val = { NULL, 0 };
+	int ret = cache_op(cache, read, &key, &val, 1);
+	if (ret == 0 && val.len == sizeof(CACHE_VERSION)
+	    && memcmp(val.data, &CACHE_VERSION, sizeof(CACHE_VERSION)) == 0) {
+		ret = kr_error(EEXIST);
+	} else {
+		int oldret = ret;
+		/* Version doesn't match. Recreate cache and write version key. */
+		ret = cache_op(cache, count);
+		if (ret != 0) { /* Non-empty cache, purge it. */
+			kr_log_info("[     ][cach] incompatible cache database detected, purging\n");
+			if (oldret) {
+				kr_log_verbose("bad ret: %d\n", oldret);
+			} else if (val.len != sizeof(CACHE_VERSION)) {
+				kr_log_verbose("bad length: %d\n", (int)val.len);
+			} else {
+				uint16_t ver;
+				memcpy(&ver, val.data, sizeof(ver));
+				kr_log_verbose("bad version: %d\n", (int)ver);
+			}
+			ret = cache_clear(cache);
+		}
+		/* Either purged or empty. */
+		if (ret == 0) {
+			/* Key/Val is invalidated by cache purge, recreate it */
+			val.data = /*const-cast*/(void *)&CACHE_VERSION;
+			val.len = sizeof(CACHE_VERSION);
+			ret = cache_op(cache, write, &key, &val, 1);
+		}
+	}
+	kr_cache_sync(cache);
+	return ret;
+}
+
+int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm)
+{
+	if (!cache) {
+		return kr_error(EINVAL);
+	}
+	/* Open cache */
+	if (!api) {
+		api = kr_cdb_lmdb();
+	}
+	cache->api = api;
+	int ret = cache->api->open(&cache->db, opts, mm);
+	if (ret != 0) {
+		return ret;
+	}
+	memset(&cache->stats, 0, sizeof(cache->stats));
+	cache->ttl_min = KR_CACHE_DEFAULT_TTL_MIN;
+	cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX;
+	/* Check cache ABI version */
+	kr_cache_make_checkpoint(cache);
+	(void)assert_right_version(cache);
+
+	char *fpath;
+	ret = asprintf(&fpath, "%s/data.mdb", opts->path);
+	if (ret > 0) {
+		kr_cache_emergency_file_to_remove = fpath;
+	} else {
+		assert(false); /* non-critical, but still */
+	}
+	return 0;
+}
+
+const char *kr_cache_emergency_file_to_remove = NULL;
+
+
+#define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db)
+
+void kr_cache_close(struct kr_cache *cache)
+{
+	if (cache_isvalid(cache)) {
+		cache_op(cache, close);
+		cache->db = NULL;
+	}
+	free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove);
+	kr_cache_emergency_file_to_remove = NULL;
+}
+
+int kr_cache_sync(struct kr_cache *cache)
+{
+	if (!cache_isvalid(cache)) {
+		return kr_error(EINVAL);
+	}
+	if (cache->api->sync) {
+		return cache_op(cache, sync);
+	}
+	return kr_ok();
+}
+
+int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp)
+{
+	int err = stash_rrset_precond(rr, NULL);
+	if (err <= 0) {
+		return kr_ok();
+	}
+	ssize_t written = stash_rrset(cache, NULL, rr, rrsig, timestamp, rank, NULL, NULL);
+		/* Zone's NSEC* parames aren't updated, but that's probably OK
+		 * for kr_cache_insert_rr() */
+	if (written >= 0) {
+		return kr_ok();
+	}
+
+	return (int) written;
+}
+
+int kr_cache_clear(struct kr_cache *cache)
+{
+	if (!cache_isvalid(cache)) {
+		return kr_error(EINVAL);
+	}
+	int ret = cache_clear(cache);
+	if (ret == 0) {
+		kr_cache_make_checkpoint(cache);
+		ret = assert_right_version(cache);
+	}
+	return ret;
+}
+
+/* When going stricter, BEWARE of breaking entry_h_consistent_NSEC() */
+struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type)
+{
+	(void) type; /* unused, for now */
+	if (!data.data) return NULL;
+	/* Length checks. */
+	if (data.len < offsetof(struct entry_h, data))
+		return NULL;
+	const struct entry_h *eh = data.data;
+	if (eh->is_packet) {
+		uint16_t pkt_len;
+		if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)) {
+			return NULL;
+		}
+		memcpy(&pkt_len, eh->data, sizeof(pkt_len));
+		if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)
+				+ pkt_len) {
+			return NULL;
+		}
+	}
+
+	bool ok = true;
+	ok = ok && kr_rank_check(eh->rank);
+	ok = ok && (!kr_rank_test(eh->rank, KR_RANK_BOGUS)
+		    || eh->is_packet);
+	ok = ok && (eh->is_packet || !eh->has_optout);
+
+	return ok ? /*const-cast*/(struct entry_h *)eh : NULL;
+}
+
+int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry,
+                    const knot_dname_t *owner, uint16_t type, uint32_t now)
+{
+	int32_t diff = now - entry->time;
+	if (diff < 0) {
+		/* We may have obtained the record *after* the request started. */
+		diff = 0;
+	}
+	int32_t res = entry->ttl - diff;
+	if (res < 0 && owner && qry && qry->stale_cb) {
+		/* Stale-serving decision, delegated to a callback. */
+		int res_stale = qry->stale_cb(res, owner, type, qry);
+		if (res_stale >= 0)
+			return res_stale;
+	}
+	return res;
+}
+
+int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry,
+		     const knot_dname_t *name, uint16_t type)
+{
+	const struct entry_h *eh = peek->raw_data;
+	return get_new_ttl(eh, qry, name, type, qry->timestamp.tv_sec);
+}
+
+/** Check that no label contains a zero character, incl. a log trace.
+ *
+ * We refuse to work with those, as LF and our cache keys might become ambiguous.
+ * Assuming uncompressed name, as usual.
+ * CACHE_KEY_DEF
+ */
+static bool check_dname_for_lf(const knot_dname_t *n, const struct kr_query *qry/*logging*/)
+{
+	const bool ret = knot_dname_size(n) == strlen((const char *)n) + 1;
+	if (!ret) { WITH_VERBOSE(qry) {
+		auto_free char *n_str = kr_dname_text(n);
+		VERBOSE_MSG(qry, "=> skipping zero-containing name %s\n", n_str);
+	} }
+	return ret;
+}
+
+/** Return false on types to be ignored.  Meant both for sname and direct cache requests. */
+static bool check_rrtype(uint16_t type, const struct kr_query *qry/*logging*/)
+{
+	const bool ret = !knot_rrtype_is_metatype(type)
+			&& type != KNOT_RRTYPE_RRSIG;
+	if (!ret) { WITH_VERBOSE(qry) {
+		auto_free char *type_str = kr_rrtype_text(type);
+		VERBOSE_MSG(qry, "=> skipping RR type %s\n", type_str);
+	} }
+	return ret;
+}
+
+/** Like key_exact_type() but omits a couple checks not holding for pkt cache. */
+knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type)
+{
+	assert(check_rrtype(type, NULL));
+	switch (type) {
+	case KNOT_RRTYPE_RRSIG: /* no RRSIG query caching, at least for now */
+		assert(false);
+		return (knot_db_val_t){ NULL, 0 };
+	/* xNAME lumped into NS. */
+	case KNOT_RRTYPE_CNAME:
+	case KNOT_RRTYPE_DNAME:
+		type = KNOT_RRTYPE_NS;
+	default:
+		break;
+	}
+
+	int name_len = k->buf[0];
+	k->buf[name_len + 1] = 0; /* make sure different names can never match */
+	k->buf[name_len + 2] = 'E'; /* tag for exact name+type matches */
+	memcpy(k->buf + name_len + 3, &type, 2);
+	k->type = type;
+	/* CACHE_KEY_DEF: key == dname_lf + '\0' + 'E' + RRTYPE */
+	return (knot_db_val_t){ k->buf + 1, name_len + 4 };
+}
+
+
+/** The inside for cache_peek(); implementation separated to ./peek.c */
+int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt);
+/** function for .produce phase */
+int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+	/* We first check various exit-conditions and then call the _real function. */
+
+	if (!kr_cache_is_open(&req->ctx->cache)
+	    || ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || qry->flags.NO_CACHE
+	    || (qry->flags.CACHE_TRIED && !qry->stale_cb)
+	    || !check_rrtype(qry->stype, qry) /* LATER: some other behavior for some of these? */
+	    || qry->sclass != KNOT_CLASS_IN) {
+		return ctx->state; /* Already resolved/failed or already tried, etc. */
+	}
+	/* ATM cache only peeks for qry->sname and that would be useless
+	 * to repeat on every iteration, so disable it from now on.
+	 * LATER(optim.): assist with more precise QNAME minimization. */
+	qry->flags.CACHE_TRIED = true;
+
+	if (qry->stype == KNOT_RRTYPE_NSEC) {
+		VERBOSE_MSG(qry, "=> skipping stype NSEC\n");
+		return ctx->state;
+	}
+	if (!check_dname_for_lf(qry->sname, qry)) {
+		return ctx->state;
+	}
+
+	int ret = peek_nosync(ctx, pkt);
+	kr_cache_sync(&req->ctx->cache);
+	return ret;
+}
+
+
+
+/** It's simply inside of cycle taken out to decrease indentation.  \return error code. */
+static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
+			const struct kr_query *qry, struct kr_cache *cache,
+			int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout);
+/** Stash a single nsec_p.  \return 0 (errors are ignored). */
+static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
+			struct kr_request *req);
+
+/** The whole .consume phase for the cache module. */
+int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+	struct kr_cache *cache = &req->ctx->cache;
+
+	/* Note: we cache even in KR_STATE_FAIL.  For example,
+	 * BOGUS answer can go to +cd cache even without +cd request. */
+	if (!kr_cache_is_open(cache) || !qry
+	    || qry->flags.CACHED || !check_rrtype(knot_pkt_qtype(pkt), qry)
+	    || qry->sclass != KNOT_CLASS_IN) {
+		return ctx->state;
+	}
+	/* Do not cache truncated answers, at least for now.  LATER */
+	if (knot_wire_get_tc(pkt->wire)) {
+		return ctx->state;
+	}
+	/* Stash individual records. */
+	ranked_rr_array_t *selected[] = kr_request_selected(req);
+	int unauth_cnt = 0;
+	trie_t *nsec_pmap = trie_create(&req->pool);
+	if (!nsec_pmap) {
+		assert(!ENOMEM);
+		goto finally;
+	}
+	bool has_optout = false;
+		/* ^^ DNSSEC_OPTOUT is not fired in cases like `com. A`,
+		 * but currently we don't stash separate NSEC3 proving that. */
+	for (int psec = KNOT_ANSWER; psec <= KNOT_ADDITIONAL; ++psec) {
+		ranked_rr_array_t *arr = selected[psec];
+		/* uncached entries are located at the end */
+		for (ssize_t i = arr->len - 1; i >= 0; --i) {
+			ranked_rr_array_entry_t *entry = arr->at[i];
+			if (entry->qry_uid != qry->uid) {
+				continue;
+				/* TODO: probably safe to break but maybe not worth it */
+			}
+			int ret = stash_rrarray_entry(arr, i, qry, cache, &unauth_cnt,
+							nsec_pmap, &has_optout);
+			if (ret) {
+				VERBOSE_MSG(qry, "=> stashing RRs errored out\n");
+				goto finally;
+			}
+			/* LATER(optim.): maybe filter out some type-rank combinations
+			 * that won't be useful as separate RRsets. */
+		}
+	}
+
+	trie_it_t *it;
+	for (it = trie_it_begin(nsec_pmap); !trie_it_finished(it); trie_it_next(it)) {
+		stash_nsec_p((const knot_dname_t *)trie_it_key(it, NULL),
+				(const char *)*trie_it_val(it), req);
+	}
+	trie_it_free(it);
+	/* LATER(optim.): typically we also have corresponding NS record in the list,
+	 * so we might save a cache operation. */
+
+	stash_pkt(pkt, qry, req, has_optout);
+
+finally:
+	if (unauth_cnt) {
+		VERBOSE_MSG(qry, "=> stashed also %d nonauth RRsets\n", unauth_cnt);
+	};
+	kr_cache_sync(cache);
+	return ctx->state; /* we ignore cache-stashing errors */
+}
+
+/** Preliminary checks before stash_rrset().  Don't call if returns <= 0. */
+static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/)
+{
+	if (!rr || rr->rclass != KNOT_CLASS_IN) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	if (!check_rrtype(rr->type, qry)) {
+		return kr_ok();
+	}
+	if (!check_dname_for_lf(rr->owner, qry)) {
+		return kr_ok();
+	}
+	return 1/*proceed*/;
+}
+
+static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
+		const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
+		uint8_t rank, trie_t *nsec_pmap, bool *has_optout)
+{
+	assert(stash_rrset_precond(rr, qry) > 0);
+	if (!cache) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+
+	const int wild_labels = rr_sigs == NULL ? 0 :
+	       knot_dname_labels(rr->owner, NULL) - knot_rrsig_labels(rr_sigs->rrs.rdata);
+	if (wild_labels < 0) {
+		return kr_ok();
+	}
+	const knot_dname_t *encloser = rr->owner; /**< the closest encloser name */
+	for (int i = 0; i < wild_labels; ++i) {
+		encloser = knot_wire_next_label(encloser, NULL);
+	}
+	int ret = 0;
+
+	/* Construct the key under which RRs will be stored,
+	 * and add corresponding nsec_pmap item (if necessary). */
+	struct key k_storage, *k = &k_storage;
+	knot_db_val_t key;
+	switch (rr->type) {
+	case KNOT_RRTYPE_NSEC3:
+		/* Skip "suspicious" or opt-out NSEC3 sets. */
+		if (rr->rrs.count != 1) return kr_ok();
+		if (KNOT_NSEC3_FLAG_OPT_OUT & knot_nsec3_flags(rr->rrs.rdata)) {
+			if (has_optout) *has_optout = true;
+			return kr_ok();
+		}
+		/* fall through */
+	case KNOT_RRTYPE_NSEC:
+		if (!kr_rank_test(rank, KR_RANK_SECURE)) {
+			/* Skip any NSEC*s that aren't validated. */
+			return kr_ok();
+		}
+		if (!rr_sigs || !rr_sigs->rrs.count || !rr_sigs->rrs.rdata) {
+			assert(!EINVAL);
+			return kr_error(EINVAL);
+		}
+		const knot_dname_t *signer = knot_rrsig_signer_name(rr_sigs->rrs.rdata);
+		const int signer_size = knot_dname_size(signer);
+		k->zlf_len = signer_size - 1;
+
+		void **npp = nsec_pmap == NULL ? NULL
+			: trie_get_ins(nsec_pmap, (const char *)signer, signer_size);
+		assert(!nsec_pmap || (npp && ENOMEM));
+		if (rr->type == KNOT_RRTYPE_NSEC) {
+			key = key_NSEC1(k, encloser, wild_labels);
+			break;
+		}
+
+		assert(rr->type == KNOT_RRTYPE_NSEC3);
+		const knot_rdata_t * const rdata = rr->rrs.rdata;
+		if (rdata->len <= 4) return kr_error(EILSEQ); /*< data from outside; less trust */
+		const int np_dlen = nsec_p_rdlen(rdata->data);
+		if (np_dlen > rdata->len) return kr_error(EILSEQ);
+		key = key_NSEC3(k, encloser, nsec_p_mkHash(rdata->data));
+		if (npp && !*npp) {
+			*npp = mm_alloc(&qry->request->pool, np_dlen);
+			if (!*npp) {
+				assert(!ENOMEM);
+				break;
+			}
+			memcpy(*npp, rdata->data, np_dlen);
+		}
+		break;
+	default:
+		ret = kr_dname_lf(k->buf, encloser, wild_labels);
+		if (ret) {
+			assert(!ret);
+			return kr_error(ret);
+		}
+		key = key_exact_type(k, rr->type);
+	}
+
+	/* Compute materialized sizes of the new data. */
+	const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL;
+	const int rr_ssize = rdataset_dematerialize_size(&rr->rrs);
+	assert(rr_ssize == to_even(rr_ssize));
+	knot_db_val_t val_new_entry = {
+		.data = NULL,
+		.len = offsetof(struct entry_h, data) + rr_ssize
+			+ rdataset_dematerialize_size(rds_sigs),
+	};
+
+	/* Prepare raw memory for the new entry. */
+	ret = entry_h_splice(&val_new_entry, rank, key, k->type, rr->type,
+				rr->owner, qry, cache, timestamp);
+	if (ret) return kr_ok(); /* some aren't really errors */
+	assert(val_new_entry.data);
+
+	const uint32_t ttl = rr->ttl;
+	/* FIXME: consider TTLs and expirations of RRSIGs as well, just in case. */
+
+	/* Write the entry itself. */
+	struct entry_h *eh = val_new_entry.data;
+	memset(eh, 0, offsetof(struct entry_h, data));
+	eh->time = timestamp;
+	eh->ttl  = MAX(MIN(ttl, cache->ttl_max), cache->ttl_min);
+	eh->rank = rank;
+	if (rdataset_dematerialize(&rr->rrs, eh->data)
+	    || rdataset_dematerialize(rds_sigs, eh->data + rr_ssize)) {
+		/* minimize the damage from incomplete write; TODO: better */
+		eh->time = 0;
+		eh->ttl = 0;
+		eh->rank = 0;
+		assert(false);
+	}
+	assert(entry_h_consistent(val_new_entry, rr->type));
+
+	#if 0 /* Occasionally useful when debugging some kinds of changes. */
+	{
+	kr_cache_sync(cache);
+	knot_db_val_t val = { NULL, 0 };
+	ret = cache_op(cache, read, &key, &val, 1);
+	if (ret != kr_error(ENOENT)) { // ENOENT might happen in some edge case, I guess
+		assert(!ret);
+		entry_list_t el;
+		entry_list_parse(val, el);
+	}
+	}
+	#endif
+
+	/* Update metrics */
+	cache->stats.insert += 1;
+
+	/* Verbose-log some not-too-common cases. */
+	WITH_VERBOSE(qry) { if (kr_rank_test(rank, KR_RANK_AUTH)
+				|| rr->type == KNOT_RRTYPE_NS) {
+		auto_free char *type_str = kr_rrtype_text(rr->type),
+			*encl_str = kr_dname_text(encloser);
+		VERBOSE_MSG(qry, "=> stashed %s%s %s, rank 0%.2o, "
+			"%d B total, incl. %d RRSIGs\n",
+			(wild_labels ? "*." : ""), encl_str, type_str, rank,
+			(int)val_new_entry.len, (rr_sigs ? rr_sigs->rrs.count : 0)
+			);
+	} }
+
+	return (ssize_t) val_new_entry.len;
+}
+
+static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
+			const struct kr_query *qry, struct kr_cache *cache,
+			int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout)
+{
+	ranked_rr_array_entry_t *entry = arr->at[arr_i];
+	if (entry->cached) {
+		return kr_ok();
+	}
+	const knot_rrset_t *rr = entry->rr;
+	if (rr->type == KNOT_RRTYPE_RRSIG) {
+		return kr_ok(); /* reduce verbose logging from the following call */
+	}
+	int ret = stash_rrset_precond(rr, qry);
+	if (ret <= 0) {
+		return ret;
+	}
+
+	/* Try to find corresponding signatures, always.  LATER(optim.): speed. */
+	ranked_rr_array_entry_t *entry_rrsigs = NULL;
+	const knot_rrset_t *rr_sigs = NULL;
+	for (ssize_t j = arr->len - 1; j >= 0; --j) {
+		/* TODO: ATM we assume that some properties are the same
+		 * for all RRSIGs in the set (esp. label count). */
+		ranked_rr_array_entry_t *e = arr->at[j];
+		bool ok = e->qry_uid == qry->uid && !e->cached
+			&& e->rr->type == KNOT_RRTYPE_RRSIG
+			&& knot_rrsig_type_covered(e->rr->rrs.rdata) == rr->type
+			&& knot_dname_is_equal(rr->owner, e->rr->owner);
+		if (!ok) continue;
+		entry_rrsigs = e;
+		rr_sigs = e->rr;
+		break;
+	}
+
+	ssize_t written = stash_rrset(cache, qry, rr, rr_sigs, qry->timestamp.tv_sec,
+					entry->rank, nsec_pmap, has_optout);
+	if (written < 0) {
+		kr_log_error("[%05u.%02u][cach] stash failed, ret = %d\n", qry->request->uid,
+			     qry->uid, ret);
+		return (int) written;
+	}
+
+	if (written > 0) {
+		/* Mark entry as cached for the rest of the query processing */
+		entry->cached = true;
+		if (entry_rrsigs) {
+			entry_rrsigs->cached = true;
+		}
+		if (!kr_rank_test(entry->rank, KR_RANK_AUTH) && rr->type != KNOT_RRTYPE_NS) {
+			*unauth_cnt += 1;
+		}
+	}
+
+	return kr_ok();
+}
+
+static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
+			struct kr_request *req)
+{
+	const struct kr_query *qry = req->current_query;
+	struct kr_cache *cache = &req->ctx->cache;
+	uint32_t valid_until = qry->timestamp.tv_sec + cache->ttl_max;
+		/* LATER(optim.): be more precise here ^^ and reduce calls. */
+	static const int32_t ttl_margin = 3600;
+	const uint8_t *nsec_p = (const uint8_t *)nsec_p_v;
+	int data_stride = sizeof(valid_until) + nsec_p_rdlen(nsec_p);
+
+	unsigned int log_hash = 0xFeeeFeee; /* this type is simpler for printf args */
+	auto_free char *log_dname = NULL;
+	WITH_VERBOSE(qry) {
+		log_hash = nsec_p_v ? nsec_p_mkHash((const uint8_t *)nsec_p_v) : 0;
+		log_dname = kr_dname_text(dname);
+	}
+	/* Find what's in the cache. */
+	struct key k_storage, *k = &k_storage;
+	int ret = kr_dname_lf(k->buf, dname, false);
+	if (ret) return kr_error(ret);
+	knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS);
+	knot_db_val_t val_orig = { NULL, 0 };
+	ret = cache_op(cache, read, &key, &val_orig, 1);
+	if (ret && ret != -ABS(ENOENT)) {
+		VERBOSE_MSG(qry, "=> EL read failed (ret: %d)\n", ret);
+		return kr_ok();
+	}
+	/* Prepare new entry_list_t so we can just write at el[0]. */
+	entry_list_t el;
+	int log_refresh_by = 0;
+	if (ret == -ABS(ENOENT)) {
+		memset(el, 0, sizeof(el));
+	} else {
+		ret = entry_list_parse(val_orig, el);
+		if (ret) {
+			VERBOSE_MSG(qry, "=> EL parse failed (ret: %d)\n", ret);
+			return kr_error(0);
+		}
+		/* Find the index to replace. */
+		int i_replace = ENTRY_APEX_NSECS_CNT - 1;
+		for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+			if (el[i].len != data_stride) continue;
+			if (nsec_p && memcmp(nsec_p, (uint8_t *)el[i].data + sizeof(uint32_t),
+						data_stride - sizeof(uint32_t)) != 0) {
+				continue;
+			}
+			/* Save a cache operation if TTL extended only a little. */
+			uint32_t valid_orig;
+			memcpy(&valid_orig, el[i].data, sizeof(valid_orig));
+			const int32_t ttl_extended_by = valid_until - valid_orig;
+			if (ttl_extended_by < ttl_margin) {
+				VERBOSE_MSG(qry,
+					"=> nsec_p stash for %s skipped (extra TTL: %d, hash: %x)\n",
+					log_dname, ttl_extended_by, log_hash);
+				return kr_ok();
+			}
+			i_replace = i;
+			log_refresh_by = ttl_extended_by;
+			break;
+		}
+		/* Shift the other indices: move the first `i_replace` blocks
+		 * by one position. */
+		if (i_replace) {
+			memmove(&el[1], &el[0], sizeof(el[0]) * i_replace);
+		}
+	}
+	/* Prepare old data into a buffer.  See entry_h_splice() for why.  LATER(optim.) */
+	el[0].len = data_stride;
+	el[0].data = NULL;
+	knot_db_val_t val;
+	val.len = entry_list_serial_size(el),
+	val.data = mm_alloc(&req->pool, val.len),
+	entry_list_memcpy(val.data, el);
+	/* Prepare the new data chunk */
+	memcpy(el[0].data, &valid_until, sizeof(valid_until));
+	if (nsec_p) {
+		memcpy((uint8_t *)el[0].data + sizeof(valid_until), nsec_p,
+			data_stride - sizeof(valid_until));
+	}
+	/* Write it all to the cache */
+	ret = cache_op(cache, write, &key, &val, 1);
+	if (ret || !val.data) {
+		VERBOSE_MSG(qry, "=> EL write failed (ret: %d)\n", ret);
+		return kr_ok();
+	}
+	if (log_refresh_by) {
+		VERBOSE_MSG(qry, "=> nsec_p stashed for %s (refresh by %d, hash: %x)\n",
+				log_dname, log_refresh_by, log_hash);
+	} else {
+		VERBOSE_MSG(qry, "=> nsec_p stashed for %s (new, hash: %x)\n",
+				log_dname, log_hash);
+	}
+	return kr_ok();
+}
+
+
+static int peek_exact_real(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+			struct kr_cache_p *peek)
+{
+	if (!check_rrtype(type, NULL) || !check_dname_for_lf(name, NULL)) {
+		return kr_error(ENOTSUP);
+	}
+	struct key k_storage, *k = &k_storage;
+
+	int ret = kr_dname_lf(k->buf, name, false);
+	if (ret) return kr_error(ret);
+
+	knot_db_val_t key = key_exact_type(k, type);
+	knot_db_val_t val = { NULL, 0 };
+	ret = cache_op(cache, read, &key, &val, 1);
+	if (!ret) ret = entry_h_seek(&val, type);
+	if (ret) return kr_error(ret);
+
+	const struct entry_h *eh = entry_h_consistent(val, type);
+	if (!eh || eh->is_packet) {
+		// TODO: no packets, but better get rid of whole kr_cache_peek_exact().
+		return kr_error(ENOENT);
+	}
+	*peek = (struct kr_cache_p){
+		.time = eh->time,
+		.ttl  = eh->ttl,
+		.rank = eh->rank,
+		.raw_data = val.data,
+		.raw_bound = knot_db_val_bound(val),
+	};
+	return kr_ok();
+}
+int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+			struct kr_cache_p *peek)
+{	/* Just wrap with extra verbose logging. */
+	const int ret = peek_exact_real(cache, name, type, peek);
+	if (false && VERBOSE_STATUS) { /* too noisy for usual --verbose */
+		auto_free char *type_str = kr_rrtype_text(type),
+			*name_str = kr_dname_text(name);
+		const char *result_str = (ret == kr_ok() ? "hit" :
+				(ret == kr_error(ENOENT) ? "miss" : "error"));
+		VERBOSE_MSG(NULL, "_peek_exact: %s %s %s (ret: %d)",
+				type_str, name_str, result_str, ret);
+	}
+	return ret;
+}
+
+int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type)
+{
+	if (!cache_isvalid(cache)) {
+		return kr_error(EINVAL);
+	}
+	if (!cache->api->remove) {
+		return kr_error(ENOSYS);
+	}
+	struct key k_storage, *k = &k_storage;
+	int ret = kr_dname_lf(k->buf, name, false);
+	if (ret) return kr_error(ret);
+
+	knot_db_val_t key = key_exact_type(k, type);
+	return cache_op(cache, remove, &key, 1);
+}
+
+int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name,
+		   bool exact_name, knot_db_val_t keyval[][2], int maxcount)
+{
+	if (!cache_isvalid(cache)) {
+		return kr_error(EINVAL);
+	}
+	if (!cache->api->match) {
+		return kr_error(ENOSYS);
+	}
+
+	struct key k_storage, *k = &k_storage;
+
+	int ret = kr_dname_lf(k->buf, name, false);
+	if (ret) return kr_error(ret);
+
+	// use a mock type
+	knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_A);
+	/* CACHE_KEY_DEF */
+	key.len -= sizeof(uint16_t); /* the type */
+	if (!exact_name) {
+		key.len -= 2; /* '\0' 'E' */
+		if (name[0] == '\0') ++key.len; /* the root name is special ATM */
+	}
+	return cache_op(cache, match, &key, keyval, maxcount);
+}
+
+int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type)
+{
+	if (key.data == NULL || buf == NULL || type == NULL) {
+		return kr_error(EINVAL);
+	}
+
+	int len = -1;
+	const char *tag, *key_data = key.data;
+	for (tag = key_data + 1; tag < key_data + key.len; ++tag) {
+		/* CACHE_KEY_DEF */
+		if (tag[-1] == '\0' && (tag == key_data + 1 || tag[-2] == '\0')) {
+			if (tag[0] != 'E') return kr_error(EINVAL);
+			len = tag - 1 - key_data;
+			break;
+		}
+	}
+
+	if (len == -1 || len > KNOT_DNAME_MAXLEN) {
+		return kr_error(EINVAL);
+	}
+
+	int ret = knot_dname_lf2wire(buf, len, key.data);
+	if (ret < 0) {
+		return kr_error(ret);
+	}
+
+	/* CACHE_KEY_DEF: jump over "\0 E/1" */
+	memcpy(type, tag + 1, sizeof(uint16_t));
+
+	return kr_ok();
+}
+
+
+int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name,
+			    bool exact_name, int maxcount)
+{
+	if (!cache_isvalid(cache)) {
+		return kr_error(EINVAL);
+	}
+
+	knot_db_val_t keyval[maxcount][2], keys[maxcount];
+	int ret = kr_cache_match(cache, name, exact_name, keyval, maxcount);
+	if (ret <= 0) { /* ENOENT -> nothing to remove */
+		return (ret == KNOT_ENOENT) ? 0 : ret;
+	}
+	const int count = ret;
+	/* Duplicate the key strings, as deletion may invalidate the pointers. */
+	int i;
+	for (i = 0; i < count; ++i) {
+		keys[i].len = keyval[i][0].len;
+		keys[i].data = malloc(keys[i].len);
+		if (!keys[i].data) {
+			ret = kr_error(ENOMEM);
+			goto cleanup;
+		}
+		memcpy(keys[i].data, keyval[i][0].data, keys[i].len);
+	}
+	ret = cache->api->remove(cache->db, keys, count);
+cleanup:
+	kr_cache_sync(cache); /* Sync even after just kr_cache_match(). */
+	/* Free keys */
+	while (--i >= 0) {
+		free(keys[i].data);
+	}
+	return ret;
+}
+
diff --git a/lib/cache/api.h b/lib/cache/api.h
new file mode 100644
index 0000000..61bf796
--- /dev/null
+++ b/lib/cache/api.h
@@ -0,0 +1,201 @@
+/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <libknot/consts.h>
+#include <libknot/rrset.h>
+#include <sys/time.h>
+#include "lib/cache/cdb_api.h"
+#include "lib/defines.h"
+#include "contrib/ucw/config.h" /*uint*/
+
+/** When knot_pkt is passed from cache without ->wire, this is the ->size. */
+static const size_t PKT_SIZE_NOWIRE = -1;
+
+
+#include "lib/module.h"
+/* Prototypes for the 'cache' module implementation. */
+int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt);
+int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt);
+
+
+/**
+ * Cache structure, keeps API, instance and metadata.
+ */
+struct kr_cache
+{
+	knot_db_t *db;		      /**< Storage instance */
+	const struct kr_cdb_api *api; /**< Storage engine */
+	struct {
+		uint32_t hit;         /**< Number of cache hits */
+		uint32_t miss;        /**< Number of cache misses */
+		uint32_t insert;      /**< Number of insertions */
+		uint32_t delete;      /**< Number of deletions */
+	} stats;
+
+	uint32_t ttl_min, ttl_max; /**< TTL limits */
+
+	/* A pair of stamps for detection of real-time shifts during runtime. */
+	struct timeval checkpoint_walltime; /**< Wall time on the last check-point. */
+	uint64_t checkpoint_monotime; /**< Monotonic milliseconds on the last check-point. */
+};
+
+/**
+ * Open/create cache with provided storage options.
+ * @param cache cache structure to be initialized
+ * @param api   storage engine API
+ * @param opts  storage-specific options (may be NULL for default)
+ * @param mm    memory context.
+ * @return 0 or an error code
+ */
+KR_EXPORT
+int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm);
+
+/**
+ * Path to cache file to remove on critical out-of-space error. (do NOT modify it)
+ */
+KR_EXPORT extern
+const char *kr_cache_emergency_file_to_remove;
+
+/**
+ * Close persistent cache.
+ * @note This doesn't clear the data, just closes the connection to the database.
+ * @param cache structure
+ */
+KR_EXPORT
+void kr_cache_close(struct kr_cache *cache);
+
+/** Run after a row of operations to release transaction/lock if needed. */
+KR_EXPORT
+int kr_cache_sync(struct kr_cache *cache);
+
+/**
+ * Return true if cache is open and enabled.
+ */
+static inline bool kr_cache_is_open(struct kr_cache *cache)
+{
+	return cache->db != NULL;
+}
+
+/** (Re)set the time pair to the current values. */
+static inline void kr_cache_make_checkpoint(struct kr_cache *cache)
+{
+	cache->checkpoint_monotime = kr_now();
+	gettimeofday(&cache->checkpoint_walltime, NULL);
+}
+
+/**
+ * Insert RRSet into cache, replacing any existing data.
+ * @param cache cache structure
+ * @param rr inserted RRSet
+ * @param rrsig RRSIG for inserted RRSet (optional)
+ * @param rank rank of the data
+ * @param timestamp current time
+ * @return 0 or an errcode
+ */
+KR_EXPORT
+int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp);
+
+/**
+ * Clear all items from the cache.
+ * @param cache cache structure
+ * @return 0 or an errcode
+ */
+KR_EXPORT
+int kr_cache_clear(struct kr_cache *cache);
+
+
+/* ** This interface is temporary. ** */
+
+struct kr_cache_p {
+	uint32_t time;	/**< The time of inception. */
+	uint32_t ttl;	/**< TTL at inception moment.  Assuming it fits into int32_t ATM. */
+	uint8_t  rank;	/**< See enum kr_rank */
+	struct {
+		/* internal: pointer to eh struct */
+		void *raw_data, *raw_bound;
+	};
+};
+KR_EXPORT
+int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
+			struct kr_cache_p *peek);
+/* Parameters (qry, name, type) are used for timestamp and stale-serving decisions. */
+KR_EXPORT
+int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry,
+		     const knot_dname_t *name, uint16_t type);
+
+KR_EXPORT
+int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref,
+			 knot_mm_t *pool);
+
+
+/**
+ * Remove an entry from cache.
+ * @param cache cache structure
+ * @param name dname
+ * @param type rr type
+ * @return number of deleted records, or negative error code
+ * @note only "exact hits" are considered ATM, and
+ * 	some other information may be removed alongside.
+ */
+KR_EXPORT
+int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type);
+
+/**
+ * Get keys matching a dname lf prefix
+ * @param cache cache structure
+ * @param name dname
+ * @param exact_name whether to only consider exact name matches
+ * @param keyval matched key-value pairs
+ * @param maxcount limit on the number of returned key-value pairs
+ * @return result count or an errcode
+ * @note the cache keys are matched by prefix, i.e. it very much depends
+ * 	on their structure; CACHE_KEY_DEF.
+ */
+KR_EXPORT
+int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name,
+		   bool exact_name, knot_db_val_t keyval[][2], int maxcount);
+
+/**
+ * Remove a subtree in cache.  It's like _match but removing them instead of returning.
+ * @return number of deleted entries or an errcode
+ */
+KR_EXPORT
+int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name,
+			    bool exact_name, int maxcount);
+
+/**
+ * Find the closest cached zone apex for a name (in cache).
+ * @param is_DS start searching one name higher
+ * @return the number of labels to remove from the name, or negative error code
+ * @note timestamp is found by a syscall, and stale-serving is not considered
+ */
+KR_EXPORT
+int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS,
+			  knot_dname_t **apex);
+
+/**
+ * Unpack dname and type from db key
+ * @param key db key representation
+ * @param buf output buffer of domain name in dname format
+ * @param type output for type
+ * @return length of dname or an errcode
+ * @note only "exact hits" are considered ATM, moreover xNAME records
+ * 	are "hidden" as NS. (see comments in struct entry_h)
+ */
+KR_EXPORT
+int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type);
diff --git a/lib/cache/cdb_api.h b/lib/cache/cdb_api.h
new file mode 100644
index 0000000..814a5f5
--- /dev/null
+++ b/lib/cache/cdb_api.h
@@ -0,0 +1,68 @@
+/*  Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <libknot/db/db.h>
+
+/* Cache options. */
+struct kr_cdb_opts {
+	const char *path; /*!< Cache URI path. */
+	size_t maxsize;   /*!< Suggested cache size in bytes. */
+};
+
+/*! Cache database API.
+  * This is a simplified version of generic DB API from libknot,
+  * that is tailored to caching purposes.
+  */
+struct kr_cdb_api {
+	const char *name;
+
+	/* Context operations */
+
+	int (*open)(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *mm);
+	void (*close)(knot_db_t *db);
+	int (*count)(knot_db_t *db);
+	int (*clear)(knot_db_t *db);
+
+	/** Run after a row of operations to release transaction/lock if needed. */
+	int (*sync)(knot_db_t *db);
+
+	/* Data access */
+
+	int (*read)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+			int maxcount);
+	int (*write)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+			int maxcount);
+
+	/** Remove maxcount keys.
+	 * \returns the number of succesfully removed keys or the first error code
+	 * It returns on first error, but ENOENT is not considered an error. */
+	int (*remove)(knot_db_t *db, knot_db_val_t keys[], int maxcount);
+
+	/* Specialised operations */
+
+	/** Find key-value pairs that are prefixed by the given key, limited by maxcount.
+	 * \return the number of pairs or negative error. */
+	int (*match)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount);
+	/** Not implemented ATM. */
+	int (*prune)(knot_db_t *db, int maxcount);
+
+	/** Less-or-equal search (lexicographic ordering).
+	 * On successful return, key->data and val->data point to DB-owned data.
+	 * return: 0 for equality, > 0 for less, < 0 kr_error */
+	int (*read_leq)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t *val);
+};
diff --git a/lib/cache/cdb_lmdb.c b/lib/cache/cdb_lmdb.c
new file mode 100644
index 0000000..621d2e8
--- /dev/null
+++ b/lib/cache/cdb_lmdb.c
@@ -0,0 +1,710 @@
+/*  Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <lmdb.h>
+
+#include "contrib/cleanup.h"
+#include "lib/cache/cdb_lmdb.h"
+#include "lib/cache/api.h"
+#include "lib/utils.h"
+
+
+/* Defines */
+#define LMDB_DIR_MODE   0770
+#define LMDB_FILE_MODE  0660
+
+struct lmdb_env
+{
+	size_t mapsize;
+	MDB_dbi dbi;
+	MDB_env *env;
+
+	/** Cached transactions
+	 *
+	 * - only one of (ro,rw) may be active at once
+	 * - non-NULL .ro may be active or reset
+	 * - non-NULL .rw is always active
+	 */
+	struct {
+		bool ro_active, ro_curs_active;
+		MDB_txn *ro, *rw;
+		MDB_cursor *ro_curs;
+	} txn;
+};
+
+/** @brief Convert LMDB error code. */
+static int lmdb_error(int error)
+{
+	/* _BAD_TXN may happen with overfull DB,
+	 * even during mdb_get with a single fork :-/ */
+	if (error == MDB_BAD_TXN) {
+		kr_log_info("[cache] MDB_BAD_TXN, probably overfull\n");
+		error = ENOSPC;
+	}
+	switch (error) {
+	case MDB_SUCCESS:
+		return kr_ok();
+	case MDB_NOTFOUND:
+		return kr_error(ENOENT);
+	case ENOSPC:
+	case MDB_MAP_FULL:
+	case MDB_TXN_FULL:
+		return kr_error(ENOSPC);
+	default:
+		kr_log_error("[cache] LMDB error: %s\n", mdb_strerror(error));
+		return kr_error(error);
+	}
+}
+
+/** Conversion between knot and lmdb structs for values. */
+static inline knot_db_val_t val_mdb2knot(MDB_val v)
+{
+	return (knot_db_val_t){ .len = v.mv_size, .data = v.mv_data };
+}
+static inline MDB_val val_knot2mdb(knot_db_val_t v)
+{
+	return (MDB_val){ .mv_size = v.len, .mv_data = v.data };
+}
+
+
+/*! \brief Set the environment map size.
+ * \note This also sets the maximum database size, see \fn mdb_env_set_mapsize
+ */
+static int set_mapsize(MDB_env *env, size_t map_size)
+{
+	long page_size = sysconf(_SC_PAGESIZE);
+	if (page_size <= 0) {
+		return KNOT_ERROR;
+	}
+
+	/* Round to page size. */
+	map_size = (map_size / page_size) * page_size;
+	int ret = mdb_env_set_mapsize(env, map_size);
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+
+	return 0;
+}
+
+#define FLAG_RENEW (2*MDB_RDONLY)
+/** mdb_txn_begin or _renew + handle MDB_MAP_RESIZED.
+ *
+ * The retrying logic for MDB_MAP_RESIZED is so ugly that it has its own function.
+ * \note this assumes no transactions are active
+ * \return MDB_ errcode, not usual kr_error(...)
+ */
+static int txn_get_noresize(struct lmdb_env *env, unsigned int flag, MDB_txn **txn)
+{
+	assert(!env->txn.rw && (!env->txn.ro || !env->txn.ro_active));
+	int ret;
+	if (flag == FLAG_RENEW) {
+		ret = mdb_txn_renew(*txn);
+	} else {
+		ret = mdb_txn_begin(env->env, NULL, flag, txn);
+	}
+	if (ret != MDB_MAP_RESIZED) {
+		return ret;
+	}
+	//:unlikely
+	/* Another process increased the size; let's try to recover. */
+	kr_log_info("[cache] detected size increased by another process\n");
+	ret = mdb_env_set_mapsize(env->env, 0);
+	if (ret != MDB_SUCCESS) {
+		return ret;
+	}
+	if (flag == FLAG_RENEW) {
+		ret = mdb_txn_renew(*txn);
+	} else {
+		ret = mdb_txn_begin(env->env, NULL, flag, txn);
+	}
+	return ret;
+}
+
+/** Obtain a transaction.  (they're cached in env->txn) */
+static int txn_get(struct lmdb_env *env, MDB_txn **txn, bool rdonly)
+{
+	assert(env && txn);
+	if (env->txn.rw) {
+		/* Reuse the *open* RW txn even if only reading is requested.
+		 * We leave the management of this to the cdb_sync command.
+		 * The user may e.g. want to do some reads between the writes. */
+		*txn = env->txn.rw;
+		return kr_ok();
+	}
+
+	if (!rdonly) {
+		/* avoid two active transactions */
+		if (env->txn.ro && env->txn.ro_active) {
+			mdb_txn_reset(env->txn.ro);
+			env->txn.ro_active = false;
+			env->txn.ro_curs_active = false;
+		}
+		int ret = txn_get_noresize(env, 0/*RW*/, &env->txn.rw);
+		if (ret == MDB_SUCCESS) {
+			*txn = env->txn.rw;
+			assert(*txn);
+		}
+		return lmdb_error(ret);
+	}
+
+	/* Get an active RO txn and return it. */
+	int ret = MDB_SUCCESS;
+	if (!env->txn.ro) { //:unlikely
+		ret = txn_get_noresize(env, MDB_RDONLY, &env->txn.ro);
+	} else if (!env->txn.ro_active) {
+		ret = txn_get_noresize(env, FLAG_RENEW, &env->txn.ro);
+	}
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+	env->txn.ro_active = true;
+	*txn = env->txn.ro;
+	assert(*txn);
+	return kr_ok();
+}
+
+static int cdb_sync(knot_db_t *db)
+{
+	struct lmdb_env *env = db;
+	int ret = kr_ok();
+	if (env->txn.rw) {
+		ret = lmdb_error(mdb_txn_commit(env->txn.rw));
+		env->txn.rw = NULL; /* the transaction got freed even in case of errors */
+	} else if (env->txn.ro && env->txn.ro_active) {
+		mdb_txn_reset(env->txn.ro);
+		env->txn.ro_active = false;
+		env->txn.ro_curs_active = false;
+	}
+	return ret;
+}
+
+/** Obtain a read-only cursor (and a read-only transaction). */
+static int txn_curs_get(struct lmdb_env *env, MDB_cursor **curs)
+{
+	assert(env && curs);
+	if (env->txn.ro_curs_active) {
+		goto success;
+	}
+	/* Only in a read-only txn; TODO: it's a bit messy/coupled */
+	if (env->txn.rw) {
+		int ret = cdb_sync(env);
+		if (ret) return ret;
+	}
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, true);
+	if (ret) return ret;
+
+	if (env->txn.ro_curs) {
+		ret = mdb_cursor_renew(txn, env->txn.ro_curs);
+	} else {
+		ret = mdb_cursor_open(txn, env->dbi, &env->txn.ro_curs);
+	}
+	if (ret) return ret;
+	env->txn.ro_curs_active = true;
+success:
+	assert(env->txn.ro_curs_active && env->txn.ro && env->txn.ro_active
+		&& !env->txn.rw);
+	*curs = env->txn.ro_curs;
+	assert(*curs);
+	return kr_ok();
+}
+
+static void free_txn_ro(struct lmdb_env *env)
+{
+	if (env->txn.ro) {
+		mdb_txn_abort(env->txn.ro);
+		env->txn.ro = NULL;
+	}
+	if (env->txn.ro_curs) {
+		mdb_cursor_close(env->txn.ro_curs);
+		env->txn.ro_curs = NULL;
+	}
+}
+
+/*! \brief Close the database. */
+static void cdb_close_env(struct lmdb_env *env)
+{
+	assert(env && env->env);
+
+	/* Get rid of any transactions. */
+	cdb_sync(env);
+	free_txn_ro(env);
+
+	mdb_env_sync(env->env, 1);
+	mdb_dbi_close(env->env, env->dbi);
+	mdb_env_close(env->env);
+	memset(env, 0, sizeof(*env));
+}
+
+/*! \brief Open database environment. */
+static int cdb_open_env(struct lmdb_env *env, unsigned flags, const char *path, size_t mapsize)
+{
+	int ret = mkdir(path, LMDB_DIR_MODE);
+	if (ret == -1 && errno != EEXIST) {
+		return kr_error(errno);
+	}
+
+	MDB_env *mdb_env = NULL;
+	ret = mdb_env_create(&mdb_env);
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+
+	ret = set_mapsize(mdb_env, mapsize);
+	if (ret != 0) {
+		mdb_env_close(mdb_env);
+		return ret;
+	}
+
+	ret = mdb_env_open(mdb_env, path, flags, LMDB_FILE_MODE);
+	if (ret != MDB_SUCCESS) {
+		mdb_env_close(mdb_env);
+		return lmdb_error(ret);
+	}
+
+	/* Keep the environment pointer. */
+	env->env = mdb_env;
+	env->mapsize = mapsize;
+	return 0;
+}
+
+static int cdb_open(struct lmdb_env *env, const char *path, size_t mapsize)
+{
+	/* Cache doesn't require durability, we can be
+	 * loose with the requirements as a tradeoff for speed. */
+	const unsigned flags = MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOTLS;
+	int ret = cdb_open_env(env, flags, path, mapsize);
+	if (ret != 0) {
+		return ret;
+	}
+
+	/* Open the database. */
+	MDB_txn *txn = NULL;
+	ret = mdb_txn_begin(env->env, NULL, 0, &txn);
+	if (ret != MDB_SUCCESS) {
+		mdb_env_close(env->env);
+		return lmdb_error(ret);
+	}
+
+	ret = mdb_dbi_open(txn, NULL, 0, &env->dbi);
+	if (ret != MDB_SUCCESS) {
+		mdb_txn_abort(txn);
+		mdb_env_close(env->env);
+		return lmdb_error(ret);
+	}
+
+	ret = mdb_txn_commit(txn);
+	if (ret != MDB_SUCCESS) {
+		mdb_env_close(env->env);
+		return lmdb_error(ret);
+	}
+
+	return 0;
+}
+
+static int cdb_init(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *pool)
+{
+	if (!db || !opts) {
+		return kr_error(EINVAL);
+	}
+
+	struct lmdb_env *env = malloc(sizeof(*env));
+	if (!env) {
+		return kr_error(ENOMEM);
+	}
+	memset(env, 0, sizeof(struct lmdb_env));
+
+	/* Clear stale lockfiles. */
+	auto_free char *lockfile = kr_strcatdup(2, opts->path, "/.cachelock");
+	if (lockfile) {
+		if (unlink(lockfile) == 0) {
+			kr_log_info("[cache] cleared stale lockfile '%s'\n", lockfile);
+		} else if (errno != ENOENT) {
+			kr_log_info("[cache] failed to clear stale lockfile '%s': %s\n", lockfile,
+				    strerror(errno));
+		}
+	}
+
+	/* Open the database. */
+	int ret = cdb_open(env, opts->path, opts->maxsize);
+	if (ret != 0) {
+		free(env);
+		return ret;
+	}
+
+	*db = env;
+	return 0;
+}
+
+static void cdb_deinit(knot_db_t *db)
+{
+	struct lmdb_env *env = db;
+	cdb_close_env(env);
+	free(env);
+}
+
+static int cdb_count(knot_db_t *db)
+{
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, true);
+	if (ret != 0) {
+		return ret;
+	}
+
+	MDB_stat stat;
+	ret = mdb_stat(txn, env->dbi, &stat);
+
+	return (ret == MDB_SUCCESS) ? stat.ms_entries : lmdb_error(ret);
+}
+
+static int cdb_clear(knot_db_t *db)
+{
+	struct lmdb_env *env = db;
+	/* First try mdb_drop() to clear the DB; this may fail with ENOSPC. */
+	/* If we didn't do this, explicit cache.clear() ran on an instance
+	 * would lead to the instance detaching from the cache of others,
+	 * until they reopened cache explicitly or cleared it for some reason.
+	 */
+	{
+		MDB_txn *txn = NULL;
+		int ret = txn_get(env, &txn, false);
+		if (ret == kr_ok()) {
+			ret = lmdb_error(mdb_drop(txn, env->dbi, 0));
+			if (ret == kr_ok()) {
+				ret = cdb_sync(db);
+			}
+			if (ret == kr_ok()) {
+				return ret;
+			}
+		}
+		kr_log_info("[cache] clearing error, falling back\n");
+	}
+
+	/* We are about to switch to a different file, so end all txns, to be sure. */
+	(void) cdb_sync(db);
+	free_txn_ro(db);
+
+	/* Since there is no guarantee that there will be free
+	 * pages to hold whole dirtied db for transaction-safe clear,
+	 * we simply remove the database files and reopen.
+	 * We can afford this since other readers will continue to read
+	 * from removed file, but will reopen when encountering next
+	 * error. */
+	mdb_filehandle_t fd = -1;
+	int ret = mdb_env_get_fd(env->env, &fd);
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+	const char *path = NULL;
+	ret = mdb_env_get_path(env->env, &path);
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+
+	auto_free char *mdb_datafile = kr_strcatdup(2, path, "/data.mdb");
+	auto_free char *mdb_lockfile = kr_strcatdup(2, path, "/lock.mdb");
+	auto_free char *lockfile = kr_strcatdup(2, path, "/.cachelock");
+	if (!mdb_datafile || !mdb_lockfile || !lockfile) {
+		return kr_error(ENOMEM);
+	}
+	/* Find if we get a lock on lockfile. */
+	ret = open(lockfile, O_CREAT|O_EXCL|O_RDONLY, S_IRUSR);
+	if (ret == -1) {
+		kr_log_error("[cache] clearing failed to get ./.cachelock; retry later\n");
+		/* As we're out of space (almost certainly - mdb_drop didn't work),
+		 * we will retry on the next failing write operation. */
+		return kr_error(errno);
+	}
+	close(ret);
+	/* We acquired lockfile.  Now find whether *.mdb are what we have open now. */
+	struct stat old_stat, new_stat;
+	if (fstat(fd, &new_stat) || stat(mdb_datafile, &old_stat)) {
+		ret = errno;
+		unlink(lockfile);
+		return kr_error(ret);
+	}
+	/* Remove underlying files only if current open environment
+	 * points to file on the disk. Otherwise just reopen as someone
+	 * else has already removed the files.
+	 */
+	if (old_stat.st_dev == new_stat.st_dev && old_stat.st_ino == new_stat.st_ino) {
+		kr_log_verbose("[cache] clear: identical files, unlinking\n");
+		// coverity[toctou]
+		unlink(mdb_datafile);
+		unlink(mdb_lockfile);
+	} else
+		kr_log_verbose("[cache] clear: not identical files, reopening\n");
+	/* Keep copy as it points to current handle internals. */
+	auto_free char *path_copy = strdup(path);
+	size_t mapsize = env->mapsize;
+	cdb_close_env(env);
+	ret = cdb_open(env, path_copy, mapsize);
+	/* Environment updated, release lockfile. */
+	unlink(lockfile);
+	return ret;
+}
+
+static int cdb_readv(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+		     int maxcount)
+{
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, true);
+	if (ret) {
+		return ret;
+	}
+
+	for (int i = 0; i < maxcount; ++i) {
+		/* Convert key structs */
+		MDB_val _key = val_knot2mdb(key[i]);
+		MDB_val _val = val_knot2mdb(val[i]);
+		ret = mdb_get(txn, env->dbi, &_key, &_val);
+		if (ret != MDB_SUCCESS) {
+			ret = lmdb_error(ret);
+			if (ret == kr_error(ENOSPC)) {
+				/* we're likely to be forced to cache clear anyway */
+				ret = kr_error(ENOENT);
+			}
+			return ret;
+		}
+		/* Update the result. */
+		val[i] = val_mdb2knot(_val);
+	}
+	return kr_ok();
+}
+
+static int cdb_write(struct lmdb_env *env, MDB_txn **txn, const knot_db_val_t *key,
+			knot_db_val_t *val, unsigned flags)
+{
+	/* Convert key structs and write */
+	MDB_val _key = val_knot2mdb(*key);
+	MDB_val _val = val_knot2mdb(*val);
+	int ret = mdb_put(*txn, env->dbi, &_key, &_val, flags);
+
+	/* Try to recover from doing too much writing in a single transaction. */
+	if (ret == MDB_TXN_FULL) {
+		ret = cdb_sync(env);
+		if (ret) {
+			ret = txn_get(env, txn, false);
+		}
+		if (ret) {
+			ret = mdb_put(*txn, env->dbi, &_key, &_val, flags);
+		}
+	}
+	if (ret != MDB_SUCCESS) {
+		return lmdb_error(ret);
+	}
+
+	/* Update the result. */
+	val->data = _val.mv_data;
+	val->len = _val.mv_size;
+	return kr_ok();
+}
+
+static int cdb_writev(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val,
+			int maxcount)
+{
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, false);
+
+	for (int i = 0; ret == kr_ok() && i < maxcount; ++i) {
+		/* This is LMDB specific optimisation,
+		 * if caller specifies value with NULL data and non-zero length,
+		 * LMDB will preallocate the entry for caller and leave write
+		 * transaction open, caller is responsible for syncing thus committing transaction.
+		 */
+		unsigned mdb_flags = 0;
+		if (val[i].len > 0 && val[i].data == NULL) {
+			mdb_flags |= MDB_RESERVE;
+		}
+		ret = cdb_write(env, &txn, &key[i], &val[i], mdb_flags);
+	}
+
+	return ret;
+}
+
+static int cdb_remove(knot_db_t *db, knot_db_val_t keys[], int maxcount)
+{
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, false);
+	int deleted = 0;
+
+	for (int i = 0; ret == kr_ok() && i < maxcount; ++i) {
+		MDB_val _key = val_knot2mdb(keys[i]);
+		MDB_val val = { 0, NULL };
+		ret = lmdb_error(mdb_del(txn, env->dbi, &_key, &val));
+		if (ret == kr_ok())
+			deleted++;
+		else if (ret == KNOT_ENOENT)
+			ret = kr_ok();  /* skip over non-existing entries */
+	}
+
+	return ret < 0 ? ret : deleted;
+}
+
+static int cdb_match(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount)
+{
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, true);
+	if (ret != 0) {
+		return ret;
+	}
+
+	/* LATER(optim.): use txn_curs_get() instead, to save resources. */
+	MDB_cursor *cur = NULL;
+	ret = mdb_cursor_open(txn, env->dbi, &cur);
+	if (ret != 0) {
+		return lmdb_error(ret);
+	}
+
+	MDB_val cur_key = val_knot2mdb(*key);
+	MDB_val cur_val = { 0, NULL };
+	ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_SET_RANGE);
+	if (ret != MDB_SUCCESS) {
+		mdb_cursor_close(cur);
+		return lmdb_error(ret);
+	}
+
+	int results = 0;
+	while (ret == MDB_SUCCESS) {
+		/* Retrieve current key and compare with prefix */
+		if (cur_key.mv_size < key->len || memcmp(cur_key.mv_data, key->data, key->len) != 0) {
+			break;
+		}
+		/* Add to result set */
+		if (results < maxcount) {
+			keyval[results][0] = val_mdb2knot(cur_key);
+			keyval[results][1] = val_mdb2knot(cur_val);
+			++results;
+		} else {
+			break;
+		}
+		ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+	}
+
+	mdb_cursor_close(cur);
+	return results;
+}
+
+
+static int cdb_prune(knot_db_t *db, int limit)
+{
+	return -1;
+#if 0
+	/* Sync in-flight transactions */
+	cdb_sync(db);
+
+	/* Prune old records */
+	struct lmdb_env *env = db;
+	MDB_txn *txn = NULL;
+	int ret = txn_get(env, &txn, false);
+	if (ret != 0) {
+		return ret;
+	}
+
+	MDB_cursor *cur = NULL;
+	ret = mdb_cursor_open(txn, env->dbi, &cur);
+	if (ret != 0) {
+		return lmdb_error(ret);
+	}
+
+	MDB_val cur_key, cur_val;
+	ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_FIRST);
+	if (ret != 0) {
+		mdb_cursor_close(cur);
+		return lmdb_error(ret);
+	}
+
+	int results = 0;
+	struct timeval now;
+	gettimeofday(&now, NULL);
+	while (ret == 0 && results < limit) {
+		/* Ignore special namespaces. */
+		if (cur_key.mv_size < 2 || ((const char *)cur_key.mv_data)[0] == 'V') {
+			ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+			continue;
+		}
+		/* Check entry age. */
+		struct kr_cache_entry *entry = cur_val.mv_data;
+		if (entry->timestamp > now.tv_sec ||
+			(now.tv_sec - entry->timestamp) < entry->ttl) {
+			ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+			continue;
+		}
+		/* Remove entry */
+		mdb_cursor_del(cur, 0);
+		++results;
+		ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT);
+	}
+	mdb_cursor_close(cur);
+	return ret < 0 ? ret : results;
+#endif
+}
+
+static int cdb_read_leq(knot_db_t *env, knot_db_val_t *key, knot_db_val_t *val)
+{
+	assert(env && key && key->data && val);
+	MDB_cursor *curs = NULL;
+	int ret = txn_curs_get(env, &curs);
+	if (ret) return ret;
+
+	MDB_val key2_m = val_knot2mdb(*key);
+	MDB_val val2_m = { 0, NULL };
+	ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_SET_RANGE);
+	if (ret) return lmdb_error(ret);
+	/* test for equality //:unlikely */
+	if (key2_m.mv_size == key->len
+	    && memcmp(key2_m.mv_data, key->data, key->len) == 0) {
+		ret = 0; /* equality */
+		goto success;
+	}
+	/* we must be greater than key; do one step to smaller */
+	ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_PREV);
+	if (ret) return lmdb_error(ret);
+	ret = 1;
+success:
+	/* finalize the output */
+	*key = val_mdb2knot(key2_m);
+	*val = val_mdb2knot(val2_m);
+	return ret;
+}
+
+
+const struct kr_cdb_api *kr_cdb_lmdb(void)
+{
+	static const struct kr_cdb_api api = {
+		"lmdb",
+		cdb_init, cdb_deinit, cdb_count, cdb_clear, cdb_sync,
+		cdb_readv, cdb_writev, cdb_remove,
+		cdb_match, cdb_prune,
+		cdb_read_leq
+	};
+
+	return &api;
+}
diff --git a/lib/cache/cdb_lmdb.h b/lib/cache/cdb_lmdb.h
new file mode 100644
index 0000000..bc2c7d6
--- /dev/null
+++ b/lib/cache/cdb_lmdb.h
@@ -0,0 +1,23 @@
+/*  Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "lib/cache/cdb_api.h"
+#include "lib/defines.h"
+
+KR_EXPORT KR_CONST
+const struct kr_cdb_api *kr_cdb_lmdb(void);
diff --git a/lib/cache/entry_list.c b/lib/cache/entry_list.c
new file mode 100644
index 0000000..6a5001c
--- /dev/null
+++ b/lib/cache/entry_list.c
@@ -0,0 +1,293 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of chaining in struct entry_h.  Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+#include "lib/utils.h"
+
+
+static int entry_h_len(knot_db_val_t val);
+
+
+void entry_list_memcpy(struct entry_apex *ea, entry_list_t list)
+{
+	assert(ea);
+	memset(ea, 0, offsetof(struct entry_apex, data));
+	ea->has_ns	= list[EL_NS	].len;
+	ea->has_cname	= list[EL_CNAME	].len;
+	ea->has_dname	= list[EL_DNAME	].len;
+	for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+		ea->nsecs[i] =   list[i].len == 0 ? 0 :
+				(list[i].len == 4 ? 1 : 3);
+	}
+	uint8_t *it = ea->data;
+	for (int i = 0; i < EL_LENGTH; ++i) {
+		if (list[i].data) {
+			memcpy(it, list[i].data, list[i].len);
+			/* LATER(optim.): coalesce consecutive writes? */
+		} else {
+			list[i].data = it;
+		}
+		it += to_even(list[i].len);
+	}
+}
+
+int entry_list_parse(const knot_db_val_t val, entry_list_t list)
+{
+	const bool ok = val.data && val.len && list;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	/* Parse the apex itself (nsec parameters). */
+	const struct entry_apex *ea = entry_apex_consistent(val);
+	if (!ea) {
+		return kr_error(EILSEQ);
+	}
+	const uint8_t *it = ea->data,
+		*it_bound = knot_db_val_bound(val);
+	for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+		if (it > it_bound) {
+			return kr_error(EILSEQ);
+		}
+		list[i].data = (void *)it;
+		switch (ea->nsecs[i]) {
+		case 0:
+			list[i].len = 0;
+			break;
+		case 1:
+			list[i].len = sizeof(uint32_t); /* just timestamp */
+			break;
+		case 3: { /* timestamp + NSEC3PARAM wire */
+			if (it + sizeof(uint32_t) + 4 > it_bound) {
+				return kr_error(EILSEQ);
+			}
+			list[i].len = sizeof(uint32_t)
+				+ nsec_p_rdlen(it + sizeof(uint32_t));
+			break;
+			}
+		default:
+			return kr_error(EILSEQ);
+		};
+		it += to_even(list[i].len);
+	}
+	/* Parse every entry_h. */
+	for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) {
+		list[i].data = (void *)it;
+		bool has_type;
+		switch (i) {
+		case EL_NS:	has_type = ea->has_ns;		break;
+		case EL_CNAME:	has_type = ea->has_cname;	break;
+		case EL_DNAME:	has_type = ea->has_dname;	break;
+		default: assert(false); return kr_error(EINVAL); /* something very bad */
+		}
+		if (!has_type) {
+			list[i].len = 0;
+			continue;
+		}
+		if (it >= it_bound) {
+			assert(!EILSEQ);
+			return kr_error(EILSEQ);
+		}
+		const int len = entry_h_len(
+			(knot_db_val_t){ .data = (void *)it, .len = it_bound - it });
+		if (len < 0) {
+			assert(false);
+			return kr_error(len);
+		}
+		list[i].len = len;
+		it += to_even(len);
+	}
+	assert(it == it_bound);
+	return kr_ok();
+}
+
+/** Given a valid entry header, find its length (i.e. offset of the next entry).
+ * \param val The beginning of the data and the bound (read only).
+ */
+static int entry_h_len(const knot_db_val_t val)
+{
+	const bool ok = val.data && ((ssize_t)val.len) > 0;
+	if (!ok) return kr_error(EINVAL);
+	const struct entry_h *eh = val.data;
+	const uint8_t *d = eh->data; /* iterates over the data in entry */
+	const uint8_t *data_bound = knot_db_val_bound(val);
+	if (d >= data_bound) return kr_error(EILSEQ);
+	if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
+		int sets = 2;
+		while (sets-- > 0) {
+			d += rdataset_dematerialized_size(d);
+			if (d > data_bound) {
+				assert(!EILSEQ);
+				return kr_error(EILSEQ);
+			}
+		}
+	} else { /* A "packet" (opaque ATM). */
+		uint16_t len;
+		if (d + sizeof(len) > data_bound) return kr_error(EILSEQ);
+		memcpy(&len, d, sizeof(len));
+		d += 2 + to_even(len);
+	}
+	if (d > data_bound) {
+		assert(!EILSEQ);
+		return kr_error(EILSEQ);
+	}
+	return d - (uint8_t *)val.data;
+}
+
+struct entry_apex * entry_apex_consistent(knot_db_val_t val)
+{
+	//XXX: check lengths, etc.
+	return val.data;
+}
+
+/* See the header file. */
+int entry_h_seek(knot_db_val_t *val, uint16_t type)
+{
+	int i = -1;
+	switch (type) {
+	case KNOT_RRTYPE_NS:	i = EL_NS;	break;
+	case KNOT_RRTYPE_CNAME:	i = EL_CNAME;	break;
+	case KNOT_RRTYPE_DNAME:	i = EL_DNAME;	break;
+	default:		return kr_ok();
+	}
+
+	entry_list_t el;
+	int ret = entry_list_parse(*val, el);
+	if (ret) return ret;
+	*val = el[i];
+	return val->len ? kr_ok() : kr_error(ENOENT);
+}
+
+static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key,
+				knot_db_val_t *val, const struct kr_query *qry)
+{
+	int ret = cache_op(cache, write, key, val, 1);
+	if (!ret) return kr_ok();
+	/* Clear cache if overfull.  It's nontrivial to do better with LMDB.
+	 * LATER: some garbage-collection mechanism. */
+	if (ret == kr_error(ENOSPC)) {
+		ret = kr_cache_clear(cache);
+		const char *msg = "[cache] clearing because overfull, ret = %d\n";
+		if (ret) {
+			kr_log_error(msg, ret);
+		} else {
+			kr_log_info(msg, ret);
+			ret = kr_error(ENOSPC);
+		}
+		return ret;
+	}
+	VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret);
+	return kr_error(ret ? ret : ENOSPC);
+}
+
+
+/* See the header file. */
+int entry_h_splice(
+	knot_db_val_t *val_new_entry, uint8_t rank,
+	const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+	const knot_dname_t *owner/*log only*/,
+	const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp)
+{
+	//TODO: another review, perhaps incuding the API
+	const bool ok = val_new_entry && val_new_entry->len > 0;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+
+	int i_type;
+	switch (type) {
+	case KNOT_RRTYPE_NS:	i_type = EL_NS;		break;
+	case KNOT_RRTYPE_CNAME:	i_type = EL_CNAME;	break;
+	case KNOT_RRTYPE_DNAME:	i_type = EL_DNAME;	break;
+	default:		i_type = 0;
+	}
+
+	/* Get eh_orig (original entry), and also el list if multi-entry case. */
+	const struct entry_h *eh_orig = NULL;
+	entry_list_t el;
+	int ret = -1;
+	if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
+		knot_db_val_t val;
+		ret = cache_op(cache, read, &key, &val, 1);
+		if (i_type) {
+			if (!ret) ret = entry_list_parse(val, el);
+			if (ret) memset(el, 0, sizeof(el));
+			val = el[i_type];
+		}
+		/* val is on the entry, in either case (or error) */
+		if (!ret) {
+			eh_orig = entry_h_consistent(val, type);
+		}
+	} else {
+		/* We want to fully overwrite the entry, so don't even read it. */
+		memset(el, 0, sizeof(el));
+	}
+
+	if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
+		/* If equal rank was accepted, spoofing a *single* answer would be
+		 * enough to e.g. override NS record in AUTHORITY section.
+		 * This way they would have to hit the first answer
+		 * (whenever TTL nears expiration).
+		 * Stale-serving is NOT considered, but TTL 1 would be considered
+		 * as expiring anyway, ... */
+		int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp);
+		if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl)
+		    && rank <= eh_orig->rank) {
+			WITH_VERBOSE(qry) {
+				auto_free char *type_str = kr_rrtype_text(type),
+					*owner_str = kr_dname_text(owner);
+				VERBOSE_MSG(qry, "=> not overwriting %s %s\n",
+						type_str, owner_str);
+			}
+			return kr_error(EEXIST);
+		}
+	}
+
+	if (!i_type) {
+		/* The non-list types are trivial now. */
+		return cache_write_or_clear(cache, &key, val_new_entry, qry);
+	}
+	/* Now we're in trouble.  In some cases, parts of data to be written
+	 * is an lmdb entry that may be invalidated by our write request.
+	 * (lmdb does even in-place updates!) Therefore we copy all into a buffer.
+	 * (We don't bother deallocating from the mempool.)
+	 * LATER(optim.): do this only when neccessary, or perhaps another approach.
+	 * This is also complicated by the fact that the val_new_entry part
+	 * is to be written *afterwards* by the caller.
+	 */
+	el[i_type] = (knot_db_val_t){
+		.len = val_new_entry->len,
+		.data = NULL, /* perhaps unclear in the entry_h_splice() API */
+	};
+	knot_db_val_t val = {
+		.len = entry_list_serial_size(el),
+		.data = NULL,
+	};
+	void *buf = mm_alloc(&qry->request->pool, val.len);
+	entry_list_memcpy(buf, el);
+	ret = cache_write_or_clear(cache, &key, &val, qry);
+	if (ret) return kr_error(ret);
+	memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */
+	val_new_entry->data = (uint8_t *)val.data
+			    + ((uint8_t *)el[i_type].data - (uint8_t *)buf);
+	return kr_ok();
+}
+
diff --git a/lib/cache/entry_pkt.c b/lib/cache/entry_pkt.c
new file mode 100644
index 0000000..00e73d4
--- /dev/null
+++ b/lib/cache/entry_pkt.c
@@ -0,0 +1,224 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of packet-caching.  Prototypes in ./impl.h
+ *
+ * The packet is stashed in entry_h::data as uint16_t length + full packet wire format.
+ */
+
+#include "lib/utils.h"
+#include "lib/layer/iterate.h" /* kr_response_classify */
+#include "lib/cache/impl.h"
+
+
+/** Compute TTL for a packet.  Generally it's minimum TTL, with extra conditions. */
+static uint32_t packet_ttl(const knot_pkt_t *pkt, bool is_negative)
+{
+	bool has_ttl = false;
+	uint32_t ttl = UINT32_MAX;
+	/* Find minimum entry TTL in the packet or SOA minimum TTL. */
+	for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+		const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+		for (unsigned k = 0; k < sec->count; ++k) {
+			const knot_rrset_t *rr = knot_pkt_rr(sec, k);
+			if (is_negative) {
+				/* Use SOA minimum TTL for negative answers. */
+				if (rr->type == KNOT_RRTYPE_SOA) {
+					return MIN(rr->ttl, knot_soa_minimum(rr->rrs.rdata));
+				} else {
+					continue; /* Use SOA only for negative answers. */
+				}
+			}
+			if (knot_rrtype_is_metatype(rr->type)) {
+				continue; /* Skip metatypes. */
+			}
+			ttl = MIN(ttl, rr->ttl);
+		}
+	}
+	/* If no valid TTL present, go with zero (will get clamped to minimum). */
+	return has_ttl ? ttl : 0;
+}
+
+
+void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry,
+		const struct kr_request *req, const bool has_optout)
+{
+	/* In some cases, stash also the packet. */
+	const bool is_negative = kr_response_classify(pkt)
+				& (PKT_NODATA|PKT_NXDOMAIN);
+	const struct kr_qflags * const qf = &qry->flags;
+	const bool want_negative = qf->DNSSEC_INSECURE || !qf->DNSSEC_WANT || has_optout;
+	const bool want_pkt = qf->DNSSEC_BOGUS /*< useful for +cd answers */
+				|| (is_negative && want_negative);
+
+	if (!want_pkt || !knot_wire_get_aa(pkt->wire)
+	    || pkt->parsed != pkt->size /*< malformed packet; still can't detect KNOT_EFEWDATA */
+	   ) {
+		return;
+	}
+
+	/* Compute rank.  If cd bit is set or we got answer via non-validated
+	 * forwarding, make the rank bad; otherwise it depends on flags.
+	 * TODO: probably make validator attempt validation even with +cd. */
+	uint8_t rank = KR_RANK_AUTH;
+	const bool risky_vldr = is_negative && qf->FORWARD && qf->CNAME;
+		/* ^^ CNAME'ed NXDOMAIN answer in forwarding mode can contain
+		 * unvalidated records; original commit: d6e22f476. */
+	if (knot_wire_get_cd(req->qsource.packet->wire) || qf->STUB || risky_vldr) {
+		kr_rank_set(&rank, KR_RANK_OMIT);
+	} else {
+		if (qf->DNSSEC_BOGUS) {
+			kr_rank_set(&rank, KR_RANK_BOGUS);
+		} else if (qf->DNSSEC_INSECURE) {
+			kr_rank_set(&rank, KR_RANK_INSECURE);
+		} else if (!qf->DNSSEC_WANT) {
+			/* no TAs at all, leave _RANK_AUTH */
+		} else if (has_optout) {
+			/* All bad cases should be filtered above,
+			 * at least the same way as pktcache in kresd 1.5.x. */
+			kr_rank_set(&rank, KR_RANK_SECURE);
+		} else assert(false);
+	}
+
+	const uint16_t pkt_type = knot_pkt_qtype(pkt);
+	const knot_dname_t *owner = knot_pkt_qname(pkt); /* qname can't be compressed */
+
+	// LATER: nothing exists under NXDOMAIN.  Implement that (optionally)?
+#if 0
+	if (knot_wire_get_rcode(pkt->wire) == KNOT_RCODE_NXDOMAIN
+	 /* && !qf->DNSSEC_INSECURE */ ) {
+		pkt_type = KNOT_RRTYPE_NS;
+	}
+#endif
+
+	/* Construct the key under which the pkt will be stored. */
+	struct key k_storage, *k = &k_storage;
+	knot_db_val_t key;
+	int ret = kr_dname_lf(k->buf, owner, false);
+	if (ret) {
+		/* A server might (incorrectly) reply with QDCOUNT=0. */
+		assert(owner == NULL);
+		return;
+	}
+	key = key_exact_type_maypkt(k, pkt_type);
+
+	/* For now we stash the full packet byte-exactly as it came from upstream. */
+	const uint16_t pkt_size = pkt->size;
+	knot_db_val_t val_new_entry = {
+		.data = NULL,
+		.len = offsetof(struct entry_h, data) + sizeof(pkt_size) + pkt->size,
+	};
+	/* Prepare raw memory for the new entry and fill it. */
+	struct kr_cache *cache = &req->ctx->cache;
+	ret = entry_h_splice(&val_new_entry, rank, key, k->type, pkt_type,
+				owner, qry, cache, qry->timestamp.tv_sec);
+	if (ret) return; /* some aren't really errors */
+	assert(val_new_entry.data);
+	struct entry_h *eh = val_new_entry.data;
+	memset(eh, 0, offsetof(struct entry_h, data));
+	eh->time = qry->timestamp.tv_sec;
+	eh->ttl  = MAX(MIN(packet_ttl(pkt, is_negative), cache->ttl_max), cache->ttl_min);
+	eh->rank = rank;
+	eh->is_packet = true;
+	eh->has_optout = qf->DNSSEC_OPTOUT;
+	memcpy(eh->data, &pkt_size, sizeof(pkt_size));
+	memcpy(eh->data + sizeof(pkt_size), pkt->wire, pkt_size);
+
+	WITH_VERBOSE(qry) {
+		auto_free char *type_str = kr_rrtype_text(pkt_type),
+			*owner_str = kr_dname_text(owner);
+		VERBOSE_MSG(qry, "=> stashed packet: rank 0%.2o, TTL %d, "
+				"%s %s (%d B)\n",
+				eh->rank, eh->ttl,
+				type_str, owner_str, (int)val_new_entry.len);
+	}
+}
+
+
+int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+
+	uint16_t pkt_len;
+	memcpy(&pkt_len, eh->data, sizeof(pkt_len));
+	if (pkt_len > pkt->max_size) {
+		return kr_error(ENOENT);
+	}
+
+	/* Copy answer and reparse it, but keep the original message id. */
+	uint16_t msgid = knot_wire_get_id(pkt->wire);
+	knot_pkt_clear(pkt);
+	memcpy(pkt->wire, eh->data + 2, pkt_len);
+	pkt->size = pkt_len;
+	int ret = knot_pkt_parse(pkt, 0);
+	if (ret == KNOT_EFEWDATA || ret == KNOT_EMALF) {
+		return kr_error(ENOENT);
+		/* LATER(opt): try harder to avoid stashing such packets */
+	}
+	if (ret != KNOT_EOK) {
+		assert(!ret);
+		return kr_error(ret);
+	}
+	knot_wire_set_id(pkt->wire, msgid);
+
+	/* Add rank into the additional field. */
+	for (size_t i = 0; i < pkt->rrset_count; ++i) {
+		assert(!pkt->rr[i].additional);
+		uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank));
+		if (!rr_rank) {
+			return kr_error(ENOMEM);
+		}
+		*rr_rank = eh->rank;
+		pkt->rr[i].additional = rr_rank;
+	}
+
+	/* Adjust TTL in each record. */
+	const uint32_t drift = eh->ttl - new_ttl;
+	for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+		const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+		for (unsigned k = 0; k < sec->count; ++k) {
+			knot_rrset_t *rrs = // vv FIXME??
+				/*const-cast*/(knot_rrset_t *)knot_pkt_rr(sec, k);
+			/* We need to be careful: due to enforcing minimum TTL
+			 * on packet, some records may be below that value.
+			 * We keep those records at TTL 0. */
+			if (rrs->ttl >= drift) {
+				rrs->ttl -= drift;
+			} else {
+				rrs->ttl = 0;
+			}
+		}
+	}
+
+	/* Finishing touches. TODO: perhaps factor out */
+	struct kr_qflags * const qf = &qry->flags;
+	qf->EXPIRING = is_expiring(eh->ttl, new_ttl);
+	qf->CACHED = true;
+	qf->NO_MINIMIZE = true;
+	qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+	qf->DNSSEC_BOGUS = kr_rank_test(eh->rank, KR_RANK_BOGUS);
+	if (qf->DNSSEC_INSECURE || qf->DNSSEC_BOGUS) {
+		qf->DNSSEC_WANT = false;
+	}
+	qf->DNSSEC_OPTOUT = eh->has_optout;
+	VERBOSE_MSG(qry, "=> satisfied by exact packet: rank 0%.2o, new TTL %d\n",
+			eh->rank, new_ttl);
+	return kr_ok();
+}
+
diff --git a/lib/cache/entry_rr.c b/lib/cache/entry_rr.c
new file mode 100644
index 0000000..94dd0e8
--- /dev/null
+++ b/lib/cache/entry_rr.c
@@ -0,0 +1,146 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of RRset (de)materialization, i.e. (de)serialization to storage
+ * format used in cache (some repeated fields are omitted).  Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data)
+{
+	/* FIXME: either give up on even alignment and thus direct usability
+	 * of rdatasets as they are in lmdb, or align inside cdb_* functions
+	 * (request sizes one byte longer and shift iff on an odd address). */
+	//if ((size_t)data & 1) VERBOSE_MSG(NULL, "dematerialize: odd address\n");
+	//const uint8_t *data0 = data;
+	if (!data) {
+		assert(data);
+		return kr_error(EINVAL);
+	}
+	const uint16_t rr_count = rds ? rds->count : 0;
+	memcpy(data, &rr_count, sizeof(rr_count));
+	data += sizeof(rr_count);
+	if (rr_count) {
+		size_t size = knot_rdataset_size(rds);
+		memcpy(data, rds->rdata, size);
+		data += size;
+	}
+	//VERBOSE_MSG(NULL, "dematerialized to %d B\n", (int)(data - data0));
+	(void)data;
+	return kr_ok();
+}
+
+/** Materialize a knot_rdataset_t from cache with given TTL.
+ * Return the number of bytes consumed or an error code.
+ */
+static int rdataset_materialize(knot_rdataset_t * restrict rds, const uint8_t * const data,
+				const uint8_t *data_bound, knot_mm_t *pool)
+{
+	assert(rds && data && data_bound && data_bound > data && !rds->rdata
+		/*&& !((size_t)data & 1)*/);
+	assert(pool); /* not required, but that's our current usage; guard leaks */
+	const uint8_t *d = data; /* iterates over the cache data */
+	{
+		uint16_t rr_count;
+		memcpy(&rr_count, d, sizeof(rr_count));
+		d += sizeof(rr_count);
+		rds->count = rr_count;
+		if (!rr_count) { /* avoid mm_alloc(pool, 0); etc. */
+			return d - data;
+		}
+	}
+	/* First sum up the sizes for wire format length. */
+	const knot_rdataset_t rds_tmp = {
+		.count = rds->count,
+		.rdata = (knot_rdata_t *)d,
+	};
+	size_t rds_size = knot_rdataset_size(&rds_tmp); /* TODO: we might overrun here already,
+							but we need to trust cache anyway...*/
+	if (d + rds_size > data_bound) {
+		VERBOSE_MSG(NULL, "materialize: EILSEQ!\n");
+		return kr_error(EILSEQ);
+	}
+	rds->rdata = mm_alloc(pool, rds_size);
+	if (!rds->rdata) {
+		return kr_error(ENOMEM);
+	}
+	memcpy(rds->rdata, d, rds_size);
+	d += rds_size;
+	//VERBOSE_MSG(NULL, "materialized from %d B\n", (int)(d - data));
+	return d - data;
+}
+
+int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref,
+			 knot_mm_t *pool)
+{
+	struct entry_h *eh = ref->raw_data;
+	return rdataset_materialize(dst, eh->data, ref->raw_bound, pool);
+}
+
+
+int entry2answer(struct answer *ans, int id,
+		const struct entry_h *eh, const uint8_t *eh_bound,
+		const knot_dname_t *owner, uint16_t type, uint32_t new_ttl)
+{
+	/* We assume it's zeroed.  Do basic sanity check. */
+	if (ans->rrsets[id].set.rr || ans->rrsets[id].sig_rds.rdata
+	    || (type == KNOT_RRTYPE_NSEC  &&  ans->nsec_p.raw)
+	    || (type == KNOT_RRTYPE_NSEC3 && !ans->nsec_p.raw)
+	   )
+	{
+		assert(false);
+		return kr_error(EINVAL);
+	}
+	/* Materialize the base RRset. */
+	knot_rrset_t *rr = ans->rrsets[id].set.rr
+		= knot_rrset_new(owner, type, KNOT_CLASS_IN, new_ttl, ans->mm);
+	if (!rr) {
+		assert(!ENOMEM);
+		return kr_error(ENOMEM);
+	}
+	int ret = rdataset_materialize(&rr->rrs, eh->data, eh_bound, ans->mm);
+	if (ret < 0) goto fail;
+	size_t data_off = ret;
+	ans->rrsets[id].set.rank = eh->rank;
+	ans->rrsets[id].set.expiring = is_expiring(eh->ttl, new_ttl);
+	/* Materialize the RRSIG RRset for the answer in (pseudo-)packet. */
+	bool want_rrsigs = true; /* LATER(optim.): might be omitted in some cases. */
+	if (want_rrsigs) {
+		ret = rdataset_materialize(&ans->rrsets[id].sig_rds, eh->data + data_off,
+					   eh_bound, ans->mm);
+		if (ret < 0) goto fail;
+		/* Sanity check: we consumed exactly all data. */
+		int unused_bytes = eh_bound - (uint8_t *)eh->data - data_off - ret;
+		if (unused_bytes) {
+			kr_log_error("[cach] entry2answer ERROR: unused bytes: %d\n",
+					unused_bytes);
+			assert(!EILSEQ);
+			ret = kr_error(EILSEQ);
+			goto fail; /* to be on the safe side */
+		}
+	}
+	return kr_ok();
+fail:
+	assert(/*false*/!ret);
+	/* Cleanup the item that we might've (partially) written to. */
+	knot_rrset_free(ans->rrsets[id].set.rr, ans->mm);
+	knot_rdataset_clear(&ans->rrsets[id].sig_rds, ans->mm);
+	memset(&ans->rrsets[id], 0, sizeof(ans->rrsets[id]));
+	return kr_error(ret);
+}
+
diff --git a/lib/cache/impl.h b/lib/cache/impl.h
new file mode 100644
index 0000000..a08f355
--- /dev/null
+++ b/lib/cache/impl.h
@@ -0,0 +1,412 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Header internal for cache implementation(s).
+ * Only LMDB works for now.
+ */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libdnssec/error.h>
+#include <libdnssec/nsec.h>
+#include <libknot/consts.h>
+#include <libknot/db/db.h>
+#include <libknot/dname.h>
+
+#include "contrib/cleanup.h"
+#include "contrib/murmurhash3/murmurhash3.h" /* hash() for nsec_p_hash() */
+#include "lib/cache/cdb_api.h"
+#include "lib/resolve.h"
+
+/* Cache entry values - binary layout.
+ *
+ * It depends on type which is recognizable by the key.
+ * Code depending on the contents of the key is marked by CACHE_KEY_DEF.
+ *
+ * 'E' entry (exact hit):
+ *	- ktype == NS: struct entry_apex - multiple types inside (NS and xNAME);
+ *	- ktype != NS: struct entry_h
+ *	    * is_packet: uint16_t length, the rest is opaque and handled by ./entry_pkt.c
+ *	    * otherwise RRset + its RRSIG set (possibly empty).
+ * '1' or '3' entry (NSEC or NSEC3)
+ *	- struct entry_h, contents is the same as for exact hit
+ *	- flags don't make sense there
+ */
+
+struct entry_h {
+	uint32_t time;	/**< The time of inception. */
+	uint32_t ttl;	/**< TTL at inception moment.  Assuming it fits into int32_t ATM. */
+	uint8_t  rank : 6;	/**< See enum kr_rank */
+	bool is_packet : 1;	/**< Negative-answer packet for insecure/bogus name. */
+	bool has_optout : 1;	/**< Only for packets; persisted DNSSEC_OPTOUT. */
+	uint8_t _pad;		/**< We need even alignment for data now. */
+	uint8_t data[];
+};
+struct entry_apex;
+
+/** Check basic consistency of entry_h for 'E' entries, not looking into ->data.
+ * (for is_packet the length of data is checked)
+ */
+struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type);
+
+struct entry_apex * entry_apex_consistent(knot_db_val_t val);
+
+/** Consistency check, ATM common for NSEC and NSEC3. */
+static inline struct entry_h * entry_h_consistent_NSEC(knot_db_val_t data)
+{
+	/* ATM it's enough to just extend the checks for exact entries. */
+	const struct entry_h *eh = entry_h_consistent(data, KNOT_RRTYPE_NSEC);
+	bool ok = eh != NULL;
+	ok = ok && !eh->is_packet && !eh->has_optout;
+	return ok ? /*const-cast*/(struct entry_h *)eh : NULL;
+}
+
+
+/* nsec_p* - NSEC* chain parameters */
+
+static inline int nsec_p_rdlen(const uint8_t *rdata)
+{
+	//TODO: do we really need the zero case?
+	return rdata ? 5 + rdata[4] : 0; /* rfc5155 4.2 and 3.2. */
+}
+static const int NSEC_P_MAXLEN = sizeof(uint32_t) + 5 + 255; // TODO: remove??
+
+/** Hash of NSEC3 parameters, used as a tag to separate different chains for same zone. */
+typedef uint32_t nsec_p_hash_t;
+static inline nsec_p_hash_t nsec_p_mkHash(const uint8_t *nsec_p)
+{
+	assert(nsec_p && !(KNOT_NSEC3_FLAG_OPT_OUT & nsec_p[1]));
+	return hash((const char *)nsec_p, nsec_p_rdlen(nsec_p));
+}
+
+/** NSEC* parameters for the chain. */
+struct nsec_p {
+	const uint8_t *raw; /**< Pointer to raw NSEC3 parameters; NULL for NSEC. */
+	nsec_p_hash_t hash; /**< Hash of `raw`, used for cache keys. */
+	dnssec_nsec3_params_t libknot; /**< Format for libknot; owns malloced memory! */
+};
+
+
+
+/** LATER(optim.): this is overshot, but struct key usage should be cheap ATM. */
+#define KR_CACHE_KEY_MAXLEN (KNOT_DNAME_MAXLEN + 100) /* CACHE_KEY_DEF */
+
+struct key {
+	const knot_dname_t *zname; /**< current zone name (points within qry->sname) */
+	uint8_t zlf_len; /**< length of current zone's lookup format */
+
+	/** Corresponding key type; e.g. NS for CNAME.
+	 * Note: NSEC type is ambiguous (exact and range key). */
+	uint16_t type;
+	/** The key data start at buf+1, and buf[0] contains some length.
+	 * For details see key_exact* and key_NSEC* functions. */
+	uint8_t buf[KR_CACHE_KEY_MAXLEN];
+	/* LATER(opt.): ^^ probably change the anchoring, so that kr_dname_lf()
+	 * doesn't need to move data after knot_dname_lf(). */
+};
+
+static inline size_t key_nwz_off(const struct key *k)
+{
+	/* CACHE_KEY_DEF: zone name lf + 0 ('1' or '3').
+	 * NSEC '1' case continues just with the name within zone. */
+	return k->zlf_len + 2;
+}
+static inline size_t key_nsec3_hash_off(const struct key *k)
+{
+	/* CACHE_KEY_DEF NSEC3: tag (nsec_p_hash_t) + 20 bytes NSEC3 name hash) */
+	return key_nwz_off(k) + sizeof(nsec_p_hash_t);
+}
+/** Hash is always SHA1; I see no plans to standardize anything else.
+ * https://www.iana.org/assignments/dnssec-nsec3-parameters/dnssec-nsec3-parameters.xhtml#dnssec-nsec3-parameters-3
+ */
+static const int NSEC3_HASH_LEN = 20,
+		 NSEC3_HASH_TXT_LEN = 32;
+
+/** Finish constructing string key for for exact search.
+ * It's assumed that kr_dname_lf(k->buf, owner, *) had been ran.
+ */
+knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type);
+
+/** Like key_exact_type_maypkt but with extra checks if used for RRs only. */
+static inline knot_db_val_t key_exact_type(struct key *k, uint16_t type)
+{
+	switch (type) {
+	/* Sanity check: forbidden types represented in other way(s). */
+	case KNOT_RRTYPE_NSEC:
+	case KNOT_RRTYPE_NSEC3:
+		assert(false);
+		return (knot_db_val_t){ NULL, 0 };
+	}
+	return key_exact_type_maypkt(k, type);
+}
+
+
+/* entry_h chaining; implementation in ./entry_list.c */
+
+enum { ENTRY_APEX_NSECS_CNT = 2 };
+
+/** Header of 'E' entry with ktype == NS.  Inside is private to ./entry_list.c
+ *
+ * We store xNAME at NS type to lower the number of searches in closest_NS().
+ * CNAME is only considered for equal name, of course.
+ * We also store NSEC* parameters at NS type.
+ */
+struct entry_apex {
+	/* ENTRY_H_FLAGS */
+	bool has_ns : 1;
+	bool has_cname : 1;
+	bool has_dname : 1;
+
+	uint8_t pad_; /**< Weird: 1 byte + 2 bytes + x bytes; let's do 2+2+x. */
+	int8_t nsecs[ENTRY_APEX_NSECS_CNT]; /**< values:  0: none, 1: NSEC, 3: NSEC3 */
+	uint8_t data[];
+	/* XXX: if not first, stamp of last being the first?
+	 * Purpose: save cache operations if rolled the algo/params long ago. */
+};
+
+/** Indices for decompressed entry_list_t. */
+enum EL {
+	EL_NS = ENTRY_APEX_NSECS_CNT,
+	EL_CNAME,
+	EL_DNAME,
+	EL_LENGTH
+};
+/** Decompressed entry_apex.  It's an array of unparsed entry_h references.
+ * Note: arrays are passed "by reference" to functions (in C99). */
+typedef knot_db_val_t entry_list_t[EL_LENGTH];
+
+static inline uint16_t EL2RRTYPE(enum EL i)
+{
+	switch (i) {
+	case EL_NS:	return KNOT_RRTYPE_NS;
+	case EL_CNAME:	return KNOT_RRTYPE_CNAME;
+	case EL_DNAME:	return KNOT_RRTYPE_DNAME;
+	default:	assert(false);  return 0;
+	}
+}
+
+/** There may be multiple entries within, so rewind `val` to the one we want.
+ *
+ * ATM there are multiple types only for the NS ktype - it also accommodates xNAMEs.
+ * \note `val->len` represents the bound of the whole list, not of a single entry.
+ * \note in case of ENOENT, `val` is still rewound to the beginning of the next entry.
+ * \return error code
+ * TODO: maybe get rid of this API?
+ */
+int entry_h_seek(knot_db_val_t *val, uint16_t type);
+
+/** Prepare space to insert an entry.
+ *
+ * Some checks are performed (rank, TTL), the current entry in cache is copied
+ * with a hole ready for the new entry (old one of the same type is cut out).
+ *
+ * \param val_new_entry The only changing parameter; ->len is read, ->data written.
+ * \return error code
+ */
+int entry_h_splice(
+	knot_db_val_t *val_new_entry, uint8_t rank,
+	const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+	const knot_dname_t *owner/*log only*/,
+	const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp);
+
+/** Parse an entry_apex into individual items.  @return error code. */
+int entry_list_parse(const knot_db_val_t val, entry_list_t list);
+
+static inline size_t to_even(size_t n)
+{
+	return n + (n & 1);
+}
+
+static inline int entry_list_serial_size(const entry_list_t list)
+{
+	int size = offsetof(struct entry_apex, data);
+	for (int i = 0; i < EL_LENGTH; ++i) {
+		size += to_even(list[i].len);
+	}
+	return size;
+}
+
+/** Fill contents of an entry_apex.
+ *
+ * @note NULL pointers are overwritten - caller may like to fill the space later.
+ */
+void entry_list_memcpy(struct entry_apex *ea, entry_list_t list);
+
+
+
+/* Packet caching; implementation in ./entry_pkt.c */
+
+/** Stash the packet into cache (if suitable, etc.)
+ * \param has_optout whether the packet contains an opt-out NSEC3 */
+void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry,
+		const struct kr_request *req, bool has_optout);
+
+/** Try answering from packet cache, given an entry_h.
+ *
+ * This assumes the TTL is OK and entry_h_consistent, but it may still return error.
+ * On success it handles all the rest, incl. qry->flags.
+ */
+int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl);
+
+
+/** Record is expiring if it has less than 1% TTL (or less than 5s) */
+static inline bool is_expiring(uint32_t orig_ttl, uint32_t new_ttl)
+{
+	int64_t nttl = new_ttl; /* avoid potential over/under-flow */
+	return 100 * (nttl - 5) < orig_ttl;
+}
+
+/** Returns signed result so you can inspect how much stale the RR is.
+ *
+ * @param owner name for stale-serving decisions.  You may pass NULL to disable stale.
+ * @note: NSEC* uses zone name ATM; for NSEC3 the owner may not even be knowable.
+ * @param type for stale-serving.
+ */
+int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry,
+                    const knot_dname_t *owner, uint16_t type, uint32_t now);
+
+
+/* RRset (de)materialization; implementation in ./entry_rr.c */
+
+/** Size of the RR count field */
+#define KR_CACHE_RR_COUNT_SIZE sizeof(uint16_t)
+
+/** Compute size of serialized rdataset.  NULL is accepted as empty set. */
+static inline int rdataset_dematerialize_size(const knot_rdataset_t *rds)
+{
+	return KR_CACHE_RR_COUNT_SIZE + (rds == NULL ? 0 : knot_rdataset_size(rds));
+}
+
+static inline int rdataset_dematerialized_size(const uint8_t *data)
+{
+	knot_rdataset_t rds;
+	memcpy(&rds.count, data, sizeof(rds.count));
+	rds.rdata = (knot_rdata_t *)(data + sizeof(rds.count));
+	return sizeof(rds.count) + knot_rdataset_size(&rds);
+}
+
+/** Serialize an rdataset. */
+int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data);
+
+
+/** Partially constructed answer when gathering RRsets from cache. */
+struct answer {
+	int rcode;		/**< PKT_NODATA, etc. */
+	struct nsec_p nsec_p;	/**< Don't mix different NSEC* parameters in one answer. */
+	knot_mm_t *mm;		/**< Allocator for rrsets */
+	struct answer_rrset {
+		ranked_rr_array_entry_t set;	/**< set+rank for the main data */
+		knot_rdataset_t sig_rds;	/**< RRSIG data, if any */
+	} rrsets[1+1+3]; /**< see AR_ANSWER and friends; only required records are filled */
+};
+enum {
+	AR_ANSWER = 0, /**< Positive answer record.  It might be wildcard-expanded. */
+	AR_SOA,  /**< SOA record. */
+	AR_NSEC, /**< NSEC* covering or matching the SNAME (next closer name in NSEC3 case). */
+	AR_WILD, /**< NSEC* covering or matching the source of synthesis. */
+	AR_CPE,  /**< NSEC3 matching the closest provable encloser. */
+};
+
+/** Materialize RRset + RRSIGs into ans->rrsets[id].
+ * LATER(optim.): it's slightly wasteful that we allocate knot_rrset_t for the packet
+ *
+ * \return error code.  They are all bad conditions and "guarded" by assert.
+ */
+int entry2answer(struct answer *ans, int id,
+		const struct entry_h *eh, const uint8_t *eh_bound,
+		const knot_dname_t *owner, uint16_t type, uint32_t new_ttl);
+
+
+/* Preparing knot_pkt_t for cache answer from RRs; implementation in ./knot_pkt.c */
+
+/** Prepare answer packet to be filled by RRs (without RR data in wire). */
+int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type);
+
+/** Append RRset + its RRSIGs into the current section (*shallow* copy), with given rank.
+ * \note it works with empty set as well (skipped)
+ * \note pkt->wire is not updated in any way
+ * \note KNOT_CLASS_IN is assumed
+ */
+int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank);
+
+
+
+/* NSEC (1) stuff.  Implementation in ./nsec1.c */
+
+/** Construct a string key for for NSEC (1) predecessor-search.
+ * \param add_wildcard Act as if the name was extended by "*."
+ * \note k->zlf_len is assumed to have been correctly set */
+knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard);
+
+/** Closest encloser check for NSEC (1).
+ * To understand the interface, see the call point.
+ * \param k	space to store key + input: zname and zlf_len
+ * \return 0: success;  >0: try other (NSEC3);  <0: exit cache immediately. */
+int nsec1_encloser(struct key *k, struct answer *ans,
+		   const int sname_labels, int *clencl_labels,
+		   knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz,
+		   const struct kr_query *qry, struct kr_cache *cache);
+
+/** Source of synthesis (SS) check for NSEC (1).
+ * To understand the interface, see the call point.
+ * \return 0: continue; <0: exit cache immediately;
+ * 	AR_SOA: skip to adding SOA (SS was covered or matched for NODATA). */
+int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz,
+		    const struct kr_query *qry, struct kr_cache *cache);
+
+
+/* NSEC3 stuff.  Implementation in ./nsec3.c */
+
+/** Construct a string key for for NSEC3 predecessor-search, from an NSEC3 name.
+ * \note k->zlf_len is assumed to have been correctly set */
+knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name,
+			const nsec_p_hash_t nsec_p_hash);
+
+/** TODO.  See nsec1_encloser(...) */
+int nsec3_encloser(struct key *k, struct answer *ans,
+		   const int sname_labels, int *clencl_labels,
+		   const struct kr_query *qry, struct kr_cache *cache);
+
+/** TODO.  See nsec1_src_synth(...) */
+int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    const struct kr_query *qry, struct kr_cache *cache);
+
+
+
+#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "cach",  ## __VA_ARGS__)
+
+/** Shorthand for operations on cache backend */
+#define cache_op(cache, op, ...) (cache)->api->op((cache)->db, ## __VA_ARGS__)
+
+
+static inline uint16_t get_uint16(const void *address)
+{
+	uint16_t tmp;
+	memcpy(&tmp, address, sizeof(tmp));
+	return tmp;
+}
+
+/** Useful pattern, especially as void-pointer arithmetic isn't standard-compliant. */
+static inline uint8_t * knot_db_val_bound(knot_db_val_t val)
+{
+	return (uint8_t *)val.data + val.len;
+}
+
diff --git a/lib/cache/knot_pkt.c b/lib/cache/knot_pkt.c
new file mode 100644
index 0000000..56836a6
--- /dev/null
+++ b/lib/cache/knot_pkt.c
@@ -0,0 +1,106 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of preparing knot_pkt_t for filling with RRs.
+ * Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type)
+{
+	/* Update packet question if needed. */
+	if (!knot_dname_is_equal(knot_pkt_qname(pkt), name)
+	    || knot_pkt_qtype(pkt) != type || knot_pkt_qclass(pkt) != KNOT_CLASS_IN) {
+		int ret = kr_pkt_recycle(pkt);
+		if (ret) return kr_error(ret);
+		ret = knot_pkt_put_question(pkt, name, KNOT_CLASS_IN, type);
+		if (ret) return kr_error(ret);
+	}
+
+	pkt->parsed = pkt->size = PKT_SIZE_NOWIRE;
+	knot_wire_set_qr(pkt->wire);
+	knot_wire_set_aa(pkt->wire);
+	return kr_ok();
+}
+
+/** Reserve space for additional `count` RRsets.
+ * \note pkt->rr_info gets correct length but is always zeroed
+ */
+static int pkt_alloc_space(knot_pkt_t *pkt, int count)
+{
+	size_t allocd_orig = pkt->rrset_allocd;
+	if (pkt->rrset_count + count <= allocd_orig) {
+		return kr_ok();
+	}
+	/* A simple growth strategy, amortized O(count). */
+	pkt->rrset_allocd = MAX(
+			pkt->rrset_count + count,
+			pkt->rrset_count + allocd_orig);
+
+	pkt->rr = mm_realloc(&pkt->mm, pkt->rr,
+				sizeof(pkt->rr[0]) * pkt->rrset_allocd,
+				sizeof(pkt->rr[0]) * allocd_orig);
+	if (!pkt->rr) {
+		return kr_error(ENOMEM);
+	}
+	/* Allocate pkt->rr_info to be certain, but just leave it zeroed. */
+	mm_free(&pkt->mm, pkt->rr_info);
+	pkt->rr_info = mm_alloc(&pkt->mm, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd);
+	if (!pkt->rr_info) {
+		return kr_error(ENOMEM);
+	}
+	memset(pkt->rr_info, 0, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd);
+	return kr_ok();
+}
+
+int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank)
+{
+	/* allocate space, to be sure */
+	int rrset_cnt = (rrset->set.rr->rrs.count > 0) + (rrset->sig_rds.count > 0);
+	int ret = pkt_alloc_space(pkt, rrset_cnt);
+	if (ret) return kr_error(ret);
+	/* write both sets */
+	const knot_rdataset_t *rdss[2] = { &rrset->set.rr->rrs, &rrset->sig_rds };
+	for (int i = 0; i < rrset_cnt; ++i) {
+		assert(rdss[i]->count);
+		/* allocate rank */
+		uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank));
+		if (!rr_rank) return kr_error(ENOMEM);
+		*rr_rank = (i == 0) ? rank : (KR_RANK_OMIT | KR_RANK_AUTH);
+			/* rank for RRSIGs isn't really useful: ^^ */
+		if (i == 0) {
+			pkt->rr[pkt->rrset_count] = *rrset->set.rr;
+			pkt->rr[pkt->rrset_count].additional = rr_rank;
+		} else {
+		/* append the RR array */
+			pkt->rr[pkt->rrset_count] = (knot_rrset_t){
+				.owner = knot_dname_copy(rrset->set.rr->owner, &pkt->mm),
+					/* ^^ well, another copy isn't really needed */
+				.ttl = rrset->set.rr->ttl,
+				.type = KNOT_RRTYPE_RRSIG,
+				.rclass = KNOT_CLASS_IN,
+				.rrs = *rdss[i],
+				.additional = rr_rank,
+			};
+		}
+		++pkt->rrset_count;
+		++(pkt->sections[pkt->current].count);
+	}
+	return kr_ok();
+}
+
diff --git a/lib/cache/nsec1.c b/lib/cache/nsec1.c
new file mode 100644
index 0000000..74b3d34
--- /dev/null
+++ b/lib/cache/nsec1.c
@@ -0,0 +1,511 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of NSEC (1) handling.  Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+#include "lib/dnssec/nsec.h"
+#include "lib/layer/iterate.h"
+
+
+/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN).
+ * \return kr_ok() or error code (<0). */
+static int dname_wire_reconstruct(knot_dname_t *buf, const struct key *k,
+		 knot_db_val_t kwz)
+{
+	/* Reconstruct from key: first the ending, then zone name. */
+	int ret = knot_dname_lf2wire(buf, kwz.len, kwz.data);
+	if (ret < 0) {
+		VERBOSE_MSG(NULL, "=> NSEC: LF2wire ret = %d\n", ret);
+		assert(false);
+		return ret;
+	}
+		/* The last written byte is the zero label for root -> overwrite. */
+	knot_dname_t *zone_start = buf + ret - 1;
+	assert(*zone_start == '\0');
+	ret = knot_dname_to_wire(zone_start, k->zname, KNOT_DNAME_MAXLEN - kwz.len);
+	if (ret != k->zlf_len + 1) {
+		assert(false);
+		return ret < 0 ? ret : kr_error(EILSEQ);
+	}
+	return kr_ok();
+}
+
+
+knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard)
+{
+	/* we basically need dname_lf with two bytes added
+	 * on a correct place within the name (the cut) */
+	int ret;
+	const bool ok = k && name
+		&& !(ret = kr_dname_lf(k->buf, name, add_wildcard));
+	if (!ok) {
+		assert(false);
+		return (knot_db_val_t){ NULL, 0 };
+	}
+
+	uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */
+	uint8_t *end = k->buf + 1 + k->buf[0]; /* we don't use the final zero in key,
+						* but move it anyway */
+	if (end < begin) {
+		assert(false);
+		return (knot_db_val_t){ NULL, 0 };
+	}
+	int key_len;
+	if (end > begin) {
+		memmove(begin + 2, begin, end - begin);
+		key_len = k->buf[0] + 1;
+	} else {
+		key_len = k->buf[0] + 2;
+	}
+	/* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '1' + dname_lf
+	 * of the name within the zone without the final 0.  Iff the latter is empty,
+	 * there's no zero to cut and thus the key_len difference.
+	 */
+	begin[0] = 0;
+	begin[1] = '1'; /* tag for NSEC1 */
+	k->type = KNOT_RRTYPE_NSEC;
+
+	/*
+	VERBOSE_MSG(NULL, "<> key_NSEC1; name: ");
+	kr_dname_print(name, add_wildcard ? "*." : "" , " ");
+	kr_log_verbose("(zone name LF length: %d; total key length: %d)\n",
+			k->zlf_len, key_len);
+	*/
+
+	return (knot_db_val_t){ k->buf + 1, key_len };
+}
+
+
+/** Assuming that k1 < k4, find where k2 is.  (Considers DNS wrap-around.)
+ *
+ * \return Intuition: position of k2 among kX.
+ *	0: k2 < k1;  1: k1 == k2;  2: k1 is a prefix of k2 < k4;
+ *	3: k1 < k2 < k4 (and not 2);  4: k2 == k4;  5: k2 > k4
+ * \note k1.data may be NULL, meaning assumption that k1 < k2 and not a prefix
+ *       (i.e. return code will be > 2)
+ */
+static int kwz_between(knot_db_val_t k1, knot_db_val_t k2, knot_db_val_t k4)
+{
+	assert(k2.data && k4.data);
+	/* CACHE_KEY_DEF; we need to beware of one key being a prefix of another */
+	int ret_maybe; /**< result, assuming we confirm k2 < k4 */
+	if (k1.data) {
+		const int cmp12 = memcmp(k1.data, k2.data, MIN(k1.len, k2.len));
+		if (cmp12 == 0 && k1.len == k2.len) /* iff k1 == k2 */
+			return 1;
+		if (cmp12 > 0 || (cmp12 == 0 && k1.len > k2.len)) /* iff k1 > k2 */
+			return 0;
+		ret_maybe = cmp12 == 0 ? 2 : 3;
+	} else {
+		ret_maybe = 3;
+	}
+	if (k4.len == 0) { /* wrap-around */
+		return k2.len > 0 ? ret_maybe : 4;
+	} else {
+		const int cmp24 = memcmp(k2.data, k4.data, MIN(k2.len, k4.len));
+		if (cmp24 == 0 && k2.len == k4.len) /* iff k2 == k4 */
+			return 4;
+		if (cmp24 > 0 || (cmp24 == 0 && k2.len > k4.len)) /* iff k2 > k4 */
+			return 5;
+		return ret_maybe;
+	}
+}
+
+
+/** NSEC1 range search.
+ *
+ * \param key Pass output of key_NSEC1(k, ...)
+ * \param value[out] The raw data of the NSEC cache record (optional; consistency checked).
+ * \param exact_match[out] Whether the key was matched exactly or just covered (optional).
+ * \param kwz_low[out] Output the low end of covering NSEC, pointing within DB (optional).
+ * \param kwz_high[in,out] Storage for the high end of covering NSEC (optional).
+ * 		It's only set if !exact_match.
+ * \param new_ttl[out] New TTL of the NSEC (optional).
+ * \return Error message or NULL.
+ * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4.
+ */
+static const char * find_leq_NSEC1(struct kr_cache *cache, const struct kr_query *qry,
+			const knot_db_val_t key, const struct key *k, knot_db_val_t *value,
+			bool *exact_match, knot_db_val_t *kwz_low, knot_db_val_t *kwz_high,
+			uint32_t *new_ttl)
+{
+	/* Do the cache operation. */
+	const size_t nwz_off = key_nwz_off(k);
+	if (!key.data || key.len < nwz_off) {
+		assert(false);
+		return "range search ERROR";
+	}
+	knot_db_val_t key_nsec = key;
+	knot_db_val_t val = { NULL, 0 };
+	int ret = cache_op(cache, read_leq, &key_nsec, &val);
+	if (ret < 0) {
+		if (ret == kr_error(ENOENT)) {
+			return "range search miss";
+		} else {
+			assert(false);
+			return "range search ERROR";
+		}
+	}
+	if (value) {
+		*value = val;
+	}
+	/* Check consistency, TTL, rank. */
+	const bool is_exact = (ret == 0);
+	if (exact_match) {
+		*exact_match = is_exact;
+	}
+	const struct entry_h *eh = entry_h_consistent_NSEC(val);
+	if (!eh) {
+		/* This might be just finding something else than NSEC1 entry,
+		 * in case we searched before the very first one in the zone. */
+		return "range search found inconsistent entry";
+	}
+	/* Passing just zone name instead of owner, as we don't
+	 * have it reconstructed at this point. */
+	int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC,
+					qry->timestamp.tv_sec);
+	if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) {
+		return "range search found stale or insecure entry";
+		/* TODO: remove the stale record *and* retry,
+		 * in case we haven't run off.  Perhaps start by in_zone check. */
+	}
+	if (new_ttl) {
+		*new_ttl = new_ttl_;
+	}
+	if (kwz_low) {
+		*kwz_low = (knot_db_val_t){
+			.data = (uint8_t *)key_nsec.data + nwz_off,
+			.len = key_nsec.len - nwz_off,
+		};	/* CACHE_KEY_DEF */
+	}
+	if (is_exact) {
+		/* Nothing else to do. */
+		return NULL;
+	}
+	/* The NSEC starts strictly before our target name;
+	 * now check that it still belongs into that zone. */
+	const bool nsec_in_zone = key_nsec.len >= nwz_off
+		/* CACHE_KEY_DEF */
+		&& memcmp(key.data, key_nsec.data, nwz_off) == 0;
+	if (!nsec_in_zone) {
+		return "range search miss (!nsec_in_zone)";
+	}
+	/* We know it starts before sname, so let's check the other end.
+	 * 1. construct the key for the next name - kwz_hi. */
+	/* it's *full* name ATM */
+	const knot_rdata_t *next = (const knot_rdata_t *)
+				(eh->data + KR_CACHE_RR_COUNT_SIZE);
+	if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) {
+		assert(false);
+		return "ERROR";
+		/* TODO: more checks? */
+	}
+	/*
+	WITH_VERBOSE {
+		VERBOSE_MSG(qry, "=> NSEC: next name: ");
+		kr_dname_print(next, "", "\n");
+	}
+	*/
+	knot_dname_t ch_buf[KNOT_DNAME_MAXLEN];
+	knot_dname_t *chs = kwz_high ? kwz_high->data : ch_buf;
+	if (!chs) {
+		assert(false);
+		return "EINVAL";
+	}
+	{
+		/* Lower-case chs; see also RFC 6840 5.1.
+		 * LATER(optim.): we do lots of copying etc. */
+		knot_dname_t lower_buf[KNOT_DNAME_MAXLEN];
+		ret = knot_dname_to_wire(lower_buf, next->data,
+					 MIN(next->len, KNOT_DNAME_MAXLEN));
+		if (ret < 0) { /* _ESPACE */
+			return "range search found record with incorrect contents";
+		}
+		knot_dname_to_lower(lower_buf);
+		ret = kr_dname_lf(chs, lower_buf, false);
+	}
+	if (ret) {
+		assert(false);
+		return "ERROR";
+	}
+	knot_db_val_t kwz_hi = { /* skip the zone name */
+		.data = chs + 1 + k->zlf_len,
+		.len = chs[0] - k->zlf_len,
+	};
+	assert((ssize_t)(kwz_hi.len) >= 0);
+	/* 2. do the actual range check. */
+	const knot_db_val_t kwz_sname = {
+		.data = (void *)/*const-cast*/(k->buf + 1 + nwz_off),
+		.len = k->buf[0] - k->zlf_len,
+	};
+	assert((ssize_t)(kwz_sname.len) >= 0);
+	bool covers = /* we know for sure that the low end is before kwz_sname */
+		3 == kwz_between((knot_db_val_t){ NULL, 0 }, kwz_sname, kwz_hi);
+	if (!covers) {
+		return "range search miss (!covers)";
+	}
+	if (kwz_high) {
+		*kwz_high = kwz_hi;
+	}
+	return NULL;
+}
+
+
+int nsec1_encloser(struct key *k, struct answer *ans,
+		   const int sname_labels, int *clencl_labels,
+		   knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz,
+		   const struct kr_query *qry, struct kr_cache *cache)
+{
+	static const int ESKIP = ABS(ENOENT);
+	/* Basic sanity check. */
+	const bool ok = k && ans && clencl_labels && cover_low_kwz && cover_hi_kwz
+			&& qry && cache;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	
+	/* Find a previous-or-equal name+NSEC in cache covering the QNAME,
+	 * checking TTL etc. */
+	knot_db_val_t key = key_NSEC1(k, qry->sname, false);
+	knot_db_val_t val = { NULL, 0 };
+	bool exact_match;
+	uint32_t new_ttl;
+	const char *err = find_leq_NSEC1(cache, qry, key, k, &val,
+			&exact_match, cover_low_kwz, cover_hi_kwz, &new_ttl);
+	if (err) {
+		VERBOSE_MSG(qry, "=> NSEC sname: %s\n", err);
+		return ESKIP;
+	}
+
+	/* Get owner name of the record. */
+	const knot_dname_t *owner;
+	knot_dname_t owner_buf[KNOT_DNAME_MAXLEN];
+	if (exact_match) {
+		owner = qry->sname;
+	} else {
+		int ret = dname_wire_reconstruct(owner_buf, k, *cover_low_kwz);
+		if (unlikely(ret)) return ESKIP;
+		owner = owner_buf;
+	}
+	/* Basic checks OK -> materialize data. */
+	{
+		const struct entry_h *nsec_eh = val.data;
+		int ret = entry2answer(ans, AR_NSEC, nsec_eh, knot_db_val_bound(val),
+					owner, KNOT_RRTYPE_NSEC, new_ttl);
+		if (ret) return kr_error(ret);
+	}
+
+	/* Final checks, split for matching vs. covering our sname. */
+	const knot_rrset_t *nsec_rr = ans->rrsets[AR_NSEC].set.rr;
+	const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata);
+	uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata);
+	assert(bm);
+
+	if (exact_match) {
+		if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) != 0) {
+			VERBOSE_MSG(qry,
+				"=> NSEC sname: match but failed type check\n");
+			return ESKIP;
+		}
+		/* NODATA proven; just need to add SOA+RRSIG later */
+		VERBOSE_MSG(qry, "=> NSEC sname: match proved NODATA, new TTL %d\n",
+				new_ttl);
+		ans->rcode = PKT_NODATA;
+		return kr_ok();
+	} /* else */
+
+	/* Inexact match.  First check if sname is delegated by that NSEC. */
+	const int nsec_matched = knot_dname_matched_labels(nsec_rr->owner, qry->sname);
+	const bool is_sub = nsec_matched == knot_dname_labels(nsec_rr->owner, NULL);
+	if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+		VERBOSE_MSG(qry, "=> NSEC sname: covered but delegated (or error)\n");
+		return ESKIP;
+	}
+	/* NXDOMAIN proven *except* for wildcards. */
+	WITH_VERBOSE(qry) {
+		auto_free char *owner_str = kr_dname_text(nsec_rr->owner),
+			  *next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata));
+		VERBOSE_MSG(qry, "=> NSEC sname: covered by: %s -> %s, new TTL %d\n",
+				owner_str, next_str, new_ttl);
+	}
+
+	/* Find label count of the closest encloser.
+	 * Both endpoints in an NSEC do exist (though possibly in a child zone)
+	 * and any prefixes of those names as well (empty non-terminals),
+	 * but nothing else exists inside this "triangle".
+	 *
+	 * Note that we have to lower-case the next name for comparison,
+	 * even though we have canonicalized NSEC already; see RFC 6840 5.1.
+	 * LATER(optim.): it might be faster to use the LFs we already have.
+	 */
+	knot_dname_t next[KNOT_DNAME_MAXLEN];
+	int ret = knot_dname_to_wire(next, knot_nsec_next(nsec_rr->rrs.rdata), sizeof(next));
+	if (ret < 0) {
+		assert(!ret);
+		return kr_error(ret);
+	}
+	knot_dname_to_lower(next);
+	*clencl_labels = MAX(
+		nsec_matched,
+		knot_dname_matched_labels(qry->sname, next)
+		);
+
+	/* Empty non-terminals don't need to have
+	 * a matching NSEC record. */
+	if (sname_labels == *clencl_labels) {
+		ans->rcode = PKT_NODATA;
+		VERBOSE_MSG(qry,
+			"=> NSEC sname: empty non-terminal by the same RR\n");
+	} else {
+		ans->rcode = PKT_NXDOMAIN;
+	}
+	return kr_ok();
+}
+
+/** Verify non-existence after kwz_between() call. */
+static bool nonexistence_ok(int cmp, const knot_rrset_t *rrs)
+{
+	if (cmp == 3) {
+		return true;
+	}
+	if (cmp != 2) {
+		return false;
+	}
+	const uint8_t *bm = knot_nsec_bitmap(rrs->rrs.rdata);
+	uint16_t bm_size = knot_nsec_bitmap_len(rrs->rrs.rdata);
+	return kr_nsec_children_in_zone_check(bm, bm_size) != 0;
+}
+
+int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz,
+		    const struct kr_query *qry, struct kr_cache *cache)
+{
+	/* Construct key for the source of synthesis. */
+	knot_db_val_t key = key_NSEC1(k, clencl_name, true);
+	const size_t nwz_off = key_nwz_off(k);
+	if (!key.data || key.len < nwz_off) {
+		assert(false);
+		return kr_error(1);
+	}
+	/* Check if our sname-covering NSEC also covers/matches SS. */
+	knot_db_val_t kwz = {
+		.data = (uint8_t *)key.data + nwz_off,
+		.len = key.len - nwz_off,
+	};
+	assert((ssize_t)(kwz.len) >= 0);
+	const int cmp = kwz_between(cover_low_kwz, kwz, cover_hi_kwz);
+	if (nonexistence_ok(cmp, ans->rrsets[AR_NSEC].set.rr)) {
+		VERBOSE_MSG(qry, "=> NSEC wildcard: covered by the same RR\n");
+		return AR_SOA;
+	}
+	const knot_rrset_t *nsec_rr = NULL; /**< the wildcard proof NSEC */
+	bool exact_match; /**< whether it matches the source of synthesis */
+	if (cmp == 1) {
+		exact_match = true;
+		nsec_rr = ans->rrsets[AR_NSEC].set.rr;
+	} else {
+		/* Try to find the NSEC for SS. */
+		knot_db_val_t val = { NULL, 0 };
+		knot_db_val_t wild_low_kwz = { NULL, 0 };
+		uint32_t new_ttl;
+		const char *err = find_leq_NSEC1(cache, qry, key, k, &val,
+				&exact_match, &wild_low_kwz, NULL, &new_ttl);
+		if (err) {
+			VERBOSE_MSG(qry, "=> NSEC wildcard: %s\n", err);
+			return kr_ok();
+		}
+		/* Materialize the record into answer (speculatively). */
+		knot_dname_t owner[KNOT_DNAME_MAXLEN];
+		int ret = dname_wire_reconstruct(owner, k, wild_low_kwz);
+		if (ret) return kr_error(ret);
+		const struct entry_h *nsec_eh = val.data;
+		ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val),
+				   owner, KNOT_RRTYPE_NSEC, new_ttl);
+		if (ret) return kr_error(ret);
+		nsec_rr = ans->rrsets[AR_WILD].set.rr;
+	}
+
+	assert(nsec_rr);
+	const uint32_t new_ttl_log =
+		kr_verbose_status ? nsec_rr->ttl : -1;
+	const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata);
+	uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata);
+	int ret;
+	struct answer_rrset * const arw = &ans->rrsets[AR_WILD];
+	if (!bm) {
+		assert(false);
+		ret = kr_error(1);
+		goto clean_wild;
+	}
+	if (!exact_match) {
+		/* Finish verification that the source of synthesis doesn't exist. */
+		const int nsec_matched =
+			knot_dname_matched_labels(nsec_rr->owner, clencl_name);
+			/* we don't need to use the full source of synthesis ^ */
+		const bool is_sub =
+			nsec_matched == knot_dname_labels(nsec_rr->owner, NULL);
+		if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+			VERBOSE_MSG(qry,
+				"=> NSEC wildcard: covered but delegated (or error)\n");
+			ret = kr_ok();
+			goto clean_wild;
+		}
+		/* We have a record proving wildcard non-existence. */
+		WITH_VERBOSE(qry) {
+			auto_free char *owner_str = kr_dname_text(nsec_rr->owner),
+				*next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata));
+			VERBOSE_MSG(qry, "=> NSEC wildcard: covered by: %s -> %s, new TTL %d\n",
+					owner_str, next_str, new_ttl_log);
+		}
+		return AR_SOA;
+	}
+
+	/* The wildcard exists.  Find if it's NODATA - check type bitmap. */
+	if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) {
+		/* NODATA proven; just need to add SOA+RRSIG later */
+		WITH_VERBOSE(qry) {
+			const char *msg_start = "=> NSEC wildcard: match proved NODATA";
+			if (arw->set.rr) {
+				auto_free char *owner_str = kr_dname_text(nsec_rr->owner);
+				VERBOSE_MSG(qry, "%s: %s, new TTL %d\n",
+						msg_start, owner_str, new_ttl_log);
+			} else {
+				/* don't repeat the RR if it's the same */
+				VERBOSE_MSG(qry, "%s, by the same RR\n", msg_start);
+			}
+		}
+		ans->rcode = PKT_NODATA;
+		return AR_SOA;
+
+	} /* else */
+	/* The data probably exists -> don't add this NSEC
+	 * and (later) try to find the real wildcard data */
+	VERBOSE_MSG(qry, "=> NSEC wildcard: should exist (or error)\n");
+	ans->rcode = PKT_NOERROR;
+	ret = kr_ok();
+clean_wild:
+	if (arw->set.rr) { /* we may have matched AR_NSEC */
+		knot_rrset_free(arw->set.rr, ans->mm);
+		arw->set.rr = NULL;
+		knot_rdataset_clear(&arw->sig_rds, ans->mm);
+	}
+	return ret;
+}
+
diff --git a/lib/cache/nsec3.c b/lib/cache/nsec3.c
new file mode 100644
index 0000000..d2ae72a
--- /dev/null
+++ b/lib/cache/nsec3.c
@@ -0,0 +1,501 @@
+/*  Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of NSEC3 handling.  Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+
+#include "contrib/base32hex.h"
+#include "lib/dnssec/nsec.h"
+#include "lib/layer/iterate.h"
+
+#include <libknot/rrtype/nsec3.h>
+
+static const knot_db_val_t VAL_EMPTY = { NULL, 0 };
+
+/** Common part: write all but the NSEC3 hash. */
+static knot_db_val_t key_NSEC3_common(struct key *k, const knot_dname_t *zname,
+					const nsec_p_hash_t nsec_p_hash)
+{
+	int ret;
+	const bool ok = k && zname
+		&& !(ret = kr_dname_lf(k->buf, zname, false));
+	if (!ok) {
+		assert(false);
+		return VAL_EMPTY;
+	}
+
+	/* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '3' + nsec_p hash (4B)
+	 * 			+ NSEC3 hash (20B == NSEC3_HASH_LEN binary!)
+	 * LATER(optim.) nsec_p hash: perhaps 2B would give a sufficient probability
+	 * of avoiding collisions.
+	 */
+	uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */
+	begin[0] = 0;
+	begin[1] = '3'; /* tag for NSEC3 */
+	k->type = KNOT_RRTYPE_NSEC3;
+	memcpy(begin + 2, &nsec_p_hash, sizeof(nsec_p_hash));
+	return (knot_db_val_t){
+		.data = k->buf + 1,
+		.len = begin + 2 + sizeof(nsec_p_hash) - (k->buf + 1),
+	};
+}
+
+knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name,
+			const nsec_p_hash_t nsec_p_hash)
+{
+	knot_db_val_t val = key_NSEC3_common(k, nsec3_name /*only zname required*/,
+						nsec_p_hash);
+	if (!val.data) return val;
+	int len = base32hex_decode(nsec3_name + 1, nsec3_name[0],
+			knot_db_val_bound(val), KR_CACHE_KEY_MAXLEN - val.len);
+	if (len != NSEC3_HASH_LEN) {
+		return VAL_EMPTY;
+	}
+	val.len += len;
+	return val;
+}
+
+/** Construct a string key for for NSEC3 predecessor-search, from an non-NSEC3 name.
+ * \note k->zlf_len and k->zname are assumed to have been correctly set */
+static knot_db_val_t key_NSEC3_name(struct key *k, const knot_dname_t *name,
+		const bool add_wildcard, const struct nsec_p *nsec_p)
+{
+	bool ok = k && name && nsec_p && nsec_p->raw;
+	if (!ok) return VAL_EMPTY;
+	knot_db_val_t val = key_NSEC3_common(k, k->zname, nsec_p->hash);
+	if (!val.data) return val;
+
+	/* Make `name` point to correctly wildcarded owner name. */
+	uint8_t buf[KNOT_DNAME_MAXLEN];
+	int name_len;
+	if (add_wildcard) {
+		buf[0] = '\1';
+		buf[1] = '*';
+		name_len = knot_dname_to_wire(buf + 2, name, sizeof(buf) - 2);
+		if (name_len < 0) return VAL_EMPTY; /* wants wildcard but doesn't fit */
+		name = buf;
+		name_len += 2;
+	} else {
+		name_len = knot_dname_size(name);
+	}
+	/* Append the NSEC3 hash. */
+	const dnssec_binary_t dname = {
+		.size = name_len,
+		.data = (uint8_t *)/*const-cast*/name,
+	};
+
+	#if 0 // LATER(optim.): this requires a patched libdnssec - tries to realloc()
+	dnssec_binary_t hash = {
+		.size = KR_CACHE_KEY_MAXLEN - val.len,
+		.data = val.data + val.len,
+	};
+	int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash);
+	if (ret != DNSSEC_EOK) return VAL_EMPTY;
+	assert(hash.size == NSEC3_HASH_LEN);
+
+	#else
+	dnssec_binary_t hash = { .size = 0, .data = NULL };
+	int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash);
+	if (ret != DNSSEC_EOK) return VAL_EMPTY;
+	if (hash.size != NSEC3_HASH_LEN || !hash.data) {
+		assert(false);
+		return VAL_EMPTY;
+	}
+	memcpy(knot_db_val_bound(val), hash.data, NSEC3_HASH_LEN);
+	free(hash.data);
+	#endif
+
+	val.len += hash.size;
+	return val;
+}
+
+/** Return h1 < h2, semantically on NSEC3 hashes. */
+static inline bool nsec3_hash_ordered(const uint8_t *h1, const uint8_t *h2)
+{
+	return memcmp(h1, h2, NSEC3_HASH_LEN) < 0;
+}
+
+/** NSEC3 range search.
+ *
+ * \param key Pass output of key_NSEC3(k, ...)
+ * \param nsec_p Restrict to this NSEC3 parameter-set.
+ * \param value[out] The raw data of the NSEC3 cache record (optional; consistency checked).
+ * \param exact_match[out] Whether the key was matched exactly or just covered (optional).
+ * \param hash_low[out] Output the low end hash of covering NSEC3, pointing within DB (optional).
+ * \param new_ttl[out] New TTL of the NSEC3 (optional).
+ * \return Error message or NULL.
+ * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4.
+ */
+static const char * find_leq_NSEC3(struct kr_cache *cache, const struct kr_query *qry,
+		const knot_db_val_t key, const struct key *k, const struct nsec_p *nsec_p,
+		knot_db_val_t *value, bool *exact_match, const uint8_t **hash_low,
+		uint32_t *new_ttl)
+{
+	/* Do the cache operation. */
+	const size_t hash_off = key_nsec3_hash_off(k);
+	if (!key.data || key.len < hash_off) {
+		assert(false);
+		return "range search ERROR";
+	}
+	knot_db_val_t key_found = key;
+	knot_db_val_t val = { NULL, 0 };
+	int ret = cache_op(cache, read_leq, &key_found, &val);
+		/* ^^ LATER(optim.): incrementing key and doing less-than search
+		 * would probably be slightly more efficient with LMDB,
+		 * but the code complexity would grow considerably. */
+	if (ret < 0) {
+		if (ret == kr_error(ENOENT)) {
+			return "range search miss";
+		} else {
+			assert(false);
+			return "range search ERROR";
+		}
+	}
+	if (value) {
+		*value = val;
+	}
+	/* Check consistency, TTL, rank. */
+	const bool is_exact = (ret == 0);
+	if (exact_match) {
+		*exact_match = is_exact;
+	}
+	const struct entry_h *eh = entry_h_consistent_NSEC(val);
+	if (!eh) {
+		/* This might be just finding something else than NSEC3 entry,
+		 * in case we searched before the very first one in the zone. */
+		return "range search found inconsistent entry";
+	}
+	/* Passing just zone name instead of owner. */
+	int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC3,
+					qry->timestamp.tv_sec);
+	if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) {
+		return "range search found stale or insecure entry";
+		/* TODO: remove the stale record *and* retry,
+		 * in case we haven't run off.  Perhaps start by in_zone check. */
+	}
+	if (new_ttl) {
+		*new_ttl = new_ttl_;
+	}
+	if (hash_low) {
+		*hash_low = (uint8_t *)key_found.data + hash_off;
+	}
+	if (is_exact) {
+		/* Nothing else to do. */
+		return NULL;
+	}
+	/* The NSEC3 starts strictly before our target name;
+	 * now check that it still belongs into that zone and chain. */
+	const uint8_t *nsec_p_raw = eh->data + KR_CACHE_RR_COUNT_SIZE
+					+ 2 /* RDLENGTH from rfc1034 */;
+	const int nsec_p_len = nsec_p_rdlen(nsec_p_raw);
+	const bool same_chain = key_found.len == hash_off + NSEC3_HASH_LEN
+		/* CACHE_KEY_DEF */
+		&& memcmp(key.data, key_found.data, hash_off) == 0
+		/* exact comparison of NSEC3 parameters */
+		&& nsec_p_len == nsec_p_rdlen(nsec_p->raw)
+		&& memcmp(nsec_p_raw, nsec_p->raw, nsec_p_len) == 0;
+	if (!same_chain) {
+		return "range search miss (!same_chain)";
+	}
+	/* We know it starts before sname, so let's check the other end.
+	 * A. find the next hash and check its length. */
+	if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) {
+		assert(false);
+		return "ERROR";
+		/* TODO: more checks?  Also, `next` computation is kinda messy. */
+	}
+	const uint8_t *hash_next = nsec_p_raw + nsec_p_len
+				 + sizeof(uint8_t) /* hash length from rfc5155 */;
+	if (hash_next[-1] != NSEC3_HASH_LEN) {
+		return "unexpected next hash length";
+	}
+	/* B. do the actual range check. */
+	const uint8_t * const hash_searched = (uint8_t *)key.data + hash_off;
+	bool covers = /* we know for sure that the low end is before the searched name */
+		nsec3_hash_ordered(hash_searched, hash_next)
+		/* and the wrap-around case */
+		|| nsec3_hash_ordered(hash_next, (const uint8_t *)key_found.data + hash_off);
+	if (!covers) {
+		return "range search miss (!covers)";
+	}
+	return NULL;
+}
+
+/** Extract textual representation of NSEC3 hash from a cache key.
+ * \param text must have length at least NSEC3_HASH_TXT_LEN+1 (will get 0-terminated). */
+static void key_NSEC3_hash2text(const knot_db_val_t key, char *text)
+{
+	assert(key.data && key.len > NSEC3_HASH_LEN);
+	const uint8_t *hash_raw = knot_db_val_bound(key) - NSEC3_HASH_LEN;
+			/* CACHE_KEY_DEF ^^ */
+	int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, (uint8_t *)text,
+				   NSEC3_HASH_TXT_LEN);
+	assert(len == NSEC3_HASH_TXT_LEN); (void)len;
+	text[NSEC3_HASH_TXT_LEN] = '\0';
+}
+
+/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN).
+ * \return kr_ok() or error code (<0). */
+static int dname_wire_reconstruct(knot_dname_t *buf, const knot_dname_t *zname,
+				  const uint8_t *hash_raw)
+{
+	int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, buf + 1, NSEC3_HASH_TXT_LEN);
+	if (len != NSEC3_HASH_TXT_LEN) {
+		assert(false);
+		return kr_error(EINVAL);
+	}
+	buf[0] = len;
+	int ret = knot_dname_to_wire(buf + 1 + len, zname, KNOT_DNAME_MAXLEN - 1 - len);
+	return ret < 0 ? kr_error(ret) : kr_ok();
+}
+
+static void nsec3_hash2text(const knot_dname_t *owner, char *text)
+{
+	assert(owner[0] == NSEC3_HASH_TXT_LEN);
+	memcpy(text, owner + 1, MIN(owner[0], NSEC3_HASH_TXT_LEN));
+	text[NSEC3_HASH_TXT_LEN] = '\0';
+}
+
+int nsec3_encloser(struct key *k, struct answer *ans,
+		   const int sname_labels, int *clencl_labels,
+		   const struct kr_query *qry, struct kr_cache *cache)
+{
+	static const int ESKIP = ABS(ENOENT);
+	/* Basic sanity check. */
+	const bool ok = k && k->zname && ans && clencl_labels
+			&& qry && cache;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+
+	/*** Find the closest encloser - cycle: name starting at sname,
+	 * proceeding while longer than zname, shortening by one label on step.
+	 * We need a pair where a name doesn't exist *and* its parent does. */
+		/* LATER(optim.): perhaps iterate in the other order - that
+		 * should help significantly against deep queries where we have
+		 * a shallow proof in the cache.  We can also optimize by using
+		 * only exact search unless we had a match in the previous iteration. */
+	const int zname_labels = knot_dname_labels(k->zname, NULL);
+	int last_nxproven_labels = -1;
+	const knot_dname_t *name = qry->sname;
+	for (int name_labels = sname_labels; name_labels >= zname_labels;
+					--name_labels, name += 1 + name[0]) {
+		/* Find a previous-or-equal NSEC3 in cache covering the name,
+		 * checking TTL etc. */
+		const knot_db_val_t key = key_NSEC3_name(k, name, false, &ans->nsec_p);
+		if (!key.data) continue;
+		WITH_VERBOSE(qry) {
+			char hash_txt[NSEC3_HASH_TXT_LEN + 1];
+			key_NSEC3_hash2text(key, hash_txt);
+			VERBOSE_MSG(qry, "=> NSEC3 depth %d: hash %s\n",
+					name_labels - zname_labels, hash_txt);
+		}
+		knot_db_val_t val = { NULL, 0 };
+		bool exact_match;
+		uint32_t new_ttl;
+		const uint8_t *hash_low;
+		const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val,
+						 &exact_match, &hash_low, &new_ttl);
+		if (err) {
+			WITH_VERBOSE(qry) {
+				auto_free char *name_str = kr_dname_text(name);
+				VERBOSE_MSG(qry, "=> NSEC3 encloser error for %s: %s\n",
+						name_str, err);
+			}
+			continue;
+		}
+		if (exact_match && name_labels != sname_labels
+				&& name_labels + 1 != last_nxproven_labels) {
+			/* This name exists (checked rank and TTL), and it's
+			 * neither of the two interesting cases, so we do not
+			 * keep searching for non-existence above this name. */
+			VERBOSE_MSG(qry,
+				"=> NSEC3 encloser: only found existence of an ancestor\n");
+			return ESKIP;
+		}
+		/* Optimization: avoid the rest of the last iteration if pointless. */
+		if (!exact_match && name_labels == zname_labels
+		    && last_nxproven_labels != name_labels + 1) {
+			break;
+		}
+
+		/* Basic checks OK -> materialize data, cleaning any previous
+		 * records on that answer index (unsuccessful attempts). */
+		knot_dname_t owner[KNOT_DNAME_MAXLEN];
+		{
+			int ret = dname_wire_reconstruct(owner, k->zname, hash_low);
+			if (unlikely(ret)) continue;
+		}
+		const int ans_id = (exact_match && name_labels + 1 == last_nxproven_labels)
+				 ? AR_CPE : AR_NSEC;
+		{
+			const struct entry_h *nsec_eh = val.data;
+			memset(&ans->rrsets[ans_id], 0, sizeof(ans->rrsets[ans_id]));
+			int ret = entry2answer(ans, ans_id, nsec_eh, knot_db_val_bound(val),
+						owner, KNOT_RRTYPE_NSEC3, new_ttl);
+			if (ret) return kr_error(ret);
+		}
+
+		if (!exact_match) {
+			/* Non-existence proven, but we don't know if `name`
+			 * is the next closer name.
+			 * Note: we don't need to check for the sname being
+			 * delegated away by this record, as with NSEC3 only
+			 * *exact* match on an ancestor could do that. */
+			last_nxproven_labels = name_labels;
+			WITH_VERBOSE(qry) {
+				char hash_low_txt[NSEC3_HASH_TXT_LEN + 1];
+				nsec3_hash2text(owner, hash_low_txt);
+				VERBOSE_MSG(qry,
+					"=> NSEC3 depth %d: covered by %s -> TODO, new TTL %d\n",
+					name_labels - zname_labels, hash_low_txt, new_ttl);
+			}
+			continue;
+		}
+
+		/* Exactly matched NSEC3: two cases, one after another. */
+		const knot_rrset_t *nsec_rr = ans->rrsets[ans_id].set.rr;
+		const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata);
+		uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata);
+		assert(bm);
+		if (name_labels == sname_labels) {
+			if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype,
+							nsec_rr->owner) != 0) {
+				VERBOSE_MSG(qry,
+					"=> NSEC3 sname: match but failed type check\n");
+				return ESKIP;
+			}
+			/* NODATA proven; just need to add SOA+RRSIG later */
+			VERBOSE_MSG(qry,
+				"=> NSEC3 sname: match proved NODATA, new TTL %d\n",
+				new_ttl);
+			ans->rcode = PKT_NODATA;
+			return kr_ok();
+
+		} /* else */
+
+		assert(name_labels + 1 == last_nxproven_labels);
+		if (kr_nsec_children_in_zone_check(bm, bm_size) != 0) {
+			VERBOSE_MSG(qry,
+				"=> NSEC3 encloser: found but delegated (or error)\n");
+			return ESKIP;
+		}
+		/* NXDOMAIN proven *except* for wildcards. */
+		WITH_VERBOSE(qry) {
+			auto_free char *name_str = kr_dname_text(name);
+			VERBOSE_MSG(qry,
+				"=> NSEC3 encloser: confirmed as %s, new TTL %d\n",
+				name_str, new_ttl);
+		}
+		*clencl_labels = name_labels;
+		ans->rcode = PKT_NXDOMAIN;
+		/* Avoid repeated NSEC3 - remove either if the hashes match.
+		 * This is very unlikely in larger zones: 1/size (per attempt).
+		 * Well, deduplication would happen anyway when the answer
+		 * from cache is read by kresd (internally). */
+		if (unlikely(0 == memcmp(ans->rrsets[AR_NSEC].set.rr->owner + 1,
+					 ans->rrsets[AR_CPE ].set.rr->owner + 1,
+					 NSEC3_HASH_LEN))) {
+			memset(&ans->rrsets[AR_CPE], 0, sizeof(ans->rrsets[AR_CPE]));
+			/* LATER(optim.): perhaps check this earlier and avoid some work? */
+		}
+		return kr_ok();
+	}
+
+	/* We've ran out of options. */
+	if (last_nxproven_labels > 0) {
+		/* We didn't manage to prove existence of the closest encloser,
+		 * meaning the only chance left is a *positive* wildcard record. */
+		*clencl_labels = last_nxproven_labels - 1;
+		ans->rcode = PKT_NXDOMAIN;
+		/* FIXME: review */
+	}
+	return ESKIP;
+}
+
+int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    const struct kr_query *qry, struct kr_cache *cache)
+{
+	/* Find a previous-or-equal NSEC3 in cache covering or matching
+	 * the source of synthesis, checking TTL etc. */
+	const knot_db_val_t key = key_NSEC3_name(k, clencl_name, true, &ans->nsec_p);
+	if (!key.data) return kr_error(1);
+	WITH_VERBOSE(qry) {
+		char hash_txt[NSEC3_HASH_TXT_LEN + 1];
+		key_NSEC3_hash2text(key, hash_txt);
+		VERBOSE_MSG(qry, "=> NSEC3 wildcard: hash %s\n", hash_txt);
+	}
+	knot_db_val_t val = { NULL, 0 };
+	bool exact_match;
+	uint32_t new_ttl;
+	const uint8_t *hash_low;
+	const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val,
+					 &exact_match, &hash_low, &new_ttl);
+	if (err) {
+		VERBOSE_MSG(qry, "=> NSEC3 wildcard: %s\n", err);
+		return kr_ok();
+	}
+
+	/* LATER(optim.): avoid duplicities in answer. */
+
+	/* Basic checks OK -> materialize the data (speculatively). */
+	knot_dname_t owner[KNOT_DNAME_MAXLEN];
+	{
+		int ret = dname_wire_reconstruct(owner, k->zname, hash_low);
+		if (unlikely(ret)) return kr_ok();
+		const struct entry_h *nsec_eh = val.data;
+		ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val),
+				   owner, KNOT_RRTYPE_NSEC3, new_ttl);
+		if (ret) return kr_error(ret);
+	}
+	const knot_rrset_t *nsec_rr = ans->rrsets[AR_WILD].set.rr;
+
+	if (!exact_match) {
+		/* The record proves wildcard non-existence. */
+		WITH_VERBOSE(qry) {
+			char hash_low_txt[NSEC3_HASH_TXT_LEN + 1];
+			nsec3_hash2text(owner, hash_low_txt);
+			VERBOSE_MSG(qry,
+				"=> NSEC3 wildcard: covered by %s -> TODO, new TTL %d\n",
+				hash_low_txt, new_ttl);
+		}
+		return AR_SOA;
+	}
+
+	/* The wildcard exists.  Find if it's NODATA - check type bitmap. */
+	const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata);
+	uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata);
+	assert(bm);
+	if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) {
+		/* NODATA proven; just need to add SOA+RRSIG later */
+		VERBOSE_MSG(qry, "=> NSEC3 wildcard: match proved NODATA, new TTL %d\n",
+				 new_ttl);
+		ans->rcode = PKT_NODATA;
+		return AR_SOA;
+
+	} /* else */
+	/* The data probably exists -> don't add this NSEC3
+	 * and (later) try to find the real wildcard data */
+	VERBOSE_MSG(qry, "=> NSEC3 wildcard: should exist (or error)\n");
+	ans->rcode = PKT_NOERROR;
+	memset(&ans->rrsets[AR_WILD], 0, sizeof(ans->rrsets[AR_WILD]));
+	return kr_ok();
+}
+
diff --git a/lib/cache/peek.c b/lib/cache/peek.c
new file mode 100644
index 0000000..1af1627
--- /dev/null
+++ b/lib/cache/peek.c
@@ -0,0 +1,724 @@
+/*  Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "lib/cache/impl.h"
+
+#include "lib/dnssec/ta.h"
+#include "lib/layer/iterate.h"
+
+/* The whole file only exports peek_nosync().
+ * Forwards for larger chunks of code: */
+
+static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
+			   uint8_t lowest_rank);
+static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el,
+			struct kr_query *qry, bool only_NS, bool is_DS);
+static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl);
+static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    uint16_t type, uint8_t lowest_rank,
+		    const struct kr_query *qry, struct kr_cache *cache);
+
+static int peek_encloser(
+	struct key *k, struct answer *ans, int sname_labels,
+	uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache);
+
+
+static int nsec_p_init(struct nsec_p *nsec_p, knot_db_val_t nsec_p_entry, bool with_knot)
+{
+	const size_t stamp_len = sizeof(uint32_t);
+	if (nsec_p_entry.len <= stamp_len) { /* plain NSEC if equal */
+		nsec_p->raw = NULL;
+		nsec_p->hash = 0;
+		return kr_ok();
+	}
+	nsec_p->raw = (uint8_t *)nsec_p_entry.data + stamp_len;
+	nsec_p->hash = nsec_p_mkHash(nsec_p->raw);
+	if (!with_knot) return kr_ok();
+	/* Convert NSEC3 params to another format. */
+	const dnssec_binary_t rdata = {
+		.size = nsec_p_rdlen(nsec_p->raw),
+		.data = (uint8_t *)/*const-cast*/nsec_p->raw,
+	};
+	int ret = dnssec_nsec3_params_from_rdata(&nsec_p->libknot, &rdata);
+	return ret == DNSSEC_EOK ? kr_ok() : kr_error(ret);
+}
+
+static void nsec_p_cleanup(struct nsec_p *nsec_p)
+{
+	dnssec_binary_free(&nsec_p->libknot.salt);
+	/* We don't really need to clear it, but it's not large. (`salt` zeroed above) */
+	memset(nsec_p, 0, sizeof(*nsec_p));
+}
+
+/** Compute new TTL for nsec_p entry, using SOA serial arith.
+ * \param new_ttl (optionally) write the new TTL (even if negative)
+ * \return error code, e.g. kr_error(ESTALE) */
+static int nsec_p_ttl(knot_db_val_t entry, const uint32_t timestamp, int32_t *new_ttl)
+{
+	if (!entry.data) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	uint32_t stamp;
+	if (!entry.len) {
+		return kr_error(ENOENT);
+	}
+	if (entry.len < sizeof(stamp)) {
+		assert(!EILSEQ);
+		return kr_error(EILSEQ);
+	}
+	memcpy(&stamp, entry.data, sizeof(stamp));
+	int32_t newttl = stamp - timestamp;
+	if (new_ttl) *new_ttl = newttl;
+	return newttl < 0 ? kr_error(ESTALE) : kr_ok();
+}
+
+static uint8_t get_lowest_rank(const struct kr_request *req, const struct kr_query *qry)
+{
+	/* TODO: move rank handling into the iterator (DNSSEC_* flags)? */
+	const bool allow_unverified =
+		knot_wire_get_cd(req->qsource.packet->wire) || qry->flags.STUB;
+		/* in stub mode we don't trust RRs anyway ^^ */
+	if (qry->flags.NONAUTH) {
+		return KR_RANK_INITIAL;
+		/* Note: there's little sense in validation status for non-auth records.
+		 * In case of using NONAUTH to get NS IPs, knowing that you ask correct
+		 * IP doesn't matter much for security; it matters whether you can
+		 * validate the answers from the NS.
+		 */
+	} else if (!allow_unverified) {
+		/* Records not present under any TA don't have their security
+		 * verified at all, so we also accept low ranks in that case. */
+		const bool ta_covers = kr_ta_covers_qry(req->ctx, qry->sname, qry->stype);
+		/* ^ TODO: performance?  TODO: stype - call sites */
+		if (ta_covers) {
+			return KR_RANK_INSECURE | KR_RANK_AUTH;
+		} /* else falltrhough */
+	}
+	return KR_RANK_INITIAL | KR_RANK_AUTH;
+}
+
+
+/** Almost whole .produce phase for the cache module.
+ * \note we don't transition to KR_STATE_FAIL even in case of "unexpected errors".
+ */
+int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+	struct kr_cache *cache = &req->ctx->cache;
+
+	struct key k_storage, *k = &k_storage;
+	int ret = kr_dname_lf(k->buf, qry->sname, false);
+	if (unlikely(ret)) {
+		assert(false);
+		return ctx->state;
+	}
+
+	const uint8_t lowest_rank = get_lowest_rank(req, qry);
+
+	/**** 1. find the name or the closest (available) zone, not considering wildcards
+	 **** 1a. exact name+type match (can be negative answer in insecure zones) */
+	{
+		knot_db_val_t key = key_exact_type_maypkt(k, qry->stype);
+		knot_db_val_t val = { NULL, 0 };
+		ret = cache_op(cache, read, &key, &val, 1);
+		if (!ret) {
+			/* found an entry: test conditions, materialize into pkt, etc. */
+			ret = found_exact_hit(ctx, pkt, val, lowest_rank);
+		}
+	}
+	if (ret && ret != -abs(ENOENT)) {
+		VERBOSE_MSG(qry, "=> exact hit error: %d %s\n", ret, kr_strerror(ret));
+		assert(false);
+		return ctx->state;
+	} else if (!ret) {
+		return KR_STATE_DONE;
+	}
+
+	/**** 1b. otherwise, find the longest prefix zone/xNAME (with OK time+rank). [...] */
+	k->zname = qry->sname;
+	ret = kr_dname_lf(k->buf, k->zname, false); /* LATER(optim.): probably remove */
+	if (unlikely(ret)) {
+		assert(false);
+		return ctx->state;
+	}
+	entry_list_t el;
+	ret = closest_NS(cache, k, el, qry, false, qry->stype == KNOT_RRTYPE_DS);
+	if (ret) {
+		assert(ret == kr_error(ENOENT));
+		if (ret != kr_error(ENOENT) || !el[0].len) {
+			return ctx->state;
+		}
+	}
+	switch (k->type) {
+	case KNOT_RRTYPE_CNAME: {
+		const knot_db_val_t v = el[EL_CNAME];
+		assert(v.data && v.len);
+		const int32_t new_ttl = get_new_ttl(v.data, qry, qry->sname,
+						KNOT_RRTYPE_CNAME, qry->timestamp.tv_sec);
+		ret = answer_simple_hit(ctx, pkt, KNOT_RRTYPE_CNAME, v.data,
+					knot_db_val_bound(v), new_ttl);
+		/* TODO: ^^ cumbersome code; we also recompute the TTL */
+		return ret == kr_ok() ? KR_STATE_DONE : ctx->state;
+		}
+	case KNOT_RRTYPE_DNAME:
+		VERBOSE_MSG(qry, "=> DNAME not supported yet\n"); // LATER
+		return ctx->state;
+	}
+
+	/* We have to try proving from NSEC*. */
+	auto_free char *log_zname = NULL;
+	WITH_VERBOSE(qry) {
+		log_zname = kr_dname_text(k->zname);
+		if (!el[0].len) {
+			VERBOSE_MSG(qry, "=> no NSEC* cached for zone: %s\n", log_zname);
+		}
+	}
+
+#if 0
+	if (!eh) { /* fall back to root hints? */
+		ret = kr_zonecut_set_sbelt(req->ctx, &qry->zone_cut);
+		if (ret) return ctx->state;
+		assert(!qry->zone_cut.parent);
+
+		//VERBOSE_MSG(qry, "=> using root hints\n");
+		//qry->flags.AWAIT_CUT = false;
+		return ctx->state;
+	}
+
+	/* Now `eh` points to the closest NS record that we've found,
+	 * and that's the only place to start - we may either find
+	 * a negative proof or we may query upstream from that point. */
+	kr_zonecut_set(&qry->zone_cut, k->zname);
+	ret = kr_make_query(qry, pkt); // TODO: probably not yet - qname minimization
+	if (ret) return ctx->state;
+#endif
+
+	/** Structure for collecting multiple NSEC* + RRSIG records,
+	 * in preparation for the answer, and for tracking the progress. */
+	struct answer ans;
+	memset(&ans, 0, sizeof(ans));
+	ans.mm = &pkt->mm;
+	const int sname_labels = knot_dname_labels(qry->sname, NULL);
+
+	/* Try the NSEC* parameters in order, until success.
+	 * Let's not mix different parameters for NSEC* RRs in a single proof. */
+	for (int i = 0; ;) {
+		int32_t log_new_ttl = -123456789; /* visually recognizable value */
+		ret = nsec_p_ttl(el[i], qry->timestamp.tv_sec, &log_new_ttl);
+		if (!ret || VERBOSE_STATUS) {
+			nsec_p_init(&ans.nsec_p, el[i], !ret);
+		}
+		if (ret) {
+			VERBOSE_MSG(qry, "=> skipping zone: %s, %s, hash %x;"
+				"new TTL %d, ret %d\n",
+				log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"),
+				(unsigned)ans.nsec_p.hash, (int)log_new_ttl, ret);
+			/* no need for nsec_p_cleanup() in this case */
+			goto cont;
+		}
+		VERBOSE_MSG(qry, "=> trying zone: %s, %s, hash %x\n",
+				log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"),
+				(unsigned)ans.nsec_p.hash);
+		/**** 2. and 3. inside */
+		ret = peek_encloser(k, &ans, sname_labels,
+					lowest_rank, qry, cache);
+		nsec_p_cleanup(&ans.nsec_p);
+		if (!ret) break;
+		if (ret < 0) return ctx->state;
+	cont:
+		/* Otherwise we try another nsec_p, if available. */
+		if (++i == ENTRY_APEX_NSECS_CNT) return ctx->state;
+		/* clear possible partial answers in `ans` (no need to deallocate) */
+		ans.rcode = 0;
+		memset(&ans.rrsets, 0, sizeof(ans.rrsets));
+	}
+
+	/**** 4. add SOA iff needed */
+	if (ans.rcode != PKT_NOERROR) {
+		/* Assuming k->buf still starts with zone's prefix,
+		 * look up the SOA in cache. */
+		k->buf[0] = k->zlf_len;
+		knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_SOA);
+		knot_db_val_t val = { NULL, 0 };
+		ret = cache_op(cache, read, &key, &val, 1);
+		const struct entry_h *eh;
+		if (ret || !(eh = entry_h_consistent(val, KNOT_RRTYPE_SOA))) {
+			assert(ret); /* only want to catch `eh` failures */
+			VERBOSE_MSG(qry, "=> SOA missed\n");
+			return ctx->state;
+		}
+		/* Check if the record is OK. */
+		int32_t new_ttl = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_SOA,
+						qry->timestamp.tv_sec);
+		if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) {
+			VERBOSE_MSG(qry, "=> SOA unfit %s: rank 0%.2o, new TTL %d\n",
+					(eh->is_packet ? "packet" : "RR"),
+					eh->rank, new_ttl);
+			return ctx->state;
+		}
+		/* Add the SOA into the answer. */
+		ret = entry2answer(&ans, AR_SOA, eh, knot_db_val_bound(val),
+				   k->zname, KNOT_RRTYPE_SOA, new_ttl);
+		if (ret) return ctx->state;
+	}
+
+	/* Find our target RCODE. */
+	int real_rcode;
+	switch (ans.rcode) {
+	case PKT_NODATA:
+	case PKT_NOERROR: /* positive wildcarded response */
+		real_rcode = KNOT_RCODE_NOERROR;
+		break;
+	case PKT_NXDOMAIN:
+		real_rcode = KNOT_RCODE_NXDOMAIN;
+		break;
+	default:
+		assert(false);
+	case 0: /* i.e. nothing was found */
+		/* LATER(optim.): zone cut? */
+		VERBOSE_MSG(qry, "=> cache miss\n");
+		return ctx->state;
+	}
+
+	if (pkt_renew(pkt, qry->sname, qry->stype)
+	    || knot_pkt_begin(pkt, KNOT_ANSWER)
+	   ) {
+		assert(false);
+		return ctx->state;
+	}
+	knot_wire_set_rcode(pkt->wire, real_rcode);
+
+	bool expiring = false; // TODO
+	VERBOSE_MSG(qry, "=> writing RRsets: ");
+	for (int i = 0; i < sizeof(ans.rrsets) / sizeof(ans.rrsets[0]); ++i) {
+		if (i == 1) knot_pkt_begin(pkt, KNOT_AUTHORITY);
+		if (!ans.rrsets[i].set.rr) continue;
+		expiring = expiring || ans.rrsets[i].set.expiring;
+		ret = pkt_append(pkt, &ans.rrsets[i], ans.rrsets[i].set.rank);
+		if (ret) {
+			assert(false);
+			return ctx->state;
+		}
+		kr_log_verbose(kr_rank_test(ans.rrsets[i].set.rank, KR_RANK_SECURE)
+				? "+" : "-");
+	}
+	kr_log_verbose("\n");
+
+	/* Finishing touches. */
+	struct kr_qflags * const qf = &qry->flags;
+	qf->EXPIRING = expiring;
+	qf->CACHED = true;
+	qf->NO_MINIMIZE = true;
+
+	return KR_STATE_DONE;
+}
+
+/**
+ * This is where the high-level "business logic" of aggressive cache is.
+ * \return 0: success (may need SOA);  >0: try other nsec_p;  <0: exit cache immediately.
+ */
+static int peek_encloser(
+	struct key *k, struct answer *ans, const int sname_labels,
+	uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache)
+{
+	/** Start of NSEC* covering the sname;
+	 * it's part of key - the one within zone (read only) */
+	knot_db_val_t cover_low_kwz = { NULL, 0 };
+	knot_dname_t cover_hi_storage[KNOT_DNAME_MAXLEN];
+	/** End of NSEC* covering the sname. */
+	knot_db_val_t cover_hi_kwz = {
+		.data = cover_hi_storage,
+		.len = sizeof(cover_hi_storage),
+	};
+
+	/**** 2. Find a closest (provable) encloser (of sname). */
+	int clencl_labels = -1;
+	bool clencl_is_tentative = false;
+	if (!ans->nsec_p.raw) { /* NSEC */
+		int ret = nsec1_encloser(k, ans, sname_labels, &clencl_labels,
+					 &cover_low_kwz, &cover_hi_kwz, qry, cache);
+		if (ret) return ret;
+	} else {
+		int ret = nsec3_encloser(k, ans, sname_labels, &clencl_labels,
+					 qry, cache);
+		clencl_is_tentative = ret == ABS(ENOENT) && clencl_labels >= 0;
+		/* ^^ Last chance: *positive* wildcard record under this clencl. */
+		if (ret && !clencl_is_tentative) return ret;
+	}
+
+	/* We should have either a match or a cover at this point. */
+	if (ans->rcode != PKT_NODATA && ans->rcode != PKT_NXDOMAIN) {
+		assert(false);
+		return kr_error(EINVAL);
+	}
+	const bool ncloser_covered = ans->rcode == PKT_NXDOMAIN;
+
+	/** Name of the closest (provable) encloser. */
+	const knot_dname_t *clencl_name = qry->sname;
+	for (int l = sname_labels; l > clencl_labels; --l)
+		clencl_name = knot_wire_next_label(clencl_name, NULL);
+
+	/**** 3. source of synthesis checks, in case the next closer name was covered.
+	 **** 3a. We want to query for NSEC* of source of synthesis (SS) or its
+	 * predecessor, providing us with a proof of its existence or non-existence. */
+	if (ncloser_covered && !ans->nsec_p.raw) {
+		int ret = nsec1_src_synth(k, ans, clencl_name,
+					  cover_low_kwz, cover_hi_kwz, qry, cache);
+		if (ret == AR_SOA) return 0;
+		assert(ret <= 0);
+		if (ret) return ret;
+
+	} else if (ncloser_covered && ans->nsec_p.raw && !clencl_is_tentative) {
+		int ret = nsec3_src_synth(k, ans, clencl_name, qry, cache);
+		if (ret == AR_SOA) return 0;
+		assert(ret <= 0);
+		if (ret) return ret;
+
+	} /* else (!ncloser_covered) so no wildcard checks needed,
+	   * as we proved that sname exists. */
+
+	/**** 3b. find wildcarded answer, if next closer name was covered
+	 * and we don't have a full proof yet.  (common for NSEC*) */
+	if (!ncloser_covered)
+		return kr_ok(); /* decrease indentation */
+	/* Construct key for exact qry->stype + source of synthesis. */
+	int ret = kr_dname_lf(k->buf, clencl_name, true);
+	if (ret) {
+		assert(!ret);
+		return kr_error(ret);
+	}
+	const uint16_t types[] = { qry->stype, KNOT_RRTYPE_CNAME };
+	for (int i = 0; i < (2 - (qry->stype == KNOT_RRTYPE_CNAME)); ++i) {
+		ret = try_wild(k, ans, clencl_name, types[i],
+				lowest_rank, qry, cache);
+		if (ret == kr_ok()) {
+			return kr_ok();
+		} else if (ret != -ABS(ENOENT) && ret != -ABS(ESTALE)) {
+			assert(false);
+			return kr_error(ret);
+		}
+		/* else continue */
+	}
+	/* Neither attempt succeeded, but the NSEC* proofs were found,
+	 * so skip trying other parameters, as it seems very unlikely
+	 * to turn out differently than by the same wildcard search. */
+	return -ABS(ENOENT);
+}
+
+
+static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
+#define CHECK_RET(ret) do { \
+	if ((ret) < 0) { assert(false); return kr_error((ret)); } \
+} while (false)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+
+	/* All OK, so start constructing the (pseudo-)packet. */
+	int ret = pkt_renew(pkt, qry->sname, qry->stype);
+	CHECK_RET(ret);
+
+	/* Materialize the sets for the answer in (pseudo-)packet. */
+	struct answer ans;
+	memset(&ans, 0, sizeof(ans));
+	ans.mm = &pkt->mm;
+	ret = entry2answer(&ans, AR_ANSWER, eh, eh_bound,
+			   qry->sname, type, new_ttl);
+	CHECK_RET(ret);
+	/* Put links to the materialized data into the pkt. */
+	ret = pkt_append(pkt, &ans.rrsets[AR_ANSWER], eh->rank);
+	CHECK_RET(ret);
+
+	/* Finishing touches. */
+	struct kr_qflags * const qf = &qry->flags;
+	qf->EXPIRING = is_expiring(eh->ttl, new_ttl);
+	qf->CACHED = true;
+	qf->NO_MINIMIZE = true;
+	qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+	if (qf->DNSSEC_INSECURE) {
+		qf->DNSSEC_WANT = false;
+	}
+	VERBOSE_MSG(qry, "=> satisfied by exact %s: rank 0%.2o, new TTL %d\n",
+			(type == KNOT_RRTYPE_CNAME ? "CNAME" : "RRset"),
+			eh->rank, new_ttl);
+	return kr_ok();
+}
+#undef CHECK_RET
+
+
+/** TODO: description; see the single call site for now. */
+static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
+			   uint8_t lowest_rank)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+
+	int ret = entry_h_seek(&val, qry->stype);
+	if (ret) return ret;
+	const struct entry_h *eh = entry_h_consistent(val, qry->stype);
+	if (!eh) {
+		assert(false);
+		return kr_error(ENOENT);
+		// LATER: recovery in case of error, perhaps via removing the entry?
+		// LATER(optim): pehaps optimize the zone cut search
+	}
+
+	int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, qry->stype,
+					qry->timestamp.tv_sec);
+	if (new_ttl < 0 || eh->rank < lowest_rank) {
+		/* Positive record with stale TTL or bad rank.
+		 * LATER(optim.): It's unlikely that we find a negative one,
+		 * so we might theoretically skip all the cache code. */
+
+		VERBOSE_MSG(qry, "=> skipping exact %s: rank 0%.2o (min. 0%.2o), new TTL %d\n",
+				eh->is_packet ? "packet" : "RR", eh->rank, lowest_rank, new_ttl);
+		return kr_error(ENOENT);
+	}
+
+	const uint8_t *eh_bound = knot_db_val_bound(val);
+	if (eh->is_packet) {
+		/* Note: we answer here immediately, even if it's (theoretically)
+		 * possible that we could generate a higher-security negative proof.
+		 * Rank is high-enough so we take it to save time searching. */
+		return answer_from_pkt  (ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
+	} else {
+		return answer_simple_hit(ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
+	}
+}
+
+
+/** Try to satisfy via wildcard (positively).  See the single call site. */
+static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name,
+		    const uint16_t type, const uint8_t lowest_rank,
+		    const struct kr_query *qry, struct kr_cache *cache)
+{
+	knot_db_val_t key = key_exact_type(k, type);
+	/* Find the record. */
+	knot_db_val_t val = { NULL, 0 };
+	int ret = cache_op(cache, read, &key, &val, 1);
+	if (!ret) {
+		ret = entry_h_seek(&val, type);
+	}
+	if (ret) {
+		if (ret != -ABS(ENOENT)) {
+			VERBOSE_MSG(qry, "=> wildcard: hit error %d %s\n",
+					ret, strerror(abs(ret)));
+			assert(false);
+		}
+		WITH_VERBOSE(qry) {
+			auto_free char *clencl_str = kr_dname_text(clencl_name),
+				*type_str = kr_rrtype_text(type);
+			VERBOSE_MSG(qry, "=> wildcard: not found: *.%s %s\n",
+					clencl_str, type_str);
+		}
+		return ret;
+	}
+	/* Check if the record is OK. */
+	const struct entry_h *eh = entry_h_consistent(val, type);
+	if (!eh) {
+		assert(false);
+		return kr_error(ret);
+		// LATER: recovery in case of error, perhaps via removing the entry?
+	}
+	int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, type, qry->timestamp.tv_sec);
+		/* ^^ here we use the *expanded* wildcard name */
+	if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) {
+		/* Wildcard record with stale TTL, bad rank or packet.  */
+		VERBOSE_MSG(qry, "=> wildcard: skipping %s, rank 0%.2o, new TTL %d\n",
+				eh->is_packet ? "packet" : "RR", eh->rank, new_ttl);
+		return -ABS(ESTALE);
+	}
+	/* Add the RR into the answer. */
+	ret = entry2answer(ans, AR_ANSWER, eh, knot_db_val_bound(val),
+			   qry->sname, type, new_ttl);
+	VERBOSE_MSG(qry, "=> wildcard: answer expanded, ret = %d, new TTL %d\n",
+			ret, (int)new_ttl);
+	if (ret) return kr_error(ret);
+	ans->rcode = PKT_NOERROR;
+	return kr_ok();
+}
+
+int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS,
+			  knot_dname_t ** apex)
+{
+	if (!cache || !cache->db || !name || !apex || *apex) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	struct key k_storage, *k = &k_storage;
+	int ret = kr_dname_lf(k->buf, name, false);
+	if (ret)
+		return kr_error(ret);
+	entry_list_t el_;
+	k->zname = name;
+	ret = closest_NS(cache, k, el_, NULL, true, is_DS);
+	if (ret && ret != -abs(ENOENT))
+		return ret;
+	*apex = knot_dname_copy(k->zname, NULL);
+	if (!*apex)
+		return kr_error(ENOMEM);
+	return kr_ok();
+}
+
+/** \internal for closest_NS.  Check suitability of a single entry, setting k->type if OK.
+ * \return error code, negative iff whole list should be skipped.
+ */
+static int check_NS_entry(struct key *k, knot_db_val_t entry, int i,
+			  bool exact_match, bool is_DS,
+			  const struct kr_query *qry, uint32_t timestamp);
+
+/**
+ * Find the longest prefix zone/xNAME (with OK time+rank), starting at k->*.
+ *
+ * The found type is returned via k->type; the values are returned in el.
+ * \note we use k->type = KNOT_RRTYPE_NS also for the nsec_p result.
+ * \param qry can be NULL (-> gettimeofday(), but you lose the stale-serve hook)
+ * \param only_NS don't consider xNAMEs
+ * \return error code
+ */
+static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el,
+			struct kr_query *qry, const bool only_NS, const bool is_DS)
+{
+	/* get the current timestamp */
+	uint32_t timestamp;
+	if (qry) {
+		timestamp = qry->timestamp.tv_sec;
+	} else {
+		struct timeval tv;
+		if (gettimeofday(&tv, NULL)) return kr_error(errno);
+		timestamp = tv.tv_sec;
+	}
+
+	int zlf_len = k->buf[0];
+
+	// LATER(optim): if stype is NS, we check the same value again
+	bool exact_match = true;
+	bool need_zero = true;
+	/* Inspect the NS/xNAME entries, shortening by a label on each iteration. */
+	do {
+		k->buf[0] = zlf_len;
+		knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS);
+		knot_db_val_t val;
+		int ret = cache_op(cache, read, &key, &val, 1);
+		if (ret == -abs(ENOENT)) goto next_label;
+		if (ret) {
+			assert(!ret);
+			if (need_zero) memset(el, 0, sizeof(entry_list_t));
+			return kr_error(ret);
+		}
+
+		/* Check consistency, find any type;
+		 * using `goto` for shortening by another label. */
+		ret = entry_list_parse(val, el);
+		if (ret) {
+			assert(!ret); // do something about it?
+			goto next_label;
+		}
+		need_zero = false;
+		/* More types are possible; try in order.
+		 * For non-fatal failures just "continue;" to try the next type. */
+		const int el_count = only_NS ? EL_NS + 1 : EL_LENGTH;
+		for (int i = 0; i < el_count; ++i) {
+			ret = check_NS_entry(k, el[i], i, exact_match, is_DS,
+						qry, timestamp);
+			if (ret < 0) goto next_label; else
+			if (!ret) {
+				/* We found our match. */
+				k->zlf_len = zlf_len;
+				return kr_ok();
+			}
+		}
+
+	next_label:
+		/* remove one more label */
+		exact_match = false;
+		if (k->zname[0] == 0) {
+			/* We miss root NS in cache, but let's at least assume it exists. */
+			k->type = KNOT_RRTYPE_NS;
+			k->zlf_len = zlf_len;
+			assert(zlf_len == 0);
+			if (need_zero) memset(el, 0, sizeof(entry_list_t));
+			return kr_error(ENOENT);
+		}
+		zlf_len -= (k->zname[0] + 1);
+		k->zname += (k->zname[0] + 1);
+		k->buf[zlf_len + 1] = 0;
+	} while (true);
+}
+
+static int check_NS_entry(struct key *k, const knot_db_val_t entry, const int i,
+			  const bool exact_match, const bool is_DS,
+			  const struct kr_query *qry, uint32_t timestamp)
+{
+	const int ESKIP = ABS(ENOENT);
+	if (!entry.len
+		/* On a zone cut we want DS from the parent zone. */
+		|| (i <= EL_NS && exact_match && is_DS)
+		/* CNAME is interesting only if we
+		 * directly hit the name that was asked.
+		 * Note that we want it even in the DS case. */
+		|| (i == EL_CNAME && !exact_match)
+		/* DNAME is interesting only if we did NOT
+		 * directly hit the name that was asked. */
+		|| (i == EL_DNAME && exact_match)
+	   ) {
+		return ESKIP;
+	}
+
+	uint16_t type;
+	if (i < ENTRY_APEX_NSECS_CNT) {
+		type = KNOT_RRTYPE_NS;
+		int32_t log_new_ttl = -123456789; /* visually recognizable value */
+		const int err = nsec_p_ttl(entry, timestamp, &log_new_ttl);
+		if (err) {
+			VERBOSE_MSG(qry,
+				"=> skipping unfit nsec_p: new TTL %d, error %d\n",
+				(int)log_new_ttl, err);
+			return ESKIP;
+		}
+	} else {
+		type = EL2RRTYPE(i);
+		/* Find the entry for the type, check positivity, TTL */
+		const struct entry_h *eh = entry_h_consistent(entry, type);
+		if (!eh) {
+			VERBOSE_MSG(qry, "=> EH not consistent\n");
+			assert(false);
+			return kr_error(EILSEQ);
+		}
+		const int32_t log_new_ttl = get_new_ttl(eh, qry, k->zname, type, timestamp);
+		const uint8_t rank_min = KR_RANK_INSECURE | KR_RANK_AUTH;
+		const bool ok = /* For NS any kr_rank is accepted,
+				 * as insecure or even nonauth is OK */
+				(type == KNOT_RRTYPE_NS || eh->rank >= rank_min)
+				/* Not interested in negative bogus or outdated RRs. */
+				&& !eh->is_packet && log_new_ttl >= 0;
+		WITH_VERBOSE(qry) { if (!ok) {
+			auto_free char *type_str = kr_rrtype_text(type);
+			const char *packet_str = eh->is_packet ? "packet" : "RR";
+			VERBOSE_MSG(qry,
+				"=> skipping unfit %s %s: rank 0%.2o, new TTL %d\n",
+				type_str, packet_str, eh->rank, (int)log_new_ttl);
+		} }
+		if (!ok) return ESKIP;
+	}
+	k->type = type;
+	return kr_ok();
+}
+