1 files changed, 293 insertions, 0 deletions
diff --git a/lib/cache/entry_list.c b/lib/cache/entry_list.c
new file mode 100644
index 0000000..6a5001c
--- /dev/null
+++ b/lib/cache/entry_list.c
@@ -0,0 +1,293 @@
+/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ * Implementation of chaining in struct entry_h.  Prototypes in ./impl.h
+ */
+
+#include "lib/cache/impl.h"
+#include "lib/utils.h"
+
+
+static int entry_h_len(knot_db_val_t val);
+
+
+void entry_list_memcpy(struct entry_apex *ea, entry_list_t list)
+{
+	assert(ea);
+	memset(ea, 0, offsetof(struct entry_apex, data));
+	ea->has_ns	= list[EL_NS	].len;
+	ea->has_cname	= list[EL_CNAME	].len;
+	ea->has_dname	= list[EL_DNAME	].len;
+	for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+		ea->nsecs[i] =   list[i].len == 0 ? 0 :
+				(list[i].len == 4 ? 1 : 3);
+	}
+	uint8_t *it = ea->data;
+	for (int i = 0; i < EL_LENGTH; ++i) {
+		if (list[i].data) {
+			memcpy(it, list[i].data, list[i].len);
+			/* LATER(optim.): coalesce consecutive writes? */
+		} else {
+			list[i].data = it;
+		}
+		it += to_even(list[i].len);
+	}
+}
+
+int entry_list_parse(const knot_db_val_t val, entry_list_t list)
+{
+	const bool ok = val.data && val.len && list;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+	/* Parse the apex itself (nsec parameters). */
+	const struct entry_apex *ea = entry_apex_consistent(val);
+	if (!ea) {
+		return kr_error(EILSEQ);
+	}
+	const uint8_t *it = ea->data,
+		*it_bound = knot_db_val_bound(val);
+	for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
+		if (it > it_bound) {
+			return kr_error(EILSEQ);
+		}
+		list[i].data = (void *)it;
+		switch (ea->nsecs[i]) {
+		case 0:
+			list[i].len = 0;
+			break;
+		case 1:
+			list[i].len = sizeof(uint32_t); /* just timestamp */
+			break;
+		case 3: { /* timestamp + NSEC3PARAM wire */
+			if (it + sizeof(uint32_t) + 4 > it_bound) {
+				return kr_error(EILSEQ);
+			}
+			list[i].len = sizeof(uint32_t)
+				+ nsec_p_rdlen(it + sizeof(uint32_t));
+			break;
+			}
+		default:
+			return kr_error(EILSEQ);
+		};
+		it += to_even(list[i].len);
+	}
+	/* Parse every entry_h. */
+	for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) {
+		list[i].data = (void *)it;
+		bool has_type;
+		switch (i) {
+		case EL_NS:	has_type = ea->has_ns;		break;
+		case EL_CNAME:	has_type = ea->has_cname;	break;
+		case EL_DNAME:	has_type = ea->has_dname;	break;
+		default: assert(false); return kr_error(EINVAL); /* something very bad */
+		}
+		if (!has_type) {
+			list[i].len = 0;
+			continue;
+		}
+		if (it >= it_bound) {
+			assert(!EILSEQ);
+			return kr_error(EILSEQ);
+		}
+		const int len = entry_h_len(
+			(knot_db_val_t){ .data = (void *)it, .len = it_bound - it });
+		if (len < 0) {
+			assert(false);
+			return kr_error(len);
+		}
+		list[i].len = len;
+		it += to_even(len);
+	}
+	assert(it == it_bound);
+	return kr_ok();
+}
+
+/** Given a valid entry header, find its length (i.e. offset of the next entry).
+ * \param val The beginning of the data and the bound (read only).
+ */
+static int entry_h_len(const knot_db_val_t val)
+{
+	const bool ok = val.data && ((ssize_t)val.len) > 0;
+	if (!ok) return kr_error(EINVAL);
+	const struct entry_h *eh = val.data;
+	const uint8_t *d = eh->data; /* iterates over the data in entry */
+	const uint8_t *data_bound = knot_db_val_bound(val);
+	if (d >= data_bound) return kr_error(EILSEQ);
+	if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
+		int sets = 2;
+		while (sets-- > 0) {
+			d += rdataset_dematerialized_size(d);
+			if (d > data_bound) {
+				assert(!EILSEQ);
+				return kr_error(EILSEQ);
+			}
+		}
+	} else { /* A "packet" (opaque ATM). */
+		uint16_t len;
+		if (d + sizeof(len) > data_bound) return kr_error(EILSEQ);
+		memcpy(&len, d, sizeof(len));
+		d += 2 + to_even(len);
+	}
+	if (d > data_bound) {
+		assert(!EILSEQ);
+		return kr_error(EILSEQ);
+	}
+	return d - (uint8_t *)val.data;
+}
+
+struct entry_apex * entry_apex_consistent(knot_db_val_t val)
+{
+	//XXX: check lengths, etc.
+	return val.data;
+}
+
+/* See the header file. */
+int entry_h_seek(knot_db_val_t *val, uint16_t type)
+{
+	int i = -1;
+	switch (type) {
+	case KNOT_RRTYPE_NS:	i = EL_NS;	break;
+	case KNOT_RRTYPE_CNAME:	i = EL_CNAME;	break;
+	case KNOT_RRTYPE_DNAME:	i = EL_DNAME;	break;
+	default:		return kr_ok();
+	}
+
+	entry_list_t el;
+	int ret = entry_list_parse(*val, el);
+	if (ret) return ret;
+	*val = el[i];
+	return val->len ? kr_ok() : kr_error(ENOENT);
+}
+
+static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key,
+				knot_db_val_t *val, const struct kr_query *qry)
+{
+	int ret = cache_op(cache, write, key, val, 1);
+	if (!ret) return kr_ok();
+	/* Clear cache if overfull.  It's nontrivial to do better with LMDB.
+	 * LATER: some garbage-collection mechanism. */
+	if (ret == kr_error(ENOSPC)) {
+		ret = kr_cache_clear(cache);
+		const char *msg = "[cache] clearing because overfull, ret = %d\n";
+		if (ret) {
+			kr_log_error(msg, ret);
+		} else {
+			kr_log_info(msg, ret);
+			ret = kr_error(ENOSPC);
+		}
+		return ret;
+	}
+	VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret);
+	return kr_error(ret ? ret : ENOSPC);
+}
+
+
+/* See the header file. */
+int entry_h_splice(
+	knot_db_val_t *val_new_entry, uint8_t rank,
+	const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+	const knot_dname_t *owner/*log only*/,
+	const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp)
+{
+	//TODO: another review, perhaps incuding the API
+	const bool ok = val_new_entry && val_new_entry->len > 0;
+	if (!ok) {
+		assert(!EINVAL);
+		return kr_error(EINVAL);
+	}
+
+	int i_type;
+	switch (type) {
+	case KNOT_RRTYPE_NS:	i_type = EL_NS;		break;
+	case KNOT_RRTYPE_CNAME:	i_type = EL_CNAME;	break;
+	case KNOT_RRTYPE_DNAME:	i_type = EL_DNAME;	break;
+	default:		i_type = 0;
+	}
+
+	/* Get eh_orig (original entry), and also el list if multi-entry case. */
+	const struct entry_h *eh_orig = NULL;
+	entry_list_t el;
+	int ret = -1;
+	if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
+		knot_db_val_t val;
+		ret = cache_op(cache, read, &key, &val, 1);
+		if (i_type) {
+			if (!ret) ret = entry_list_parse(val, el);
+			if (ret) memset(el, 0, sizeof(el));
+			val = el[i_type];
+		}
+		/* val is on the entry, in either case (or error) */
+		if (!ret) {
+			eh_orig = entry_h_consistent(val, type);
+		}
+	} else {
+		/* We want to fully overwrite the entry, so don't even read it. */
+		memset(el, 0, sizeof(el));
+	}
+
+	if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
+		/* If equal rank was accepted, spoofing a *single* answer would be
+		 * enough to e.g. override NS record in AUTHORITY section.
+		 * This way they would have to hit the first answer
+		 * (whenever TTL nears expiration).
+		 * Stale-serving is NOT considered, but TTL 1 would be considered
+		 * as expiring anyway, ... */
+		int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp);
+		if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl)
+		    && rank <= eh_orig->rank) {
+			WITH_VERBOSE(qry) {
+				auto_free char *type_str = kr_rrtype_text(type),
+					*owner_str = kr_dname_text(owner);
+				VERBOSE_MSG(qry, "=> not overwriting %s %s\n",
+						type_str, owner_str);
+			}
+			return kr_error(EEXIST);
+		}
+	}
+
+	if (!i_type) {
+		/* The non-list types are trivial now. */
+		return cache_write_or_clear(cache, &key, val_new_entry, qry);
+	}
+	/* Now we're in trouble.  In some cases, parts of data to be written
+	 * is an lmdb entry that may be invalidated by our write request.
+	 * (lmdb does even in-place updates!) Therefore we copy all into a buffer.
+	 * (We don't bother deallocating from the mempool.)
+	 * LATER(optim.): do this only when neccessary, or perhaps another approach.
+	 * This is also complicated by the fact that the val_new_entry part
+	 * is to be written *afterwards* by the caller.
+	 */
+	el[i_type] = (knot_db_val_t){
+		.len = val_new_entry->len,
+		.data = NULL, /* perhaps unclear in the entry_h_splice() API */
+	};
+	knot_db_val_t val = {
+		.len = entry_list_serial_size(el),
+		.data = NULL,
+	};
+	void *buf = mm_alloc(&qry->request->pool, val.len);
+	entry_list_memcpy(buf, el);
+	ret = cache_write_or_clear(cache, &key, &val, qry);
+	if (ret) return kr_error(ret);
+	memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */
+	val_new_entry->data = (uint8_t *)val.data
+			    + ((uint8_t *)el[i_type].data - (uint8_t *)buf);
+	return kr_ok();
+}
+