301 lines
9.1 KiB
C
301 lines
9.1 KiB
C
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
|
|
* SPDX-License-Identifier: GPL-3.0-or-later
|
|
*/
|
|
|
|
/** @file
|
|
* Implementation of chaining in struct entry_h. Prototypes in ./impl.h
|
|
*/
|
|
|
|
#include "lib/cache/impl.h"
|
|
#include "lib/utils.h"
|
|
|
|
|
|
static int entry_h_len(knot_db_val_t val);
|
|
|
|
|
|
void entry_list_memcpy(struct entry_apex *ea, entry_list_t list)
|
|
{
|
|
if (kr_fails_assert(ea))
|
|
return;
|
|
memset(ea, 0, offsetof(struct entry_apex, data));
|
|
ea->has_ns = list[EL_NS ].len;
|
|
ea->has_cname = list[EL_CNAME ].len;
|
|
ea->has_dname = list[EL_DNAME ].len;
|
|
for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
|
|
ea->nsecs[i] = list[i].len == 0 ? 0 :
|
|
(list[i].len == 4 ? 1 : 3);
|
|
}
|
|
uint8_t *it = ea->data;
|
|
for (int i = 0; i < EL_LENGTH; ++i) {
|
|
if (list[i].data) {
|
|
memcpy(it, list[i].data, list[i].len);
|
|
/* LATER(optim.): coalesce consecutive writes? */
|
|
} else {
|
|
list[i].data = it;
|
|
}
|
|
it += to_even(list[i].len);
|
|
}
|
|
}
|
|
|
|
int entry_list_parse(const knot_db_val_t val, entry_list_t list)
|
|
{
|
|
if (kr_fails_assert(val.data && val.len && list))
|
|
return kr_error(EINVAL);
|
|
/* Parse the apex itself (nsec parameters). */
|
|
const struct entry_apex *ea = entry_apex_consistent(val);
|
|
if (!ea) {
|
|
return kr_error(EILSEQ);
|
|
}
|
|
const uint8_t *it = ea->data,
|
|
*it_bound = knot_db_val_bound(val);
|
|
for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
|
|
if (it > it_bound) {
|
|
return kr_error(EILSEQ);
|
|
}
|
|
list[i].data = (void *)it;
|
|
switch (ea->nsecs[i]) {
|
|
case 0:
|
|
list[i].len = 0;
|
|
break;
|
|
case 1:
|
|
list[i].len = sizeof(uint32_t); /* just timestamp */
|
|
break;
|
|
case 3: { /* timestamp + NSEC3PARAM wire */
|
|
if (it + sizeof(uint32_t) + 4 > it_bound) {
|
|
return kr_error(EILSEQ);
|
|
}
|
|
list[i].len = sizeof(uint32_t)
|
|
+ nsec_p_rdlen(it + sizeof(uint32_t));
|
|
break;
|
|
}
|
|
default:
|
|
return kr_error(EILSEQ);
|
|
};
|
|
it += to_even(list[i].len);
|
|
}
|
|
/* Parse every entry_h. */
|
|
for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) {
|
|
list[i].data = (void *)it;
|
|
bool has_type;
|
|
switch (i) {
|
|
case EL_NS: has_type = ea->has_ns; break;
|
|
case EL_CNAME: has_type = ea->has_cname; break;
|
|
case EL_DNAME: has_type = ea->has_dname; break;
|
|
default:
|
|
kr_assert(!EINVAL);
|
|
return kr_error(EINVAL); /* something very bad */
|
|
}
|
|
if (!has_type) {
|
|
list[i].len = 0;
|
|
continue;
|
|
}
|
|
if (kr_fails_assert(it < it_bound))
|
|
return kr_error(EILSEQ);
|
|
const int len = entry_h_len(
|
|
(knot_db_val_t){ .data = (void *)it, .len = it_bound - it });
|
|
if (kr_fails_assert(len >= 0))
|
|
return kr_error(len);
|
|
list[i].len = len;
|
|
it += to_even(len);
|
|
}
|
|
if (kr_fails_assert(it == it_bound)) /* better not use it; might be "damaged" */
|
|
return kr_error(EILSEQ);
|
|
return kr_ok();
|
|
}
|
|
|
|
/** Given a valid entry header, find its length (i.e. offset of the next entry).
|
|
* \param val The beginning of the data and the bound (read only).
|
|
*/
|
|
static int entry_h_len(const knot_db_val_t val)
|
|
{
|
|
const bool ok = val.data && ((ssize_t)val.len) > 0;
|
|
if (!ok) return kr_error(EINVAL);
|
|
const struct entry_h *eh = val.data;
|
|
const uint8_t *d = eh->data; /* iterates over the data in entry */
|
|
const uint8_t *data_bound = knot_db_val_bound(val);
|
|
if (d >= data_bound) return kr_error(EILSEQ);
|
|
if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
|
|
int sets = 2;
|
|
while (sets-- > 0) {
|
|
d += KR_CACHE_RR_COUNT_SIZE + rdataset_dematerialized_size(d, NULL);
|
|
if (kr_fails_assert(d <= data_bound))
|
|
return kr_error(EILSEQ);
|
|
}
|
|
} else { /* A "packet" (opaque ATM). */
|
|
uint16_t len;
|
|
if (d + sizeof(len) > data_bound) return kr_error(EILSEQ);
|
|
memcpy(&len, d, sizeof(len));
|
|
d += 2 + to_even(len);
|
|
}
|
|
if (kr_fails_assert(d <= data_bound))
|
|
return kr_error(EILSEQ);
|
|
return d - (uint8_t *)val.data;
|
|
}
|
|
|
|
struct entry_apex * entry_apex_consistent(knot_db_val_t val)
|
|
{
|
|
//XXX: check lengths, etc.
|
|
return val.data;
|
|
}
|
|
|
|
/* See the header file. */
|
|
int entry_h_seek(knot_db_val_t *val, uint16_t type)
|
|
{
|
|
int i = -1;
|
|
switch (type) {
|
|
case KNOT_RRTYPE_NS: i = EL_NS; break;
|
|
case KNOT_RRTYPE_CNAME: i = EL_CNAME; break;
|
|
case KNOT_RRTYPE_DNAME: i = EL_DNAME; break;
|
|
default: return kr_ok();
|
|
}
|
|
|
|
entry_list_t el;
|
|
int ret = entry_list_parse(*val, el);
|
|
if (ret) return ret;
|
|
*val = el[i];
|
|
return val->len ? kr_ok() : kr_error(ENOENT);
|
|
}
|
|
|
|
static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key,
|
|
knot_db_val_t *val, const struct kr_query *qry)
|
|
{
|
|
static uint64_t ignoring_errors_until = 0; /// zero or a timestamp
|
|
int ret = cache_op(cache, write, key, val, 1);
|
|
if (!ret) {
|
|
ignoring_errors_until = 0;
|
|
return kr_ok();
|
|
}
|
|
VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret);
|
|
|
|
if (ret == kr_error(ENOSPC) && cache->api->usage_percent(cache->db) > 90) {
|
|
// Cache seems overfull. Maybe kres-cache-gc service doesn't work.
|
|
goto recovery;
|
|
}
|
|
|
|
/* If we get ENOSPC with usage < 90% (especially just above 80% when GC fires),
|
|
* it most likely isn't real overfull state but some LMDB bug related
|
|
* to transactions. Upstream seems unlikely to address it:
|
|
https://lists.openldap.org/hyperkitty/list/openldap-technical@openldap.org/thread/QHOTE2Y3WZ6E7J27OOKI44P344ETUOSF/
|
|
*
|
|
* In real life we see all processes getting a LMDB failure
|
|
* but it should recover after the transactions get reopened.
|
|
*
|
|
* Fortunately the kresd cache can afford to be slightly lossy,
|
|
* so we ignore this and other errors for a short while.
|
|
*/
|
|
const uint64_t now = kr_now();
|
|
if (!ignoring_errors_until) { // First error after a success.
|
|
kr_log_info(CACHE, "LMDB refusing writes (ignored for 5-9s): %s\n",
|
|
kr_strerror(ret));
|
|
ignoring_errors_until = now + 5000 + kr_rand_bytes(2)/16;
|
|
return kr_error(ret);
|
|
}
|
|
if (now < ignoring_errors_until)
|
|
return kr_error(ret);
|
|
// We've lost patience with cache writes not working continuously.
|
|
|
|
recovery: // Try to recover by clearing cache.
|
|
ret = kr_cache_clear(cache);
|
|
switch (ret) {
|
|
default:
|
|
kr_log_crit(CACHE, "CRITICAL: clearing cache failed: %s; fatal error, aborting\n",
|
|
kr_strerror(ret));
|
|
abort();
|
|
case 0:
|
|
kr_log_info(CACHE, "stuck cache cleared\n");
|
|
ignoring_errors_until = 0;
|
|
case -EAGAIN: // fall-through; krcachelock race -> retry later
|
|
return kr_error(ENOSPC);
|
|
}
|
|
}
|
|
|
|
|
|
/* See the header file. */
|
|
int entry_h_splice(
|
|
knot_db_val_t *val_new_entry, uint8_t rank,
|
|
const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
|
|
const knot_dname_t *owner/*log only*/,
|
|
const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp)
|
|
{
|
|
//TODO: another review, perhaps including the API
|
|
if (kr_fails_assert(val_new_entry && val_new_entry->len > 0))
|
|
return kr_error(EINVAL);
|
|
|
|
int i_type;
|
|
switch (type) {
|
|
case KNOT_RRTYPE_NS: i_type = EL_NS; break;
|
|
case KNOT_RRTYPE_CNAME: i_type = EL_CNAME; break;
|
|
case KNOT_RRTYPE_DNAME: i_type = EL_DNAME; break;
|
|
default: i_type = 0;
|
|
}
|
|
|
|
/* Get eh_orig (original entry), and also el list if multi-entry case. */
|
|
const struct entry_h *eh_orig = NULL;
|
|
entry_list_t el;
|
|
int ret = -1;
|
|
if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
|
|
knot_db_val_t val;
|
|
ret = cache_op(cache, read, &key, &val, 1);
|
|
if (i_type) {
|
|
if (!ret) ret = entry_list_parse(val, el);
|
|
if (ret) memset(el, 0, sizeof(el));
|
|
val = el[i_type];
|
|
}
|
|
/* val is on the entry, in either case (or error) */
|
|
if (!ret) {
|
|
eh_orig = entry_h_consistent_E(val, type);
|
|
}
|
|
} else {
|
|
/* We want to fully overwrite the entry, so don't even read it. */
|
|
memset(el, 0, sizeof(el));
|
|
}
|
|
|
|
if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
|
|
/* If equal rank was accepted, spoofing a *single* answer would be
|
|
* enough to e.g. override NS record in AUTHORITY section.
|
|
* This way they would have to hit the first answer
|
|
* (whenever TTL nears expiration).
|
|
* Stale-serving is NOT considered, but TTL 1 would be considered
|
|
* as expiring anyway, ... */
|
|
int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp);
|
|
if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl)
|
|
&& rank <= eh_orig->rank) {
|
|
WITH_VERBOSE(qry) {
|
|
auto_free char *type_str = kr_rrtype_text(type),
|
|
*owner_str = kr_dname_text(owner);
|
|
VERBOSE_MSG(qry, "=> not overwriting %s %s\n",
|
|
type_str, owner_str);
|
|
}
|
|
return kr_error(EEXIST);
|
|
}
|
|
}
|
|
|
|
if (!i_type) {
|
|
/* The non-list types are trivial now. */
|
|
return cache_write_or_clear(cache, &key, val_new_entry, qry);
|
|
}
|
|
/* Now we're in trouble. In some cases, parts of data to be written
|
|
* is an lmdb entry that may be invalidated by our write request.
|
|
* (lmdb does even in-place updates!) Therefore we copy all into a buffer.
|
|
* LATER(optim.): do this only when necessary, or perhaps another approach.
|
|
* This is also complicated by the fact that the val_new_entry part
|
|
* is to be written *afterwards* by the caller.
|
|
*/
|
|
el[i_type] = (knot_db_val_t){
|
|
.len = val_new_entry->len,
|
|
.data = NULL, /* perhaps unclear in the entry_h_splice() API */
|
|
};
|
|
knot_db_val_t val = {
|
|
.len = entry_list_serial_size(el),
|
|
.data = NULL,
|
|
};
|
|
uint8_t buf[val.len];
|
|
entry_list_memcpy((struct entry_apex *)buf, el);
|
|
ret = cache_write_or_clear(cache, &key, &val, qry);
|
|
if (ret) return kr_error(ret);
|
|
memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */
|
|
val_new_entry->data = (uint8_t *)val.data
|
|
+ ((uint8_t *)el[i_type].data - buf);
|
|
return kr_ok();
|
|
}
|
|
|