diff options
Diffstat (limited to '')
-rw-r--r-- | lib/cache/entry_list.c | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/lib/cache/entry_list.c b/lib/cache/entry_list.c new file mode 100644 index 0000000..4dced2f --- /dev/null +++ b/lib/cache/entry_list.c @@ -0,0 +1,301 @@ +/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz> + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +/** @file + * Implementation of chaining in struct entry_h. Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" +#include "lib/utils.h" + + +static int entry_h_len(knot_db_val_t val); + + +void entry_list_memcpy(struct entry_apex *ea, entry_list_t list) +{ + if (kr_fails_assert(ea)) + return; + memset(ea, 0, offsetof(struct entry_apex, data)); + ea->has_ns = list[EL_NS ].len; + ea->has_cname = list[EL_CNAME ].len; + ea->has_dname = list[EL_DNAME ].len; + for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) { + ea->nsecs[i] = list[i].len == 0 ? 0 : + (list[i].len == 4 ? 1 : 3); + } + uint8_t *it = ea->data; + for (int i = 0; i < EL_LENGTH; ++i) { + if (list[i].data) { + memcpy(it, list[i].data, list[i].len); + /* LATER(optim.): coalesce consecutive writes? */ + } else { + list[i].data = it; + } + it += to_even(list[i].len); + } +} + +int entry_list_parse(const knot_db_val_t val, entry_list_t list) +{ + if (kr_fails_assert(val.data && val.len && list)) + return kr_error(EINVAL); + /* Parse the apex itself (nsec parameters). */ + const struct entry_apex *ea = entry_apex_consistent(val); + if (!ea) { + return kr_error(EILSEQ); + } + const uint8_t *it = ea->data, + *it_bound = knot_db_val_bound(val); + for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) { + if (it > it_bound) { + return kr_error(EILSEQ); + } + list[i].data = (void *)it; + switch (ea->nsecs[i]) { + case 0: + list[i].len = 0; + break; + case 1: + list[i].len = sizeof(uint32_t); /* just timestamp */ + break; + case 3: { /* timestamp + NSEC3PARAM wire */ + if (it + sizeof(uint32_t) + 4 > it_bound) { + return kr_error(EILSEQ); + } + list[i].len = sizeof(uint32_t) + + nsec_p_rdlen(it + sizeof(uint32_t)); + break; + } + default: + return kr_error(EILSEQ); + }; + it += to_even(list[i].len); + } + /* Parse every entry_h. */ + for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) { + list[i].data = (void *)it; + bool has_type; + switch (i) { + case EL_NS: has_type = ea->has_ns; break; + case EL_CNAME: has_type = ea->has_cname; break; + case EL_DNAME: has_type = ea->has_dname; break; + default: + kr_assert(!EINVAL); + return kr_error(EINVAL); /* something very bad */ + } + if (!has_type) { + list[i].len = 0; + continue; + } + if (kr_fails_assert(it < it_bound)) + return kr_error(EILSEQ); + const int len = entry_h_len( + (knot_db_val_t){ .data = (void *)it, .len = it_bound - it }); + if (kr_fails_assert(len >= 0)) + return kr_error(len); + list[i].len = len; + it += to_even(len); + } + if (kr_fails_assert(it == it_bound)) /* better not use it; might be "damaged" */ + return kr_error(EILSEQ); + return kr_ok(); +} + +/** Given a valid entry header, find its length (i.e. offset of the next entry). + * \param val The beginning of the data and the bound (read only). + */ +static int entry_h_len(const knot_db_val_t val) +{ + const bool ok = val.data && ((ssize_t)val.len) > 0; + if (!ok) return kr_error(EINVAL); + const struct entry_h *eh = val.data; + const uint8_t *d = eh->data; /* iterates over the data in entry */ + const uint8_t *data_bound = knot_db_val_bound(val); + if (d >= data_bound) return kr_error(EILSEQ); + if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */ + int sets = 2; + while (sets-- > 0) { + d += KR_CACHE_RR_COUNT_SIZE + rdataset_dematerialized_size(d, NULL); + if (kr_fails_assert(d <= data_bound)) + return kr_error(EILSEQ); + } + } else { /* A "packet" (opaque ATM). */ + uint16_t len; + if (d + sizeof(len) > data_bound) return kr_error(EILSEQ); + memcpy(&len, d, sizeof(len)); + d += 2 + to_even(len); + } + if (kr_fails_assert(d <= data_bound)) + return kr_error(EILSEQ); + return d - (uint8_t *)val.data; +} + +struct entry_apex * entry_apex_consistent(knot_db_val_t val) +{ + //XXX: check lengths, etc. + return val.data; +} + +/* See the header file. */ +int entry_h_seek(knot_db_val_t *val, uint16_t type) +{ + int i = -1; + switch (type) { + case KNOT_RRTYPE_NS: i = EL_NS; break; + case KNOT_RRTYPE_CNAME: i = EL_CNAME; break; + case KNOT_RRTYPE_DNAME: i = EL_DNAME; break; + default: return kr_ok(); + } + + entry_list_t el; + int ret = entry_list_parse(*val, el); + if (ret) return ret; + *val = el[i]; + return val->len ? kr_ok() : kr_error(ENOENT); +} + +static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key, + knot_db_val_t *val, const struct kr_query *qry) +{ + static uint64_t ignoring_errors_until = 0; /// zero or a timestamp + int ret = cache_op(cache, write, key, val, 1); + if (!ret) { + ignoring_errors_until = 0; + return kr_ok(); + } + VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret); + + if (ret == kr_error(ENOSPC) && cache->api->usage_percent(cache->db) > 90) { + // Cache seems overfull. Maybe kres-cache-gc service doesn't work. + goto recovery; + } + + /* If we get ENOSPC with usage < 90% (especially just above 80% when GC fires), + * it most likely isn't real overfull state but some LMDB bug related + * to transactions. Upstream seems unlikely to address it: + https://lists.openldap.org/hyperkitty/list/openldap-technical@openldap.org/thread/QHOTE2Y3WZ6E7J27OOKI44P344ETUOSF/ + * + * In real life we see all processes getting a LMDB failure + * but it should recover after the transactions get reopened. + * + * Fortunately the kresd cache can afford to be slightly lossy, + * so we ignore this and other errors for a short while. + */ + const uint64_t now = kr_now(); + if (!ignoring_errors_until) { // First error after a success. + kr_log_info(CACHE, "LMDB refusing writes (ignored for 5-9s): %s\n", + kr_strerror(ret)); + ignoring_errors_until = now + 5000 + kr_rand_bytes(2)/16; + return kr_error(ret); + } + if (now < ignoring_errors_until) + return kr_error(ret); + // We've lost patience with cache writes not working continuously. + +recovery: // Try to recover by clearing cache. + ret = kr_cache_clear(cache); + switch (ret) { + default: + kr_log_crit(CACHE, "CRITICAL: clearing cache failed: %s; fatal error, aborting\n", + kr_strerror(ret)); + abort(); + case 0: + kr_log_info(CACHE, "stuck cache cleared\n"); + ignoring_errors_until = 0; + case -EAGAIN: // fall-through; krcachelock race -> retry later + return kr_error(ENOSPC); + } +} + + +/* See the header file. */ +int entry_h_splice( + knot_db_val_t *val_new_entry, uint8_t rank, + const knot_db_val_t key, const uint16_t ktype, const uint16_t type, + const knot_dname_t *owner/*log only*/, + const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp) +{ + //TODO: another review, perhaps including the API + if (kr_fails_assert(val_new_entry && val_new_entry->len > 0)) + return kr_error(EINVAL); + + int i_type; + switch (type) { + case KNOT_RRTYPE_NS: i_type = EL_NS; break; + case KNOT_RRTYPE_CNAME: i_type = EL_CNAME; break; + case KNOT_RRTYPE_DNAME: i_type = EL_DNAME; break; + default: i_type = 0; + } + + /* Get eh_orig (original entry), and also el list if multi-entry case. */ + const struct entry_h *eh_orig = NULL; + entry_list_t el; + int ret = -1; + if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) { + knot_db_val_t val; + ret = cache_op(cache, read, &key, &val, 1); + if (i_type) { + if (!ret) ret = entry_list_parse(val, el); + if (ret) memset(el, 0, sizeof(el)); + val = el[i_type]; + } + /* val is on the entry, in either case (or error) */ + if (!ret) { + eh_orig = entry_h_consistent_E(val, type); + } + } else { + /* We want to fully overwrite the entry, so don't even read it. */ + memset(el, 0, sizeof(el)); + } + + if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) { + /* If equal rank was accepted, spoofing a *single* answer would be + * enough to e.g. override NS record in AUTHORITY section. + * This way they would have to hit the first answer + * (whenever TTL nears expiration). + * Stale-serving is NOT considered, but TTL 1 would be considered + * as expiring anyway, ... */ + int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp); + if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl) + && rank <= eh_orig->rank) { + WITH_VERBOSE(qry) { + auto_free char *type_str = kr_rrtype_text(type), + *owner_str = kr_dname_text(owner); + VERBOSE_MSG(qry, "=> not overwriting %s %s\n", + type_str, owner_str); + } + return kr_error(EEXIST); + } + } + + if (!i_type) { + /* The non-list types are trivial now. */ + return cache_write_or_clear(cache, &key, val_new_entry, qry); + } + /* Now we're in trouble. In some cases, parts of data to be written + * is an lmdb entry that may be invalidated by our write request. + * (lmdb does even in-place updates!) Therefore we copy all into a buffer. + * LATER(optim.): do this only when necessary, or perhaps another approach. + * This is also complicated by the fact that the val_new_entry part + * is to be written *afterwards* by the caller. + */ + el[i_type] = (knot_db_val_t){ + .len = val_new_entry->len, + .data = NULL, /* perhaps unclear in the entry_h_splice() API */ + }; + knot_db_val_t val = { + .len = entry_list_serial_size(el), + .data = NULL, + }; + uint8_t buf[val.len]; + entry_list_memcpy((struct entry_apex *)buf, el); + ret = cache_write_or_clear(cache, &key, &val, qry); + if (ret) return kr_error(ret); + memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */ + val_new_entry->data = (uint8_t *)val.data + + ((uint8_t *)el[i_type].data - buf); + return kr_ok(); +} + |