diff options
Diffstat (limited to '')
-rw-r--r-- | lib/dns/journal.c | 2364 |
1 files changed, 2364 insertions, 0 deletions
diff --git a/lib/dns/journal.c b/lib/dns/journal.c new file mode 100644 index 0000000..bdbc459 --- /dev/null +++ b/lib/dns/journal.c @@ -0,0 +1,2364 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <config.h> + +#include <inttypes.h> +#include <stdbool.h> + +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> + +#include <isc/file.h> +#include <isc/mem.h> +#include <isc/print.h> +#include <isc/stdio.h> +#include <isc/string.h> +#include <isc/util.h> + +#include <dns/compress.h> +#include <dns/db.h> +#include <dns/dbiterator.h> +#include <dns/diff.h> +#include <dns/fixedname.h> +#include <dns/journal.h> +#include <dns/log.h> +#include <dns/rdataset.h> +#include <dns/rdatasetiter.h> +#include <dns/result.h> +#include <dns/soa.h> + +/*! \file + * \brief Journaling. + * + * A journal file consists of + * + * \li A fixed-size header of type journal_rawheader_t. + * + * \li The index. This is an unordered array of index entries + * of type journal_rawpos_t giving the locations + * of some arbitrary subset of the journal's addressable + * transactions. The index entries are used as hints to + * speed up the process of locating a transaction with a given + * serial number. Unused index entries have an "offset" + * field of zero. The size of the index can vary between + * journal files, but does not change during the lifetime + * of a file. The size can be zero. + * + * \li The journal data. This consists of one or more transactions. + * Each transaction begins with a transaction header of type + * journal_rawxhdr_t. The transaction header is followed by a + * sequence of RRs, similar in structure to an IXFR difference + * sequence (RFC1995). That is, the pre-transaction SOA, + * zero or more other deleted RRs, the post-transaction SOA, + * and zero or more other added RRs. Unlike in IXFR, each RR + * is prefixed with a 32-bit length. + * + * The journal data part grows as new transactions are + * appended to the file. Only those transactions + * whose serial number is current-(2^31-1) to current + * are considered "addressable" and may be pointed + * to from the header or index. They may be preceded + * by old transactions that are no longer addressable, + * and they may be followed by transactions that were + * appended to the journal but never committed by updating + * the "end" position in the header. The latter will + * be overwritten when new transactions are added. + */ +/*% + * When true, accept IXFR difference sequences where the + * SOA serial number does not change (BIND 8 sends such + * sequences). + */ +static bool bind8_compat = true; /* XXX config */ + +/**************************************************************************/ +/* + * Miscellaneous utilities. + */ + +#define JOURNAL_COMMON_LOGARGS \ + dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL + +#define JOURNAL_DEBUG_LOGARGS(n) \ + JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n) + +/*% + * It would be non-sensical (or at least obtuse) to use FAIL() with an + * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler + * from complaining about "end-of-loop code not reached". + */ +#define FAIL(code) \ + do { result = (code); \ + if (result != ISC_R_SUCCESS) goto failure; \ + } while (0) + +#define CHECK(op) \ + do { result = (op); \ + if (result != ISC_R_SUCCESS) goto failure; \ + } while (0) + +#define JOURNAL_SERIALSET 0x01U + +static isc_result_t index_to_disk(dns_journal_t *); + +static inline uint32_t +decode_uint32(unsigned char *p) { + return ((p[0] << 24) + + (p[1] << 16) + + (p[2] << 8) + + (p[3] << 0)); +} + +static inline void +encode_uint32(uint32_t val, unsigned char *p) { + p[0] = (uint8_t)(val >> 24); + p[1] = (uint8_t)(val >> 16); + p[2] = (uint8_t)(val >> 8); + p[3] = (uint8_t)(val >> 0); +} + +isc_result_t +dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx, + dns_diffop_t op, dns_difftuple_t **tp) +{ + isc_result_t result; + dns_dbnode_t *node; + dns_rdataset_t rdataset; + dns_rdata_t rdata = DNS_RDATA_INIT; + dns_fixedname_t fixed; + dns_name_t *zonename; + + zonename = dns_fixedname_initname(&fixed); + dns_name_copy(dns_db_origin(db), zonename, NULL); + + node = NULL; + result = dns_db_findnode(db, zonename, false, &node); + if (result != ISC_R_SUCCESS) + goto nonode; + + dns_rdataset_init(&rdataset); + result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0, + (isc_stdtime_t)0, &rdataset, NULL); + if (result != ISC_R_SUCCESS) + goto freenode; + + result = dns_rdataset_first(&rdataset); + if (result != ISC_R_SUCCESS) + goto freenode; + + dns_rdataset_current(&rdataset, &rdata); + dns_rdataset_getownercase(&rdataset, zonename); + + result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, + &rdata, tp); + + dns_rdataset_disassociate(&rdataset); + dns_db_detachnode(db, &node); + return (result); + + freenode: + dns_db_detachnode(db, &node); + nonode: + UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA"); + return (result); +} + +/* Journaling */ + +/*% + * On-disk representation of a "pointer" to a journal entry. + * These are used in the journal header to locate the beginning + * and end of the journal, and in the journal index to locate + * other transactions. + */ +typedef struct { + unsigned char serial[4]; /*%< SOA serial before update. */ + /* + * XXXRTH Should offset be 8 bytes? + * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs. + * XXXAG ... but we will not be able to seek >2G anyway on many + * platforms as long as we are using fseek() rather + * than lseek(). + */ + unsigned char offset[4]; /*%< Offset from beginning of file. */ +} journal_rawpos_t; + + +/*% + * The header is of a fixed size, with some spare room for future + * extensions. + */ +#define JOURNAL_HEADER_SIZE 64 /* Bytes. */ + +/*% + * The on-disk representation of the journal header. + * All numbers are stored in big-endian order. + */ +typedef union { + struct { + /*% File format version ID. */ + unsigned char format[16]; + /*% Position of the first addressable transaction */ + journal_rawpos_t begin; + /*% Position of the next (yet nonexistent) transaction. */ + journal_rawpos_t end; + /*% Number of index entries following the header. */ + unsigned char index_size[4]; + /*% Source serial number. */ + unsigned char sourceserial[4]; + unsigned char flags; + } h; + /* Pad the header to a fixed size. */ + unsigned char pad[JOURNAL_HEADER_SIZE]; +} journal_rawheader_t; + +/*% + * The on-disk representation of the transaction header. + * There is one of these at the beginning of each transaction. + */ +typedef struct { + unsigned char size[4]; /*%< In bytes, excluding header. */ + unsigned char serial0[4]; /*%< SOA serial before update. */ + unsigned char serial1[4]; /*%< SOA serial after update. */ +} journal_rawxhdr_t; + +/*% + * The on-disk representation of the RR header. + * There is one of these at the beginning of each RR. + */ +typedef struct { + unsigned char size[4]; /*%< In bytes, excluding header. */ +} journal_rawrrhdr_t; + +/*% + * The in-core representation of the journal header. + */ +typedef struct { + uint32_t serial; + isc_offset_t offset; +} journal_pos_t; + +#define POS_VALID(pos) ((pos).offset != 0) +#define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0) + +typedef struct { + unsigned char format[16]; + journal_pos_t begin; + journal_pos_t end; + uint32_t index_size; + uint32_t sourceserial; + bool serialset; +} journal_header_t; + +/*% + * The in-core representation of the transaction header. + */ + +typedef struct { + uint32_t size; + uint32_t serial0; + uint32_t serial1; +} journal_xhdr_t; + +/*% + * The in-core representation of the RR header. + */ +typedef struct { + uint32_t size; +} journal_rrhdr_t; + + +/*% + * Initial contents to store in the header of a newly created + * journal file. + * + * The header starts with the magic string ";BIND LOG V9\n" + * to identify the file as a BIND 9 journal file. An ASCII + * identification string is used rather than a binary magic + * number to be consistent with BIND 8 (BIND 8 journal files + * are ASCII text files). + */ + +static journal_header_t +initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0 }; + +#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset) + +typedef enum { + JOURNAL_STATE_INVALID, + JOURNAL_STATE_READ, + JOURNAL_STATE_WRITE, + JOURNAL_STATE_TRANSACTION, + JOURNAL_STATE_INLINE +} journal_state_t; + +struct dns_journal { + unsigned int magic; /*%< JOUR */ + isc_mem_t *mctx; /*%< Memory context */ + journal_state_t state; + char *filename; /*%< Journal file name */ + FILE * fp; /*%< File handle */ + isc_offset_t offset; /*%< Current file offset */ + journal_header_t header; /*%< In-core journal header */ + unsigned char *rawindex; /*%< In-core buffer for journal index in on-disk format */ + journal_pos_t *index; /*%< In-core journal index */ + + /*% Current transaction state (when writing). */ + struct { + unsigned int n_soa; /*%< Number of SOAs seen */ + journal_pos_t pos[2]; /*%< Begin/end position */ + } x; + + /*% Iteration state (when reading). */ + struct { + /* These define the part of the journal we iterate over. */ + journal_pos_t bpos; /*%< Position before first, */ + journal_pos_t epos; /*%< and after last transaction */ + /* The rest is iterator state. */ + uint32_t current_serial; /*%< Current SOA serial */ + isc_buffer_t source; /*%< Data from disk */ + isc_buffer_t target; /*%< Data from _fromwire check */ + dns_decompress_t dctx; /*%< Dummy decompression ctx */ + dns_name_t name; /*%< Current domain name */ + dns_rdata_t rdata; /*%< Current rdata */ + uint32_t ttl; /*%< Current TTL */ + unsigned int xsize; /*%< Size of transaction data */ + unsigned int xpos; /*%< Current position in it */ + isc_result_t result; /*%< Result of last call */ + } it; +}; + +#define DNS_JOURNAL_MAGIC ISC_MAGIC('J', 'O', 'U', 'R') +#define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC) + +static void +journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) { + cooked->serial = decode_uint32(raw->serial); + cooked->offset = decode_uint32(raw->offset); +} + +static void +journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) { + encode_uint32(cooked->serial, raw->serial); + encode_uint32(cooked->offset, raw->offset); +} + +static void +journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) { + INSIST(sizeof(cooked->format) == sizeof(raw->h.format)); + memmove(cooked->format, raw->h.format, sizeof(cooked->format)); + journal_pos_decode(&raw->h.begin, &cooked->begin); + journal_pos_decode(&raw->h.end, &cooked->end); + cooked->index_size = decode_uint32(raw->h.index_size); + cooked->sourceserial = decode_uint32(raw->h.sourceserial); + cooked->serialset = (raw->h.flags & JOURNAL_SERIALSET); +} + +static void +journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) { + unsigned char flags = 0; + + INSIST(sizeof(cooked->format) == sizeof(raw->h.format)); + memset(raw->pad, 0, sizeof(raw->pad)); + memmove(raw->h.format, cooked->format, sizeof(raw->h.format)); + journal_pos_encode(&raw->h.begin, &cooked->begin); + journal_pos_encode(&raw->h.end, &cooked->end); + encode_uint32(cooked->index_size, raw->h.index_size); + encode_uint32(cooked->sourceserial, raw->h.sourceserial); + if (cooked->serialset) + flags |= JOURNAL_SERIALSET; + raw->h.flags = flags; +} + +/* + * Journal file I/O subroutines, with error checking and reporting. + */ +static isc_result_t +journal_seek(dns_journal_t *j, uint32_t offset) { + isc_result_t result; + + result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: seek: %s", j->filename, + isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + j->offset = offset; + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_read(dns_journal_t *j, void *mem, size_t nbytes) { + isc_result_t result; + + result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL); + if (result != ISC_R_SUCCESS) { + if (result == ISC_R_EOF) + return (ISC_R_NOMORE); + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: read: %s", + j->filename, isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + j->offset += (isc_offset_t)nbytes; + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_write(dns_journal_t *j, void *mem, size_t nbytes) { + isc_result_t result; + + result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: write: %s", + j->filename, isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + j->offset += (isc_offset_t)nbytes; + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_fsync(dns_journal_t *j) { + isc_result_t result; + result = isc_stdio_flush(j->fp); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: flush: %s", + j->filename, isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + result = isc_stdio_sync(j->fp); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: fsync: %s", + j->filename, isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + return (ISC_R_SUCCESS); +} + +/* + * Read/write a transaction header at the current file position. + */ + +static isc_result_t +journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) { + journal_rawxhdr_t raw; + isc_result_t result; + result = journal_read(j, &raw, sizeof(raw)); + if (result != ISC_R_SUCCESS) + return (result); + xhdr->size = decode_uint32(raw.size); + xhdr->serial0 = decode_uint32(raw.serial0); + xhdr->serial1 = decode_uint32(raw.serial1); + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_write_xhdr(dns_journal_t *j, uint32_t size, + uint32_t serial0, uint32_t serial1) +{ + journal_rawxhdr_t raw; + encode_uint32(size, raw.size); + encode_uint32(serial0, raw.serial0); + encode_uint32(serial1, raw.serial1); + return (journal_write(j, &raw, sizeof(raw))); +} + + +/* + * Read an RR header at the current file position. + */ + +static isc_result_t +journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) { + journal_rawrrhdr_t raw; + isc_result_t result; + result = journal_read(j, &raw, sizeof(raw)); + if (result != ISC_R_SUCCESS) + return (result); + rrhdr->size = decode_uint32(raw.size); + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_file_create(isc_mem_t *mctx, const char *filename) { + FILE *fp = NULL; + isc_result_t result; + journal_header_t header; + journal_rawheader_t rawheader; + int index_size = 56; /* XXX configurable */ + int size; + void *mem; /* Memory for temporary index image. */ + + INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE); + + result = isc_stdio_open(filename, "wb", &fp); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: create: %s", + filename, isc_result_totext(result)); + return (ISC_R_UNEXPECTED); + } + + header = initial_journal_header; + header.index_size = index_size; + journal_header_encode(&header, &rawheader); + + size = sizeof(journal_rawheader_t) + + index_size * sizeof(journal_rawpos_t); + + mem = isc_mem_get(mctx, size); + if (mem == NULL) { + (void)isc_stdio_close(fp); + (void)isc_file_remove(filename); + return (ISC_R_NOMEMORY); + } + memset(mem, 0, size); + memmove(mem, &rawheader, sizeof(rawheader)); + + result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: write: %s", + filename, isc_result_totext(result)); + (void)isc_stdio_close(fp); + (void)isc_file_remove(filename); + isc_mem_put(mctx, mem, size); + return (ISC_R_UNEXPECTED); + } + isc_mem_put(mctx, mem, size); + + result = isc_stdio_close(fp); + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: close: %s", + filename, isc_result_totext(result)); + (void)isc_file_remove(filename); + return (ISC_R_UNEXPECTED); + } + + return (ISC_R_SUCCESS); +} + +static isc_result_t +journal_open(isc_mem_t *mctx, const char *filename, bool writable, + bool create, dns_journal_t **journalp) +{ + FILE *fp = NULL; + isc_result_t result; + journal_rawheader_t rawheader; + dns_journal_t *j; + + INSIST(journalp != NULL && *journalp == NULL); + j = isc_mem_get(mctx, sizeof(*j)); + if (j == NULL) + return (ISC_R_NOMEMORY); + + j->mctx = NULL; + isc_mem_attach(mctx, &j->mctx); + j->state = JOURNAL_STATE_INVALID; + j->fp = NULL; + j->filename = isc_mem_strdup(mctx, filename); + j->index = NULL; + j->rawindex = NULL; + + if (j->filename == NULL) + FAIL(ISC_R_NOMEMORY); + + result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp); + + if (result == ISC_R_FILENOTFOUND) { + if (create) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1), + "journal file %s does not exist, " + "creating it", j->filename); + CHECK(journal_file_create(mctx, filename)); + /* + * Retry. + */ + result = isc_stdio_open(j->filename, "rb+", &fp); + } else { + FAIL(ISC_R_NOTFOUND); + } + } + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: open: %s", + j->filename, isc_result_totext(result)); + FAIL(ISC_R_UNEXPECTED); + } + + j->fp = fp; + + /* + * Set magic early so that seek/read can succeed. + */ + j->magic = DNS_JOURNAL_MAGIC; + + CHECK(journal_seek(j, 0)); + CHECK(journal_read(j, &rawheader, sizeof(rawheader))); + + if (memcmp(rawheader.h.format, initial_journal_header.format, + sizeof(initial_journal_header.format)) != 0) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal format not recognized", + j->filename); + FAIL(ISC_R_UNEXPECTED); + } + journal_header_decode(&rawheader, &j->header); + + /* + * If there is an index, read the raw index into a dynamically + * allocated buffer and then convert it into a cooked index. + */ + if (j->header.index_size != 0) { + unsigned int i; + unsigned int rawbytes; + unsigned char *p; + + rawbytes = j->header.index_size * sizeof(journal_rawpos_t); + j->rawindex = isc_mem_get(mctx, rawbytes); + if (j->rawindex == NULL) + FAIL(ISC_R_NOMEMORY); + + CHECK(journal_read(j, j->rawindex, rawbytes)); + + j->index = isc_mem_get(mctx, j->header.index_size * + sizeof(journal_pos_t)); + if (j->index == NULL) + FAIL(ISC_R_NOMEMORY); + + p = j->rawindex; + for (i = 0; i < j->header.index_size; i++) { + j->index[i].serial = decode_uint32(p); + p += 4; + j->index[i].offset = decode_uint32(p); + p += 4; + } + INSIST(p == j->rawindex + rawbytes); + } + j->offset = -1; /* Invalid, must seek explicitly. */ + + /* + * Initialize the iterator. + */ + dns_name_init(&j->it.name, NULL); + dns_rdata_init(&j->it.rdata); + + /* + * Set up empty initial buffers for unchecked and checked + * wire format RR data. They will be reallocated + * later. + */ + isc_buffer_init(&j->it.source, NULL, 0); + isc_buffer_init(&j->it.target, NULL, 0); + dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE); + + j->state = + writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ; + + *journalp = j; + return (ISC_R_SUCCESS); + + failure: + j->magic = 0; + if (j->rawindex != NULL) + isc_mem_put(j->mctx, j->rawindex, j->header.index_size * + sizeof(journal_rawpos_t)); + if (j->index != NULL) + isc_mem_put(j->mctx, j->index, j->header.index_size * + sizeof(journal_pos_t)); + if (j->filename != NULL) + isc_mem_free(j->mctx, j->filename); + if (j->fp != NULL) + (void)isc_stdio_close(j->fp); + isc_mem_putanddetach(&j->mctx, j, sizeof(*j)); + return (result); +} + +isc_result_t +dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode, + dns_journal_t **journalp) +{ + isc_result_t result; + size_t namelen; + char backup[1024]; + bool writable, create; + + create = (mode & DNS_JOURNAL_CREATE); + writable = (mode & (DNS_JOURNAL_WRITE|DNS_JOURNAL_CREATE)); + + result = journal_open(mctx, filename, writable, create, journalp); + if (result == ISC_R_NOTFOUND) { + namelen = strlen(filename); + if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) + namelen -= 4; + + result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk", + (int)namelen, filename); + if (result != ISC_R_SUCCESS) + return (result); + result = journal_open(mctx, backup, writable, writable, + journalp); + } + return (result); +} + +/* + * A comparison function defining the sorting order for + * entries in the IXFR-style journal file. + * + * The IXFR format requires that deletions are sorted before + * additions, and within either one, SOA records are sorted + * before others. + * + * Also sort the non-SOA records by type as a courtesy to the + * server receiving the IXFR - it may help reduce the amount of + * rdataset merging it has to do. + */ +static int +ixfr_order(const void *av, const void *bv) { + dns_difftuple_t const * const *ap = av; + dns_difftuple_t const * const *bp = bv; + dns_difftuple_t const *a = *ap; + dns_difftuple_t const *b = *bp; + int r; + int bop = 0, aop = 0; + + switch (a->op) { + case DNS_DIFFOP_DEL: + case DNS_DIFFOP_DELRESIGN: + aop = 1; + break; + case DNS_DIFFOP_ADD: + case DNS_DIFFOP_ADDRESIGN: + aop = 0; + break; + default: + INSIST(0); + } + + switch (b->op) { + case DNS_DIFFOP_DEL: + case DNS_DIFFOP_DELRESIGN: + bop = 1; + break; + case DNS_DIFFOP_ADD: + case DNS_DIFFOP_ADDRESIGN: + bop = 0; + break; + default: + INSIST(0); + } + + r = bop - aop; + if (r != 0) + return (r); + + r = (b->rdata.type == dns_rdatatype_soa) - + (a->rdata.type == dns_rdatatype_soa); + if (r != 0) + return (r); + + r = (a->rdata.type - b->rdata.type); + return (r); +} + +/* + * Advance '*pos' to the next journal transaction. + * + * Requires: + * *pos refers to a valid journal transaction. + * + * Ensures: + * When ISC_R_SUCCESS is returned, + * *pos refers to the next journal transaction. + * + * Returns one of: + * + * ISC_R_SUCCESS + * ISC_R_NOMORE *pos pointed at the last transaction + * Other results due to file errors are possible. + */ +static isc_result_t +journal_next(dns_journal_t *j, journal_pos_t *pos) { + isc_result_t result; + journal_xhdr_t xhdr; + REQUIRE(DNS_JOURNAL_VALID(j)); + + result = journal_seek(j, pos->offset); + if (result != ISC_R_SUCCESS) + return (result); + + if (pos->serial == j->header.end.serial) + return (ISC_R_NOMORE); + /* + * Read the header of the current transaction. + * This will return ISC_R_NOMORE if we are at EOF. + */ + result = journal_read_xhdr(j, &xhdr); + if (result != ISC_R_SUCCESS) + return (result); + + /* + * Check serial number consistency. + */ + if (xhdr.serial0 != pos->serial) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal file corrupt: " + "expected serial %u, got %u", + j->filename, pos->serial, xhdr.serial0); + return (ISC_R_UNEXPECTED); + } + + /* + * Check for offset wraparound. + */ + if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size) + < pos->offset) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: offset too large", j->filename); + return (ISC_R_UNEXPECTED); + } + + pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size; + pos->serial = xhdr.serial1; + return (ISC_R_SUCCESS); +} + +/* + * If the index of the journal 'j' contains an entry "better" + * than '*best_guess', replace '*best_guess' with it. + * + * "Better" means having a serial number closer to 'serial' + * but not greater than 'serial'. + */ +static void +index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) { + unsigned int i; + if (j->index == NULL) + return; + for (i = 0; i < j->header.index_size; i++) { + if (POS_VALID(j->index[i]) && + DNS_SERIAL_GE(serial, j->index[i].serial) && + DNS_SERIAL_GT(j->index[i].serial, best_guess->serial)) + *best_guess = j->index[i]; + } +} + +/* + * Add a new index entry. If there is no room, make room by removing + * the odd-numbered entries and compacting the others into the first + * half of the index. This decimates old index entries exponentially + * over time, so that the index always contains a much larger fraction + * of recent serial numbers than of old ones. This is deliberate - + * most index searches are for outgoing IXFR, and IXFR tends to request + * recent versions more often than old ones. + */ +static void +index_add(dns_journal_t *j, journal_pos_t *pos) { + unsigned int i; + if (j->index == NULL) + return; + /* + * Search for a vacant position. + */ + for (i = 0; i < j->header.index_size; i++) { + if (! POS_VALID(j->index[i])) + break; + } + if (i == j->header.index_size) { + unsigned int k = 0; + /* + * Found no vacant position. Make some room. + */ + for (i = 0; i < j->header.index_size; i += 2) { + j->index[k++] = j->index[i]; + } + i = k; /* 'i' identifies the first vacant position. */ + while (k < j->header.index_size) { + POS_INVALIDATE(j->index[k]); + k++; + } + } + INSIST(i < j->header.index_size); + INSIST(! POS_VALID(j->index[i])); + + /* + * Store the new index entry. + */ + j->index[i] = *pos; +} + +/* + * Invalidate any existing index entries that could become + * ambiguous when a new transaction with number 'serial' is added. + */ +static void +index_invalidate(dns_journal_t *j, uint32_t serial) { + unsigned int i; + if (j->index == NULL) + return; + for (i = 0; i < j->header.index_size; i++) { + if (! DNS_SERIAL_GT(serial, j->index[i].serial)) + POS_INVALIDATE(j->index[i]); + } +} + +/* + * Try to find a transaction with initial serial number 'serial' + * in the journal 'j'. + * + * If found, store its position at '*pos' and return ISC_R_SUCCESS. + * + * If 'serial' is current (= the ending serial number of the + * last transaction in the journal), set '*pos' to + * the position immediately following the last transaction and + * return ISC_R_SUCCESS. + * + * If 'serial' is within the range of addressable serial numbers + * covered by the journal but that particular serial number is missing + * (from the journal, not just from the index), return ISC_R_NOTFOUND. + * + * If 'serial' is outside the range of addressable serial numbers + * covered by the journal, return ISC_R_RANGE. + * + */ +static isc_result_t +journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) { + isc_result_t result; + journal_pos_t current_pos; + REQUIRE(DNS_JOURNAL_VALID(j)); + + if (DNS_SERIAL_GT(j->header.begin.serial, serial)) + return (ISC_R_RANGE); + if (DNS_SERIAL_GT(serial, j->header.end.serial)) + return (ISC_R_RANGE); + if (serial == j->header.end.serial) { + *pos = j->header.end; + return (ISC_R_SUCCESS); + } + + current_pos = j->header.begin; + index_find(j, serial, ¤t_pos); + + while (current_pos.serial != serial) { + if (DNS_SERIAL_GT(current_pos.serial, serial)) + return (ISC_R_NOTFOUND); + result = journal_next(j, ¤t_pos); + if (result != ISC_R_SUCCESS) + return (result); + } + *pos = current_pos; + return (ISC_R_SUCCESS); +} + +isc_result_t +dns_journal_begin_transaction(dns_journal_t *j) { + uint32_t offset; + isc_result_t result; + journal_rawxhdr_t hdr; + + REQUIRE(DNS_JOURNAL_VALID(j)); + REQUIRE(j->state == JOURNAL_STATE_WRITE || + j->state == JOURNAL_STATE_INLINE); + + /* + * Find the file offset where the new transaction should + * be written, and seek there. + */ + if (JOURNAL_EMPTY(&j->header)) { + offset = sizeof(journal_rawheader_t) + + j->header.index_size * sizeof(journal_rawpos_t); + } else { + offset = j->header.end.offset; + } + j->x.pos[0].offset = offset; + j->x.pos[1].offset = offset; /* Initial value, will be incremented. */ + j->x.n_soa = 0; + + CHECK(journal_seek(j, offset)); + + /* + * Write a dummy transaction header of all zeroes to reserve + * space. It will be filled in when the transaction is + * finished. + */ + memset(&hdr, 0, sizeof(hdr)); + CHECK(journal_write(j, &hdr, sizeof(hdr))); + j->x.pos[1].offset = j->offset; + + j->state = JOURNAL_STATE_TRANSACTION; + result = ISC_R_SUCCESS; + failure: + return (result); +} + +isc_result_t +dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) { + dns_difftuple_t *t; + isc_buffer_t buffer; + void *mem = NULL; + uint64_t size; + isc_result_t result; + isc_region_t used; + + REQUIRE(DNS_DIFF_VALID(diff)); + REQUIRE(j->state == JOURNAL_STATE_TRANSACTION); + + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal"); + (void)dns_diff_print(diff, NULL); + + /* + * Pass 1: determine the buffer size needed, and + * keep track of SOA serial numbers. + */ + size = 0; + for (t = ISC_LIST_HEAD(diff->tuples); t != NULL; + t = ISC_LIST_NEXT(t, link)) + { + if (t->rdata.type == dns_rdatatype_soa) { + if (j->x.n_soa < 2) + j->x.pos[j->x.n_soa].serial = + dns_soa_getserial(&t->rdata); + j->x.n_soa++; + } + size += sizeof(journal_rawrrhdr_t); + size += t->name.length; /* XXX should have access macro? */ + size += 10; + size += t->rdata.length; + } + + if (size >= INT32_MAX) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "dns_journal_writediff: %s: journal entry " + "too big to be stored: %" PRIu64 " bytes", + j->filename, size); + return (ISC_R_NOSPACE); + } + + mem = isc_mem_get(j->mctx, size); + if (mem == NULL) + return (ISC_R_NOMEMORY); + + isc_buffer_init(&buffer, mem, size); + + /* + * Pass 2. Write RRs to buffer. + */ + for (t = ISC_LIST_HEAD(diff->tuples); t != NULL; + t = ISC_LIST_NEXT(t, link)) + { + /* + * Write the RR header. + */ + isc_buffer_putuint32(&buffer, t->name.length + 10 + + t->rdata.length); + /* + * Write the owner name, RR header, and RR data. + */ + isc_buffer_putmem(&buffer, t->name.ndata, t->name.length); + isc_buffer_putuint16(&buffer, t->rdata.type); + isc_buffer_putuint16(&buffer, t->rdata.rdclass); + isc_buffer_putuint32(&buffer, t->ttl); + INSIST(t->rdata.length < 65536); + isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length); + INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length); + isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length); + } + + isc_buffer_usedregion(&buffer, &used); + INSIST(used.length == size); + + j->x.pos[1].offset += used.length; + + /* + * Write the buffer contents to the journal file. + */ + CHECK(journal_write(j, used.base, used.length)); + + result = ISC_R_SUCCESS; + + failure: + if (mem != NULL) + isc_mem_put(j->mctx, mem, size); + return (result); + +} + +isc_result_t +dns_journal_commit(dns_journal_t *j) { + isc_result_t result; + journal_rawheader_t rawheader; + uint64_t total; + + REQUIRE(DNS_JOURNAL_VALID(j)); + REQUIRE(j->state == JOURNAL_STATE_TRANSACTION || + j->state == JOURNAL_STATE_INLINE); + + /* + * Just write out a updated header. + */ + if (j->state == JOURNAL_STATE_INLINE) { + CHECK(journal_fsync(j)); + journal_header_encode(&j->header, &rawheader); + CHECK(journal_seek(j, 0)); + CHECK(journal_write(j, &rawheader, sizeof(rawheader))); + CHECK(journal_fsync(j)); + j->state = JOURNAL_STATE_WRITE; + return (ISC_R_SUCCESS); + } + + /* + * Perform some basic consistency checks. + */ + if (j->x.n_soa != 2) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: malformed transaction: %d SOAs", + j->filename, j->x.n_soa); + return (ISC_R_UNEXPECTED); + } + if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) || + (bind8_compat && + j->x.pos[1].serial == j->x.pos[0].serial))) + { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: malformed transaction: serial number " + "would decrease", j->filename); + return (ISC_R_UNEXPECTED); + } + if (! JOURNAL_EMPTY(&j->header)) { + if (j->x.pos[0].serial != j->header.end.serial) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "malformed transaction: " + "%s last serial %u != " + "transaction first serial %u", + j->filename, + j->header.end.serial, + j->x.pos[0].serial); + return (ISC_R_UNEXPECTED); + } + } + + /* + * We currently don't support huge journal entries. + */ + total = j->x.pos[1].offset - j->x.pos[0].offset; + if (total >= INT32_MAX) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "transaction too big to be stored in journal: " + "%" PRIu64 "b (max is %" PRIu64 "b)", total, + (uint64_t)INT32_MAX); + return (ISC_R_UNEXPECTED); + } + + /* + * Some old journal entries may become non-addressable + * when we increment the current serial number. Purge them + * by stepping header.begin forward to the first addressable + * transaction. Also purge them from the index. + */ + if (! JOURNAL_EMPTY(&j->header)) { + while (! DNS_SERIAL_GT(j->x.pos[1].serial, + j->header.begin.serial)) { + CHECK(journal_next(j, &j->header.begin)); + } + index_invalidate(j, j->x.pos[1].serial); + } +#ifdef notyet + if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) { + force_dump(...); + } +#endif + + /* + * Commit the transaction data to stable storage. + */ + CHECK(journal_fsync(j)); + + if (j->state == JOURNAL_STATE_TRANSACTION) { + isc_offset_t offset; + offset = (j->x.pos[1].offset - j->x.pos[0].offset) - + sizeof(journal_rawxhdr_t); + /* + * Update the transaction header. + */ + CHECK(journal_seek(j, j->x.pos[0].offset)); + CHECK(journal_write_xhdr(j, offset, j->x.pos[0].serial, + j->x.pos[1].serial)); + } + + /* + * Update the journal header. + */ + if (JOURNAL_EMPTY(&j->header)) + j->header.begin = j->x.pos[0]; + j->header.end = j->x.pos[1]; + journal_header_encode(&j->header, &rawheader); + CHECK(journal_seek(j, 0)); + CHECK(journal_write(j, &rawheader, sizeof(rawheader))); + + /* + * Update the index. + */ + index_add(j, &j->x.pos[0]); + + /* + * Convert the index into on-disk format and write + * it to disk. + */ + CHECK(index_to_disk(j)); + + /* + * Commit the header to stable storage. + */ + CHECK(journal_fsync(j)); + + /* + * We no longer have a transaction open. + */ + j->state = JOURNAL_STATE_WRITE; + + result = ISC_R_SUCCESS; + + failure: + return (result); +} + +isc_result_t +dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) { + isc_result_t result; + CHECK(dns_diff_sort(diff, ixfr_order)); + CHECK(dns_journal_begin_transaction(j)); + CHECK(dns_journal_writediff(j, diff)); + CHECK(dns_journal_commit(j)); + result = ISC_R_SUCCESS; + failure: + return (result); +} + +void +dns_journal_destroy(dns_journal_t **journalp) { + dns_journal_t *j = *journalp; + REQUIRE(DNS_JOURNAL_VALID(j)); + + j->it.result = ISC_R_FAILURE; + dns_name_invalidate(&j->it.name); + dns_decompress_invalidate(&j->it.dctx); + if (j->rawindex != NULL) + isc_mem_put(j->mctx, j->rawindex, j->header.index_size * + sizeof(journal_rawpos_t)); + if (j->index != NULL) + isc_mem_put(j->mctx, j->index, j->header.index_size * + sizeof(journal_pos_t)); + if (j->it.target.base != NULL) + isc_mem_put(j->mctx, j->it.target.base, j->it.target.length); + if (j->it.source.base != NULL) + isc_mem_put(j->mctx, j->it.source.base, j->it.source.length); + if (j->filename != NULL) + isc_mem_free(j->mctx, j->filename); + if (j->fp != NULL) + (void)isc_stdio_close(j->fp); + j->magic = 0; + isc_mem_putanddetach(&j->mctx, j, sizeof(*j)); + *journalp = NULL; +} + +/* + * Roll the open journal 'j' into the database 'db'. + * A new database version will be created. + */ + +/* XXX Share code with incoming IXFR? */ + +static isc_result_t +roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options) { + isc_buffer_t source; /* Transaction data from disk */ + isc_buffer_t target; /* Ditto after _fromwire check */ + uint32_t db_serial; /* Database SOA serial */ + uint32_t end_serial; /* Last journal SOA serial */ + isc_result_t result; + dns_dbversion_t *ver = NULL; + journal_pos_t pos; + dns_diff_t diff; + unsigned int n_soa = 0; + unsigned int n_put = 0; + dns_diffop_t op; + + REQUIRE(DNS_JOURNAL_VALID(j)); + REQUIRE(DNS_DB_VALID(db)); + + dns_diff_init(j->mctx, &diff); + + /* + * Set up empty initial buffers for unchecked and checked + * wire format transaction data. They will be reallocated + * later. + */ + isc_buffer_init(&source, NULL, 0); + isc_buffer_init(&target, NULL, 0); + + /* + * Create the new database version. + */ + CHECK(dns_db_newversion(db, &ver)); + + /* + * Get the current database SOA serial number. + */ + CHECK(dns_db_getsoaserial(db, ver, &db_serial)); + + /* + * Locate a journal entry for the current database serial. + */ + CHECK(journal_find(j, db_serial, &pos)); + /* + * XXX do more drastic things, like marking zone stale, + * if this fails? + */ + /* + * XXXRTH The zone code should probably mark the zone as bad and + * scream loudly into the log if this is a dynamic update + * log reply that failed. + */ + + end_serial = dns_journal_last_serial(j); + if (db_serial == end_serial) + CHECK(DNS_R_UPTODATE); + + CHECK(dns_journal_iter_init(j, db_serial, end_serial)); + + for (result = dns_journal_first_rr(j); + result == ISC_R_SUCCESS; + result = dns_journal_next_rr(j)) + { + dns_name_t *name; + uint32_t ttl; + dns_rdata_t *rdata; + dns_difftuple_t *tuple = NULL; + + name = NULL; + rdata = NULL; + dns_journal_current_rr(j, &name, &ttl, &rdata); + + if (rdata->type == dns_rdatatype_soa) { + n_soa++; + if (n_soa == 2) + db_serial = j->it.current_serial; + } + + if (n_soa == 3) + n_soa = 1; + if (n_soa == 0) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal file corrupt: missing " + "initial SOA", j->filename); + FAIL(ISC_R_UNEXPECTED); + } + if ((options & DNS_JOURNALOPT_RESIGN) != 0) + op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN : + DNS_DIFFOP_ADDRESIGN; + else + op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD; + + CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata, + &tuple)); + dns_diff_append(&diff, &tuple); + + if (++n_put > 100) { + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), + "%s: applying diff to database (%u)", + j->filename, db_serial); + (void)dns_diff_print(&diff, NULL); + CHECK(dns_diff_apply(&diff, db, ver)); + dns_diff_clear(&diff); + n_put = 0; + } + } + if (result == ISC_R_NOMORE) + result = ISC_R_SUCCESS; + CHECK(result); + + if (n_put != 0) { + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), + "%s: applying final diff to database (%u)", + j->filename, db_serial); + (void)dns_diff_print(&diff, NULL); + CHECK(dns_diff_apply(&diff, db, ver)); + dns_diff_clear(&diff); + } + + failure: + if (ver != NULL) + dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ? + true : false); + + if (source.base != NULL) + isc_mem_put(j->mctx, source.base, source.length); + if (target.base != NULL) + isc_mem_put(j->mctx, target.base, target.length); + + dns_diff_clear(&diff); + + INSIST(ver == NULL); + + return (result); +} + +isc_result_t +dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, unsigned int options, + const char *filename) +{ + dns_journal_t *j; + isc_result_t result; + + REQUIRE(DNS_DB_VALID(db)); + REQUIRE(filename != NULL); + + j = NULL; + result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j); + if (result == ISC_R_NOTFOUND) { + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), + "no journal file, but that's OK"); + return (DNS_R_NOJOURNAL); + } + if (result != ISC_R_SUCCESS) + return (result); + if (JOURNAL_EMPTY(&j->header)) + result = DNS_R_UPTODATE; + else + result = roll_forward(j, db, options); + + dns_journal_destroy(&j); + + return (result); +} + +isc_result_t +dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) { + dns_journal_t *j; + isc_buffer_t source; /* Transaction data from disk */ + isc_buffer_t target; /* Ditto after _fromwire check */ + uint32_t start_serial; /* Database SOA serial */ + uint32_t end_serial; /* Last journal SOA serial */ + isc_result_t result; + dns_diff_t diff; + unsigned int n_soa = 0; + unsigned int n_put = 0; + + REQUIRE(filename != NULL); + + j = NULL; + result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j); + if (result == ISC_R_NOTFOUND) { + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file"); + return (DNS_R_NOJOURNAL); + } + + if (result != ISC_R_SUCCESS) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "journal open failure: %s: %s", + isc_result_totext(result), filename); + return (result); + } + + if (j->header.serialset) + fprintf(file, "Source serial = %u\n", j->header.sourceserial); + dns_diff_init(j->mctx, &diff); + + /* + * Set up empty initial buffers for unchecked and checked + * wire format transaction data. They will be reallocated + * later. + */ + isc_buffer_init(&source, NULL, 0); + isc_buffer_init(&target, NULL, 0); + + start_serial = dns_journal_first_serial(j); + end_serial = dns_journal_last_serial(j); + + CHECK(dns_journal_iter_init(j, start_serial, end_serial)); + + for (result = dns_journal_first_rr(j); + result == ISC_R_SUCCESS; + result = dns_journal_next_rr(j)) + { + dns_name_t *name; + uint32_t ttl; + dns_rdata_t *rdata; + dns_difftuple_t *tuple = NULL; + + name = NULL; + rdata = NULL; + dns_journal_current_rr(j, &name, &ttl, &rdata); + + if (rdata->type == dns_rdatatype_soa) + n_soa++; + + if (n_soa == 3) + n_soa = 1; + if (n_soa == 0) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal file corrupt: missing " + "initial SOA", j->filename); + FAIL(ISC_R_UNEXPECTED); + } + CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ? + DNS_DIFFOP_DEL : DNS_DIFFOP_ADD, + name, ttl, rdata, &tuple)); + dns_diff_append(&diff, &tuple); + + if (++n_put > 100) { + result = dns_diff_print(&diff, file); + dns_diff_clear(&diff); + n_put = 0; + if (result != ISC_R_SUCCESS) + break; + } + } + if (result == ISC_R_NOMORE) + result = ISC_R_SUCCESS; + CHECK(result); + + if (n_put != 0) { + result = dns_diff_print(&diff, file); + dns_diff_clear(&diff); + } + goto cleanup; + + failure: + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: cannot print: journal file corrupt", j->filename); + + cleanup: + if (source.base != NULL) + isc_mem_put(j->mctx, source.base, source.length); + if (target.base != NULL) + isc_mem_put(j->mctx, target.base, target.length); + + dns_diff_clear(&diff); + dns_journal_destroy(&j); + + return (result); +} + +/**************************************************************************/ +/* + * Miscellaneous accessors. + */ +uint32_t +dns_journal_first_serial(dns_journal_t *j) { + return (j->header.begin.serial); +} + +uint32_t +dns_journal_last_serial(dns_journal_t *j) { + return (j->header.end.serial); +} + +void +dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) { + + REQUIRE(j->state == JOURNAL_STATE_WRITE || + j->state == JOURNAL_STATE_INLINE || + j->state == JOURNAL_STATE_TRANSACTION); + + j->header.sourceserial = sourceserial; + j->header.serialset = true; + if (j->state == JOURNAL_STATE_WRITE) + j->state = JOURNAL_STATE_INLINE; +} + +bool +dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) { + REQUIRE(sourceserial != NULL); + + if (!j->header.serialset) + return (false); + *sourceserial = j->header.sourceserial; + return (true); +} + +/**************************************************************************/ +/* + * Iteration support. + * + * When serving an outgoing IXFR, we transmit a part the journal starting + * at the serial number in the IXFR request and ending at the serial + * number that is current when the IXFR request arrives. The ending + * serial number is not necessarily at the end of the journal: + * the journal may grow while the IXFR is in progress, but we stop + * when we reach the serial number that was current when the IXFR started. + */ + +static isc_result_t read_one_rr(dns_journal_t *j); + +/* + * Make sure the buffer 'b' is has at least 'size' bytes + * allocated, and clear it. + * + * Requires: + * Either b->base is NULL, or it points to b->length bytes of memory + * previously allocated by isc_mem_get(). + */ + +static isc_result_t +size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) { + if (b->length < size) { + void *mem = isc_mem_get(mctx, size); + if (mem == NULL) + return (ISC_R_NOMEMORY); + if (b->base != NULL) + isc_mem_put(mctx, b->base, b->length); + b->base = mem; + b->length = size; + } + isc_buffer_clear(b); + return (ISC_R_SUCCESS); +} + +isc_result_t +dns_journal_iter_init(dns_journal_t *j, + uint32_t begin_serial, uint32_t end_serial) +{ + isc_result_t result; + + CHECK(journal_find(j, begin_serial, &j->it.bpos)); + INSIST(j->it.bpos.serial == begin_serial); + + CHECK(journal_find(j, end_serial, &j->it.epos)); + INSIST(j->it.epos.serial == end_serial); + + result = ISC_R_SUCCESS; + failure: + j->it.result = result; + return (j->it.result); +} + + +isc_result_t +dns_journal_first_rr(dns_journal_t *j) { + isc_result_t result; + + /* + * Seek to the beginning of the first transaction we are + * interested in. + */ + CHECK(journal_seek(j, j->it.bpos.offset)); + j->it.current_serial = j->it.bpos.serial; + + j->it.xsize = 0; /* We have no transaction data yet... */ + j->it.xpos = 0; /* ...and haven't used any of it. */ + + return (read_one_rr(j)); + + failure: + return (result); +} + +static isc_result_t +read_one_rr(dns_journal_t *j) { + isc_result_t result; + + dns_rdatatype_t rdtype; + dns_rdataclass_t rdclass; + unsigned int rdlen; + uint32_t ttl; + journal_xhdr_t xhdr; + journal_rrhdr_t rrhdr; + + if (j->offset > j->it.epos.offset) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal corrupt: possible integer overflow", + j->filename); + return (ISC_R_UNEXPECTED); + } + if (j->offset == j->it.epos.offset) + return (ISC_R_NOMORE); + if (j->it.xpos == j->it.xsize) { + /* + * We are at a transaction boundary. + * Read another transaction header. + */ + CHECK(journal_read_xhdr(j, &xhdr)); + if (xhdr.size == 0) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal corrupt: empty transaction", + j->filename); + FAIL(ISC_R_UNEXPECTED); + } + if (xhdr.serial0 != j->it.current_serial) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal file corrupt: " + "expected serial %u, got %u", + j->filename, + j->it.current_serial, xhdr.serial0); + FAIL(ISC_R_UNEXPECTED); + } + j->it.xsize = xhdr.size; + j->it.xpos = 0; + } + /* + * Read an RR. + */ + CHECK(journal_read_rrhdr(j, &rrhdr)); + /* + * Perform a sanity check on the journal RR size. + * The smallest possible RR has a 1-byte owner name + * and a 10-byte header. The largest possible + * RR has 65535 bytes of data, a header, and a maximum- + * size owner name, well below 70 k total. + */ + if (rrhdr.size < 1+10 || rrhdr.size > 70000) { + isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR, + "%s: journal corrupt: impossible RR size " + "(%d bytes)", j->filename, rrhdr.size); + FAIL(ISC_R_UNEXPECTED); + } + + CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size)); + CHECK(journal_read(j, j->it.source.base, rrhdr.size)); + isc_buffer_add(&j->it.source, rrhdr.size); + + /* + * The target buffer is made the same size + * as the source buffer, with the assumption that when + * no compression in present, the output of dns_*_fromwire() + * is no larger than the input. + */ + CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size)); + + /* + * Parse the owner name. We don't know where it + * ends yet, so we make the entire "remaining" + * part of the buffer "active". + */ + isc_buffer_setactive(&j->it.source, + j->it.source.used - j->it.source.current); + CHECK(dns_name_fromwire(&j->it.name, &j->it.source, + &j->it.dctx, 0, &j->it.target)); + + /* + * Check that the RR header is there, and parse it. + */ + if (isc_buffer_remaininglength(&j->it.source) < 10) + FAIL(DNS_R_FORMERR); + + rdtype = isc_buffer_getuint16(&j->it.source); + rdclass = isc_buffer_getuint16(&j->it.source); + ttl = isc_buffer_getuint32(&j->it.source); + rdlen = isc_buffer_getuint16(&j->it.source); + + /* + * Parse the rdata. + */ + if (isc_buffer_remaininglength(&j->it.source) != rdlen) + FAIL(DNS_R_FORMERR); + isc_buffer_setactive(&j->it.source, rdlen); + dns_rdata_reset(&j->it.rdata); + CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, + rdtype, &j->it.source, &j->it.dctx, + 0, &j->it.target)); + j->it.ttl = ttl; + + j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size; + if (rdtype == dns_rdatatype_soa) { + /* XXX could do additional consistency checks here */ + j->it.current_serial = dns_soa_getserial(&j->it.rdata); + } + + result = ISC_R_SUCCESS; + + failure: + j->it.result = result; + return (result); +} + +isc_result_t +dns_journal_next_rr(dns_journal_t *j) { + j->it.result = read_one_rr(j); + return (j->it.result); +} + +void +dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl, + dns_rdata_t **rdata) +{ + REQUIRE(j->it.result == ISC_R_SUCCESS); + *name = &j->it.name; + *ttl = j->it.ttl; + *rdata = &j->it.rdata; +} + +/**************************************************************************/ +/* + * Generating diffs from databases + */ + +/* + * Construct a diff containing all the RRs at the current name of the + * database iterator 'dbit' in database 'db', version 'ver'. + * Set '*name' to the current name, and append the diff to 'diff'. + * All new tuples will have the operation 'op'. + * + * Requires: 'name' must have buffer large enough to hold the name. + * Typically, a dns_fixedname_t would be used. + */ +static isc_result_t +get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now, + dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op, + dns_diff_t *diff) +{ + isc_result_t result; + dns_dbnode_t *node = NULL; + dns_rdatasetiter_t *rdsiter = NULL; + dns_difftuple_t *tuple = NULL; + + result = dns_dbiterator_current(dbit, &node, name); + if (result != ISC_R_SUCCESS) + return (result); + + result = dns_db_allrdatasets(db, node, ver, now, &rdsiter); + if (result != ISC_R_SUCCESS) + goto cleanup_node; + + for (result = dns_rdatasetiter_first(rdsiter); + result == ISC_R_SUCCESS; + result = dns_rdatasetiter_next(rdsiter)) + { + dns_rdataset_t rdataset; + + dns_rdataset_init(&rdataset); + dns_rdatasetiter_current(rdsiter, &rdataset); + + for (result = dns_rdataset_first(&rdataset); + result == ISC_R_SUCCESS; + result = dns_rdataset_next(&rdataset)) + { + dns_rdata_t rdata = DNS_RDATA_INIT; + dns_rdataset_current(&rdataset, &rdata); + result = dns_difftuple_create(diff->mctx, op, name, + rdataset.ttl, &rdata, + &tuple); + if (result != ISC_R_SUCCESS) { + dns_rdataset_disassociate(&rdataset); + goto cleanup_iterator; + } + dns_diff_append(diff, &tuple); + } + dns_rdataset_disassociate(&rdataset); + if (result != ISC_R_NOMORE) + goto cleanup_iterator; + } + if (result != ISC_R_NOMORE) + goto cleanup_iterator; + + result = ISC_R_SUCCESS; + + cleanup_iterator: + dns_rdatasetiter_destroy(&rdsiter); + + cleanup_node: + dns_db_detachnode(db, &node); + + return (result); +} + +/* + * Comparison function for use by dns_diff_subtract when sorting + * the diffs to be subtracted. The sort keys are the rdata type + * and the rdata itself. The owner name is ignored, because + * it is known to be the same for all tuples. + */ +static int +rdata_order(const void *av, const void *bv) { + dns_difftuple_t const * const *ap = av; + dns_difftuple_t const * const *bp = bv; + dns_difftuple_t const *a = *ap; + dns_difftuple_t const *b = *bp; + int r; + r = (b->rdata.type - a->rdata.type); + if (r != 0) + return (r); + r = dns_rdata_compare(&a->rdata, &b->rdata); + return (r); +} + +static isc_result_t +dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) { + isc_result_t result; + dns_difftuple_t *p[2]; + int i, t; + bool append; + + CHECK(dns_diff_sort(&diff[0], rdata_order)); + CHECK(dns_diff_sort(&diff[1], rdata_order)); + + for (;;) { + p[0] = ISC_LIST_HEAD(diff[0].tuples); + p[1] = ISC_LIST_HEAD(diff[1].tuples); + if (p[0] == NULL && p[1] == NULL) + break; + + for (i = 0; i < 2; i++) + if (p[!i] == NULL) { + ISC_LIST_UNLINK(diff[i].tuples, p[i], link); + ISC_LIST_APPEND(r->tuples, p[i], link); + goto next; + } + t = rdata_order(&p[0], &p[1]); + if (t < 0) { + ISC_LIST_UNLINK(diff[0].tuples, p[0], link); + ISC_LIST_APPEND(r->tuples, p[0], link); + goto next; + } + if (t > 0) { + ISC_LIST_UNLINK(diff[1].tuples, p[1], link); + ISC_LIST_APPEND(r->tuples, p[1], link); + goto next; + } + INSIST(t == 0); + /* + * Identical RRs in both databases; skip them both + * if the ttl differs. + */ + append = (p[0]->ttl != p[1]->ttl); + for (i = 0; i < 2; i++) { + ISC_LIST_UNLINK(diff[i].tuples, p[i], link); + if (append) { + ISC_LIST_APPEND(r->tuples, p[i], link); + } else { + dns_difftuple_free(&p[i]); + } + } + next: ; + } + result = ISC_R_SUCCESS; + failure: + return (result); +} + +static isc_result_t +diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, + dns_db_t *dbb, dns_dbversion_t *dbverb, + unsigned int options, dns_diff_t *resultdiff) +{ + dns_db_t *db[2]; + dns_dbversion_t *ver[2]; + dns_dbiterator_t *dbit[2] = { NULL, NULL }; + bool have[2] = { false, false }; + dns_fixedname_t fixname[2]; + isc_result_t result, itresult[2]; + dns_diff_t diff[2]; + int i, t; + + db[0] = dba, db[1] = dbb; + ver[0] = dbvera, ver[1] = dbverb; + + dns_diff_init(resultdiff->mctx, &diff[0]); + dns_diff_init(resultdiff->mctx, &diff[1]); + + dns_fixedname_init(&fixname[0]); + dns_fixedname_init(&fixname[1]); + + result = dns_db_createiterator(db[0], options, &dbit[0]); + if (result != ISC_R_SUCCESS) + return (result); + result = dns_db_createiterator(db[1], options, &dbit[1]); + if (result != ISC_R_SUCCESS) + goto cleanup_iterator; + + itresult[0] = dns_dbiterator_first(dbit[0]); + itresult[1] = dns_dbiterator_first(dbit[1]); + + for (;;) { + for (i = 0; i < 2; i++) { + if (! have[i] && itresult[i] == ISC_R_SUCCESS) { + CHECK(get_name_diff(db[i], ver[i], 0, dbit[i], + dns_fixedname_name(&fixname[i]), + i == 0 ? + DNS_DIFFOP_ADD : + DNS_DIFFOP_DEL, + &diff[i])); + itresult[i] = dns_dbiterator_next(dbit[i]); + have[i] = true; + } + } + + if (! have[0] && ! have[1]) { + INSIST(ISC_LIST_EMPTY(diff[0].tuples)); + INSIST(ISC_LIST_EMPTY(diff[1].tuples)); + break; + } + + for (i = 0; i < 2; i++) { + if (! have[!i]) { + ISC_LIST_APPENDLIST(resultdiff->tuples, + diff[i].tuples, link); + INSIST(ISC_LIST_EMPTY(diff[i].tuples)); + have[i] = false; + goto next; + } + } + + t = dns_name_compare(dns_fixedname_name(&fixname[0]), + dns_fixedname_name(&fixname[1])); + if (t < 0) { + ISC_LIST_APPENDLIST(resultdiff->tuples, + diff[0].tuples, link); + INSIST(ISC_LIST_EMPTY(diff[0].tuples)); + have[0] = false; + continue; + } + if (t > 0) { + ISC_LIST_APPENDLIST(resultdiff->tuples, + diff[1].tuples, link); + INSIST(ISC_LIST_EMPTY(diff[1].tuples)); + have[1] = false; + continue; + } + INSIST(t == 0); + CHECK(dns_diff_subtract(diff, resultdiff)); + INSIST(ISC_LIST_EMPTY(diff[0].tuples)); + INSIST(ISC_LIST_EMPTY(diff[1].tuples)); + have[0] = have[1] = false; + next: ; + } + if (itresult[0] != ISC_R_NOMORE) + FAIL(itresult[0]); + if (itresult[1] != ISC_R_NOMORE) + FAIL(itresult[1]); + + INSIST(ISC_LIST_EMPTY(diff[0].tuples)); + INSIST(ISC_LIST_EMPTY(diff[1].tuples)); + + failure: + dns_dbiterator_destroy(&dbit[1]); + + cleanup_iterator: + dns_dbiterator_destroy(&dbit[0]); + dns_diff_clear(&diff[0]); + dns_diff_clear(&diff[1]); + return (result); +} + +/* + * Compare the databases 'dba' and 'dbb' and generate a journal + * entry containing the changes to make 'dba' from 'dbb' (note + * the order). This journal entry will consist of a single, + * possibly very large transaction. + */ +isc_result_t +dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera, + dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) +{ + isc_result_t result; + dns_diff_t diff; + + dns_diff_init(mctx, &diff); + + result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename); + + dns_diff_clear(&diff); + + return (result); +} + +isc_result_t +dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera, + dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) +{ + isc_result_t result; + dns_journal_t *journal = NULL; + + if (filename != NULL) { + result = dns_journal_open(diff->mctx, filename, + DNS_JOURNAL_CREATE, &journal); + if (result != ISC_R_SUCCESS) + return (result); + } + + CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff)); + CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff)); + + if (journal != NULL) { + if (ISC_LIST_EMPTY(diff->tuples)) + isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes"); + else + CHECK(dns_journal_write_transaction(journal, diff)); + } + + failure: + if (journal != NULL) + dns_journal_destroy(&journal); + return (result); +} + +isc_result_t +dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial, + uint32_t target_size) +{ + unsigned int i; + journal_pos_t best_guess; + journal_pos_t current_pos; + dns_journal_t *j1 = NULL; + dns_journal_t *j2 = NULL; + journal_rawheader_t rawheader; + unsigned int copy_length; + size_t namelen; + char *buf = NULL; + unsigned int size = 0; + isc_result_t result; + unsigned int indexend; + char newname[1024]; + char backup[1024]; + bool is_backup = false; + + REQUIRE(filename != NULL); + + namelen = strlen(filename); + if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) + namelen -= 4; + + result = isc_string_printf(newname, sizeof(newname), "%.*s.jnw", + (int)namelen, filename); + if (result != ISC_R_SUCCESS) + return (result); + + result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk", + (int)namelen, filename); + if (result != ISC_R_SUCCESS) + return (result); + + result = journal_open(mctx, filename, false, false, &j1); + if (result == ISC_R_NOTFOUND) { + is_backup = true; + result = journal_open(mctx, backup, false, false, &j1); + } + if (result != ISC_R_SUCCESS) + return (result); + + if (JOURNAL_EMPTY(&j1->header)) { + dns_journal_destroy(&j1); + return (ISC_R_SUCCESS); + } + + if (DNS_SERIAL_GT(j1->header.begin.serial, serial) || + DNS_SERIAL_GT(serial, j1->header.end.serial)) { + dns_journal_destroy(&j1); + return (ISC_R_RANGE); + } + + /* + * Cope with very small target sizes. + */ + indexend = sizeof(journal_rawheader_t) + + j1->header.index_size * sizeof(journal_rawpos_t); + if (target_size < indexend * 2) + target_size = target_size/2 + indexend; + + /* + * See if there is any work to do. + */ + if ((uint32_t) j1->header.end.offset < target_size) { + dns_journal_destroy(&j1); + return (ISC_R_SUCCESS); + } + + CHECK(journal_open(mctx, newname, true, true, &j2)); + + /* + * Remove overhead so space test below can succeed. + */ + if (target_size >= indexend) + target_size -= indexend; + + /* + * Find if we can create enough free space. + */ + best_guess = j1->header.begin; + for (i = 0; i < j1->header.index_size; i++) { + if (POS_VALID(j1->index[i]) && + DNS_SERIAL_GE(serial, j1->index[i].serial) && + ((uint32_t)(j1->header.end.offset - j1->index[i].offset) + >= target_size / 2) && + j1->index[i].offset > best_guess.offset) + best_guess = j1->index[i]; + } + + current_pos = best_guess; + while (current_pos.serial != serial) { + CHECK(journal_next(j1, ¤t_pos)); + if (current_pos.serial == j1->header.end.serial) + break; + + if (DNS_SERIAL_GE(serial, current_pos.serial) && + ((uint32_t)(j1->header.end.offset - current_pos.offset) + >= (target_size / 2)) && + current_pos.offset > best_guess.offset) + best_guess = current_pos; + else + break; + } + + INSIST(best_guess.serial != j1->header.end.serial); + if (best_guess.serial != serial) + CHECK(journal_next(j1, &best_guess)); + + /* + * We should now be roughly half target_size provided + * we did not reach 'serial'. If not we will just copy + * all uncommitted deltas regardless of the size. + */ + copy_length = j1->header.end.offset - best_guess.offset; + + if (copy_length != 0) { + /* + * Copy best_guess to end into space just freed. + */ + size = 64*1024; + if (copy_length < size) + size = copy_length; + buf = isc_mem_get(mctx, size); + if (buf == NULL) { + result = ISC_R_NOMEMORY; + goto failure; + } + + CHECK(journal_seek(j1, best_guess.offset)); + CHECK(journal_seek(j2, indexend)); + for (i = 0; i < copy_length; i += size) { + unsigned int len = (copy_length - i) > size ? size : + (copy_length - i); + CHECK(journal_read(j1, buf, len)); + CHECK(journal_write(j2, buf, len)); + } + + CHECK(journal_fsync(j2)); + + /* + * Compute new header. + */ + j2->header.begin.serial = best_guess.serial; + j2->header.begin.offset = indexend; + j2->header.end.serial = j1->header.end.serial; + j2->header.end.offset = indexend + copy_length; + j2->header.sourceserial = j1->header.sourceserial; + j2->header.serialset = j1->header.serialset; + + /* + * Update the journal header. + */ + journal_header_encode(&j2->header, &rawheader); + CHECK(journal_seek(j2, 0)); + CHECK(journal_write(j2, &rawheader, sizeof(rawheader))); + CHECK(journal_fsync(j2)); + + /* + * Build new index. + */ + current_pos = j2->header.begin; + while (current_pos.serial != j2->header.end.serial) { + index_add(j2, ¤t_pos); + CHECK(journal_next(j2, ¤t_pos)); + } + + /* + * Write index. + */ + CHECK(index_to_disk(j2)); + CHECK(journal_fsync(j2)); + + indexend = j2->header.end.offset; + POST(indexend); + } + + /* + * Close both journals before trying to rename files (this is + * necessary on WIN32). + */ + dns_journal_destroy(&j1); + dns_journal_destroy(&j2); + + /* + * With a UFS file system this should just succeed and be atomic. + * Any IXFR outs will just continue and the old journal will be + * removed on final close. + * + * With MSDOS / NTFS we need to do a two stage rename, triggered + * by EEXIST. (If any IXFR's are running in other threads, however, + * this will fail, and the journal will not be compacted. But + * if so, hopefully they'll be finished by the next time we + * compact.) + */ + if (rename(newname, filename) == -1) { + if (errno == EEXIST && !is_backup) { + result = isc_file_remove(backup); + if (result != ISC_R_SUCCESS && + result != ISC_R_FILENOTFOUND) + goto failure; + if (rename(filename, backup) == -1) + goto maperrno; + if (rename(newname, filename) == -1) + goto maperrno; + (void)isc_file_remove(backup); + } else { + maperrno: + result = ISC_R_FAILURE; + goto failure; + } + } + + result = ISC_R_SUCCESS; + + failure: + (void)isc_file_remove(newname); + if (buf != NULL) + isc_mem_put(mctx, buf, size); + if (j1 != NULL) + dns_journal_destroy(&j1); + if (j2 != NULL) + dns_journal_destroy(&j2); + return (result); +} + +static isc_result_t +index_to_disk(dns_journal_t *j) { + isc_result_t result = ISC_R_SUCCESS; + + if (j->header.index_size != 0) { + unsigned int i; + unsigned char *p; + unsigned int rawbytes; + + rawbytes = j->header.index_size * sizeof(journal_rawpos_t); + + p = j->rawindex; + for (i = 0; i < j->header.index_size; i++) { + encode_uint32(j->index[i].serial, p); + p += 4; + encode_uint32(j->index[i].offset, p); + p += 4; + } + INSIST(p == j->rawindex + rawbytes); + + CHECK(journal_seek(j, sizeof(journal_rawheader_t))); + CHECK(journal_write(j, j->rawindex, rawbytes)); + } +failure: + return (result); +} |