summaryrefslogtreecommitdiffstats
path: root/lib/dns/journal.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/dns/journal.c2364
1 files changed, 2364 insertions, 0 deletions
diff --git a/lib/dns/journal.c b/lib/dns/journal.c
new file mode 100644
index 0000000..bdbc459
--- /dev/null
+++ b/lib/dns/journal.c
@@ -0,0 +1,2364 @@
+/*
+ * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * See the COPYRIGHT file distributed with this work for additional
+ * information regarding copyright ownership.
+ */
+
+#include <config.h>
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <isc/file.h>
+#include <isc/mem.h>
+#include <isc/print.h>
+#include <isc/stdio.h>
+#include <isc/string.h>
+#include <isc/util.h>
+
+#include <dns/compress.h>
+#include <dns/db.h>
+#include <dns/dbiterator.h>
+#include <dns/diff.h>
+#include <dns/fixedname.h>
+#include <dns/journal.h>
+#include <dns/log.h>
+#include <dns/rdataset.h>
+#include <dns/rdatasetiter.h>
+#include <dns/result.h>
+#include <dns/soa.h>
+
+/*! \file
+ * \brief Journaling.
+ *
+ * A journal file consists of
+ *
+ * \li A fixed-size header of type journal_rawheader_t.
+ *
+ * \li The index. This is an unordered array of index entries
+ * of type journal_rawpos_t giving the locations
+ * of some arbitrary subset of the journal's addressable
+ * transactions. The index entries are used as hints to
+ * speed up the process of locating a transaction with a given
+ * serial number. Unused index entries have an "offset"
+ * field of zero. The size of the index can vary between
+ * journal files, but does not change during the lifetime
+ * of a file. The size can be zero.
+ *
+ * \li The journal data. This consists of one or more transactions.
+ * Each transaction begins with a transaction header of type
+ * journal_rawxhdr_t. The transaction header is followed by a
+ * sequence of RRs, similar in structure to an IXFR difference
+ * sequence (RFC1995). That is, the pre-transaction SOA,
+ * zero or more other deleted RRs, the post-transaction SOA,
+ * and zero or more other added RRs. Unlike in IXFR, each RR
+ * is prefixed with a 32-bit length.
+ *
+ * The journal data part grows as new transactions are
+ * appended to the file. Only those transactions
+ * whose serial number is current-(2^31-1) to current
+ * are considered "addressable" and may be pointed
+ * to from the header or index. They may be preceded
+ * by old transactions that are no longer addressable,
+ * and they may be followed by transactions that were
+ * appended to the journal but never committed by updating
+ * the "end" position in the header. The latter will
+ * be overwritten when new transactions are added.
+ */
+/*%
+ * When true, accept IXFR difference sequences where the
+ * SOA serial number does not change (BIND 8 sends such
+ * sequences).
+ */
+static bool bind8_compat = true; /* XXX config */
+
+/**************************************************************************/
+/*
+ * Miscellaneous utilities.
+ */
+
+#define JOURNAL_COMMON_LOGARGS \
+ dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
+
+#define JOURNAL_DEBUG_LOGARGS(n) \
+ JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
+
+/*%
+ * It would be non-sensical (or at least obtuse) to use FAIL() with an
+ * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
+ * from complaining about "end-of-loop code not reached".
+ */
+#define FAIL(code) \
+ do { result = (code); \
+ if (result != ISC_R_SUCCESS) goto failure; \
+ } while (0)
+
+#define CHECK(op) \
+ do { result = (op); \
+ if (result != ISC_R_SUCCESS) goto failure; \
+ } while (0)
+
+#define JOURNAL_SERIALSET 0x01U
+
+static isc_result_t index_to_disk(dns_journal_t *);
+
+static inline uint32_t
+decode_uint32(unsigned char *p) {
+ return ((p[0] << 24) +
+ (p[1] << 16) +
+ (p[2] << 8) +
+ (p[3] << 0));
+}
+
+static inline void
+encode_uint32(uint32_t val, unsigned char *p) {
+ p[0] = (uint8_t)(val >> 24);
+ p[1] = (uint8_t)(val >> 16);
+ p[2] = (uint8_t)(val >> 8);
+ p[3] = (uint8_t)(val >> 0);
+}
+
+isc_result_t
+dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
+ dns_diffop_t op, dns_difftuple_t **tp)
+{
+ isc_result_t result;
+ dns_dbnode_t *node;
+ dns_rdataset_t rdataset;
+ dns_rdata_t rdata = DNS_RDATA_INIT;
+ dns_fixedname_t fixed;
+ dns_name_t *zonename;
+
+ zonename = dns_fixedname_initname(&fixed);
+ dns_name_copy(dns_db_origin(db), zonename, NULL);
+
+ node = NULL;
+ result = dns_db_findnode(db, zonename, false, &node);
+ if (result != ISC_R_SUCCESS)
+ goto nonode;
+
+ dns_rdataset_init(&rdataset);
+ result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
+ (isc_stdtime_t)0, &rdataset, NULL);
+ if (result != ISC_R_SUCCESS)
+ goto freenode;
+
+ result = dns_rdataset_first(&rdataset);
+ if (result != ISC_R_SUCCESS)
+ goto freenode;
+
+ dns_rdataset_current(&rdataset, &rdata);
+ dns_rdataset_getownercase(&rdataset, zonename);
+
+ result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
+ &rdata, tp);
+
+ dns_rdataset_disassociate(&rdataset);
+ dns_db_detachnode(db, &node);
+ return (result);
+
+ freenode:
+ dns_db_detachnode(db, &node);
+ nonode:
+ UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
+ return (result);
+}
+
+/* Journaling */
+
+/*%
+ * On-disk representation of a "pointer" to a journal entry.
+ * These are used in the journal header to locate the beginning
+ * and end of the journal, and in the journal index to locate
+ * other transactions.
+ */
+typedef struct {
+ unsigned char serial[4]; /*%< SOA serial before update. */
+ /*
+ * XXXRTH Should offset be 8 bytes?
+ * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
+ * XXXAG ... but we will not be able to seek >2G anyway on many
+ * platforms as long as we are using fseek() rather
+ * than lseek().
+ */
+ unsigned char offset[4]; /*%< Offset from beginning of file. */
+} journal_rawpos_t;
+
+
+/*%
+ * The header is of a fixed size, with some spare room for future
+ * extensions.
+ */
+#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
+
+/*%
+ * The on-disk representation of the journal header.
+ * All numbers are stored in big-endian order.
+ */
+typedef union {
+ struct {
+ /*% File format version ID. */
+ unsigned char format[16];
+ /*% Position of the first addressable transaction */
+ journal_rawpos_t begin;
+ /*% Position of the next (yet nonexistent) transaction. */
+ journal_rawpos_t end;
+ /*% Number of index entries following the header. */
+ unsigned char index_size[4];
+ /*% Source serial number. */
+ unsigned char sourceserial[4];
+ unsigned char flags;
+ } h;
+ /* Pad the header to a fixed size. */
+ unsigned char pad[JOURNAL_HEADER_SIZE];
+} journal_rawheader_t;
+
+/*%
+ * The on-disk representation of the transaction header.
+ * There is one of these at the beginning of each transaction.
+ */
+typedef struct {
+ unsigned char size[4]; /*%< In bytes, excluding header. */
+ unsigned char serial0[4]; /*%< SOA serial before update. */
+ unsigned char serial1[4]; /*%< SOA serial after update. */
+} journal_rawxhdr_t;
+
+/*%
+ * The on-disk representation of the RR header.
+ * There is one of these at the beginning of each RR.
+ */
+typedef struct {
+ unsigned char size[4]; /*%< In bytes, excluding header. */
+} journal_rawrrhdr_t;
+
+/*%
+ * The in-core representation of the journal header.
+ */
+typedef struct {
+ uint32_t serial;
+ isc_offset_t offset;
+} journal_pos_t;
+
+#define POS_VALID(pos) ((pos).offset != 0)
+#define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
+
+typedef struct {
+ unsigned char format[16];
+ journal_pos_t begin;
+ journal_pos_t end;
+ uint32_t index_size;
+ uint32_t sourceserial;
+ bool serialset;
+} journal_header_t;
+
+/*%
+ * The in-core representation of the transaction header.
+ */
+
+typedef struct {
+ uint32_t size;
+ uint32_t serial0;
+ uint32_t serial1;
+} journal_xhdr_t;
+
+/*%
+ * The in-core representation of the RR header.
+ */
+typedef struct {
+ uint32_t size;
+} journal_rrhdr_t;
+
+
+/*%
+ * Initial contents to store in the header of a newly created
+ * journal file.
+ *
+ * The header starts with the magic string ";BIND LOG V9\n"
+ * to identify the file as a BIND 9 journal file. An ASCII
+ * identification string is used rather than a binary magic
+ * number to be consistent with BIND 8 (BIND 8 journal files
+ * are ASCII text files).
+ */
+
+static journal_header_t
+initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0 };
+
+#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
+
+typedef enum {
+ JOURNAL_STATE_INVALID,
+ JOURNAL_STATE_READ,
+ JOURNAL_STATE_WRITE,
+ JOURNAL_STATE_TRANSACTION,
+ JOURNAL_STATE_INLINE
+} journal_state_t;
+
+struct dns_journal {
+ unsigned int magic; /*%< JOUR */
+ isc_mem_t *mctx; /*%< Memory context */
+ journal_state_t state;
+ char *filename; /*%< Journal file name */
+ FILE * fp; /*%< File handle */
+ isc_offset_t offset; /*%< Current file offset */
+ journal_header_t header; /*%< In-core journal header */
+ unsigned char *rawindex; /*%< In-core buffer for journal index in on-disk format */
+ journal_pos_t *index; /*%< In-core journal index */
+
+ /*% Current transaction state (when writing). */
+ struct {
+ unsigned int n_soa; /*%< Number of SOAs seen */
+ journal_pos_t pos[2]; /*%< Begin/end position */
+ } x;
+
+ /*% Iteration state (when reading). */
+ struct {
+ /* These define the part of the journal we iterate over. */
+ journal_pos_t bpos; /*%< Position before first, */
+ journal_pos_t epos; /*%< and after last transaction */
+ /* The rest is iterator state. */
+ uint32_t current_serial; /*%< Current SOA serial */
+ isc_buffer_t source; /*%< Data from disk */
+ isc_buffer_t target; /*%< Data from _fromwire check */
+ dns_decompress_t dctx; /*%< Dummy decompression ctx */
+ dns_name_t name; /*%< Current domain name */
+ dns_rdata_t rdata; /*%< Current rdata */
+ uint32_t ttl; /*%< Current TTL */
+ unsigned int xsize; /*%< Size of transaction data */
+ unsigned int xpos; /*%< Current position in it */
+ isc_result_t result; /*%< Result of last call */
+ } it;
+};
+
+#define DNS_JOURNAL_MAGIC ISC_MAGIC('J', 'O', 'U', 'R')
+#define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
+
+static void
+journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
+ cooked->serial = decode_uint32(raw->serial);
+ cooked->offset = decode_uint32(raw->offset);
+}
+
+static void
+journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
+ encode_uint32(cooked->serial, raw->serial);
+ encode_uint32(cooked->offset, raw->offset);
+}
+
+static void
+journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
+ INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
+ memmove(cooked->format, raw->h.format, sizeof(cooked->format));
+ journal_pos_decode(&raw->h.begin, &cooked->begin);
+ journal_pos_decode(&raw->h.end, &cooked->end);
+ cooked->index_size = decode_uint32(raw->h.index_size);
+ cooked->sourceserial = decode_uint32(raw->h.sourceserial);
+ cooked->serialset = (raw->h.flags & JOURNAL_SERIALSET);
+}
+
+static void
+journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
+ unsigned char flags = 0;
+
+ INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
+ memset(raw->pad, 0, sizeof(raw->pad));
+ memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
+ journal_pos_encode(&raw->h.begin, &cooked->begin);
+ journal_pos_encode(&raw->h.end, &cooked->end);
+ encode_uint32(cooked->index_size, raw->h.index_size);
+ encode_uint32(cooked->sourceserial, raw->h.sourceserial);
+ if (cooked->serialset)
+ flags |= JOURNAL_SERIALSET;
+ raw->h.flags = flags;
+}
+
+/*
+ * Journal file I/O subroutines, with error checking and reporting.
+ */
+static isc_result_t
+journal_seek(dns_journal_t *j, uint32_t offset) {
+ isc_result_t result;
+
+ result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: seek: %s", j->filename,
+ isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+ j->offset = offset;
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
+ isc_result_t result;
+
+ result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
+ if (result != ISC_R_SUCCESS) {
+ if (result == ISC_R_EOF)
+ return (ISC_R_NOMORE);
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: read: %s",
+ j->filename, isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+ j->offset += (isc_offset_t)nbytes;
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
+ isc_result_t result;
+
+ result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: write: %s",
+ j->filename, isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+ j->offset += (isc_offset_t)nbytes;
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_fsync(dns_journal_t *j) {
+ isc_result_t result;
+ result = isc_stdio_flush(j->fp);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: flush: %s",
+ j->filename, isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+ result = isc_stdio_sync(j->fp);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: fsync: %s",
+ j->filename, isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+ return (ISC_R_SUCCESS);
+}
+
+/*
+ * Read/write a transaction header at the current file position.
+ */
+
+static isc_result_t
+journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
+ journal_rawxhdr_t raw;
+ isc_result_t result;
+ result = journal_read(j, &raw, sizeof(raw));
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ xhdr->size = decode_uint32(raw.size);
+ xhdr->serial0 = decode_uint32(raw.serial0);
+ xhdr->serial1 = decode_uint32(raw.serial1);
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_write_xhdr(dns_journal_t *j, uint32_t size,
+ uint32_t serial0, uint32_t serial1)
+{
+ journal_rawxhdr_t raw;
+ encode_uint32(size, raw.size);
+ encode_uint32(serial0, raw.serial0);
+ encode_uint32(serial1, raw.serial1);
+ return (journal_write(j, &raw, sizeof(raw)));
+}
+
+
+/*
+ * Read an RR header at the current file position.
+ */
+
+static isc_result_t
+journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
+ journal_rawrrhdr_t raw;
+ isc_result_t result;
+ result = journal_read(j, &raw, sizeof(raw));
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ rrhdr->size = decode_uint32(raw.size);
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_file_create(isc_mem_t *mctx, const char *filename) {
+ FILE *fp = NULL;
+ isc_result_t result;
+ journal_header_t header;
+ journal_rawheader_t rawheader;
+ int index_size = 56; /* XXX configurable */
+ int size;
+ void *mem; /* Memory for temporary index image. */
+
+ INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
+
+ result = isc_stdio_open(filename, "wb", &fp);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: create: %s",
+ filename, isc_result_totext(result));
+ return (ISC_R_UNEXPECTED);
+ }
+
+ header = initial_journal_header;
+ header.index_size = index_size;
+ journal_header_encode(&header, &rawheader);
+
+ size = sizeof(journal_rawheader_t) +
+ index_size * sizeof(journal_rawpos_t);
+
+ mem = isc_mem_get(mctx, size);
+ if (mem == NULL) {
+ (void)isc_stdio_close(fp);
+ (void)isc_file_remove(filename);
+ return (ISC_R_NOMEMORY);
+ }
+ memset(mem, 0, size);
+ memmove(mem, &rawheader, sizeof(rawheader));
+
+ result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: write: %s",
+ filename, isc_result_totext(result));
+ (void)isc_stdio_close(fp);
+ (void)isc_file_remove(filename);
+ isc_mem_put(mctx, mem, size);
+ return (ISC_R_UNEXPECTED);
+ }
+ isc_mem_put(mctx, mem, size);
+
+ result = isc_stdio_close(fp);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: close: %s",
+ filename, isc_result_totext(result));
+ (void)isc_file_remove(filename);
+ return (ISC_R_UNEXPECTED);
+ }
+
+ return (ISC_R_SUCCESS);
+}
+
+static isc_result_t
+journal_open(isc_mem_t *mctx, const char *filename, bool writable,
+ bool create, dns_journal_t **journalp)
+{
+ FILE *fp = NULL;
+ isc_result_t result;
+ journal_rawheader_t rawheader;
+ dns_journal_t *j;
+
+ INSIST(journalp != NULL && *journalp == NULL);
+ j = isc_mem_get(mctx, sizeof(*j));
+ if (j == NULL)
+ return (ISC_R_NOMEMORY);
+
+ j->mctx = NULL;
+ isc_mem_attach(mctx, &j->mctx);
+ j->state = JOURNAL_STATE_INVALID;
+ j->fp = NULL;
+ j->filename = isc_mem_strdup(mctx, filename);
+ j->index = NULL;
+ j->rawindex = NULL;
+
+ if (j->filename == NULL)
+ FAIL(ISC_R_NOMEMORY);
+
+ result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);
+
+ if (result == ISC_R_FILENOTFOUND) {
+ if (create) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
+ "journal file %s does not exist, "
+ "creating it", j->filename);
+ CHECK(journal_file_create(mctx, filename));
+ /*
+ * Retry.
+ */
+ result = isc_stdio_open(j->filename, "rb+", &fp);
+ } else {
+ FAIL(ISC_R_NOTFOUND);
+ }
+ }
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: open: %s",
+ j->filename, isc_result_totext(result));
+ FAIL(ISC_R_UNEXPECTED);
+ }
+
+ j->fp = fp;
+
+ /*
+ * Set magic early so that seek/read can succeed.
+ */
+ j->magic = DNS_JOURNAL_MAGIC;
+
+ CHECK(journal_seek(j, 0));
+ CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
+
+ if (memcmp(rawheader.h.format, initial_journal_header.format,
+ sizeof(initial_journal_header.format)) != 0) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal format not recognized",
+ j->filename);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+ journal_header_decode(&rawheader, &j->header);
+
+ /*
+ * If there is an index, read the raw index into a dynamically
+ * allocated buffer and then convert it into a cooked index.
+ */
+ if (j->header.index_size != 0) {
+ unsigned int i;
+ unsigned int rawbytes;
+ unsigned char *p;
+
+ rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
+ j->rawindex = isc_mem_get(mctx, rawbytes);
+ if (j->rawindex == NULL)
+ FAIL(ISC_R_NOMEMORY);
+
+ CHECK(journal_read(j, j->rawindex, rawbytes));
+
+ j->index = isc_mem_get(mctx, j->header.index_size *
+ sizeof(journal_pos_t));
+ if (j->index == NULL)
+ FAIL(ISC_R_NOMEMORY);
+
+ p = j->rawindex;
+ for (i = 0; i < j->header.index_size; i++) {
+ j->index[i].serial = decode_uint32(p);
+ p += 4;
+ j->index[i].offset = decode_uint32(p);
+ p += 4;
+ }
+ INSIST(p == j->rawindex + rawbytes);
+ }
+ j->offset = -1; /* Invalid, must seek explicitly. */
+
+ /*
+ * Initialize the iterator.
+ */
+ dns_name_init(&j->it.name, NULL);
+ dns_rdata_init(&j->it.rdata);
+
+ /*
+ * Set up empty initial buffers for unchecked and checked
+ * wire format RR data. They will be reallocated
+ * later.
+ */
+ isc_buffer_init(&j->it.source, NULL, 0);
+ isc_buffer_init(&j->it.target, NULL, 0);
+ dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
+
+ j->state =
+ writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
+
+ *journalp = j;
+ return (ISC_R_SUCCESS);
+
+ failure:
+ j->magic = 0;
+ if (j->rawindex != NULL)
+ isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
+ sizeof(journal_rawpos_t));
+ if (j->index != NULL)
+ isc_mem_put(j->mctx, j->index, j->header.index_size *
+ sizeof(journal_pos_t));
+ if (j->filename != NULL)
+ isc_mem_free(j->mctx, j->filename);
+ if (j->fp != NULL)
+ (void)isc_stdio_close(j->fp);
+ isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
+ return (result);
+}
+
+isc_result_t
+dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
+ dns_journal_t **journalp)
+{
+ isc_result_t result;
+ size_t namelen;
+ char backup[1024];
+ bool writable, create;
+
+ create = (mode & DNS_JOURNAL_CREATE);
+ writable = (mode & (DNS_JOURNAL_WRITE|DNS_JOURNAL_CREATE));
+
+ result = journal_open(mctx, filename, writable, create, journalp);
+ if (result == ISC_R_NOTFOUND) {
+ namelen = strlen(filename);
+ if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
+ namelen -= 4;
+
+ result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
+ (int)namelen, filename);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ result = journal_open(mctx, backup, writable, writable,
+ journalp);
+ }
+ return (result);
+}
+
+/*
+ * A comparison function defining the sorting order for
+ * entries in the IXFR-style journal file.
+ *
+ * The IXFR format requires that deletions are sorted before
+ * additions, and within either one, SOA records are sorted
+ * before others.
+ *
+ * Also sort the non-SOA records by type as a courtesy to the
+ * server receiving the IXFR - it may help reduce the amount of
+ * rdataset merging it has to do.
+ */
+static int
+ixfr_order(const void *av, const void *bv) {
+ dns_difftuple_t const * const *ap = av;
+ dns_difftuple_t const * const *bp = bv;
+ dns_difftuple_t const *a = *ap;
+ dns_difftuple_t const *b = *bp;
+ int r;
+ int bop = 0, aop = 0;
+
+ switch (a->op) {
+ case DNS_DIFFOP_DEL:
+ case DNS_DIFFOP_DELRESIGN:
+ aop = 1;
+ break;
+ case DNS_DIFFOP_ADD:
+ case DNS_DIFFOP_ADDRESIGN:
+ aop = 0;
+ break;
+ default:
+ INSIST(0);
+ }
+
+ switch (b->op) {
+ case DNS_DIFFOP_DEL:
+ case DNS_DIFFOP_DELRESIGN:
+ bop = 1;
+ break;
+ case DNS_DIFFOP_ADD:
+ case DNS_DIFFOP_ADDRESIGN:
+ bop = 0;
+ break;
+ default:
+ INSIST(0);
+ }
+
+ r = bop - aop;
+ if (r != 0)
+ return (r);
+
+ r = (b->rdata.type == dns_rdatatype_soa) -
+ (a->rdata.type == dns_rdatatype_soa);
+ if (r != 0)
+ return (r);
+
+ r = (a->rdata.type - b->rdata.type);
+ return (r);
+}
+
+/*
+ * Advance '*pos' to the next journal transaction.
+ *
+ * Requires:
+ * *pos refers to a valid journal transaction.
+ *
+ * Ensures:
+ * When ISC_R_SUCCESS is returned,
+ * *pos refers to the next journal transaction.
+ *
+ * Returns one of:
+ *
+ * ISC_R_SUCCESS
+ * ISC_R_NOMORE *pos pointed at the last transaction
+ * Other results due to file errors are possible.
+ */
+static isc_result_t
+journal_next(dns_journal_t *j, journal_pos_t *pos) {
+ isc_result_t result;
+ journal_xhdr_t xhdr;
+ REQUIRE(DNS_JOURNAL_VALID(j));
+
+ result = journal_seek(j, pos->offset);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ if (pos->serial == j->header.end.serial)
+ return (ISC_R_NOMORE);
+ /*
+ * Read the header of the current transaction.
+ * This will return ISC_R_NOMORE if we are at EOF.
+ */
+ result = journal_read_xhdr(j, &xhdr);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ /*
+ * Check serial number consistency.
+ */
+ if (xhdr.serial0 != pos->serial) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal file corrupt: "
+ "expected serial %u, got %u",
+ j->filename, pos->serial, xhdr.serial0);
+ return (ISC_R_UNEXPECTED);
+ }
+
+ /*
+ * Check for offset wraparound.
+ */
+ if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
+ < pos->offset) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: offset too large", j->filename);
+ return (ISC_R_UNEXPECTED);
+ }
+
+ pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
+ pos->serial = xhdr.serial1;
+ return (ISC_R_SUCCESS);
+}
+
+/*
+ * If the index of the journal 'j' contains an entry "better"
+ * than '*best_guess', replace '*best_guess' with it.
+ *
+ * "Better" means having a serial number closer to 'serial'
+ * but not greater than 'serial'.
+ */
+static void
+index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
+ unsigned int i;
+ if (j->index == NULL)
+ return;
+ for (i = 0; i < j->header.index_size; i++) {
+ if (POS_VALID(j->index[i]) &&
+ DNS_SERIAL_GE(serial, j->index[i].serial) &&
+ DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
+ *best_guess = j->index[i];
+ }
+}
+
+/*
+ * Add a new index entry. If there is no room, make room by removing
+ * the odd-numbered entries and compacting the others into the first
+ * half of the index. This decimates old index entries exponentially
+ * over time, so that the index always contains a much larger fraction
+ * of recent serial numbers than of old ones. This is deliberate -
+ * most index searches are for outgoing IXFR, and IXFR tends to request
+ * recent versions more often than old ones.
+ */
+static void
+index_add(dns_journal_t *j, journal_pos_t *pos) {
+ unsigned int i;
+ if (j->index == NULL)
+ return;
+ /*
+ * Search for a vacant position.
+ */
+ for (i = 0; i < j->header.index_size; i++) {
+ if (! POS_VALID(j->index[i]))
+ break;
+ }
+ if (i == j->header.index_size) {
+ unsigned int k = 0;
+ /*
+ * Found no vacant position. Make some room.
+ */
+ for (i = 0; i < j->header.index_size; i += 2) {
+ j->index[k++] = j->index[i];
+ }
+ i = k; /* 'i' identifies the first vacant position. */
+ while (k < j->header.index_size) {
+ POS_INVALIDATE(j->index[k]);
+ k++;
+ }
+ }
+ INSIST(i < j->header.index_size);
+ INSIST(! POS_VALID(j->index[i]));
+
+ /*
+ * Store the new index entry.
+ */
+ j->index[i] = *pos;
+}
+
+/*
+ * Invalidate any existing index entries that could become
+ * ambiguous when a new transaction with number 'serial' is added.
+ */
+static void
+index_invalidate(dns_journal_t *j, uint32_t serial) {
+ unsigned int i;
+ if (j->index == NULL)
+ return;
+ for (i = 0; i < j->header.index_size; i++) {
+ if (! DNS_SERIAL_GT(serial, j->index[i].serial))
+ POS_INVALIDATE(j->index[i]);
+ }
+}
+
+/*
+ * Try to find a transaction with initial serial number 'serial'
+ * in the journal 'j'.
+ *
+ * If found, store its position at '*pos' and return ISC_R_SUCCESS.
+ *
+ * If 'serial' is current (= the ending serial number of the
+ * last transaction in the journal), set '*pos' to
+ * the position immediately following the last transaction and
+ * return ISC_R_SUCCESS.
+ *
+ * If 'serial' is within the range of addressable serial numbers
+ * covered by the journal but that particular serial number is missing
+ * (from the journal, not just from the index), return ISC_R_NOTFOUND.
+ *
+ * If 'serial' is outside the range of addressable serial numbers
+ * covered by the journal, return ISC_R_RANGE.
+ *
+ */
+static isc_result_t
+journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
+ isc_result_t result;
+ journal_pos_t current_pos;
+ REQUIRE(DNS_JOURNAL_VALID(j));
+
+ if (DNS_SERIAL_GT(j->header.begin.serial, serial))
+ return (ISC_R_RANGE);
+ if (DNS_SERIAL_GT(serial, j->header.end.serial))
+ return (ISC_R_RANGE);
+ if (serial == j->header.end.serial) {
+ *pos = j->header.end;
+ return (ISC_R_SUCCESS);
+ }
+
+ current_pos = j->header.begin;
+ index_find(j, serial, &current_pos);
+
+ while (current_pos.serial != serial) {
+ if (DNS_SERIAL_GT(current_pos.serial, serial))
+ return (ISC_R_NOTFOUND);
+ result = journal_next(j, &current_pos);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ }
+ *pos = current_pos;
+ return (ISC_R_SUCCESS);
+}
+
+isc_result_t
+dns_journal_begin_transaction(dns_journal_t *j) {
+ uint32_t offset;
+ isc_result_t result;
+ journal_rawxhdr_t hdr;
+
+ REQUIRE(DNS_JOURNAL_VALID(j));
+ REQUIRE(j->state == JOURNAL_STATE_WRITE ||
+ j->state == JOURNAL_STATE_INLINE);
+
+ /*
+ * Find the file offset where the new transaction should
+ * be written, and seek there.
+ */
+ if (JOURNAL_EMPTY(&j->header)) {
+ offset = sizeof(journal_rawheader_t) +
+ j->header.index_size * sizeof(journal_rawpos_t);
+ } else {
+ offset = j->header.end.offset;
+ }
+ j->x.pos[0].offset = offset;
+ j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
+ j->x.n_soa = 0;
+
+ CHECK(journal_seek(j, offset));
+
+ /*
+ * Write a dummy transaction header of all zeroes to reserve
+ * space. It will be filled in when the transaction is
+ * finished.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ CHECK(journal_write(j, &hdr, sizeof(hdr)));
+ j->x.pos[1].offset = j->offset;
+
+ j->state = JOURNAL_STATE_TRANSACTION;
+ result = ISC_R_SUCCESS;
+ failure:
+ return (result);
+}
+
+isc_result_t
+dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
+ dns_difftuple_t *t;
+ isc_buffer_t buffer;
+ void *mem = NULL;
+ uint64_t size;
+ isc_result_t result;
+ isc_region_t used;
+
+ REQUIRE(DNS_DIFF_VALID(diff));
+ REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
+
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
+ (void)dns_diff_print(diff, NULL);
+
+ /*
+ * Pass 1: determine the buffer size needed, and
+ * keep track of SOA serial numbers.
+ */
+ size = 0;
+ for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
+ t = ISC_LIST_NEXT(t, link))
+ {
+ if (t->rdata.type == dns_rdatatype_soa) {
+ if (j->x.n_soa < 2)
+ j->x.pos[j->x.n_soa].serial =
+ dns_soa_getserial(&t->rdata);
+ j->x.n_soa++;
+ }
+ size += sizeof(journal_rawrrhdr_t);
+ size += t->name.length; /* XXX should have access macro? */
+ size += 10;
+ size += t->rdata.length;
+ }
+
+ if (size >= INT32_MAX) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "dns_journal_writediff: %s: journal entry "
+ "too big to be stored: %" PRIu64 " bytes",
+ j->filename, size);
+ return (ISC_R_NOSPACE);
+ }
+
+ mem = isc_mem_get(j->mctx, size);
+ if (mem == NULL)
+ return (ISC_R_NOMEMORY);
+
+ isc_buffer_init(&buffer, mem, size);
+
+ /*
+ * Pass 2. Write RRs to buffer.
+ */
+ for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
+ t = ISC_LIST_NEXT(t, link))
+ {
+ /*
+ * Write the RR header.
+ */
+ isc_buffer_putuint32(&buffer, t->name.length + 10 +
+ t->rdata.length);
+ /*
+ * Write the owner name, RR header, and RR data.
+ */
+ isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
+ isc_buffer_putuint16(&buffer, t->rdata.type);
+ isc_buffer_putuint16(&buffer, t->rdata.rdclass);
+ isc_buffer_putuint32(&buffer, t->ttl);
+ INSIST(t->rdata.length < 65536);
+ isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
+ INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
+ isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
+ }
+
+ isc_buffer_usedregion(&buffer, &used);
+ INSIST(used.length == size);
+
+ j->x.pos[1].offset += used.length;
+
+ /*
+ * Write the buffer contents to the journal file.
+ */
+ CHECK(journal_write(j, used.base, used.length));
+
+ result = ISC_R_SUCCESS;
+
+ failure:
+ if (mem != NULL)
+ isc_mem_put(j->mctx, mem, size);
+ return (result);
+
+}
+
+isc_result_t
+dns_journal_commit(dns_journal_t *j) {
+ isc_result_t result;
+ journal_rawheader_t rawheader;
+ uint64_t total;
+
+ REQUIRE(DNS_JOURNAL_VALID(j));
+ REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
+ j->state == JOURNAL_STATE_INLINE);
+
+ /*
+ * Just write out a updated header.
+ */
+ if (j->state == JOURNAL_STATE_INLINE) {
+ CHECK(journal_fsync(j));
+ journal_header_encode(&j->header, &rawheader);
+ CHECK(journal_seek(j, 0));
+ CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
+ CHECK(journal_fsync(j));
+ j->state = JOURNAL_STATE_WRITE;
+ return (ISC_R_SUCCESS);
+ }
+
+ /*
+ * Perform some basic consistency checks.
+ */
+ if (j->x.n_soa != 2) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: malformed transaction: %d SOAs",
+ j->filename, j->x.n_soa);
+ return (ISC_R_UNEXPECTED);
+ }
+ if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
+ (bind8_compat &&
+ j->x.pos[1].serial == j->x.pos[0].serial)))
+ {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: malformed transaction: serial number "
+ "would decrease", j->filename);
+ return (ISC_R_UNEXPECTED);
+ }
+ if (! JOURNAL_EMPTY(&j->header)) {
+ if (j->x.pos[0].serial != j->header.end.serial) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "malformed transaction: "
+ "%s last serial %u != "
+ "transaction first serial %u",
+ j->filename,
+ j->header.end.serial,
+ j->x.pos[0].serial);
+ return (ISC_R_UNEXPECTED);
+ }
+ }
+
+ /*
+ * We currently don't support huge journal entries.
+ */
+ total = j->x.pos[1].offset - j->x.pos[0].offset;
+ if (total >= INT32_MAX) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "transaction too big to be stored in journal: "
+ "%" PRIu64 "b (max is %" PRIu64 "b)", total,
+ (uint64_t)INT32_MAX);
+ return (ISC_R_UNEXPECTED);
+ }
+
+ /*
+ * Some old journal entries may become non-addressable
+ * when we increment the current serial number. Purge them
+ * by stepping header.begin forward to the first addressable
+ * transaction. Also purge them from the index.
+ */
+ if (! JOURNAL_EMPTY(&j->header)) {
+ while (! DNS_SERIAL_GT(j->x.pos[1].serial,
+ j->header.begin.serial)) {
+ CHECK(journal_next(j, &j->header.begin));
+ }
+ index_invalidate(j, j->x.pos[1].serial);
+ }
+#ifdef notyet
+ if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
+ force_dump(...);
+ }
+#endif
+
+ /*
+ * Commit the transaction data to stable storage.
+ */
+ CHECK(journal_fsync(j));
+
+ if (j->state == JOURNAL_STATE_TRANSACTION) {
+ isc_offset_t offset;
+ offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
+ sizeof(journal_rawxhdr_t);
+ /*
+ * Update the transaction header.
+ */
+ CHECK(journal_seek(j, j->x.pos[0].offset));
+ CHECK(journal_write_xhdr(j, offset, j->x.pos[0].serial,
+ j->x.pos[1].serial));
+ }
+
+ /*
+ * Update the journal header.
+ */
+ if (JOURNAL_EMPTY(&j->header))
+ j->header.begin = j->x.pos[0];
+ j->header.end = j->x.pos[1];
+ journal_header_encode(&j->header, &rawheader);
+ CHECK(journal_seek(j, 0));
+ CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
+
+ /*
+ * Update the index.
+ */
+ index_add(j, &j->x.pos[0]);
+
+ /*
+ * Convert the index into on-disk format and write
+ * it to disk.
+ */
+ CHECK(index_to_disk(j));
+
+ /*
+ * Commit the header to stable storage.
+ */
+ CHECK(journal_fsync(j));
+
+ /*
+ * We no longer have a transaction open.
+ */
+ j->state = JOURNAL_STATE_WRITE;
+
+ result = ISC_R_SUCCESS;
+
+ failure:
+ return (result);
+}
+
+isc_result_t
+dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
+ isc_result_t result;
+ CHECK(dns_diff_sort(diff, ixfr_order));
+ CHECK(dns_journal_begin_transaction(j));
+ CHECK(dns_journal_writediff(j, diff));
+ CHECK(dns_journal_commit(j));
+ result = ISC_R_SUCCESS;
+ failure:
+ return (result);
+}
+
+void
+dns_journal_destroy(dns_journal_t **journalp) {
+ dns_journal_t *j = *journalp;
+ REQUIRE(DNS_JOURNAL_VALID(j));
+
+ j->it.result = ISC_R_FAILURE;
+ dns_name_invalidate(&j->it.name);
+ dns_decompress_invalidate(&j->it.dctx);
+ if (j->rawindex != NULL)
+ isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
+ sizeof(journal_rawpos_t));
+ if (j->index != NULL)
+ isc_mem_put(j->mctx, j->index, j->header.index_size *
+ sizeof(journal_pos_t));
+ if (j->it.target.base != NULL)
+ isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
+ if (j->it.source.base != NULL)
+ isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
+ if (j->filename != NULL)
+ isc_mem_free(j->mctx, j->filename);
+ if (j->fp != NULL)
+ (void)isc_stdio_close(j->fp);
+ j->magic = 0;
+ isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
+ *journalp = NULL;
+}
+
+/*
+ * Roll the open journal 'j' into the database 'db'.
+ * A new database version will be created.
+ */
+
+/* XXX Share code with incoming IXFR? */
+
+static isc_result_t
+roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
+ isc_buffer_t source; /* Transaction data from disk */
+ isc_buffer_t target; /* Ditto after _fromwire check */
+ uint32_t db_serial; /* Database SOA serial */
+ uint32_t end_serial; /* Last journal SOA serial */
+ isc_result_t result;
+ dns_dbversion_t *ver = NULL;
+ journal_pos_t pos;
+ dns_diff_t diff;
+ unsigned int n_soa = 0;
+ unsigned int n_put = 0;
+ dns_diffop_t op;
+
+ REQUIRE(DNS_JOURNAL_VALID(j));
+ REQUIRE(DNS_DB_VALID(db));
+
+ dns_diff_init(j->mctx, &diff);
+
+ /*
+ * Set up empty initial buffers for unchecked and checked
+ * wire format transaction data. They will be reallocated
+ * later.
+ */
+ isc_buffer_init(&source, NULL, 0);
+ isc_buffer_init(&target, NULL, 0);
+
+ /*
+ * Create the new database version.
+ */
+ CHECK(dns_db_newversion(db, &ver));
+
+ /*
+ * Get the current database SOA serial number.
+ */
+ CHECK(dns_db_getsoaserial(db, ver, &db_serial));
+
+ /*
+ * Locate a journal entry for the current database serial.
+ */
+ CHECK(journal_find(j, db_serial, &pos));
+ /*
+ * XXX do more drastic things, like marking zone stale,
+ * if this fails?
+ */
+ /*
+ * XXXRTH The zone code should probably mark the zone as bad and
+ * scream loudly into the log if this is a dynamic update
+ * log reply that failed.
+ */
+
+ end_serial = dns_journal_last_serial(j);
+ if (db_serial == end_serial)
+ CHECK(DNS_R_UPTODATE);
+
+ CHECK(dns_journal_iter_init(j, db_serial, end_serial));
+
+ for (result = dns_journal_first_rr(j);
+ result == ISC_R_SUCCESS;
+ result = dns_journal_next_rr(j))
+ {
+ dns_name_t *name;
+ uint32_t ttl;
+ dns_rdata_t *rdata;
+ dns_difftuple_t *tuple = NULL;
+
+ name = NULL;
+ rdata = NULL;
+ dns_journal_current_rr(j, &name, &ttl, &rdata);
+
+ if (rdata->type == dns_rdatatype_soa) {
+ n_soa++;
+ if (n_soa == 2)
+ db_serial = j->it.current_serial;
+ }
+
+ if (n_soa == 3)
+ n_soa = 1;
+ if (n_soa == 0) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal file corrupt: missing "
+ "initial SOA", j->filename);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+ if ((options & DNS_JOURNALOPT_RESIGN) != 0)
+ op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN :
+ DNS_DIFFOP_ADDRESIGN;
+ else
+ op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
+
+ CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
+ &tuple));
+ dns_diff_append(&diff, &tuple);
+
+ if (++n_put > 100) {
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
+ "%s: applying diff to database (%u)",
+ j->filename, db_serial);
+ (void)dns_diff_print(&diff, NULL);
+ CHECK(dns_diff_apply(&diff, db, ver));
+ dns_diff_clear(&diff);
+ n_put = 0;
+ }
+ }
+ if (result == ISC_R_NOMORE)
+ result = ISC_R_SUCCESS;
+ CHECK(result);
+
+ if (n_put != 0) {
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
+ "%s: applying final diff to database (%u)",
+ j->filename, db_serial);
+ (void)dns_diff_print(&diff, NULL);
+ CHECK(dns_diff_apply(&diff, db, ver));
+ dns_diff_clear(&diff);
+ }
+
+ failure:
+ if (ver != NULL)
+ dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
+ true : false);
+
+ if (source.base != NULL)
+ isc_mem_put(j->mctx, source.base, source.length);
+ if (target.base != NULL)
+ isc_mem_put(j->mctx, target.base, target.length);
+
+ dns_diff_clear(&diff);
+
+ INSIST(ver == NULL);
+
+ return (result);
+}
+
+isc_result_t
+dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, unsigned int options,
+ const char *filename)
+{
+ dns_journal_t *j;
+ isc_result_t result;
+
+ REQUIRE(DNS_DB_VALID(db));
+ REQUIRE(filename != NULL);
+
+ j = NULL;
+ result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
+ if (result == ISC_R_NOTFOUND) {
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
+ "no journal file, but that's OK");
+ return (DNS_R_NOJOURNAL);
+ }
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ if (JOURNAL_EMPTY(&j->header))
+ result = DNS_R_UPTODATE;
+ else
+ result = roll_forward(j, db, options);
+
+ dns_journal_destroy(&j);
+
+ return (result);
+}
+
+isc_result_t
+dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
+ dns_journal_t *j;
+ isc_buffer_t source; /* Transaction data from disk */
+ isc_buffer_t target; /* Ditto after _fromwire check */
+ uint32_t start_serial; /* Database SOA serial */
+ uint32_t end_serial; /* Last journal SOA serial */
+ isc_result_t result;
+ dns_diff_t diff;
+ unsigned int n_soa = 0;
+ unsigned int n_put = 0;
+
+ REQUIRE(filename != NULL);
+
+ j = NULL;
+ result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
+ if (result == ISC_R_NOTFOUND) {
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
+ return (DNS_R_NOJOURNAL);
+ }
+
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "journal open failure: %s: %s",
+ isc_result_totext(result), filename);
+ return (result);
+ }
+
+ if (j->header.serialset)
+ fprintf(file, "Source serial = %u\n", j->header.sourceserial);
+ dns_diff_init(j->mctx, &diff);
+
+ /*
+ * Set up empty initial buffers for unchecked and checked
+ * wire format transaction data. They will be reallocated
+ * later.
+ */
+ isc_buffer_init(&source, NULL, 0);
+ isc_buffer_init(&target, NULL, 0);
+
+ start_serial = dns_journal_first_serial(j);
+ end_serial = dns_journal_last_serial(j);
+
+ CHECK(dns_journal_iter_init(j, start_serial, end_serial));
+
+ for (result = dns_journal_first_rr(j);
+ result == ISC_R_SUCCESS;
+ result = dns_journal_next_rr(j))
+ {
+ dns_name_t *name;
+ uint32_t ttl;
+ dns_rdata_t *rdata;
+ dns_difftuple_t *tuple = NULL;
+
+ name = NULL;
+ rdata = NULL;
+ dns_journal_current_rr(j, &name, &ttl, &rdata);
+
+ if (rdata->type == dns_rdatatype_soa)
+ n_soa++;
+
+ if (n_soa == 3)
+ n_soa = 1;
+ if (n_soa == 0) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal file corrupt: missing "
+ "initial SOA", j->filename);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+ CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
+ DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
+ name, ttl, rdata, &tuple));
+ dns_diff_append(&diff, &tuple);
+
+ if (++n_put > 100) {
+ result = dns_diff_print(&diff, file);
+ dns_diff_clear(&diff);
+ n_put = 0;
+ if (result != ISC_R_SUCCESS)
+ break;
+ }
+ }
+ if (result == ISC_R_NOMORE)
+ result = ISC_R_SUCCESS;
+ CHECK(result);
+
+ if (n_put != 0) {
+ result = dns_diff_print(&diff, file);
+ dns_diff_clear(&diff);
+ }
+ goto cleanup;
+
+ failure:
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: cannot print: journal file corrupt", j->filename);
+
+ cleanup:
+ if (source.base != NULL)
+ isc_mem_put(j->mctx, source.base, source.length);
+ if (target.base != NULL)
+ isc_mem_put(j->mctx, target.base, target.length);
+
+ dns_diff_clear(&diff);
+ dns_journal_destroy(&j);
+
+ return (result);
+}
+
+/**************************************************************************/
+/*
+ * Miscellaneous accessors.
+ */
+uint32_t
+dns_journal_first_serial(dns_journal_t *j) {
+ return (j->header.begin.serial);
+}
+
+uint32_t
+dns_journal_last_serial(dns_journal_t *j) {
+ return (j->header.end.serial);
+}
+
+void
+dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
+
+ REQUIRE(j->state == JOURNAL_STATE_WRITE ||
+ j->state == JOURNAL_STATE_INLINE ||
+ j->state == JOURNAL_STATE_TRANSACTION);
+
+ j->header.sourceserial = sourceserial;
+ j->header.serialset = true;
+ if (j->state == JOURNAL_STATE_WRITE)
+ j->state = JOURNAL_STATE_INLINE;
+}
+
+bool
+dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
+ REQUIRE(sourceserial != NULL);
+
+ if (!j->header.serialset)
+ return (false);
+ *sourceserial = j->header.sourceserial;
+ return (true);
+}
+
+/**************************************************************************/
+/*
+ * Iteration support.
+ *
+ * When serving an outgoing IXFR, we transmit a part the journal starting
+ * at the serial number in the IXFR request and ending at the serial
+ * number that is current when the IXFR request arrives. The ending
+ * serial number is not necessarily at the end of the journal:
+ * the journal may grow while the IXFR is in progress, but we stop
+ * when we reach the serial number that was current when the IXFR started.
+ */
+
+static isc_result_t read_one_rr(dns_journal_t *j);
+
+/*
+ * Make sure the buffer 'b' is has at least 'size' bytes
+ * allocated, and clear it.
+ *
+ * Requires:
+ * Either b->base is NULL, or it points to b->length bytes of memory
+ * previously allocated by isc_mem_get().
+ */
+
+static isc_result_t
+size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
+ if (b->length < size) {
+ void *mem = isc_mem_get(mctx, size);
+ if (mem == NULL)
+ return (ISC_R_NOMEMORY);
+ if (b->base != NULL)
+ isc_mem_put(mctx, b->base, b->length);
+ b->base = mem;
+ b->length = size;
+ }
+ isc_buffer_clear(b);
+ return (ISC_R_SUCCESS);
+}
+
+isc_result_t
+dns_journal_iter_init(dns_journal_t *j,
+ uint32_t begin_serial, uint32_t end_serial)
+{
+ isc_result_t result;
+
+ CHECK(journal_find(j, begin_serial, &j->it.bpos));
+ INSIST(j->it.bpos.serial == begin_serial);
+
+ CHECK(journal_find(j, end_serial, &j->it.epos));
+ INSIST(j->it.epos.serial == end_serial);
+
+ result = ISC_R_SUCCESS;
+ failure:
+ j->it.result = result;
+ return (j->it.result);
+}
+
+
+isc_result_t
+dns_journal_first_rr(dns_journal_t *j) {
+ isc_result_t result;
+
+ /*
+ * Seek to the beginning of the first transaction we are
+ * interested in.
+ */
+ CHECK(journal_seek(j, j->it.bpos.offset));
+ j->it.current_serial = j->it.bpos.serial;
+
+ j->it.xsize = 0; /* We have no transaction data yet... */
+ j->it.xpos = 0; /* ...and haven't used any of it. */
+
+ return (read_one_rr(j));
+
+ failure:
+ return (result);
+}
+
+static isc_result_t
+read_one_rr(dns_journal_t *j) {
+ isc_result_t result;
+
+ dns_rdatatype_t rdtype;
+ dns_rdataclass_t rdclass;
+ unsigned int rdlen;
+ uint32_t ttl;
+ journal_xhdr_t xhdr;
+ journal_rrhdr_t rrhdr;
+
+ if (j->offset > j->it.epos.offset) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal corrupt: possible integer overflow",
+ j->filename);
+ return (ISC_R_UNEXPECTED);
+ }
+ if (j->offset == j->it.epos.offset)
+ return (ISC_R_NOMORE);
+ if (j->it.xpos == j->it.xsize) {
+ /*
+ * We are at a transaction boundary.
+ * Read another transaction header.
+ */
+ CHECK(journal_read_xhdr(j, &xhdr));
+ if (xhdr.size == 0) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal corrupt: empty transaction",
+ j->filename);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+ if (xhdr.serial0 != j->it.current_serial) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal file corrupt: "
+ "expected serial %u, got %u",
+ j->filename,
+ j->it.current_serial, xhdr.serial0);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+ j->it.xsize = xhdr.size;
+ j->it.xpos = 0;
+ }
+ /*
+ * Read an RR.
+ */
+ CHECK(journal_read_rrhdr(j, &rrhdr));
+ /*
+ * Perform a sanity check on the journal RR size.
+ * The smallest possible RR has a 1-byte owner name
+ * and a 10-byte header. The largest possible
+ * RR has 65535 bytes of data, a header, and a maximum-
+ * size owner name, well below 70 k total.
+ */
+ if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
+ isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
+ "%s: journal corrupt: impossible RR size "
+ "(%d bytes)", j->filename, rrhdr.size);
+ FAIL(ISC_R_UNEXPECTED);
+ }
+
+ CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
+ CHECK(journal_read(j, j->it.source.base, rrhdr.size));
+ isc_buffer_add(&j->it.source, rrhdr.size);
+
+ /*
+ * The target buffer is made the same size
+ * as the source buffer, with the assumption that when
+ * no compression in present, the output of dns_*_fromwire()
+ * is no larger than the input.
+ */
+ CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
+
+ /*
+ * Parse the owner name. We don't know where it
+ * ends yet, so we make the entire "remaining"
+ * part of the buffer "active".
+ */
+ isc_buffer_setactive(&j->it.source,
+ j->it.source.used - j->it.source.current);
+ CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
+ &j->it.dctx, 0, &j->it.target));
+
+ /*
+ * Check that the RR header is there, and parse it.
+ */
+ if (isc_buffer_remaininglength(&j->it.source) < 10)
+ FAIL(DNS_R_FORMERR);
+
+ rdtype = isc_buffer_getuint16(&j->it.source);
+ rdclass = isc_buffer_getuint16(&j->it.source);
+ ttl = isc_buffer_getuint32(&j->it.source);
+ rdlen = isc_buffer_getuint16(&j->it.source);
+
+ /*
+ * Parse the rdata.
+ */
+ if (isc_buffer_remaininglength(&j->it.source) != rdlen)
+ FAIL(DNS_R_FORMERR);
+ isc_buffer_setactive(&j->it.source, rdlen);
+ dns_rdata_reset(&j->it.rdata);
+ CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
+ rdtype, &j->it.source, &j->it.dctx,
+ 0, &j->it.target));
+ j->it.ttl = ttl;
+
+ j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
+ if (rdtype == dns_rdatatype_soa) {
+ /* XXX could do additional consistency checks here */
+ j->it.current_serial = dns_soa_getserial(&j->it.rdata);
+ }
+
+ result = ISC_R_SUCCESS;
+
+ failure:
+ j->it.result = result;
+ return (result);
+}
+
+isc_result_t
+dns_journal_next_rr(dns_journal_t *j) {
+ j->it.result = read_one_rr(j);
+ return (j->it.result);
+}
+
+void
+dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
+ dns_rdata_t **rdata)
+{
+ REQUIRE(j->it.result == ISC_R_SUCCESS);
+ *name = &j->it.name;
+ *ttl = j->it.ttl;
+ *rdata = &j->it.rdata;
+}
+
+/**************************************************************************/
+/*
+ * Generating diffs from databases
+ */
+
+/*
+ * Construct a diff containing all the RRs at the current name of the
+ * database iterator 'dbit' in database 'db', version 'ver'.
+ * Set '*name' to the current name, and append the diff to 'diff'.
+ * All new tuples will have the operation 'op'.
+ *
+ * Requires: 'name' must have buffer large enough to hold the name.
+ * Typically, a dns_fixedname_t would be used.
+ */
+static isc_result_t
+get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
+ dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
+ dns_diff_t *diff)
+{
+ isc_result_t result;
+ dns_dbnode_t *node = NULL;
+ dns_rdatasetiter_t *rdsiter = NULL;
+ dns_difftuple_t *tuple = NULL;
+
+ result = dns_dbiterator_current(dbit, &node, name);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
+ if (result != ISC_R_SUCCESS)
+ goto cleanup_node;
+
+ for (result = dns_rdatasetiter_first(rdsiter);
+ result == ISC_R_SUCCESS;
+ result = dns_rdatasetiter_next(rdsiter))
+ {
+ dns_rdataset_t rdataset;
+
+ dns_rdataset_init(&rdataset);
+ dns_rdatasetiter_current(rdsiter, &rdataset);
+
+ for (result = dns_rdataset_first(&rdataset);
+ result == ISC_R_SUCCESS;
+ result = dns_rdataset_next(&rdataset))
+ {
+ dns_rdata_t rdata = DNS_RDATA_INIT;
+ dns_rdataset_current(&rdataset, &rdata);
+ result = dns_difftuple_create(diff->mctx, op, name,
+ rdataset.ttl, &rdata,
+ &tuple);
+ if (result != ISC_R_SUCCESS) {
+ dns_rdataset_disassociate(&rdataset);
+ goto cleanup_iterator;
+ }
+ dns_diff_append(diff, &tuple);
+ }
+ dns_rdataset_disassociate(&rdataset);
+ if (result != ISC_R_NOMORE)
+ goto cleanup_iterator;
+ }
+ if (result != ISC_R_NOMORE)
+ goto cleanup_iterator;
+
+ result = ISC_R_SUCCESS;
+
+ cleanup_iterator:
+ dns_rdatasetiter_destroy(&rdsiter);
+
+ cleanup_node:
+ dns_db_detachnode(db, &node);
+
+ return (result);
+}
+
+/*
+ * Comparison function for use by dns_diff_subtract when sorting
+ * the diffs to be subtracted. The sort keys are the rdata type
+ * and the rdata itself. The owner name is ignored, because
+ * it is known to be the same for all tuples.
+ */
+static int
+rdata_order(const void *av, const void *bv) {
+ dns_difftuple_t const * const *ap = av;
+ dns_difftuple_t const * const *bp = bv;
+ dns_difftuple_t const *a = *ap;
+ dns_difftuple_t const *b = *bp;
+ int r;
+ r = (b->rdata.type - a->rdata.type);
+ if (r != 0)
+ return (r);
+ r = dns_rdata_compare(&a->rdata, &b->rdata);
+ return (r);
+}
+
+static isc_result_t
+dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
+ isc_result_t result;
+ dns_difftuple_t *p[2];
+ int i, t;
+ bool append;
+
+ CHECK(dns_diff_sort(&diff[0], rdata_order));
+ CHECK(dns_diff_sort(&diff[1], rdata_order));
+
+ for (;;) {
+ p[0] = ISC_LIST_HEAD(diff[0].tuples);
+ p[1] = ISC_LIST_HEAD(diff[1].tuples);
+ if (p[0] == NULL && p[1] == NULL)
+ break;
+
+ for (i = 0; i < 2; i++)
+ if (p[!i] == NULL) {
+ ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
+ ISC_LIST_APPEND(r->tuples, p[i], link);
+ goto next;
+ }
+ t = rdata_order(&p[0], &p[1]);
+ if (t < 0) {
+ ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
+ ISC_LIST_APPEND(r->tuples, p[0], link);
+ goto next;
+ }
+ if (t > 0) {
+ ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
+ ISC_LIST_APPEND(r->tuples, p[1], link);
+ goto next;
+ }
+ INSIST(t == 0);
+ /*
+ * Identical RRs in both databases; skip them both
+ * if the ttl differs.
+ */
+ append = (p[0]->ttl != p[1]->ttl);
+ for (i = 0; i < 2; i++) {
+ ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
+ if (append) {
+ ISC_LIST_APPEND(r->tuples, p[i], link);
+ } else {
+ dns_difftuple_free(&p[i]);
+ }
+ }
+ next: ;
+ }
+ result = ISC_R_SUCCESS;
+ failure:
+ return (result);
+}
+
+static isc_result_t
+diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera,
+ dns_db_t *dbb, dns_dbversion_t *dbverb,
+ unsigned int options, dns_diff_t *resultdiff)
+{
+ dns_db_t *db[2];
+ dns_dbversion_t *ver[2];
+ dns_dbiterator_t *dbit[2] = { NULL, NULL };
+ bool have[2] = { false, false };
+ dns_fixedname_t fixname[2];
+ isc_result_t result, itresult[2];
+ dns_diff_t diff[2];
+ int i, t;
+
+ db[0] = dba, db[1] = dbb;
+ ver[0] = dbvera, ver[1] = dbverb;
+
+ dns_diff_init(resultdiff->mctx, &diff[0]);
+ dns_diff_init(resultdiff->mctx, &diff[1]);
+
+ dns_fixedname_init(&fixname[0]);
+ dns_fixedname_init(&fixname[1]);
+
+ result = dns_db_createiterator(db[0], options, &dbit[0]);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ result = dns_db_createiterator(db[1], options, &dbit[1]);
+ if (result != ISC_R_SUCCESS)
+ goto cleanup_iterator;
+
+ itresult[0] = dns_dbiterator_first(dbit[0]);
+ itresult[1] = dns_dbiterator_first(dbit[1]);
+
+ for (;;) {
+ for (i = 0; i < 2; i++) {
+ if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
+ CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
+ dns_fixedname_name(&fixname[i]),
+ i == 0 ?
+ DNS_DIFFOP_ADD :
+ DNS_DIFFOP_DEL,
+ &diff[i]));
+ itresult[i] = dns_dbiterator_next(dbit[i]);
+ have[i] = true;
+ }
+ }
+
+ if (! have[0] && ! have[1]) {
+ INSIST(ISC_LIST_EMPTY(diff[0].tuples));
+ INSIST(ISC_LIST_EMPTY(diff[1].tuples));
+ break;
+ }
+
+ for (i = 0; i < 2; i++) {
+ if (! have[!i]) {
+ ISC_LIST_APPENDLIST(resultdiff->tuples,
+ diff[i].tuples, link);
+ INSIST(ISC_LIST_EMPTY(diff[i].tuples));
+ have[i] = false;
+ goto next;
+ }
+ }
+
+ t = dns_name_compare(dns_fixedname_name(&fixname[0]),
+ dns_fixedname_name(&fixname[1]));
+ if (t < 0) {
+ ISC_LIST_APPENDLIST(resultdiff->tuples,
+ diff[0].tuples, link);
+ INSIST(ISC_LIST_EMPTY(diff[0].tuples));
+ have[0] = false;
+ continue;
+ }
+ if (t > 0) {
+ ISC_LIST_APPENDLIST(resultdiff->tuples,
+ diff[1].tuples, link);
+ INSIST(ISC_LIST_EMPTY(diff[1].tuples));
+ have[1] = false;
+ continue;
+ }
+ INSIST(t == 0);
+ CHECK(dns_diff_subtract(diff, resultdiff));
+ INSIST(ISC_LIST_EMPTY(diff[0].tuples));
+ INSIST(ISC_LIST_EMPTY(diff[1].tuples));
+ have[0] = have[1] = false;
+ next: ;
+ }
+ if (itresult[0] != ISC_R_NOMORE)
+ FAIL(itresult[0]);
+ if (itresult[1] != ISC_R_NOMORE)
+ FAIL(itresult[1]);
+
+ INSIST(ISC_LIST_EMPTY(diff[0].tuples));
+ INSIST(ISC_LIST_EMPTY(diff[1].tuples));
+
+ failure:
+ dns_dbiterator_destroy(&dbit[1]);
+
+ cleanup_iterator:
+ dns_dbiterator_destroy(&dbit[0]);
+ dns_diff_clear(&diff[0]);
+ dns_diff_clear(&diff[1]);
+ return (result);
+}
+
+/*
+ * Compare the databases 'dba' and 'dbb' and generate a journal
+ * entry containing the changes to make 'dba' from 'dbb' (note
+ * the order). This journal entry will consist of a single,
+ * possibly very large transaction.
+ */
+isc_result_t
+dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
+ dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
+{
+ isc_result_t result;
+ dns_diff_t diff;
+
+ dns_diff_init(mctx, &diff);
+
+ result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
+
+ dns_diff_clear(&diff);
+
+ return (result);
+}
+
+isc_result_t
+dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
+ dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
+{
+ isc_result_t result;
+ dns_journal_t *journal = NULL;
+
+ if (filename != NULL) {
+ result = dns_journal_open(diff->mctx, filename,
+ DNS_JOURNAL_CREATE, &journal);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+ }
+
+ CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
+ CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
+
+ if (journal != NULL) {
+ if (ISC_LIST_EMPTY(diff->tuples))
+ isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
+ else
+ CHECK(dns_journal_write_transaction(journal, diff));
+ }
+
+ failure:
+ if (journal != NULL)
+ dns_journal_destroy(&journal);
+ return (result);
+}
+
+isc_result_t
+dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
+ uint32_t target_size)
+{
+ unsigned int i;
+ journal_pos_t best_guess;
+ journal_pos_t current_pos;
+ dns_journal_t *j1 = NULL;
+ dns_journal_t *j2 = NULL;
+ journal_rawheader_t rawheader;
+ unsigned int copy_length;
+ size_t namelen;
+ char *buf = NULL;
+ unsigned int size = 0;
+ isc_result_t result;
+ unsigned int indexend;
+ char newname[1024];
+ char backup[1024];
+ bool is_backup = false;
+
+ REQUIRE(filename != NULL);
+
+ namelen = strlen(filename);
+ if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
+ namelen -= 4;
+
+ result = isc_string_printf(newname, sizeof(newname), "%.*s.jnw",
+ (int)namelen, filename);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
+ (int)namelen, filename);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ result = journal_open(mctx, filename, false, false, &j1);
+ if (result == ISC_R_NOTFOUND) {
+ is_backup = true;
+ result = journal_open(mctx, backup, false, false, &j1);
+ }
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ if (JOURNAL_EMPTY(&j1->header)) {
+ dns_journal_destroy(&j1);
+ return (ISC_R_SUCCESS);
+ }
+
+ if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
+ DNS_SERIAL_GT(serial, j1->header.end.serial)) {
+ dns_journal_destroy(&j1);
+ return (ISC_R_RANGE);
+ }
+
+ /*
+ * Cope with very small target sizes.
+ */
+ indexend = sizeof(journal_rawheader_t) +
+ j1->header.index_size * sizeof(journal_rawpos_t);
+ if (target_size < indexend * 2)
+ target_size = target_size/2 + indexend;
+
+ /*
+ * See if there is any work to do.
+ */
+ if ((uint32_t) j1->header.end.offset < target_size) {
+ dns_journal_destroy(&j1);
+ return (ISC_R_SUCCESS);
+ }
+
+ CHECK(journal_open(mctx, newname, true, true, &j2));
+
+ /*
+ * Remove overhead so space test below can succeed.
+ */
+ if (target_size >= indexend)
+ target_size -= indexend;
+
+ /*
+ * Find if we can create enough free space.
+ */
+ best_guess = j1->header.begin;
+ for (i = 0; i < j1->header.index_size; i++) {
+ if (POS_VALID(j1->index[i]) &&
+ DNS_SERIAL_GE(serial, j1->index[i].serial) &&
+ ((uint32_t)(j1->header.end.offset - j1->index[i].offset)
+ >= target_size / 2) &&
+ j1->index[i].offset > best_guess.offset)
+ best_guess = j1->index[i];
+ }
+
+ current_pos = best_guess;
+ while (current_pos.serial != serial) {
+ CHECK(journal_next(j1, &current_pos));
+ if (current_pos.serial == j1->header.end.serial)
+ break;
+
+ if (DNS_SERIAL_GE(serial, current_pos.serial) &&
+ ((uint32_t)(j1->header.end.offset - current_pos.offset)
+ >= (target_size / 2)) &&
+ current_pos.offset > best_guess.offset)
+ best_guess = current_pos;
+ else
+ break;
+ }
+
+ INSIST(best_guess.serial != j1->header.end.serial);
+ if (best_guess.serial != serial)
+ CHECK(journal_next(j1, &best_guess));
+
+ /*
+ * We should now be roughly half target_size provided
+ * we did not reach 'serial'. If not we will just copy
+ * all uncommitted deltas regardless of the size.
+ */
+ copy_length = j1->header.end.offset - best_guess.offset;
+
+ if (copy_length != 0) {
+ /*
+ * Copy best_guess to end into space just freed.
+ */
+ size = 64*1024;
+ if (copy_length < size)
+ size = copy_length;
+ buf = isc_mem_get(mctx, size);
+ if (buf == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto failure;
+ }
+
+ CHECK(journal_seek(j1, best_guess.offset));
+ CHECK(journal_seek(j2, indexend));
+ for (i = 0; i < copy_length; i += size) {
+ unsigned int len = (copy_length - i) > size ? size :
+ (copy_length - i);
+ CHECK(journal_read(j1, buf, len));
+ CHECK(journal_write(j2, buf, len));
+ }
+
+ CHECK(journal_fsync(j2));
+
+ /*
+ * Compute new header.
+ */
+ j2->header.begin.serial = best_guess.serial;
+ j2->header.begin.offset = indexend;
+ j2->header.end.serial = j1->header.end.serial;
+ j2->header.end.offset = indexend + copy_length;
+ j2->header.sourceserial = j1->header.sourceserial;
+ j2->header.serialset = j1->header.serialset;
+
+ /*
+ * Update the journal header.
+ */
+ journal_header_encode(&j2->header, &rawheader);
+ CHECK(journal_seek(j2, 0));
+ CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
+ CHECK(journal_fsync(j2));
+
+ /*
+ * Build new index.
+ */
+ current_pos = j2->header.begin;
+ while (current_pos.serial != j2->header.end.serial) {
+ index_add(j2, &current_pos);
+ CHECK(journal_next(j2, &current_pos));
+ }
+
+ /*
+ * Write index.
+ */
+ CHECK(index_to_disk(j2));
+ CHECK(journal_fsync(j2));
+
+ indexend = j2->header.end.offset;
+ POST(indexend);
+ }
+
+ /*
+ * Close both journals before trying to rename files (this is
+ * necessary on WIN32).
+ */
+ dns_journal_destroy(&j1);
+ dns_journal_destroy(&j2);
+
+ /*
+ * With a UFS file system this should just succeed and be atomic.
+ * Any IXFR outs will just continue and the old journal will be
+ * removed on final close.
+ *
+ * With MSDOS / NTFS we need to do a two stage rename, triggered
+ * by EEXIST. (If any IXFR's are running in other threads, however,
+ * this will fail, and the journal will not be compacted. But
+ * if so, hopefully they'll be finished by the next time we
+ * compact.)
+ */
+ if (rename(newname, filename) == -1) {
+ if (errno == EEXIST && !is_backup) {
+ result = isc_file_remove(backup);
+ if (result != ISC_R_SUCCESS &&
+ result != ISC_R_FILENOTFOUND)
+ goto failure;
+ if (rename(filename, backup) == -1)
+ goto maperrno;
+ if (rename(newname, filename) == -1)
+ goto maperrno;
+ (void)isc_file_remove(backup);
+ } else {
+ maperrno:
+ result = ISC_R_FAILURE;
+ goto failure;
+ }
+ }
+
+ result = ISC_R_SUCCESS;
+
+ failure:
+ (void)isc_file_remove(newname);
+ if (buf != NULL)
+ isc_mem_put(mctx, buf, size);
+ if (j1 != NULL)
+ dns_journal_destroy(&j1);
+ if (j2 != NULL)
+ dns_journal_destroy(&j2);
+ return (result);
+}
+
+static isc_result_t
+index_to_disk(dns_journal_t *j) {
+ isc_result_t result = ISC_R_SUCCESS;
+
+ if (j->header.index_size != 0) {
+ unsigned int i;
+ unsigned char *p;
+ unsigned int rawbytes;
+
+ rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
+
+ p = j->rawindex;
+ for (i = 0; i < j->header.index_size; i++) {
+ encode_uint32(j->index[i].serial, p);
+ p += 4;
+ encode_uint32(j->index[i].offset, p);
+ p += 4;
+ }
+ INSIST(p == j->rawindex + rawbytes);
+
+ CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
+ CHECK(journal_write(j, j->rawindex, rawbytes));
+ }
+failure:
+ return (result);
+}