1 files changed, 1685 insertions, 0 deletions
diff --git a/src/lib-index/mail-transaction-log-file.c b/src/lib-index/mail-transaction-log-file.c
new file mode 100644
index 0000000..1820169
--- /dev/null
+++ b/src/lib-index/mail-transaction-log-file.c
@@ -0,0 +1,1685 @@
+/* Copyright (c) 2003-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "ioloop.h"
+#include "file-dotlock.h"
+#include "nfs-workarounds.h"
+#include "read-full.h"
+#include "write-full.h"
+#include "mmap-util.h"
+#include "mail-index-private.h"
+#include "mail-index-modseq.h"
+#include "mail-transaction-log-private.h"
+
+#define LOG_PREFETCH IO_BLOCK_SIZE
+#define MEMORY_LOG_NAME "(in-memory transaction log file)"
+#define LOG_NEW_DOTLOCK_SUFFIX ".newlock"
+
+static int
+mail_transaction_log_file_sync(struct mail_transaction_log_file *file,
+			       bool *retry_r, const char **reason_r);
+
+static void
+log_file_set_syscall_error(struct mail_transaction_log_file *file,
+			   const char *function)
+{
+	mail_index_file_set_syscall_error(file->log->index,
+					  file->filepath, function);
+}
+
+static void
+mail_transaction_log_mark_corrupted(struct mail_transaction_log_file *file)
+{
+	unsigned int offset =
+		offsetof(struct mail_transaction_log_header, indexid);
+	int flags;
+
+	if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file) ||
+	    file->log->index->readonly)
+		return;
+
+	/* indexid=0 marks the log file as corrupted. we opened the file with
+	   O_APPEND, and now we need to drop it for pwrite() to work (at least
+	   in Linux) */
+	flags = fcntl(file->fd, F_GETFL, 0);
+	if (flags < 0) {
+		mail_index_file_set_syscall_error(file->log->index,
+			file->filepath, "fcntl(F_GETFL)");
+		return;
+	}
+	if (fcntl(file->fd, F_SETFL, flags & ~O_APPEND) < 0) {
+		mail_index_file_set_syscall_error(file->log->index,
+			file->filepath, "fcntl(F_SETFL)");
+		return;
+	}
+	if (pwrite_full(file->fd, &file->hdr.indexid,
+			sizeof(file->hdr.indexid), offset) < 0) {
+		mail_index_file_set_syscall_error(file->log->index,
+						  file->filepath, "pwrite()");
+	}
+}
+
+void
+mail_transaction_log_file_set_corrupted(struct mail_transaction_log_file *file,
+					const char *fmt, ...)
+{
+	va_list va;
+
+	file->corrupted = TRUE;
+	file->hdr.indexid = 0;
+	mail_transaction_log_mark_corrupted(file);
+
+	va_start(va, fmt);
+	T_BEGIN {
+		mail_index_set_error(file->log->index,
+			"Corrupted transaction log file %s seq %u: %s "
+			"(sync_offset=%"PRIuUOFF_T")",
+			file->filepath, file->hdr.file_seq,
+			t_strdup_vprintf(fmt, va), file->sync_offset);
+	} T_END;
+	va_end(va);
+}
+
+struct mail_transaction_log_file *
+mail_transaction_log_file_alloc(struct mail_transaction_log *log,
+				const char *path)
+{
+	struct mail_transaction_log_file *file;
+
+	file = i_new(struct mail_transaction_log_file, 1);
+	file->log = log;
+	file->filepath = i_strdup(path);
+	file->fd = -1;
+	return file;
+}
+
+void mail_transaction_log_file_free(struct mail_transaction_log_file **_file)
+{
+	struct mail_transaction_log_file *file = *_file;
+	struct mail_transaction_log_file **p;
+	int old_errno = errno;
+
+	*_file = NULL;
+
+	i_assert(!file->locked);
+	i_assert(file->refcount == 0);
+
+	for (p = &file->log->files; *p != NULL; p = &(*p)->next) {
+		if (*p == file) {
+			*p = file->next;
+			break;
+		}
+	}
+
+	if (file == file->log->head)
+		file->log->head = NULL;
+
+	buffer_free(&file->buffer);
+
+	if (file->mmap_base != NULL) {
+		if (munmap(file->mmap_base, file->mmap_size) < 0)
+			log_file_set_syscall_error(file, "munmap()");
+	}
+
+	if (file->fd != -1) {
+		if (close(file->fd) < 0)
+			log_file_set_syscall_error(file, "close()");
+	}
+
+	i_free(file->filepath);
+	i_free(file->need_rotate);
+        i_free(file);
+
+        errno = old_errno;
+}
+
+static void
+mail_transaction_log_file_skip_to_head(struct mail_transaction_log_file *file)
+{
+	struct mail_transaction_log *log = file->log;
+	struct mail_index_map *map = log->index->map;
+	const struct mail_index_modseq_header *modseq_hdr;
+	uoff_t head_offset;
+
+	if (map == NULL || file->hdr.file_seq != map->hdr.log_file_seq ||
+	    map->hdr.log_file_head_offset == 0)
+		return;
+
+	/* we can get a valid log offset from index file. initialize
+	   sync_offset from it so we don't have to read the whole log
+	   file from beginning. */
+	head_offset = map->hdr.log_file_head_offset;
+
+	modseq_hdr = mail_index_map_get_modseq_header(map);
+	if (head_offset < file->hdr.hdr_size) {
+		mail_index_set_error(log->index,
+				     "%s: log_file_head_offset too small",
+				     log->index->filepath);
+		file->sync_offset = file->hdr.hdr_size;
+		file->sync_highest_modseq = file->hdr.initial_modseq;
+	} else if (modseq_hdr == NULL && file->hdr.initial_modseq == 0) {
+		/* modseqs not used yet */
+		file->sync_offset = head_offset;
+		file->sync_highest_modseq = 0;
+	} else if (modseq_hdr == NULL ||
+		   modseq_hdr->log_seq != file->hdr.file_seq) {
+		/* highest_modseq not synced, start from beginning */
+		file->sync_offset = file->hdr.hdr_size;
+		file->sync_highest_modseq = file->hdr.initial_modseq;
+	} else if (modseq_hdr->log_offset > head_offset) {
+		mail_index_set_error(log->index,
+				     "%s: modseq_hdr.log_offset too large",
+				     log->index->filepath);
+		file->sync_offset = file->hdr.hdr_size;
+		file->sync_highest_modseq = file->hdr.initial_modseq;
+	} else {
+		/* start from where we last stopped tracking modseqs */
+		file->sync_offset = modseq_hdr->log_offset;
+		file->sync_highest_modseq = modseq_hdr->highest_modseq;
+	}
+	if (file->hdr.file_seq == log->index->map->hdr.log_file_seq) {
+		file->last_read_hdr_tail_offset =
+			log->index->map->hdr.log_file_tail_offset;
+	}
+	if (file->last_read_hdr_tail_offset > file->max_tail_offset)
+		file->max_tail_offset = file->last_read_hdr_tail_offset;
+}
+
+static void
+mail_transaction_log_file_add_to_list(struct mail_transaction_log_file *file)
+{
+	struct mail_transaction_log_file **p;
+	const char *reason;
+	bool retry;
+
+	file->sync_offset = file->hdr.hdr_size;
+	file->sync_highest_modseq = file->hdr.initial_modseq;
+	mail_transaction_log_file_skip_to_head(file);
+
+	/* insert it to correct position */
+	for (p = &file->log->files; *p != NULL; p = &(*p)->next) {
+		if ((*p)->hdr.file_seq > file->hdr.file_seq)
+			break;
+		i_assert((*p)->hdr.file_seq < file->hdr.file_seq);
+	}
+
+	file->next = *p;
+	*p = file;
+
+	if (file->buffer != NULL) {
+		/* if we read any unfinished data, make sure the buffer gets
+		   truncated. */
+		(void)mail_transaction_log_file_sync(file, &retry, &reason);
+		buffer_set_used_size(file->buffer,
+				     file->sync_offset - file->buffer_offset);
+	}
+}
+
+static int
+mail_transaction_log_init_hdr(struct mail_transaction_log *log,
+			      struct mail_transaction_log_header *hdr)
+{
+	struct mail_index *index = log->index;
+	struct mail_transaction_log_file *file;
+
+	i_assert(index->indexid != 0);
+
+	i_zero(hdr);
+	hdr->major_version = MAIL_TRANSACTION_LOG_MAJOR_VERSION;
+	hdr->minor_version = MAIL_TRANSACTION_LOG_MINOR_VERSION;
+	hdr->hdr_size = sizeof(struct mail_transaction_log_header);
+	hdr->indexid = log->index->indexid;
+	hdr->create_stamp = ioloop_time;
+#ifndef WORDS_BIGENDIAN
+	hdr->compat_flags |= MAIL_INDEX_COMPAT_LITTLE_ENDIAN;
+#endif
+
+	if (index->fd != -1) {
+		/* not creating index - make sure we have latest header */
+		if (!index->mapping) {
+			if (mail_index_map(index,
+					   MAIL_INDEX_SYNC_HANDLER_HEAD) <= 0)
+				return -1;
+		} else {
+			/* if we got here from mapping, the .log file is
+			   corrupted. use whatever values we got from index
+			   file */
+		}
+	}
+	if (index->map != NULL) {
+		hdr->prev_file_seq = index->map->hdr.log_file_seq;
+		hdr->prev_file_offset = index->map->hdr.log_file_head_offset;
+		hdr->file_seq = index->map->hdr.log_file_seq + 1;
+		hdr->initial_modseq =
+			mail_index_map_modseq_get_highest(index->map);
+	} else {
+		hdr->file_seq = 1;
+	}
+	if (hdr->initial_modseq == 0) {
+		/* modseq tracking in log files is required for many reasons
+		   nowadays, even if per-message modseqs aren't enabled in
+		   dovecot.index. */
+		hdr->initial_modseq = 1;
+	}
+
+	if (log->head != NULL) {
+		/* make sure the sequence always increases to avoid crashes
+		   later. this catches the buggy case where two processes
+		   happen to replace the same log file. */
+		for (file = log->head->next; file != NULL; file = file->next) {
+			if (hdr->file_seq <= file->hdr.file_seq)
+				hdr->file_seq = file->hdr.file_seq + 1;
+		}
+
+		if (hdr->file_seq <= log->head->hdr.file_seq) {
+			/* make sure the sequence grows */
+			hdr->file_seq = log->head->hdr.file_seq+1;
+		}
+		if (hdr->initial_modseq < log->head->sync_highest_modseq) {
+			/* this should be always up-to-date */
+			hdr->initial_modseq = log->head->sync_highest_modseq;
+		}
+	}
+	return 0;
+}
+
+struct mail_transaction_log_file *
+mail_transaction_log_file_alloc_in_memory(struct mail_transaction_log *log)
+{
+	struct mail_transaction_log_file *file;
+
+	file = mail_transaction_log_file_alloc(log, MEMORY_LOG_NAME);
+	if (mail_transaction_log_init_hdr(log, &file->hdr) < 0) {
+		i_free(file);
+		return NULL;
+	}
+
+	file->buffer = buffer_create_dynamic(default_pool, 4096);
+	file->buffer_offset = sizeof(file->hdr);
+
+	mail_transaction_log_file_add_to_list(file);
+	return file;
+}
+
+static int
+mail_transaction_log_file_dotlock(struct mail_transaction_log_file *file)
+{
+	struct dotlock_settings dotlock_set;
+	int ret;
+
+	if (file->log->dotlock_refcount > 0)
+		ret = 1;
+	else {
+		i_assert(file->log->dotlock_refcount == 0);
+		mail_transaction_log_get_dotlock_set(file->log, &dotlock_set);
+		ret = file_dotlock_create(&dotlock_set, file->filepath, 0,
+					  &file->log->dotlock);
+	}
+	if (ret > 0) {
+		file->log->dotlock_refcount++;
+		file->locked = TRUE;
+		file->lock_create_time = time(NULL);
+		return 0;
+	}
+	if (ret < 0) {
+		log_file_set_syscall_error(file, "file_dotlock_create()");
+		return -1;
+	}
+
+	mail_index_set_error(file->log->index,
+			     "Timeout (%us) while waiting for "
+			     "dotlock for transaction log file %s",
+			     dotlock_set.timeout, file->filepath);
+	return -1;
+}
+
+static int
+mail_transaction_log_file_undotlock(struct mail_transaction_log_file *file)
+{
+	int ret;
+
+	i_assert(file->log->dotlock_refcount >= 0);
+	if (--file->log->dotlock_refcount > 0)
+		return 0;
+
+	ret = file_dotlock_delete(&file->log->dotlock);
+	if (ret < 0) {
+		log_file_set_syscall_error(file, "file_dotlock_delete()");
+		return -1;
+	}
+
+	if (ret == 0) {
+		mail_index_set_error(file->log->index,
+			"Dotlock was lost for transaction log file %s",
+			file->filepath);
+		return -1;
+	}
+	return 0;
+}
+
+int mail_transaction_log_file_lock(struct mail_transaction_log_file *file)
+{
+	unsigned int lock_timeout_secs;
+	int ret;
+
+	if (file->locked)
+		return 0;
+
+	if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) {
+		file->locked = TRUE;
+		return 0;
+	}
+
+	if (file->log->index->set.lock_method == FILE_LOCK_METHOD_DOTLOCK)
+		return mail_transaction_log_file_dotlock(file);
+
+	if (file->log->index->readonly) {
+		mail_index_set_error(file->log->index,
+			"Index is read-only, can't write-lock %s",
+			file->filepath);
+		return -1;
+	}
+
+	i_assert(file->file_lock == NULL);
+	lock_timeout_secs = I_MIN(MAIL_TRANSACTION_LOG_LOCK_TIMEOUT,
+				  file->log->index->set.max_lock_timeout_secs);
+	ret = mail_index_lock_fd(file->log->index, file->filepath, file->fd,
+				 F_WRLCK, lock_timeout_secs,
+				 &file->file_lock);
+	if (ret > 0) {
+		file->locked = TRUE;
+		file->lock_create_time = time(NULL);
+		return 0;
+	}
+	if (ret < 0) {
+		log_file_set_syscall_error(file, "mail_index_wait_lock_fd()");
+		return -1;
+	}
+
+	mail_index_set_error(file->log->index,
+		"Timeout (%us) while waiting for lock for "
+		"transaction log file %s%s",
+		lock_timeout_secs, file->filepath,
+		file_lock_find(file->fd, file->log->index->set.lock_method, F_WRLCK));
+	return -1;
+}
+
+void mail_transaction_log_file_unlock(struct mail_transaction_log_file *file,
+				      const char *lock_reason)
+{
+	unsigned int lock_time;
+
+	if (!file->locked)
+		return;
+
+	file->locked = FALSE;
+	file->locked_sync_offset_updated = FALSE;
+
+	if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file))
+		return;
+
+	lock_time = time(NULL) - file->lock_create_time;
+	if (lock_time >= MAIL_TRANSACTION_LOG_LOCK_WARN_SECS && lock_reason != NULL) {
+		i_warning("Transaction log file %s was locked for %u seconds (%s)",
+			  file->filepath, lock_time, lock_reason);
+	}
+
+	if (file->log->index->set.lock_method == FILE_LOCK_METHOD_DOTLOCK) {
+		(void)mail_transaction_log_file_undotlock(file);
+		return;
+	}
+
+	file_unlock(&file->file_lock);
+}
+
+static ssize_t
+mail_transaction_log_file_read_header(struct mail_transaction_log_file *file)
+{
+	void *dest;
+	size_t pos, dest_size;
+	ssize_t ret;
+
+	i_assert(file->buffer == NULL && file->mmap_base == NULL);
+
+	i_zero(&file->hdr);
+	if (file->last_size < mmap_get_page_size() && file->last_size > 0) {
+		/* just read the entire transaction log to memory.
+		   note that if some of the data hasn't been fully committed
+		   yet (hdr.size=0), the buffer must be truncated later */
+		file->buffer = buffer_create_dynamic(default_pool, 4096);
+		file->buffer_offset = 0;
+		dest_size = file->last_size;
+		dest = buffer_append_space_unsafe(file->buffer, dest_size);
+	} else {
+		/* read only the header */
+		dest = &file->hdr;
+		dest_size = sizeof(file->hdr);
+	}
+
+	/* it's not necessarily an error to read less than wanted header size,
+	   since older versions of the log format used smaller headers. */
+        pos = 0;
+	do {
+		ret = pread(file->fd, PTR_OFFSET(dest, pos),
+			    dest_size - pos, pos);
+		if (ret > 0)
+			pos += ret;
+	} while (ret > 0 && pos < dest_size);
+
+	if (file->buffer != NULL) {
+		buffer_set_used_size(file->buffer, pos);
+		memcpy(&file->hdr, file->buffer->data,
+		       I_MIN(pos, sizeof(file->hdr)));
+	}
+
+	return ret < 0 ? -1 : (ssize_t)pos;
+}
+
+static int
+mail_transaction_log_file_fail_dupe(struct mail_transaction_log_file *file)
+{
+	int ret;
+
+	/* mark the old file corrupted. we can't safely remove
+	   it from the list however, so return failure. */
+	file->hdr.indexid = 0;
+	if (strcmp(file->filepath, file->log->head->filepath) != 0) {
+		/* only mark .2 corrupted, just to make sure we don't lose any
+		   changes from .log in case we're somehow wrong */
+		mail_transaction_log_mark_corrupted(file);
+		ret = 0;
+	} else {
+		ret = -1;
+	}
+	if (!file->corrupted) {
+		file->corrupted = TRUE;
+		mail_index_set_error(file->log->index,
+				     "Transaction log %s: "
+				     "duplicate transaction log sequence (%u)",
+				     file->filepath, file->hdr.file_seq);
+	}
+	return ret;
+}
+
+static int
+mail_transaction_log_file_read_hdr(struct mail_transaction_log_file *file,
+				   bool ignore_estale)
+{
+        struct mail_transaction_log_file *f;
+	int ret;
+
+	i_assert(!MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file));
+
+	if (file->corrupted)
+		return 0;
+
+	ret = mail_transaction_log_file_read_header(file);
+	if (ret < 0) {
+                if (errno != ESTALE || !ignore_estale)
+			log_file_set_syscall_error(file, "pread()");
+		return -1;
+	}
+	if (file->hdr.major_version != MAIL_TRANSACTION_LOG_MAJOR_VERSION) {
+		/* incompatible version - fix silently */
+		return 0;
+	}
+	if (ret < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) {
+		mail_transaction_log_file_set_corrupted(file,
+			"unexpected end of file while reading header");
+		return 0;
+	}
+
+	const unsigned int hdr_version =
+		MAIL_TRANSACTION_LOG_HDR_VERSION(&file->hdr);
+	if (MAIL_TRANSACTION_LOG_VERSION_HAVE(hdr_version, COMPAT_FLAGS)) {
+		/* we have compatibility flags */
+		enum mail_index_header_compat_flags compat_flags = 0;
+
+#ifndef WORDS_BIGENDIAN
+		compat_flags |= MAIL_INDEX_COMPAT_LITTLE_ENDIAN;
+#endif
+		if (file->hdr.compat_flags != compat_flags) {
+			/* architecture change */
+			mail_index_set_error(file->log->index,
+					     "Rebuilding index file %s: "
+					     "CPU architecture changed",
+					     file->log->index->filepath);
+			return 0;
+		}
+	}
+	if (file->hdr.hdr_size < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) {
+		mail_transaction_log_file_set_corrupted(file,
+			"Header size too small");
+		return 0;
+	}
+	if (file->hdr.hdr_size < sizeof(file->hdr)) {
+		/* @UNSAFE: smaller than we expected - zero out the fields we
+		   shouldn't have filled */
+		memset(PTR_OFFSET(&file->hdr, file->hdr.hdr_size), 0,
+		       sizeof(file->hdr) - file->hdr.hdr_size);
+	}
+
+	if (file->hdr.indexid == 0) {
+		/* corrupted */
+		file->corrupted = TRUE;
+		mail_index_set_error(file->log->index,
+			"Transaction log file %s: marked corrupted",
+			file->filepath);
+		return 0;
+	}
+	if (file->hdr.indexid != file->log->index->indexid) {
+		if (file->log->index->indexid != 0 &&
+		    !file->log->index->initial_create) {
+			/* index file was probably just rebuilt and we don't
+			   know about it yet */
+			mail_transaction_log_file_set_corrupted(file,
+				"indexid changed: %u -> %u",
+				file->log->index->indexid, file->hdr.indexid);
+			return 0;
+		}
+
+		/* creating index file. since transaction log is created
+		   first, use the indexid in it to create the main index
+		   to avoid races. */
+		file->log->index->indexid = file->hdr.indexid;
+	}
+
+	/* make sure we already don't have a file with the same sequence
+	   opened. it shouldn't happen unless the old log file was
+	   corrupted. */
+	for (f = file->log->files; f != NULL; f = f->next) {
+		if (f->hdr.file_seq == file->hdr.file_seq) {
+			if (strcmp(f->filepath, f->log->head->filepath) != 0) {
+				/* old "f" is the .log.2 */
+				return mail_transaction_log_file_fail_dupe(f);
+			} else {
+				/* new "file" is probably the .log.2 */
+				return mail_transaction_log_file_fail_dupe(file);
+			}
+		}
+	}
+
+	file->sync_highest_modseq = file->hdr.initial_modseq;
+	return 1;
+}
+
+static int
+mail_transaction_log_file_stat(struct mail_transaction_log_file *file,
+			       bool ignore_estale)
+{
+	struct stat st;
+
+	if (fstat(file->fd, &st) < 0) {
+                if (!ESTALE_FSTAT(errno) || !ignore_estale)
+			log_file_set_syscall_error(file, "fstat()");
+		return -1;
+	}
+
+	file->st_dev = st.st_dev;
+	file->st_ino = st.st_ino;
+	file->last_mtime = st.st_mtime;
+	file->last_size = st.st_size;
+	return 0;
+}
+
+static bool
+mail_transaction_log_file_is_dupe(struct mail_transaction_log_file *file)
+{
+	struct mail_transaction_log_file *tmp;
+
+	for (tmp = file->log->files; tmp != NULL; tmp = tmp->next) {
+		if (tmp->st_ino == file->st_ino &&
+		    CMP_DEV_T(tmp->st_dev, file->st_dev))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+static void log_write_ext_hdr_init_data(struct mail_index *index, buffer_t *buf)
+{
+	const struct mail_index_registered_ext *rext;
+	struct mail_transaction_header *hdr;
+	struct mail_transaction_ext_intro *intro;
+	struct mail_transaction_ext_hdr_update *ext_hdr;
+	unsigned int hdr_offset;
+
+	rext = array_idx(&index->extensions, index->set.ext_hdr_init_id);
+
+	/* introduce the extension */
+	hdr_offset = buf->used;
+	hdr = buffer_append_space_unsafe(buf, sizeof(*hdr));
+	hdr->type = MAIL_TRANSACTION_EXT_INTRO;
+
+	intro = buffer_append_space_unsafe(buf, sizeof(*intro));
+	intro->ext_id = (uint32_t)-1;
+	intro->hdr_size = rext->hdr_size;
+	intro->record_size = rext->record_size;
+	intro->record_align = rext->record_align;
+	intro->name_size = strlen(rext->name);
+	buffer_append(buf, rext->name, intro->name_size);
+	if (buf->used % 4 != 0)
+		buffer_append_zero(buf, 4 - buf->used % 4);
+
+	hdr = buffer_get_space_unsafe(buf, hdr_offset, sizeof(*hdr));
+	hdr->size = mail_index_uint32_to_offset(buf->used - hdr_offset);
+
+	/* add the extension header data */
+	hdr_offset = buf->used;
+	hdr = buffer_append_space_unsafe(buf, sizeof(*hdr));
+	hdr->type = MAIL_TRANSACTION_EXT_HDR_UPDATE;
+
+	ext_hdr = buffer_append_space_unsafe(buf, sizeof(*ext_hdr));
+	ext_hdr->size = rext->hdr_size;
+	buffer_append(buf, index->set.ext_hdr_init_data, rext->hdr_size);
+
+	hdr = buffer_get_space_unsafe(buf, hdr_offset, sizeof(*hdr));
+	hdr->size = mail_index_uint32_to_offset(buf->used - hdr_offset);
+}
+
+static int
+mail_transaction_log_file_create2(struct mail_transaction_log_file *file,
+				  int new_fd, bool reset,
+				  struct dotlock **dotlock)
+{
+	struct mail_index *index = file->log->index;
+	struct stat st;
+	const char *path2;
+	buffer_t *writebuf;
+	int fd, ret;
+	bool rename_existing, need_lock;
+
+	need_lock = file->log->head != NULL && file->log->head->locked;
+
+	if (fcntl(new_fd, F_SETFL, O_APPEND) < 0) {
+		log_file_set_syscall_error(file, "fcntl(O_APPEND)");
+		return -1;
+	}
+
+	if ((index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0) {
+		/* although we check also mtime and file size below, it's done
+		   only to fix broken log files. we don't bother flushing
+		   attribute cache just for that. */
+		nfs_flush_file_handle_cache(file->filepath);
+	}
+
+	/* log creation is locked now - see if someone already created it.
+	   note that if we're rotating, we need to keep the log locked until
+	   the file has been rewritten. and because fcntl() locks are stupid,
+	   if we go and open()+close() the file and we had it already opened,
+	   its locks are lost. so we use stat() to check if the file has been
+	   recreated, although it almost never is. */
+	if (reset)
+		rename_existing = FALSE;
+	else if (nfs_safe_stat(file->filepath, &st) < 0) {
+		if (errno != ENOENT) {
+			log_file_set_syscall_error(file, "stat()");
+			return -1;
+		}
+		rename_existing = FALSE;
+	} else if (st.st_ino == file->st_ino &&
+		   CMP_DEV_T(st.st_dev, file->st_dev) &&
+		   /* inode/dev checks are enough when we're rotating the file,
+		      but not when we're replacing a broken log file */
+		   st.st_mtime == file->last_mtime &&
+		   (uoff_t)st.st_size == file->last_size) {
+		/* no-one else recreated the file */
+		rename_existing = TRUE;
+	} else {
+		/* recreated. use the file if its header is ok */
+		fd = nfs_safe_open(file->filepath, O_RDWR | O_APPEND);
+		if (fd == -1) {
+			if (errno != ENOENT) {
+				log_file_set_syscall_error(file, "open()");
+				return -1;
+			}
+		} else {
+			file->fd = fd;
+			file->last_size = 0;
+			if (mail_transaction_log_file_read_hdr(file,
+							       FALSE) > 0 &&
+			    mail_transaction_log_file_stat(file, FALSE) == 0) {
+				/* yes, it was ok */
+				file_dotlock_delete(dotlock);
+				mail_transaction_log_file_add_to_list(file);
+				return 0;
+			}
+			file->fd = -1;
+			if (close(fd) < 0)
+				log_file_set_syscall_error(file, "close()");
+		}
+		rename_existing = FALSE;
+	}
+
+	if (index->fd == -1 && !rename_existing) {
+		/* creating the initial index */
+		reset = TRUE;
+	}
+
+	if (mail_transaction_log_init_hdr(file->log, &file->hdr) < 0)
+		return -1;
+
+	if (reset) {
+		/* don't reset modseqs. if we're reseting due to rebuilding
+		   indexes we'll probably want to keep uidvalidity and in such
+		   cases we really don't want to shrink modseqs. */
+		file->hdr.prev_file_seq = 0;
+		file->hdr.prev_file_offset = 0;
+	}
+
+	writebuf = t_buffer_create(128);
+	buffer_append(writebuf, &file->hdr, sizeof(file->hdr));
+
+	if (index->set.ext_hdr_init_data != NULL && reset)
+		log_write_ext_hdr_init_data(index, writebuf);
+	if (write_full(new_fd, writebuf->data, writebuf->used) < 0) {
+		log_file_set_syscall_error(file, "write_full()");
+		return -1;
+	}
+
+	if (file->log->index->set.fsync_mode == FSYNC_MODE_ALWAYS) {
+		/* the header isn't important, so don't bother calling
+		   fdatasync() unless it's required */
+		if (fdatasync(new_fd) < 0) {
+			log_file_set_syscall_error(file, "fdatasync()");
+			return -1;
+		}
+	}
+
+	file->fd = new_fd;
+	ret = mail_transaction_log_file_stat(file, FALSE);
+
+	if (need_lock && ret == 0) {
+		/* we'll need to preserve the lock */
+		if (mail_transaction_log_file_lock(file) < 0)
+			ret = -1;
+	}
+
+	/* if we return -1 the dotlock deletion code closes the fd */
+	file->fd = -1;
+	if (ret < 0)
+		return -1;
+
+	/* keep two log files */
+	if (rename_existing) {
+		/* rename() would be nice and easy way to do this, except then
+		   there's a race condition between the rename and
+		   file_dotlock_replace(). during that time the log file
+		   doesn't exist, which could cause problems. */
+		path2 = t_strconcat(file->filepath, ".2", NULL);
+		if (i_unlink_if_exists(path2) < 0) {
+			/* try to link() anyway */
+		}
+		if (nfs_safe_link(file->filepath, path2, FALSE) < 0 &&
+		    errno != ENOENT && errno != EEXIST) {
+                        mail_index_set_error(index, "link(%s, %s) failed: %m",
+					     file->filepath, path2);
+			/* ignore the error. we don't care that much about the
+			   second log file and we're going to overwrite this
+			   first one. */
+		}
+		/* NOTE: here's a race condition where both .log and .log.2
+		   point to the same file. our reading code should ignore that
+		   though by comparing the inodes. */
+	}
+
+	if (file_dotlock_replace(dotlock,
+				 DOTLOCK_REPLACE_FLAG_DONT_CLOSE_FD) <= 0) {
+		/* need to unlock to avoid assert-crash in
+		   mail_transaction_log_file_free() */
+		mail_transaction_log_file_unlock(file, "creation failed");
+		return -1;
+	}
+
+	/* success */
+	file->fd = new_fd;
+	mail_transaction_log_file_add_to_list(file);
+
+	i_assert(!need_lock || file->locked);
+	return 1;
+}
+
+int mail_transaction_log_file_create(struct mail_transaction_log_file *file,
+				     bool reset)
+{
+	struct mail_index *index = file->log->index;
+	struct dotlock_settings new_dotlock_set;
+	struct dotlock *dotlock;
+	mode_t old_mask;
+	int fd, ret;
+
+	i_assert(!MAIL_INDEX_IS_IN_MEMORY(index));
+
+	if (file->log->index->readonly) {
+		mail_index_set_error(index,
+			"Can't create log file %s: Index is read-only",
+			file->filepath);
+		return -1;
+	}
+
+	if (index->indexid == 0) {
+		mail_index_set_error(index,
+			"Can't create log file %s: Index is marked corrupted",
+			file->filepath);
+		return -1;
+	}
+
+	mail_transaction_log_get_dotlock_set(file->log, &new_dotlock_set);
+	new_dotlock_set.lock_suffix = LOG_NEW_DOTLOCK_SUFFIX;
+
+	/* With dotlocking we might already have path.lock created, so this
+	   filename has to be different. */
+	old_mask = umask(index->set.mode ^ 0666);
+	fd = file_dotlock_open(&new_dotlock_set, file->filepath, 0, &dotlock);
+	umask(old_mask);
+
+	if (fd == -1) {
+		log_file_set_syscall_error(file, "file_dotlock_open()");
+		return -1;
+	}
+	mail_index_fchown(index, fd, file_dotlock_get_lock_path(dotlock));
+
+        /* either fd gets used or the dotlock gets deleted and returned fd
+           is for the existing file */
+	ret = mail_transaction_log_file_create2(file, fd, reset, &dotlock);
+	if (ret < 0) {
+		if (dotlock != NULL)
+			file_dotlock_delete(&dotlock);
+		return -1;
+	}
+	return ret;
+}
+
+int mail_transaction_log_file_open(struct mail_transaction_log_file *file,
+				   const char **reason_r)
+{
+	struct mail_index *index = file->log->index;
+        unsigned int i;
+	bool ignore_estale;
+	int ret;
+
+        for (i = 0;; i++) {
+		if (!index->readonly) {
+			file->fd = nfs_safe_open(file->filepath,
+						 O_RDWR | O_APPEND);
+		} else {
+			file->fd = nfs_safe_open(file->filepath, O_RDONLY);
+		}
+		if (file->fd == -1 && errno == EACCES) {
+			file->fd = nfs_safe_open(file->filepath, O_RDONLY);
+			index->readonly = TRUE;
+		}
+		if (file->fd == -1) {
+			if (errno == ENOENT) {
+				*reason_r = "File doesn't exist";
+				return 0;
+			}
+
+			log_file_set_syscall_error(file, "open()");
+			*reason_r = t_strdup_printf("open() failed: %m");
+			return -1;
+                }
+
+		ignore_estale = i < MAIL_INDEX_ESTALE_RETRY_COUNT;
+		if (mail_transaction_log_file_stat(file, ignore_estale) < 0)
+			ret = -1;
+		else if (mail_transaction_log_file_is_dupe(file)) {
+			/* probably our already opened .log file has been
+			   renamed to .log.2 and we're trying to reopen it.
+			   also possible that hit a race condition where .log
+			   and .log.2 are linked. */
+			*reason_r = "File is already open";
+			return 0;
+		} else {
+			ret = mail_transaction_log_file_read_hdr(file,
+								 ignore_estale);
+		}
+		if (ret > 0) {
+			/* success */
+			break;
+		}
+
+		if (ret == 0) {
+			/* corrupted */
+			if (index->readonly) {
+				/* don't delete */
+			} else {
+				i_unlink_if_exists(file->filepath);
+			}
+			*reason_r = "File is corrupted";
+			return 0;
+		}
+		if (errno != ESTALE ||
+		    i == MAIL_INDEX_ESTALE_RETRY_COUNT) {
+			/* syscall error */
+			*reason_r = t_strdup_printf("fstat() failed: %m");
+			return -1;
+		}
+
+		/* ESTALE - try again */
+		buffer_free(&file->buffer);
+        }
+
+	mail_transaction_log_file_add_to_list(file);
+	return 1;
+}
+
+static int
+log_file_track_mailbox_sync_offset_hdr(struct mail_transaction_log_file *file,
+				       const void *data, unsigned int trans_size,
+				       const char **error_r)
+{
+	const struct mail_transaction_header_update *u = data;
+	const struct mail_index_header *ihdr;
+	const unsigned int size = trans_size - sizeof(struct mail_transaction_header);
+	const unsigned int offset_pos =
+		offsetof(struct mail_index_header, log_file_tail_offset);
+	const unsigned int offset_size = sizeof(ihdr->log_file_tail_offset);
+	uint32_t tail_offset;
+
+	i_assert(offset_size == sizeof(tail_offset));
+
+	if (size < sizeof(*u) || size < sizeof(*u) + u->size) {
+		*error_r = "header update extends beyond record size";
+		mail_transaction_log_file_set_corrupted(file, "%s", *error_r);
+		return -1;
+	}
+
+	if (u->offset <= offset_pos &&
+	    u->offset + u->size >= offset_pos + offset_size) {
+		memcpy(&tail_offset,
+		       CONST_PTR_OFFSET(u + 1, offset_pos - u->offset),
+		       sizeof(tail_offset));
+
+		if (tail_offset < file->last_read_hdr_tail_offset) {
+			/* ignore shrinking tail offsets */
+			return 1;
+		} else if (tail_offset > file->sync_offset + trans_size) {
+			mail_transaction_log_file_set_corrupted(file,
+				"log_file_tail_offset %u goes past sync offset %"PRIuUOFF_T,
+				tail_offset, file->sync_offset + trans_size);
+		} else {
+			file->last_read_hdr_tail_offset = tail_offset;
+			if (tail_offset > file->max_tail_offset)
+				file->max_tail_offset = tail_offset;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static bool
+flag_updates_have_non_internal(const struct mail_transaction_flag_update *u,
+			       unsigned int count, unsigned int version)
+{
+	/* Hide internal flags from modseqs if the log file's version
+	   is new enough. This allows upgrading without the modseqs suddenly
+	   shrinking. */
+	if (!MAIL_TRANSACTION_LOG_VERSION_HAVE(version, HIDE_INTERNAL_MODSEQS))
+		return TRUE;
+
+	for (unsigned int i = 0; i < count; i++) {
+		if (!MAIL_TRANSACTION_FLAG_UPDATE_IS_INTERNAL(&u[i]))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+void mail_transaction_update_modseq(const struct mail_transaction_header *hdr,
+				    const void *data, uint64_t *cur_modseq,
+				    unsigned int version)
+{
+	uint32_t trans_size;
+
+	trans_size = mail_index_offset_to_uint32(hdr->size);
+	i_assert(trans_size != 0);
+
+	if (*cur_modseq != 0) {
+		/* tracking modseqs */
+	} else if ((hdr->type & MAIL_TRANSACTION_TYPE_MASK) ==
+		   MAIL_TRANSACTION_EXT_INTRO) {
+		/* modseqs not tracked yet. see if this is a modseq
+		   extension introduction. */
+		const struct mail_transaction_ext_intro *intro = data;
+		const unsigned int modseq_ext_len =
+			strlen(MAIL_INDEX_MODSEQ_EXT_NAME);
+
+		if (intro->name_size == modseq_ext_len &&
+		    memcmp(intro + 1, MAIL_INDEX_MODSEQ_EXT_NAME,
+			   modseq_ext_len) == 0) {
+			/* modseq tracking started */
+			*cur_modseq += 1;
+		}
+		return;
+	} else {
+		/* not tracking modseqs */
+		return;
+	}
+
+	switch (hdr->type & MAIL_TRANSACTION_TYPE_MASK) {
+	case MAIL_TRANSACTION_EXPUNGE | MAIL_TRANSACTION_EXPUNGE_PROT:
+	case MAIL_TRANSACTION_EXPUNGE_GUID | MAIL_TRANSACTION_EXPUNGE_PROT:
+		if ((hdr->type & MAIL_TRANSACTION_EXTERNAL) == 0) {
+			/* ignore expunge requests */
+			break;
+		}
+		/* fall through */
+	case MAIL_TRANSACTION_APPEND:
+	case MAIL_TRANSACTION_KEYWORD_UPDATE:
+	case MAIL_TRANSACTION_KEYWORD_RESET:
+	case MAIL_TRANSACTION_ATTRIBUTE_UPDATE:
+		/* these changes increase modseq */
+		*cur_modseq += 1;
+		break;
+	case MAIL_TRANSACTION_FLAG_UPDATE: {
+		const struct mail_transaction_flag_update *rec = data;
+		unsigned int count;
+
+		count = (trans_size - sizeof(*hdr)) / sizeof(*rec);
+		if (flag_updates_have_non_internal(rec, count, version))
+			*cur_modseq += 1;
+		break;
+	}
+	case MAIL_TRANSACTION_MODSEQ_UPDATE: {
+		const struct mail_transaction_modseq_update *rec, *end;
+
+		end = CONST_PTR_OFFSET(data, trans_size - sizeof(*hdr));
+		for (rec = data; rec < end; rec++) {
+			uint64_t modseq = ((uint64_t)rec->modseq_high32 << 32) |
+				rec->modseq_low32;
+			if (*cur_modseq < modseq)
+				*cur_modseq = modseq;
+		}
+	}
+	}
+}
+
+static int
+log_file_track_sync(struct mail_transaction_log_file *file,
+		    const struct mail_transaction_header *hdr,
+		    unsigned int trans_size, const char **error_r)
+{
+	const void *data = hdr + 1;
+	int ret;
+
+	mail_transaction_update_modseq(hdr, hdr + 1, &file->sync_highest_modseq,
+		MAIL_TRANSACTION_LOG_HDR_VERSION(&file->hdr));
+	if ((hdr->type & MAIL_TRANSACTION_EXTERNAL) == 0)
+		return 1;
+
+	/* external transactions: */
+	switch (hdr->type & MAIL_TRANSACTION_TYPE_MASK) {
+	case MAIL_TRANSACTION_HEADER_UPDATE:
+		/* see if this updates mailbox_sync_offset */
+		ret = log_file_track_mailbox_sync_offset_hdr(file, data,
+							     trans_size, error_r);
+		if (ret != 0)
+			return ret < 0 ? -1 : 1;
+		break;
+	case MAIL_TRANSACTION_INDEX_DELETED:
+		if (file->sync_offset < file->index_undeleted_offset ||
+		    file->hdr.file_seq < file->log->index->index_delete_changed_file_seq)
+			break;
+		file->log->index->index_deleted = TRUE;
+		file->log->index->index_delete_requested = FALSE;
+		file->log->index->index_delete_changed_file_seq = file->hdr.file_seq;
+		file->index_deleted_offset = file->sync_offset + trans_size;
+		break;
+	case MAIL_TRANSACTION_INDEX_UNDELETED:
+		if (file->sync_offset < file->index_deleted_offset ||
+		    file->hdr.file_seq < file->log->index->index_delete_changed_file_seq)
+			break;
+		file->log->index->index_deleted = FALSE;
+		file->log->index->index_delete_requested = FALSE;
+		file->log->index->index_delete_changed_file_seq = file->hdr.file_seq;
+		file->index_undeleted_offset = file->sync_offset + trans_size;
+		break;
+	case MAIL_TRANSACTION_BOUNDARY: {
+		const struct mail_transaction_boundary *boundary =
+			(const void *)(hdr + 1);
+		size_t wanted_buffer_size;
+
+		wanted_buffer_size = file->sync_offset - file->buffer_offset +
+			boundary->size;
+		if (wanted_buffer_size > file->buffer->used) {
+			/* the full transaction hasn't been written yet */
+			return 0;
+		}
+		break;
+	}
+	}
+
+	if (file->max_tail_offset == file->sync_offset) {
+		/* external transactions aren't synced to mailbox. we can
+		   update mailbox sync offset to skip this transaction to
+		   avoid re-reading it at the next sync. */
+		file->max_tail_offset += trans_size;
+	}
+	return 1;
+}
+
+static int
+mail_transaction_log_file_sync(struct mail_transaction_log_file *file,
+			       bool *retry_r, const char **reason_r)
+{
+        const struct mail_transaction_header *hdr;
+	const void *data;
+	struct stat st;
+	size_t size, avail;
+	uint32_t trans_size = 0;
+	int ret;
+
+	i_assert(file->sync_offset >= file->buffer_offset);
+
+	*retry_r = FALSE;
+
+	data = buffer_get_data(file->buffer, &size);
+	if (file->buffer_offset + size < file->sync_offset) {
+		*reason_r = t_strdup_printf(
+			"log file shrank (%"PRIuUOFF_T" < %"PRIuUOFF_T")",
+			file->buffer_offset + (uoff_t)size, file->sync_offset);
+		mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+		/* fix the sync_offset to avoid crashes later on */
+		file->sync_offset = file->buffer_offset + size;
+		return 0;
+	}
+	while (file->sync_offset - file->buffer_offset + sizeof(*hdr) <= size) {
+		hdr = CONST_PTR_OFFSET(data, file->sync_offset -
+				       file->buffer_offset);
+		trans_size = mail_index_offset_to_uint32(hdr->size);
+		if (trans_size == 0) {
+			/* unfinished or corrupted */
+			break;
+		}
+		if (trans_size < sizeof(*hdr)) {
+			*reason_r = t_strdup_printf(
+				"hdr.size too small (%u)", trans_size);
+			mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+			return 0;
+		}
+
+		if (file->sync_offset - file->buffer_offset + trans_size > size)
+			break;
+
+		/* transaction has been fully written */
+		if ((ret = log_file_track_sync(file, hdr, trans_size, reason_r)) <= 0) {
+			if (ret < 0)
+				return 0;
+			break;
+		}
+
+		file->sync_offset += trans_size;
+	}
+
+	if (file->mmap_base != NULL && !file->locked) {
+		/* Now that all the mmaped pages have page faulted, check if
+		   the file had changed while doing that. Only after the last
+		   page has faulted, the size returned by fstat() can be
+		   trusted. Otherwise it might point to a page boundary while
+		   the next page is still being written.
+
+		   Without this check we might see partial transactions,
+		   sometimes causing "Extension record updated without intro
+		   prefix" errors. */
+		if (fstat(file->fd, &st) < 0) {
+			log_file_set_syscall_error(file, "fstat()");
+			*reason_r = t_strdup_printf("fstat() failed: %m");
+			return -1;
+		}
+		if ((uoff_t)st.st_size != file->last_size) {
+			file->last_size = st.st_size;
+			*retry_r = TRUE;
+			*reason_r = "File size changed - retrying";
+			return 0;
+		}
+	}
+
+	avail = file->sync_offset - file->buffer_offset;
+	if (avail != size) {
+		/* There's more data than we could sync at the moment. If the
+		   last record's size wasn't valid, we can't know if it will
+		   be updated unless we've locked the log. */
+		if (file->locked) {
+			*reason_r = "Unexpected garbage at EOF";
+			mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+			return 0;
+		}
+		/* The size field will be updated soon */
+		mail_index_flush_read_cache(file->log->index, file->filepath,
+					    file->fd, file->locked);
+	}
+
+	if (file->next != NULL &&
+	    file->hdr.file_seq == file->next->hdr.prev_file_seq &&
+	    file->next->hdr.prev_file_offset != file->sync_offset) {
+		*reason_r = t_strdup_printf(
+			"Invalid transaction log size "
+			"(%"PRIuUOFF_T" vs %u): %s", file->sync_offset,
+			file->log->head->hdr.prev_file_offset, file->filepath);
+		mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+mail_transaction_log_file_insert_read(struct mail_transaction_log_file *file,
+				      uoff_t offset, const char **reason_r)
+{
+	void *data;
+	size_t size;
+	ssize_t ret;
+
+	size = file->buffer_offset - offset;
+	buffer_copy(file->buffer, size, file->buffer, 0, SIZE_MAX);
+
+	data = buffer_get_space_unsafe(file->buffer, 0, size);
+	ret = pread_full(file->fd, data, size, offset);
+	if (ret > 0) {
+		/* success */
+		file->buffer_offset -= size;
+		return 1;
+	}
+
+	/* failure. don't leave ourself to inconsistent state */
+	buffer_copy(file->buffer, 0, file->buffer, size, SIZE_MAX);
+	buffer_set_used_size(file->buffer, file->buffer->used - size);
+
+	if (ret == 0) {
+		*reason_r = "file shrank unexpectedly";
+		mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+		return 0;
+	} else if (errno == ESTALE) {
+		/* log file was deleted in NFS server, fail silently */
+		*reason_r = t_strdup_printf("read() failed: %m");
+		return 0;
+	} else {
+		log_file_set_syscall_error(file, "pread()");
+		*reason_r = t_strdup_printf("read() failed: %m");
+		return -1;
+	}
+}
+
+static int
+mail_transaction_log_file_read_more(struct mail_transaction_log_file *file,
+				    const char **reason_r)
+{
+	void *data;
+	size_t size;
+	uint32_t read_offset;
+	ssize_t ret;
+
+	read_offset = file->buffer_offset + file->buffer->used;
+
+	do {
+		data = buffer_append_space_unsafe(file->buffer, LOG_PREFETCH);
+		ret = pread(file->fd, data, LOG_PREFETCH, read_offset);
+		if (ret > 0)
+			read_offset += ret;
+
+		size = read_offset - file->buffer_offset;
+		buffer_set_used_size(file->buffer, size);
+	} while (ret > 0 || (ret < 0 && errno == EINTR));
+
+	file->last_size = read_offset;
+
+	if (ret < 0) {
+		*reason_r = t_strdup_printf("pread() failed: %m");
+		if (errno == ESTALE) {
+			/* log file was deleted in NFS server, fail silently */
+			return 0;
+		}
+		log_file_set_syscall_error(file, "pread()");
+		return -1;
+	}
+	return 1;
+}
+
+static bool
+mail_transaction_log_file_need_nfs_flush(struct mail_transaction_log_file *file)
+{
+	const struct mail_index_header *hdr = &file->log->index->map->hdr;
+	uoff_t max_offset = file->last_size;
+
+	if (file->next != NULL &&
+	    file->hdr.file_seq == file->next->hdr.prev_file_seq &&
+	    file->next->hdr.prev_file_offset != max_offset) {
+		/* we already have a newer log file which says that we haven't
+		   synced the entire file. */
+		return TRUE;
+	}
+
+	if (file->hdr.file_seq == hdr->log_file_seq &&
+	    max_offset < hdr->log_file_head_offset)
+		return TRUE;
+
+	return FALSE;
+}
+
+static int
+mail_transaction_log_file_read(struct mail_transaction_log_file *file,
+			       uoff_t start_offset, bool nfs_flush,
+			       const char **reason_r)
+{
+	bool retry;
+	int ret;
+
+	i_assert(file->mmap_base == NULL);
+
+	/* NFS: if file isn't locked, we're optimistic that we can read enough
+	   data without flushing attribute cache. if after reading we notice
+	   that we really should have read more, flush the cache and try again.
+	   if file is locked, the attribute cache was already flushed when
+	   refreshing the log. */
+	if (nfs_flush &&
+	    (file->log->index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0) {
+		if (!file->locked)
+			nfs_flush_attr_cache_unlocked(file->filepath);
+		else
+			nfs_flush_attr_cache_fd_locked(file->filepath, file->fd);
+	}
+
+	if (file->buffer != NULL && file->buffer_offset > start_offset) {
+		/* we have to insert missing data to beginning of buffer */
+		ret = mail_transaction_log_file_insert_read(file, start_offset, reason_r);
+		if (ret <= 0)
+			return ret;
+	}
+
+	if (file->buffer == NULL) {
+		file->buffer =
+			buffer_create_dynamic(default_pool, LOG_PREFETCH);
+		file->buffer_offset = start_offset;
+	}
+
+	if ((ret = mail_transaction_log_file_read_more(file, reason_r)) <= 0)
+		;
+	else if (!nfs_flush &&
+		 (file->log->index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0 &&
+		 mail_transaction_log_file_need_nfs_flush(file)) {
+		/* we didn't read enough data. flush and try again. */
+		return mail_transaction_log_file_read(file, start_offset, TRUE, reason_r);
+	} else if ((ret = mail_transaction_log_file_sync(file, &retry, reason_r)) == 0) {
+		i_assert(!retry); /* retry happens only with mmap */
+	}
+	i_assert(file->sync_offset >= file->buffer_offset);
+	buffer_set_used_size(file->buffer,
+			     file->sync_offset - file->buffer_offset);
+	return ret;
+}
+
+static bool
+log_file_map_check_offsets(struct mail_transaction_log_file *file,
+			   uoff_t start_offset, uoff_t end_offset,
+			   const char **reason_r)
+{
+	struct stat st, st2;
+
+	if (start_offset > file->sync_offset) {
+		/* broken start offset */
+		if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) {
+			*reason_r = t_strdup_printf(
+				"%s: start_offset (%"PRIuUOFF_T") > "
+				"current sync_offset (%"PRIuUOFF_T")",
+				file->filepath, start_offset, file->sync_offset);
+			return FALSE;
+		}
+
+		if (fstat(file->fd, &st) < 0) {
+			log_file_set_syscall_error(file, "fstat()");
+			st.st_size = -1;
+		}
+		*reason_r = t_strdup_printf(
+			"%s: start_offset (%"PRIuUOFF_T") > "
+			"current sync_offset (%"PRIuUOFF_T"), file size=%"PRIuUOFF_T,
+			file->filepath, start_offset, file->sync_offset,
+			st.st_size);
+		if (stat(file->filepath, &st2) == 0) {
+			if (st.st_ino != st2.st_ino) {
+				*reason_r = t_strdup_printf(
+					"%s, file unexpectedly replaced", *reason_r);
+			}
+		} else if (errno == ENOENT) {
+			*reason_r = t_strdup_printf(
+				"%s, file unexpectedly deleted", *reason_r);
+		} else {
+			log_file_set_syscall_error(file, "stat()");
+		}
+		return FALSE;
+	}
+	if (end_offset != UOFF_T_MAX && end_offset > file->sync_offset) {
+		*reason_r = t_strdup_printf(
+			"%s: end_offset (%"PRIuUOFF_T") > "
+			"current sync_offset (%"PRIuUOFF_T")",
+			file->filepath, start_offset, file->sync_offset);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static int
+mail_transaction_log_file_mmap(struct mail_transaction_log_file *file,
+			       const char **reason_r)
+{
+	/* we may have switched to mmaping */
+	buffer_free(&file->buffer);
+
+	file->mmap_size = file->last_size;
+	file->mmap_base = mmap(NULL, file->mmap_size, PROT_READ, MAP_SHARED,
+			       file->fd, 0);
+	if (file->mmap_base == MAP_FAILED) {
+		file->mmap_base = NULL;
+		if (ioloop_time != file->last_mmap_error_time) {
+			file->last_mmap_error_time = ioloop_time;
+			log_file_set_syscall_error(file, t_strdup_printf(
+				"mmap(size=%zu)", file->mmap_size));
+		}
+		*reason_r = t_strdup_printf("mmap(size=%zu) failed: %m",
+					    file->mmap_size);
+		file->mmap_size = 0;
+		return -1;
+	}
+
+	if (file->mmap_size > mmap_get_page_size()) {
+		if (madvise(file->mmap_base, file->mmap_size,
+			    MADV_SEQUENTIAL) < 0)
+			log_file_set_syscall_error(file, "madvise()");
+	}
+
+	buffer_create_from_const_data(&file->mmap_buffer,
+				      file->mmap_base, file->mmap_size);
+	file->buffer = &file->mmap_buffer;
+	file->buffer_offset = 0;
+	return 0;
+}
+
+static void
+mail_transaction_log_file_munmap(struct mail_transaction_log_file *file)
+{
+	if (file->mmap_base == NULL)
+		return;
+
+	i_assert(file->buffer != NULL);
+	if (munmap(file->mmap_base, file->mmap_size) < 0)
+		log_file_set_syscall_error(file, "munmap()");
+	file->mmap_base = NULL;
+	file->mmap_size = 0;
+	buffer_free(&file->buffer);
+}
+
+static int
+mail_transaction_log_file_map_mmap(struct mail_transaction_log_file *file,
+				   uoff_t start_offset, const char **reason_r)
+{
+	struct stat st;
+	bool retry;
+	int ret;
+
+	/* we are going to mmap() this file, but it's not necessarily
+	   mmaped currently. */
+	i_assert(file->buffer_offset == 0 || file->mmap_base == NULL);
+	i_assert(file->mmap_size == 0 || file->mmap_base != NULL);
+
+	if (fstat(file->fd, &st) < 0) {
+		log_file_set_syscall_error(file, "fstat()");
+		*reason_r = t_strdup_printf("fstat() failed: %m");
+		return -1;
+	}
+	file->last_size = st.st_size;
+
+	if ((uoff_t)st.st_size < file->sync_offset) {
+		*reason_r = t_strdup_printf(
+			"file size shrank (%"PRIuUOFF_T" < %"PRIuUOFF_T")",
+			(uoff_t)st.st_size, file->sync_offset);
+		mail_transaction_log_file_set_corrupted(file, "%s", *reason_r);
+		return 0;
+	}
+
+	if (file->buffer != NULL && file->buffer_offset <= start_offset &&
+	    (uoff_t)st.st_size == file->buffer_offset + file->buffer->used) {
+		/* we already have the whole file mapped */
+		if ((ret = mail_transaction_log_file_sync(file, &retry, reason_r)) != 0 ||
+		    !retry)
+			return ret;
+		/* size changed, re-mmap */
+	}
+
+	do {
+		mail_transaction_log_file_munmap(file);
+
+		if (file->last_size - start_offset < mmap_get_page_size()) {
+			/* just reading the file is probably faster */
+			return mail_transaction_log_file_read(file,
+							      start_offset,
+							      FALSE, reason_r);
+		}
+
+		if (mail_transaction_log_file_mmap(file, reason_r) < 0)
+			return -1;
+		ret = mail_transaction_log_file_sync(file, &retry, reason_r);
+	} while (retry);
+
+	return ret;
+}
+
+int mail_transaction_log_file_map(struct mail_transaction_log_file *file,
+				  uoff_t start_offset, uoff_t end_offset,
+				  const char **reason_r)
+{
+	uoff_t map_start_offset = start_offset;
+	size_t size;
+	int ret;
+
+	if (file->hdr.indexid == 0) {
+		/* corrupted */
+		*reason_r = "corrupted, indexid=0";
+		return 0;
+	}
+
+	i_assert(start_offset >= file->hdr.hdr_size);
+	i_assert(start_offset <= end_offset);
+	i_assert(file->buffer == NULL || file->mmap_base != NULL ||
+		 file->sync_offset >= file->buffer_offset + file->buffer->used);
+
+	if (file->locked_sync_offset_updated && file == file->log->head &&
+	    end_offset == UOFF_T_MAX) {
+		/* we're not interested of going further than sync_offset */
+		if (!log_file_map_check_offsets(file, start_offset,
+						end_offset, reason_r))
+			return 0;
+		i_assert(start_offset <= file->sync_offset);
+		end_offset = file->sync_offset;
+	}
+
+	if (file->buffer != NULL && file->buffer_offset <= start_offset) {
+		/* see if we already have it */
+		size = file->buffer->used;
+		if (file->buffer_offset + size >= end_offset)
+			return 1;
+	}
+
+	if (file->locked) {
+		/* set this only when we've synced to end of file while locked
+		   (either end_offset=UOFF_T_MAX or we had to read anyway) */
+		file->locked_sync_offset_updated = TRUE;
+	}
+
+	if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) {
+		if (start_offset < file->buffer_offset || file->buffer == NULL) {
+			/* we had moved the log to memory but failed to read
+			   the beginning of the log file */
+			*reason_r = "Beginning of the log isn't available";
+			return 0;
+		}
+		return log_file_map_check_offsets(file, start_offset,
+						  end_offset, reason_r) ? 1 : 0;
+	}
+
+	if (start_offset > file->sync_offset)
+		mail_transaction_log_file_skip_to_head(file);
+	if (start_offset > file->sync_offset) {
+		/* although we could just skip over the unwanted data, we have
+		   to sync everything so that modseqs are calculated
+		   correctly */
+		map_start_offset = file->sync_offset;
+	}
+
+	if ((file->log->index->flags & MAIL_INDEX_OPEN_FLAG_MMAP_DISABLE) == 0)
+		ret = mail_transaction_log_file_map_mmap(file, map_start_offset, reason_r);
+	else {
+		mail_transaction_log_file_munmap(file);
+		ret = mail_transaction_log_file_read(file, map_start_offset, FALSE, reason_r);
+	}
+
+	i_assert(file->buffer == NULL || file->mmap_base != NULL ||
+		 file->sync_offset >= file->buffer_offset + file->buffer->used);
+	if (ret <= 0)
+		return ret;
+
+	i_assert(file->buffer != NULL);
+	return log_file_map_check_offsets(file, start_offset, end_offset,
+					  reason_r) ? 1 : 0;
+}
+
+int mail_transaction_log_file_move_to_memory(struct mail_transaction_log_file *file)
+{
+	const char *error;
+	buffer_t *buf;
+	int ret = 0;
+
+	if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file))
+		return 0;
+
+	if (file->mmap_base != NULL) {
+		/* just copy to memory */
+		i_assert(file->buffer_offset == 0);
+
+		buf = buffer_create_dynamic(default_pool, file->mmap_size);
+		buffer_append(buf, file->mmap_base, file->mmap_size);
+		buffer_free(&file->buffer);
+		file->buffer = buf;
+
+		/* and lose the mmap */
+		if (munmap(file->mmap_base, file->mmap_size) < 0)
+			log_file_set_syscall_error(file, "munmap()");
+		file->mmap_base = NULL;
+	} else if (file->buffer_offset != 0) {
+		/* we don't have the full log in the memory. read it. */
+		ret = mail_transaction_log_file_read(file, 0, FALSE, &error);
+		if (ret <= 0) {
+			mail_index_set_error(file->log->index,
+				"%s: Failed to read into memory: %s", file->filepath, error);
+		}
+	}
+	file->last_size = 0;
+
+	if (close(file->fd) < 0)
+		log_file_set_syscall_error(file, "close()");
+	file->fd = -1;
+
+	i_free(file->filepath);
+	file->filepath = i_strdup(file->log->filepath);
+	return ret < 0 ? -1 : 0;
+}