diff options
Diffstat (limited to 'src/lib-index/mail-transaction-log-file.c')
-rw-r--r-- | src/lib-index/mail-transaction-log-file.c | 1685 |
1 files changed, 1685 insertions, 0 deletions
diff --git a/src/lib-index/mail-transaction-log-file.c b/src/lib-index/mail-transaction-log-file.c new file mode 100644 index 0000000..1820169 --- /dev/null +++ b/src/lib-index/mail-transaction-log-file.c @@ -0,0 +1,1685 @@ +/* Copyright (c) 2003-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "ioloop.h" +#include "file-dotlock.h" +#include "nfs-workarounds.h" +#include "read-full.h" +#include "write-full.h" +#include "mmap-util.h" +#include "mail-index-private.h" +#include "mail-index-modseq.h" +#include "mail-transaction-log-private.h" + +#define LOG_PREFETCH IO_BLOCK_SIZE +#define MEMORY_LOG_NAME "(in-memory transaction log file)" +#define LOG_NEW_DOTLOCK_SUFFIX ".newlock" + +static int +mail_transaction_log_file_sync(struct mail_transaction_log_file *file, + bool *retry_r, const char **reason_r); + +static void +log_file_set_syscall_error(struct mail_transaction_log_file *file, + const char *function) +{ + mail_index_file_set_syscall_error(file->log->index, + file->filepath, function); +} + +static void +mail_transaction_log_mark_corrupted(struct mail_transaction_log_file *file) +{ + unsigned int offset = + offsetof(struct mail_transaction_log_header, indexid); + int flags; + + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file) || + file->log->index->readonly) + return; + + /* indexid=0 marks the log file as corrupted. we opened the file with + O_APPEND, and now we need to drop it for pwrite() to work (at least + in Linux) */ + flags = fcntl(file->fd, F_GETFL, 0); + if (flags < 0) { + mail_index_file_set_syscall_error(file->log->index, + file->filepath, "fcntl(F_GETFL)"); + return; + } + if (fcntl(file->fd, F_SETFL, flags & ~O_APPEND) < 0) { + mail_index_file_set_syscall_error(file->log->index, + file->filepath, "fcntl(F_SETFL)"); + return; + } + if (pwrite_full(file->fd, &file->hdr.indexid, + sizeof(file->hdr.indexid), offset) < 0) { + mail_index_file_set_syscall_error(file->log->index, + file->filepath, "pwrite()"); + } +} + +void +mail_transaction_log_file_set_corrupted(struct mail_transaction_log_file *file, + const char *fmt, ...) +{ + va_list va; + + file->corrupted = TRUE; + file->hdr.indexid = 0; + mail_transaction_log_mark_corrupted(file); + + va_start(va, fmt); + T_BEGIN { + mail_index_set_error(file->log->index, + "Corrupted transaction log file %s seq %u: %s " + "(sync_offset=%"PRIuUOFF_T")", + file->filepath, file->hdr.file_seq, + t_strdup_vprintf(fmt, va), file->sync_offset); + } T_END; + va_end(va); +} + +struct mail_transaction_log_file * +mail_transaction_log_file_alloc(struct mail_transaction_log *log, + const char *path) +{ + struct mail_transaction_log_file *file; + + file = i_new(struct mail_transaction_log_file, 1); + file->log = log; + file->filepath = i_strdup(path); + file->fd = -1; + return file; +} + +void mail_transaction_log_file_free(struct mail_transaction_log_file **_file) +{ + struct mail_transaction_log_file *file = *_file; + struct mail_transaction_log_file **p; + int old_errno = errno; + + *_file = NULL; + + i_assert(!file->locked); + i_assert(file->refcount == 0); + + for (p = &file->log->files; *p != NULL; p = &(*p)->next) { + if (*p == file) { + *p = file->next; + break; + } + } + + if (file == file->log->head) + file->log->head = NULL; + + buffer_free(&file->buffer); + + if (file->mmap_base != NULL) { + if (munmap(file->mmap_base, file->mmap_size) < 0) + log_file_set_syscall_error(file, "munmap()"); + } + + if (file->fd != -1) { + if (close(file->fd) < 0) + log_file_set_syscall_error(file, "close()"); + } + + i_free(file->filepath); + i_free(file->need_rotate); + i_free(file); + + errno = old_errno; +} + +static void +mail_transaction_log_file_skip_to_head(struct mail_transaction_log_file *file) +{ + struct mail_transaction_log *log = file->log; + struct mail_index_map *map = log->index->map; + const struct mail_index_modseq_header *modseq_hdr; + uoff_t head_offset; + + if (map == NULL || file->hdr.file_seq != map->hdr.log_file_seq || + map->hdr.log_file_head_offset == 0) + return; + + /* we can get a valid log offset from index file. initialize + sync_offset from it so we don't have to read the whole log + file from beginning. */ + head_offset = map->hdr.log_file_head_offset; + + modseq_hdr = mail_index_map_get_modseq_header(map); + if (head_offset < file->hdr.hdr_size) { + mail_index_set_error(log->index, + "%s: log_file_head_offset too small", + log->index->filepath); + file->sync_offset = file->hdr.hdr_size; + file->sync_highest_modseq = file->hdr.initial_modseq; + } else if (modseq_hdr == NULL && file->hdr.initial_modseq == 0) { + /* modseqs not used yet */ + file->sync_offset = head_offset; + file->sync_highest_modseq = 0; + } else if (modseq_hdr == NULL || + modseq_hdr->log_seq != file->hdr.file_seq) { + /* highest_modseq not synced, start from beginning */ + file->sync_offset = file->hdr.hdr_size; + file->sync_highest_modseq = file->hdr.initial_modseq; + } else if (modseq_hdr->log_offset > head_offset) { + mail_index_set_error(log->index, + "%s: modseq_hdr.log_offset too large", + log->index->filepath); + file->sync_offset = file->hdr.hdr_size; + file->sync_highest_modseq = file->hdr.initial_modseq; + } else { + /* start from where we last stopped tracking modseqs */ + file->sync_offset = modseq_hdr->log_offset; + file->sync_highest_modseq = modseq_hdr->highest_modseq; + } + if (file->hdr.file_seq == log->index->map->hdr.log_file_seq) { + file->last_read_hdr_tail_offset = + log->index->map->hdr.log_file_tail_offset; + } + if (file->last_read_hdr_tail_offset > file->max_tail_offset) + file->max_tail_offset = file->last_read_hdr_tail_offset; +} + +static void +mail_transaction_log_file_add_to_list(struct mail_transaction_log_file *file) +{ + struct mail_transaction_log_file **p; + const char *reason; + bool retry; + + file->sync_offset = file->hdr.hdr_size; + file->sync_highest_modseq = file->hdr.initial_modseq; + mail_transaction_log_file_skip_to_head(file); + + /* insert it to correct position */ + for (p = &file->log->files; *p != NULL; p = &(*p)->next) { + if ((*p)->hdr.file_seq > file->hdr.file_seq) + break; + i_assert((*p)->hdr.file_seq < file->hdr.file_seq); + } + + file->next = *p; + *p = file; + + if (file->buffer != NULL) { + /* if we read any unfinished data, make sure the buffer gets + truncated. */ + (void)mail_transaction_log_file_sync(file, &retry, &reason); + buffer_set_used_size(file->buffer, + file->sync_offset - file->buffer_offset); + } +} + +static int +mail_transaction_log_init_hdr(struct mail_transaction_log *log, + struct mail_transaction_log_header *hdr) +{ + struct mail_index *index = log->index; + struct mail_transaction_log_file *file; + + i_assert(index->indexid != 0); + + i_zero(hdr); + hdr->major_version = MAIL_TRANSACTION_LOG_MAJOR_VERSION; + hdr->minor_version = MAIL_TRANSACTION_LOG_MINOR_VERSION; + hdr->hdr_size = sizeof(struct mail_transaction_log_header); + hdr->indexid = log->index->indexid; + hdr->create_stamp = ioloop_time; +#ifndef WORDS_BIGENDIAN + hdr->compat_flags |= MAIL_INDEX_COMPAT_LITTLE_ENDIAN; +#endif + + if (index->fd != -1) { + /* not creating index - make sure we have latest header */ + if (!index->mapping) { + if (mail_index_map(index, + MAIL_INDEX_SYNC_HANDLER_HEAD) <= 0) + return -1; + } else { + /* if we got here from mapping, the .log file is + corrupted. use whatever values we got from index + file */ + } + } + if (index->map != NULL) { + hdr->prev_file_seq = index->map->hdr.log_file_seq; + hdr->prev_file_offset = index->map->hdr.log_file_head_offset; + hdr->file_seq = index->map->hdr.log_file_seq + 1; + hdr->initial_modseq = + mail_index_map_modseq_get_highest(index->map); + } else { + hdr->file_seq = 1; + } + if (hdr->initial_modseq == 0) { + /* modseq tracking in log files is required for many reasons + nowadays, even if per-message modseqs aren't enabled in + dovecot.index. */ + hdr->initial_modseq = 1; + } + + if (log->head != NULL) { + /* make sure the sequence always increases to avoid crashes + later. this catches the buggy case where two processes + happen to replace the same log file. */ + for (file = log->head->next; file != NULL; file = file->next) { + if (hdr->file_seq <= file->hdr.file_seq) + hdr->file_seq = file->hdr.file_seq + 1; + } + + if (hdr->file_seq <= log->head->hdr.file_seq) { + /* make sure the sequence grows */ + hdr->file_seq = log->head->hdr.file_seq+1; + } + if (hdr->initial_modseq < log->head->sync_highest_modseq) { + /* this should be always up-to-date */ + hdr->initial_modseq = log->head->sync_highest_modseq; + } + } + return 0; +} + +struct mail_transaction_log_file * +mail_transaction_log_file_alloc_in_memory(struct mail_transaction_log *log) +{ + struct mail_transaction_log_file *file; + + file = mail_transaction_log_file_alloc(log, MEMORY_LOG_NAME); + if (mail_transaction_log_init_hdr(log, &file->hdr) < 0) { + i_free(file); + return NULL; + } + + file->buffer = buffer_create_dynamic(default_pool, 4096); + file->buffer_offset = sizeof(file->hdr); + + mail_transaction_log_file_add_to_list(file); + return file; +} + +static int +mail_transaction_log_file_dotlock(struct mail_transaction_log_file *file) +{ + struct dotlock_settings dotlock_set; + int ret; + + if (file->log->dotlock_refcount > 0) + ret = 1; + else { + i_assert(file->log->dotlock_refcount == 0); + mail_transaction_log_get_dotlock_set(file->log, &dotlock_set); + ret = file_dotlock_create(&dotlock_set, file->filepath, 0, + &file->log->dotlock); + } + if (ret > 0) { + file->log->dotlock_refcount++; + file->locked = TRUE; + file->lock_create_time = time(NULL); + return 0; + } + if (ret < 0) { + log_file_set_syscall_error(file, "file_dotlock_create()"); + return -1; + } + + mail_index_set_error(file->log->index, + "Timeout (%us) while waiting for " + "dotlock for transaction log file %s", + dotlock_set.timeout, file->filepath); + return -1; +} + +static int +mail_transaction_log_file_undotlock(struct mail_transaction_log_file *file) +{ + int ret; + + i_assert(file->log->dotlock_refcount >= 0); + if (--file->log->dotlock_refcount > 0) + return 0; + + ret = file_dotlock_delete(&file->log->dotlock); + if (ret < 0) { + log_file_set_syscall_error(file, "file_dotlock_delete()"); + return -1; + } + + if (ret == 0) { + mail_index_set_error(file->log->index, + "Dotlock was lost for transaction log file %s", + file->filepath); + return -1; + } + return 0; +} + +int mail_transaction_log_file_lock(struct mail_transaction_log_file *file) +{ + unsigned int lock_timeout_secs; + int ret; + + if (file->locked) + return 0; + + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) { + file->locked = TRUE; + return 0; + } + + if (file->log->index->set.lock_method == FILE_LOCK_METHOD_DOTLOCK) + return mail_transaction_log_file_dotlock(file); + + if (file->log->index->readonly) { + mail_index_set_error(file->log->index, + "Index is read-only, can't write-lock %s", + file->filepath); + return -1; + } + + i_assert(file->file_lock == NULL); + lock_timeout_secs = I_MIN(MAIL_TRANSACTION_LOG_LOCK_TIMEOUT, + file->log->index->set.max_lock_timeout_secs); + ret = mail_index_lock_fd(file->log->index, file->filepath, file->fd, + F_WRLCK, lock_timeout_secs, + &file->file_lock); + if (ret > 0) { + file->locked = TRUE; + file->lock_create_time = time(NULL); + return 0; + } + if (ret < 0) { + log_file_set_syscall_error(file, "mail_index_wait_lock_fd()"); + return -1; + } + + mail_index_set_error(file->log->index, + "Timeout (%us) while waiting for lock for " + "transaction log file %s%s", + lock_timeout_secs, file->filepath, + file_lock_find(file->fd, file->log->index->set.lock_method, F_WRLCK)); + return -1; +} + +void mail_transaction_log_file_unlock(struct mail_transaction_log_file *file, + const char *lock_reason) +{ + unsigned int lock_time; + + if (!file->locked) + return; + + file->locked = FALSE; + file->locked_sync_offset_updated = FALSE; + + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) + return; + + lock_time = time(NULL) - file->lock_create_time; + if (lock_time >= MAIL_TRANSACTION_LOG_LOCK_WARN_SECS && lock_reason != NULL) { + i_warning("Transaction log file %s was locked for %u seconds (%s)", + file->filepath, lock_time, lock_reason); + } + + if (file->log->index->set.lock_method == FILE_LOCK_METHOD_DOTLOCK) { + (void)mail_transaction_log_file_undotlock(file); + return; + } + + file_unlock(&file->file_lock); +} + +static ssize_t +mail_transaction_log_file_read_header(struct mail_transaction_log_file *file) +{ + void *dest; + size_t pos, dest_size; + ssize_t ret; + + i_assert(file->buffer == NULL && file->mmap_base == NULL); + + i_zero(&file->hdr); + if (file->last_size < mmap_get_page_size() && file->last_size > 0) { + /* just read the entire transaction log to memory. + note that if some of the data hasn't been fully committed + yet (hdr.size=0), the buffer must be truncated later */ + file->buffer = buffer_create_dynamic(default_pool, 4096); + file->buffer_offset = 0; + dest_size = file->last_size; + dest = buffer_append_space_unsafe(file->buffer, dest_size); + } else { + /* read only the header */ + dest = &file->hdr; + dest_size = sizeof(file->hdr); + } + + /* it's not necessarily an error to read less than wanted header size, + since older versions of the log format used smaller headers. */ + pos = 0; + do { + ret = pread(file->fd, PTR_OFFSET(dest, pos), + dest_size - pos, pos); + if (ret > 0) + pos += ret; + } while (ret > 0 && pos < dest_size); + + if (file->buffer != NULL) { + buffer_set_used_size(file->buffer, pos); + memcpy(&file->hdr, file->buffer->data, + I_MIN(pos, sizeof(file->hdr))); + } + + return ret < 0 ? -1 : (ssize_t)pos; +} + +static int +mail_transaction_log_file_fail_dupe(struct mail_transaction_log_file *file) +{ + int ret; + + /* mark the old file corrupted. we can't safely remove + it from the list however, so return failure. */ + file->hdr.indexid = 0; + if (strcmp(file->filepath, file->log->head->filepath) != 0) { + /* only mark .2 corrupted, just to make sure we don't lose any + changes from .log in case we're somehow wrong */ + mail_transaction_log_mark_corrupted(file); + ret = 0; + } else { + ret = -1; + } + if (!file->corrupted) { + file->corrupted = TRUE; + mail_index_set_error(file->log->index, + "Transaction log %s: " + "duplicate transaction log sequence (%u)", + file->filepath, file->hdr.file_seq); + } + return ret; +} + +static int +mail_transaction_log_file_read_hdr(struct mail_transaction_log_file *file, + bool ignore_estale) +{ + struct mail_transaction_log_file *f; + int ret; + + i_assert(!MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)); + + if (file->corrupted) + return 0; + + ret = mail_transaction_log_file_read_header(file); + if (ret < 0) { + if (errno != ESTALE || !ignore_estale) + log_file_set_syscall_error(file, "pread()"); + return -1; + } + if (file->hdr.major_version != MAIL_TRANSACTION_LOG_MAJOR_VERSION) { + /* incompatible version - fix silently */ + return 0; + } + if (ret < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) { + mail_transaction_log_file_set_corrupted(file, + "unexpected end of file while reading header"); + return 0; + } + + const unsigned int hdr_version = + MAIL_TRANSACTION_LOG_HDR_VERSION(&file->hdr); + if (MAIL_TRANSACTION_LOG_VERSION_HAVE(hdr_version, COMPAT_FLAGS)) { + /* we have compatibility flags */ + enum mail_index_header_compat_flags compat_flags = 0; + +#ifndef WORDS_BIGENDIAN + compat_flags |= MAIL_INDEX_COMPAT_LITTLE_ENDIAN; +#endif + if (file->hdr.compat_flags != compat_flags) { + /* architecture change */ + mail_index_set_error(file->log->index, + "Rebuilding index file %s: " + "CPU architecture changed", + file->log->index->filepath); + return 0; + } + } + if (file->hdr.hdr_size < MAIL_TRANSACTION_LOG_HEADER_MIN_SIZE) { + mail_transaction_log_file_set_corrupted(file, + "Header size too small"); + return 0; + } + if (file->hdr.hdr_size < sizeof(file->hdr)) { + /* @UNSAFE: smaller than we expected - zero out the fields we + shouldn't have filled */ + memset(PTR_OFFSET(&file->hdr, file->hdr.hdr_size), 0, + sizeof(file->hdr) - file->hdr.hdr_size); + } + + if (file->hdr.indexid == 0) { + /* corrupted */ + file->corrupted = TRUE; + mail_index_set_error(file->log->index, + "Transaction log file %s: marked corrupted", + file->filepath); + return 0; + } + if (file->hdr.indexid != file->log->index->indexid) { + if (file->log->index->indexid != 0 && + !file->log->index->initial_create) { + /* index file was probably just rebuilt and we don't + know about it yet */ + mail_transaction_log_file_set_corrupted(file, + "indexid changed: %u -> %u", + file->log->index->indexid, file->hdr.indexid); + return 0; + } + + /* creating index file. since transaction log is created + first, use the indexid in it to create the main index + to avoid races. */ + file->log->index->indexid = file->hdr.indexid; + } + + /* make sure we already don't have a file with the same sequence + opened. it shouldn't happen unless the old log file was + corrupted. */ + for (f = file->log->files; f != NULL; f = f->next) { + if (f->hdr.file_seq == file->hdr.file_seq) { + if (strcmp(f->filepath, f->log->head->filepath) != 0) { + /* old "f" is the .log.2 */ + return mail_transaction_log_file_fail_dupe(f); + } else { + /* new "file" is probably the .log.2 */ + return mail_transaction_log_file_fail_dupe(file); + } + } + } + + file->sync_highest_modseq = file->hdr.initial_modseq; + return 1; +} + +static int +mail_transaction_log_file_stat(struct mail_transaction_log_file *file, + bool ignore_estale) +{ + struct stat st; + + if (fstat(file->fd, &st) < 0) { + if (!ESTALE_FSTAT(errno) || !ignore_estale) + log_file_set_syscall_error(file, "fstat()"); + return -1; + } + + file->st_dev = st.st_dev; + file->st_ino = st.st_ino; + file->last_mtime = st.st_mtime; + file->last_size = st.st_size; + return 0; +} + +static bool +mail_transaction_log_file_is_dupe(struct mail_transaction_log_file *file) +{ + struct mail_transaction_log_file *tmp; + + for (tmp = file->log->files; tmp != NULL; tmp = tmp->next) { + if (tmp->st_ino == file->st_ino && + CMP_DEV_T(tmp->st_dev, file->st_dev)) + return TRUE; + } + return FALSE; +} + +static void log_write_ext_hdr_init_data(struct mail_index *index, buffer_t *buf) +{ + const struct mail_index_registered_ext *rext; + struct mail_transaction_header *hdr; + struct mail_transaction_ext_intro *intro; + struct mail_transaction_ext_hdr_update *ext_hdr; + unsigned int hdr_offset; + + rext = array_idx(&index->extensions, index->set.ext_hdr_init_id); + + /* introduce the extension */ + hdr_offset = buf->used; + hdr = buffer_append_space_unsafe(buf, sizeof(*hdr)); + hdr->type = MAIL_TRANSACTION_EXT_INTRO; + + intro = buffer_append_space_unsafe(buf, sizeof(*intro)); + intro->ext_id = (uint32_t)-1; + intro->hdr_size = rext->hdr_size; + intro->record_size = rext->record_size; + intro->record_align = rext->record_align; + intro->name_size = strlen(rext->name); + buffer_append(buf, rext->name, intro->name_size); + if (buf->used % 4 != 0) + buffer_append_zero(buf, 4 - buf->used % 4); + + hdr = buffer_get_space_unsafe(buf, hdr_offset, sizeof(*hdr)); + hdr->size = mail_index_uint32_to_offset(buf->used - hdr_offset); + + /* add the extension header data */ + hdr_offset = buf->used; + hdr = buffer_append_space_unsafe(buf, sizeof(*hdr)); + hdr->type = MAIL_TRANSACTION_EXT_HDR_UPDATE; + + ext_hdr = buffer_append_space_unsafe(buf, sizeof(*ext_hdr)); + ext_hdr->size = rext->hdr_size; + buffer_append(buf, index->set.ext_hdr_init_data, rext->hdr_size); + + hdr = buffer_get_space_unsafe(buf, hdr_offset, sizeof(*hdr)); + hdr->size = mail_index_uint32_to_offset(buf->used - hdr_offset); +} + +static int +mail_transaction_log_file_create2(struct mail_transaction_log_file *file, + int new_fd, bool reset, + struct dotlock **dotlock) +{ + struct mail_index *index = file->log->index; + struct stat st; + const char *path2; + buffer_t *writebuf; + int fd, ret; + bool rename_existing, need_lock; + + need_lock = file->log->head != NULL && file->log->head->locked; + + if (fcntl(new_fd, F_SETFL, O_APPEND) < 0) { + log_file_set_syscall_error(file, "fcntl(O_APPEND)"); + return -1; + } + + if ((index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0) { + /* although we check also mtime and file size below, it's done + only to fix broken log files. we don't bother flushing + attribute cache just for that. */ + nfs_flush_file_handle_cache(file->filepath); + } + + /* log creation is locked now - see if someone already created it. + note that if we're rotating, we need to keep the log locked until + the file has been rewritten. and because fcntl() locks are stupid, + if we go and open()+close() the file and we had it already opened, + its locks are lost. so we use stat() to check if the file has been + recreated, although it almost never is. */ + if (reset) + rename_existing = FALSE; + else if (nfs_safe_stat(file->filepath, &st) < 0) { + if (errno != ENOENT) { + log_file_set_syscall_error(file, "stat()"); + return -1; + } + rename_existing = FALSE; + } else if (st.st_ino == file->st_ino && + CMP_DEV_T(st.st_dev, file->st_dev) && + /* inode/dev checks are enough when we're rotating the file, + but not when we're replacing a broken log file */ + st.st_mtime == file->last_mtime && + (uoff_t)st.st_size == file->last_size) { + /* no-one else recreated the file */ + rename_existing = TRUE; + } else { + /* recreated. use the file if its header is ok */ + fd = nfs_safe_open(file->filepath, O_RDWR | O_APPEND); + if (fd == -1) { + if (errno != ENOENT) { + log_file_set_syscall_error(file, "open()"); + return -1; + } + } else { + file->fd = fd; + file->last_size = 0; + if (mail_transaction_log_file_read_hdr(file, + FALSE) > 0 && + mail_transaction_log_file_stat(file, FALSE) == 0) { + /* yes, it was ok */ + file_dotlock_delete(dotlock); + mail_transaction_log_file_add_to_list(file); + return 0; + } + file->fd = -1; + if (close(fd) < 0) + log_file_set_syscall_error(file, "close()"); + } + rename_existing = FALSE; + } + + if (index->fd == -1 && !rename_existing) { + /* creating the initial index */ + reset = TRUE; + } + + if (mail_transaction_log_init_hdr(file->log, &file->hdr) < 0) + return -1; + + if (reset) { + /* don't reset modseqs. if we're reseting due to rebuilding + indexes we'll probably want to keep uidvalidity and in such + cases we really don't want to shrink modseqs. */ + file->hdr.prev_file_seq = 0; + file->hdr.prev_file_offset = 0; + } + + writebuf = t_buffer_create(128); + buffer_append(writebuf, &file->hdr, sizeof(file->hdr)); + + if (index->set.ext_hdr_init_data != NULL && reset) + log_write_ext_hdr_init_data(index, writebuf); + if (write_full(new_fd, writebuf->data, writebuf->used) < 0) { + log_file_set_syscall_error(file, "write_full()"); + return -1; + } + + if (file->log->index->set.fsync_mode == FSYNC_MODE_ALWAYS) { + /* the header isn't important, so don't bother calling + fdatasync() unless it's required */ + if (fdatasync(new_fd) < 0) { + log_file_set_syscall_error(file, "fdatasync()"); + return -1; + } + } + + file->fd = new_fd; + ret = mail_transaction_log_file_stat(file, FALSE); + + if (need_lock && ret == 0) { + /* we'll need to preserve the lock */ + if (mail_transaction_log_file_lock(file) < 0) + ret = -1; + } + + /* if we return -1 the dotlock deletion code closes the fd */ + file->fd = -1; + if (ret < 0) + return -1; + + /* keep two log files */ + if (rename_existing) { + /* rename() would be nice and easy way to do this, except then + there's a race condition between the rename and + file_dotlock_replace(). during that time the log file + doesn't exist, which could cause problems. */ + path2 = t_strconcat(file->filepath, ".2", NULL); + if (i_unlink_if_exists(path2) < 0) { + /* try to link() anyway */ + } + if (nfs_safe_link(file->filepath, path2, FALSE) < 0 && + errno != ENOENT && errno != EEXIST) { + mail_index_set_error(index, "link(%s, %s) failed: %m", + file->filepath, path2); + /* ignore the error. we don't care that much about the + second log file and we're going to overwrite this + first one. */ + } + /* NOTE: here's a race condition where both .log and .log.2 + point to the same file. our reading code should ignore that + though by comparing the inodes. */ + } + + if (file_dotlock_replace(dotlock, + DOTLOCK_REPLACE_FLAG_DONT_CLOSE_FD) <= 0) { + /* need to unlock to avoid assert-crash in + mail_transaction_log_file_free() */ + mail_transaction_log_file_unlock(file, "creation failed"); + return -1; + } + + /* success */ + file->fd = new_fd; + mail_transaction_log_file_add_to_list(file); + + i_assert(!need_lock || file->locked); + return 1; +} + +int mail_transaction_log_file_create(struct mail_transaction_log_file *file, + bool reset) +{ + struct mail_index *index = file->log->index; + struct dotlock_settings new_dotlock_set; + struct dotlock *dotlock; + mode_t old_mask; + int fd, ret; + + i_assert(!MAIL_INDEX_IS_IN_MEMORY(index)); + + if (file->log->index->readonly) { + mail_index_set_error(index, + "Can't create log file %s: Index is read-only", + file->filepath); + return -1; + } + + if (index->indexid == 0) { + mail_index_set_error(index, + "Can't create log file %s: Index is marked corrupted", + file->filepath); + return -1; + } + + mail_transaction_log_get_dotlock_set(file->log, &new_dotlock_set); + new_dotlock_set.lock_suffix = LOG_NEW_DOTLOCK_SUFFIX; + + /* With dotlocking we might already have path.lock created, so this + filename has to be different. */ + old_mask = umask(index->set.mode ^ 0666); + fd = file_dotlock_open(&new_dotlock_set, file->filepath, 0, &dotlock); + umask(old_mask); + + if (fd == -1) { + log_file_set_syscall_error(file, "file_dotlock_open()"); + return -1; + } + mail_index_fchown(index, fd, file_dotlock_get_lock_path(dotlock)); + + /* either fd gets used or the dotlock gets deleted and returned fd + is for the existing file */ + ret = mail_transaction_log_file_create2(file, fd, reset, &dotlock); + if (ret < 0) { + if (dotlock != NULL) + file_dotlock_delete(&dotlock); + return -1; + } + return ret; +} + +int mail_transaction_log_file_open(struct mail_transaction_log_file *file, + const char **reason_r) +{ + struct mail_index *index = file->log->index; + unsigned int i; + bool ignore_estale; + int ret; + + for (i = 0;; i++) { + if (!index->readonly) { + file->fd = nfs_safe_open(file->filepath, + O_RDWR | O_APPEND); + } else { + file->fd = nfs_safe_open(file->filepath, O_RDONLY); + } + if (file->fd == -1 && errno == EACCES) { + file->fd = nfs_safe_open(file->filepath, O_RDONLY); + index->readonly = TRUE; + } + if (file->fd == -1) { + if (errno == ENOENT) { + *reason_r = "File doesn't exist"; + return 0; + } + + log_file_set_syscall_error(file, "open()"); + *reason_r = t_strdup_printf("open() failed: %m"); + return -1; + } + + ignore_estale = i < MAIL_INDEX_ESTALE_RETRY_COUNT; + if (mail_transaction_log_file_stat(file, ignore_estale) < 0) + ret = -1; + else if (mail_transaction_log_file_is_dupe(file)) { + /* probably our already opened .log file has been + renamed to .log.2 and we're trying to reopen it. + also possible that hit a race condition where .log + and .log.2 are linked. */ + *reason_r = "File is already open"; + return 0; + } else { + ret = mail_transaction_log_file_read_hdr(file, + ignore_estale); + } + if (ret > 0) { + /* success */ + break; + } + + if (ret == 0) { + /* corrupted */ + if (index->readonly) { + /* don't delete */ + } else { + i_unlink_if_exists(file->filepath); + } + *reason_r = "File is corrupted"; + return 0; + } + if (errno != ESTALE || + i == MAIL_INDEX_ESTALE_RETRY_COUNT) { + /* syscall error */ + *reason_r = t_strdup_printf("fstat() failed: %m"); + return -1; + } + + /* ESTALE - try again */ + buffer_free(&file->buffer); + } + + mail_transaction_log_file_add_to_list(file); + return 1; +} + +static int +log_file_track_mailbox_sync_offset_hdr(struct mail_transaction_log_file *file, + const void *data, unsigned int trans_size, + const char **error_r) +{ + const struct mail_transaction_header_update *u = data; + const struct mail_index_header *ihdr; + const unsigned int size = trans_size - sizeof(struct mail_transaction_header); + const unsigned int offset_pos = + offsetof(struct mail_index_header, log_file_tail_offset); + const unsigned int offset_size = sizeof(ihdr->log_file_tail_offset); + uint32_t tail_offset; + + i_assert(offset_size == sizeof(tail_offset)); + + if (size < sizeof(*u) || size < sizeof(*u) + u->size) { + *error_r = "header update extends beyond record size"; + mail_transaction_log_file_set_corrupted(file, "%s", *error_r); + return -1; + } + + if (u->offset <= offset_pos && + u->offset + u->size >= offset_pos + offset_size) { + memcpy(&tail_offset, + CONST_PTR_OFFSET(u + 1, offset_pos - u->offset), + sizeof(tail_offset)); + + if (tail_offset < file->last_read_hdr_tail_offset) { + /* ignore shrinking tail offsets */ + return 1; + } else if (tail_offset > file->sync_offset + trans_size) { + mail_transaction_log_file_set_corrupted(file, + "log_file_tail_offset %u goes past sync offset %"PRIuUOFF_T, + tail_offset, file->sync_offset + trans_size); + } else { + file->last_read_hdr_tail_offset = tail_offset; + if (tail_offset > file->max_tail_offset) + file->max_tail_offset = tail_offset; + return 1; + } + } + return 0; +} + +static bool +flag_updates_have_non_internal(const struct mail_transaction_flag_update *u, + unsigned int count, unsigned int version) +{ + /* Hide internal flags from modseqs if the log file's version + is new enough. This allows upgrading without the modseqs suddenly + shrinking. */ + if (!MAIL_TRANSACTION_LOG_VERSION_HAVE(version, HIDE_INTERNAL_MODSEQS)) + return TRUE; + + for (unsigned int i = 0; i < count; i++) { + if (!MAIL_TRANSACTION_FLAG_UPDATE_IS_INTERNAL(&u[i])) + return TRUE; + } + return FALSE; +} + +void mail_transaction_update_modseq(const struct mail_transaction_header *hdr, + const void *data, uint64_t *cur_modseq, + unsigned int version) +{ + uint32_t trans_size; + + trans_size = mail_index_offset_to_uint32(hdr->size); + i_assert(trans_size != 0); + + if (*cur_modseq != 0) { + /* tracking modseqs */ + } else if ((hdr->type & MAIL_TRANSACTION_TYPE_MASK) == + MAIL_TRANSACTION_EXT_INTRO) { + /* modseqs not tracked yet. see if this is a modseq + extension introduction. */ + const struct mail_transaction_ext_intro *intro = data; + const unsigned int modseq_ext_len = + strlen(MAIL_INDEX_MODSEQ_EXT_NAME); + + if (intro->name_size == modseq_ext_len && + memcmp(intro + 1, MAIL_INDEX_MODSEQ_EXT_NAME, + modseq_ext_len) == 0) { + /* modseq tracking started */ + *cur_modseq += 1; + } + return; + } else { + /* not tracking modseqs */ + return; + } + + switch (hdr->type & MAIL_TRANSACTION_TYPE_MASK) { + case MAIL_TRANSACTION_EXPUNGE | MAIL_TRANSACTION_EXPUNGE_PROT: + case MAIL_TRANSACTION_EXPUNGE_GUID | MAIL_TRANSACTION_EXPUNGE_PROT: + if ((hdr->type & MAIL_TRANSACTION_EXTERNAL) == 0) { + /* ignore expunge requests */ + break; + } + /* fall through */ + case MAIL_TRANSACTION_APPEND: + case MAIL_TRANSACTION_KEYWORD_UPDATE: + case MAIL_TRANSACTION_KEYWORD_RESET: + case MAIL_TRANSACTION_ATTRIBUTE_UPDATE: + /* these changes increase modseq */ + *cur_modseq += 1; + break; + case MAIL_TRANSACTION_FLAG_UPDATE: { + const struct mail_transaction_flag_update *rec = data; + unsigned int count; + + count = (trans_size - sizeof(*hdr)) / sizeof(*rec); + if (flag_updates_have_non_internal(rec, count, version)) + *cur_modseq += 1; + break; + } + case MAIL_TRANSACTION_MODSEQ_UPDATE: { + const struct mail_transaction_modseq_update *rec, *end; + + end = CONST_PTR_OFFSET(data, trans_size - sizeof(*hdr)); + for (rec = data; rec < end; rec++) { + uint64_t modseq = ((uint64_t)rec->modseq_high32 << 32) | + rec->modseq_low32; + if (*cur_modseq < modseq) + *cur_modseq = modseq; + } + } + } +} + +static int +log_file_track_sync(struct mail_transaction_log_file *file, + const struct mail_transaction_header *hdr, + unsigned int trans_size, const char **error_r) +{ + const void *data = hdr + 1; + int ret; + + mail_transaction_update_modseq(hdr, hdr + 1, &file->sync_highest_modseq, + MAIL_TRANSACTION_LOG_HDR_VERSION(&file->hdr)); + if ((hdr->type & MAIL_TRANSACTION_EXTERNAL) == 0) + return 1; + + /* external transactions: */ + switch (hdr->type & MAIL_TRANSACTION_TYPE_MASK) { + case MAIL_TRANSACTION_HEADER_UPDATE: + /* see if this updates mailbox_sync_offset */ + ret = log_file_track_mailbox_sync_offset_hdr(file, data, + trans_size, error_r); + if (ret != 0) + return ret < 0 ? -1 : 1; + break; + case MAIL_TRANSACTION_INDEX_DELETED: + if (file->sync_offset < file->index_undeleted_offset || + file->hdr.file_seq < file->log->index->index_delete_changed_file_seq) + break; + file->log->index->index_deleted = TRUE; + file->log->index->index_delete_requested = FALSE; + file->log->index->index_delete_changed_file_seq = file->hdr.file_seq; + file->index_deleted_offset = file->sync_offset + trans_size; + break; + case MAIL_TRANSACTION_INDEX_UNDELETED: + if (file->sync_offset < file->index_deleted_offset || + file->hdr.file_seq < file->log->index->index_delete_changed_file_seq) + break; + file->log->index->index_deleted = FALSE; + file->log->index->index_delete_requested = FALSE; + file->log->index->index_delete_changed_file_seq = file->hdr.file_seq; + file->index_undeleted_offset = file->sync_offset + trans_size; + break; + case MAIL_TRANSACTION_BOUNDARY: { + const struct mail_transaction_boundary *boundary = + (const void *)(hdr + 1); + size_t wanted_buffer_size; + + wanted_buffer_size = file->sync_offset - file->buffer_offset + + boundary->size; + if (wanted_buffer_size > file->buffer->used) { + /* the full transaction hasn't been written yet */ + return 0; + } + break; + } + } + + if (file->max_tail_offset == file->sync_offset) { + /* external transactions aren't synced to mailbox. we can + update mailbox sync offset to skip this transaction to + avoid re-reading it at the next sync. */ + file->max_tail_offset += trans_size; + } + return 1; +} + +static int +mail_transaction_log_file_sync(struct mail_transaction_log_file *file, + bool *retry_r, const char **reason_r) +{ + const struct mail_transaction_header *hdr; + const void *data; + struct stat st; + size_t size, avail; + uint32_t trans_size = 0; + int ret; + + i_assert(file->sync_offset >= file->buffer_offset); + + *retry_r = FALSE; + + data = buffer_get_data(file->buffer, &size); + if (file->buffer_offset + size < file->sync_offset) { + *reason_r = t_strdup_printf( + "log file shrank (%"PRIuUOFF_T" < %"PRIuUOFF_T")", + file->buffer_offset + (uoff_t)size, file->sync_offset); + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + /* fix the sync_offset to avoid crashes later on */ + file->sync_offset = file->buffer_offset + size; + return 0; + } + while (file->sync_offset - file->buffer_offset + sizeof(*hdr) <= size) { + hdr = CONST_PTR_OFFSET(data, file->sync_offset - + file->buffer_offset); + trans_size = mail_index_offset_to_uint32(hdr->size); + if (trans_size == 0) { + /* unfinished or corrupted */ + break; + } + if (trans_size < sizeof(*hdr)) { + *reason_r = t_strdup_printf( + "hdr.size too small (%u)", trans_size); + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + return 0; + } + + if (file->sync_offset - file->buffer_offset + trans_size > size) + break; + + /* transaction has been fully written */ + if ((ret = log_file_track_sync(file, hdr, trans_size, reason_r)) <= 0) { + if (ret < 0) + return 0; + break; + } + + file->sync_offset += trans_size; + } + + if (file->mmap_base != NULL && !file->locked) { + /* Now that all the mmaped pages have page faulted, check if + the file had changed while doing that. Only after the last + page has faulted, the size returned by fstat() can be + trusted. Otherwise it might point to a page boundary while + the next page is still being written. + + Without this check we might see partial transactions, + sometimes causing "Extension record updated without intro + prefix" errors. */ + if (fstat(file->fd, &st) < 0) { + log_file_set_syscall_error(file, "fstat()"); + *reason_r = t_strdup_printf("fstat() failed: %m"); + return -1; + } + if ((uoff_t)st.st_size != file->last_size) { + file->last_size = st.st_size; + *retry_r = TRUE; + *reason_r = "File size changed - retrying"; + return 0; + } + } + + avail = file->sync_offset - file->buffer_offset; + if (avail != size) { + /* There's more data than we could sync at the moment. If the + last record's size wasn't valid, we can't know if it will + be updated unless we've locked the log. */ + if (file->locked) { + *reason_r = "Unexpected garbage at EOF"; + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + return 0; + } + /* The size field will be updated soon */ + mail_index_flush_read_cache(file->log->index, file->filepath, + file->fd, file->locked); + } + + if (file->next != NULL && + file->hdr.file_seq == file->next->hdr.prev_file_seq && + file->next->hdr.prev_file_offset != file->sync_offset) { + *reason_r = t_strdup_printf( + "Invalid transaction log size " + "(%"PRIuUOFF_T" vs %u): %s", file->sync_offset, + file->log->head->hdr.prev_file_offset, file->filepath); + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + return 0; + } + + return 1; +} + +static int +mail_transaction_log_file_insert_read(struct mail_transaction_log_file *file, + uoff_t offset, const char **reason_r) +{ + void *data; + size_t size; + ssize_t ret; + + size = file->buffer_offset - offset; + buffer_copy(file->buffer, size, file->buffer, 0, SIZE_MAX); + + data = buffer_get_space_unsafe(file->buffer, 0, size); + ret = pread_full(file->fd, data, size, offset); + if (ret > 0) { + /* success */ + file->buffer_offset -= size; + return 1; + } + + /* failure. don't leave ourself to inconsistent state */ + buffer_copy(file->buffer, 0, file->buffer, size, SIZE_MAX); + buffer_set_used_size(file->buffer, file->buffer->used - size); + + if (ret == 0) { + *reason_r = "file shrank unexpectedly"; + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + return 0; + } else if (errno == ESTALE) { + /* log file was deleted in NFS server, fail silently */ + *reason_r = t_strdup_printf("read() failed: %m"); + return 0; + } else { + log_file_set_syscall_error(file, "pread()"); + *reason_r = t_strdup_printf("read() failed: %m"); + return -1; + } +} + +static int +mail_transaction_log_file_read_more(struct mail_transaction_log_file *file, + const char **reason_r) +{ + void *data; + size_t size; + uint32_t read_offset; + ssize_t ret; + + read_offset = file->buffer_offset + file->buffer->used; + + do { + data = buffer_append_space_unsafe(file->buffer, LOG_PREFETCH); + ret = pread(file->fd, data, LOG_PREFETCH, read_offset); + if (ret > 0) + read_offset += ret; + + size = read_offset - file->buffer_offset; + buffer_set_used_size(file->buffer, size); + } while (ret > 0 || (ret < 0 && errno == EINTR)); + + file->last_size = read_offset; + + if (ret < 0) { + *reason_r = t_strdup_printf("pread() failed: %m"); + if (errno == ESTALE) { + /* log file was deleted in NFS server, fail silently */ + return 0; + } + log_file_set_syscall_error(file, "pread()"); + return -1; + } + return 1; +} + +static bool +mail_transaction_log_file_need_nfs_flush(struct mail_transaction_log_file *file) +{ + const struct mail_index_header *hdr = &file->log->index->map->hdr; + uoff_t max_offset = file->last_size; + + if (file->next != NULL && + file->hdr.file_seq == file->next->hdr.prev_file_seq && + file->next->hdr.prev_file_offset != max_offset) { + /* we already have a newer log file which says that we haven't + synced the entire file. */ + return TRUE; + } + + if (file->hdr.file_seq == hdr->log_file_seq && + max_offset < hdr->log_file_head_offset) + return TRUE; + + return FALSE; +} + +static int +mail_transaction_log_file_read(struct mail_transaction_log_file *file, + uoff_t start_offset, bool nfs_flush, + const char **reason_r) +{ + bool retry; + int ret; + + i_assert(file->mmap_base == NULL); + + /* NFS: if file isn't locked, we're optimistic that we can read enough + data without flushing attribute cache. if after reading we notice + that we really should have read more, flush the cache and try again. + if file is locked, the attribute cache was already flushed when + refreshing the log. */ + if (nfs_flush && + (file->log->index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0) { + if (!file->locked) + nfs_flush_attr_cache_unlocked(file->filepath); + else + nfs_flush_attr_cache_fd_locked(file->filepath, file->fd); + } + + if (file->buffer != NULL && file->buffer_offset > start_offset) { + /* we have to insert missing data to beginning of buffer */ + ret = mail_transaction_log_file_insert_read(file, start_offset, reason_r); + if (ret <= 0) + return ret; + } + + if (file->buffer == NULL) { + file->buffer = + buffer_create_dynamic(default_pool, LOG_PREFETCH); + file->buffer_offset = start_offset; + } + + if ((ret = mail_transaction_log_file_read_more(file, reason_r)) <= 0) + ; + else if (!nfs_flush && + (file->log->index->flags & MAIL_INDEX_OPEN_FLAG_NFS_FLUSH) != 0 && + mail_transaction_log_file_need_nfs_flush(file)) { + /* we didn't read enough data. flush and try again. */ + return mail_transaction_log_file_read(file, start_offset, TRUE, reason_r); + } else if ((ret = mail_transaction_log_file_sync(file, &retry, reason_r)) == 0) { + i_assert(!retry); /* retry happens only with mmap */ + } + i_assert(file->sync_offset >= file->buffer_offset); + buffer_set_used_size(file->buffer, + file->sync_offset - file->buffer_offset); + return ret; +} + +static bool +log_file_map_check_offsets(struct mail_transaction_log_file *file, + uoff_t start_offset, uoff_t end_offset, + const char **reason_r) +{ + struct stat st, st2; + + if (start_offset > file->sync_offset) { + /* broken start offset */ + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) { + *reason_r = t_strdup_printf( + "%s: start_offset (%"PRIuUOFF_T") > " + "current sync_offset (%"PRIuUOFF_T")", + file->filepath, start_offset, file->sync_offset); + return FALSE; + } + + if (fstat(file->fd, &st) < 0) { + log_file_set_syscall_error(file, "fstat()"); + st.st_size = -1; + } + *reason_r = t_strdup_printf( + "%s: start_offset (%"PRIuUOFF_T") > " + "current sync_offset (%"PRIuUOFF_T"), file size=%"PRIuUOFF_T, + file->filepath, start_offset, file->sync_offset, + st.st_size); + if (stat(file->filepath, &st2) == 0) { + if (st.st_ino != st2.st_ino) { + *reason_r = t_strdup_printf( + "%s, file unexpectedly replaced", *reason_r); + } + } else if (errno == ENOENT) { + *reason_r = t_strdup_printf( + "%s, file unexpectedly deleted", *reason_r); + } else { + log_file_set_syscall_error(file, "stat()"); + } + return FALSE; + } + if (end_offset != UOFF_T_MAX && end_offset > file->sync_offset) { + *reason_r = t_strdup_printf( + "%s: end_offset (%"PRIuUOFF_T") > " + "current sync_offset (%"PRIuUOFF_T")", + file->filepath, start_offset, file->sync_offset); + return FALSE; + } + + return TRUE; +} + +static int +mail_transaction_log_file_mmap(struct mail_transaction_log_file *file, + const char **reason_r) +{ + /* we may have switched to mmaping */ + buffer_free(&file->buffer); + + file->mmap_size = file->last_size; + file->mmap_base = mmap(NULL, file->mmap_size, PROT_READ, MAP_SHARED, + file->fd, 0); + if (file->mmap_base == MAP_FAILED) { + file->mmap_base = NULL; + if (ioloop_time != file->last_mmap_error_time) { + file->last_mmap_error_time = ioloop_time; + log_file_set_syscall_error(file, t_strdup_printf( + "mmap(size=%zu)", file->mmap_size)); + } + *reason_r = t_strdup_printf("mmap(size=%zu) failed: %m", + file->mmap_size); + file->mmap_size = 0; + return -1; + } + + if (file->mmap_size > mmap_get_page_size()) { + if (madvise(file->mmap_base, file->mmap_size, + MADV_SEQUENTIAL) < 0) + log_file_set_syscall_error(file, "madvise()"); + } + + buffer_create_from_const_data(&file->mmap_buffer, + file->mmap_base, file->mmap_size); + file->buffer = &file->mmap_buffer; + file->buffer_offset = 0; + return 0; +} + +static void +mail_transaction_log_file_munmap(struct mail_transaction_log_file *file) +{ + if (file->mmap_base == NULL) + return; + + i_assert(file->buffer != NULL); + if (munmap(file->mmap_base, file->mmap_size) < 0) + log_file_set_syscall_error(file, "munmap()"); + file->mmap_base = NULL; + file->mmap_size = 0; + buffer_free(&file->buffer); +} + +static int +mail_transaction_log_file_map_mmap(struct mail_transaction_log_file *file, + uoff_t start_offset, const char **reason_r) +{ + struct stat st; + bool retry; + int ret; + + /* we are going to mmap() this file, but it's not necessarily + mmaped currently. */ + i_assert(file->buffer_offset == 0 || file->mmap_base == NULL); + i_assert(file->mmap_size == 0 || file->mmap_base != NULL); + + if (fstat(file->fd, &st) < 0) { + log_file_set_syscall_error(file, "fstat()"); + *reason_r = t_strdup_printf("fstat() failed: %m"); + return -1; + } + file->last_size = st.st_size; + + if ((uoff_t)st.st_size < file->sync_offset) { + *reason_r = t_strdup_printf( + "file size shrank (%"PRIuUOFF_T" < %"PRIuUOFF_T")", + (uoff_t)st.st_size, file->sync_offset); + mail_transaction_log_file_set_corrupted(file, "%s", *reason_r); + return 0; + } + + if (file->buffer != NULL && file->buffer_offset <= start_offset && + (uoff_t)st.st_size == file->buffer_offset + file->buffer->used) { + /* we already have the whole file mapped */ + if ((ret = mail_transaction_log_file_sync(file, &retry, reason_r)) != 0 || + !retry) + return ret; + /* size changed, re-mmap */ + } + + do { + mail_transaction_log_file_munmap(file); + + if (file->last_size - start_offset < mmap_get_page_size()) { + /* just reading the file is probably faster */ + return mail_transaction_log_file_read(file, + start_offset, + FALSE, reason_r); + } + + if (mail_transaction_log_file_mmap(file, reason_r) < 0) + return -1; + ret = mail_transaction_log_file_sync(file, &retry, reason_r); + } while (retry); + + return ret; +} + +int mail_transaction_log_file_map(struct mail_transaction_log_file *file, + uoff_t start_offset, uoff_t end_offset, + const char **reason_r) +{ + uoff_t map_start_offset = start_offset; + size_t size; + int ret; + + if (file->hdr.indexid == 0) { + /* corrupted */ + *reason_r = "corrupted, indexid=0"; + return 0; + } + + i_assert(start_offset >= file->hdr.hdr_size); + i_assert(start_offset <= end_offset); + i_assert(file->buffer == NULL || file->mmap_base != NULL || + file->sync_offset >= file->buffer_offset + file->buffer->used); + + if (file->locked_sync_offset_updated && file == file->log->head && + end_offset == UOFF_T_MAX) { + /* we're not interested of going further than sync_offset */ + if (!log_file_map_check_offsets(file, start_offset, + end_offset, reason_r)) + return 0; + i_assert(start_offset <= file->sync_offset); + end_offset = file->sync_offset; + } + + if (file->buffer != NULL && file->buffer_offset <= start_offset) { + /* see if we already have it */ + size = file->buffer->used; + if (file->buffer_offset + size >= end_offset) + return 1; + } + + if (file->locked) { + /* set this only when we've synced to end of file while locked + (either end_offset=UOFF_T_MAX or we had to read anyway) */ + file->locked_sync_offset_updated = TRUE; + } + + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) { + if (start_offset < file->buffer_offset || file->buffer == NULL) { + /* we had moved the log to memory but failed to read + the beginning of the log file */ + *reason_r = "Beginning of the log isn't available"; + return 0; + } + return log_file_map_check_offsets(file, start_offset, + end_offset, reason_r) ? 1 : 0; + } + + if (start_offset > file->sync_offset) + mail_transaction_log_file_skip_to_head(file); + if (start_offset > file->sync_offset) { + /* although we could just skip over the unwanted data, we have + to sync everything so that modseqs are calculated + correctly */ + map_start_offset = file->sync_offset; + } + + if ((file->log->index->flags & MAIL_INDEX_OPEN_FLAG_MMAP_DISABLE) == 0) + ret = mail_transaction_log_file_map_mmap(file, map_start_offset, reason_r); + else { + mail_transaction_log_file_munmap(file); + ret = mail_transaction_log_file_read(file, map_start_offset, FALSE, reason_r); + } + + i_assert(file->buffer == NULL || file->mmap_base != NULL || + file->sync_offset >= file->buffer_offset + file->buffer->used); + if (ret <= 0) + return ret; + + i_assert(file->buffer != NULL); + return log_file_map_check_offsets(file, start_offset, end_offset, + reason_r) ? 1 : 0; +} + +int mail_transaction_log_file_move_to_memory(struct mail_transaction_log_file *file) +{ + const char *error; + buffer_t *buf; + int ret = 0; + + if (MAIL_TRANSACTION_LOG_FILE_IN_MEMORY(file)) + return 0; + + if (file->mmap_base != NULL) { + /* just copy to memory */ + i_assert(file->buffer_offset == 0); + + buf = buffer_create_dynamic(default_pool, file->mmap_size); + buffer_append(buf, file->mmap_base, file->mmap_size); + buffer_free(&file->buffer); + file->buffer = buf; + + /* and lose the mmap */ + if (munmap(file->mmap_base, file->mmap_size) < 0) + log_file_set_syscall_error(file, "munmap()"); + file->mmap_base = NULL; + } else if (file->buffer_offset != 0) { + /* we don't have the full log in the memory. read it. */ + ret = mail_transaction_log_file_read(file, 0, FALSE, &error); + if (ret <= 0) { + mail_index_set_error(file->log->index, + "%s: Failed to read into memory: %s", file->filepath, error); + } + } + file->last_size = 0; + + if (close(file->fd) < 0) + log_file_set_syscall_error(file, "close()"); + file->fd = -1; + + i_free(file->filepath); + file->filepath = i_strdup(file->log->filepath); + return ret < 0 ? -1 : 0; +} |