From 55944e5e40b1be2afc4855d8d2baf4b73d1876b5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 10 Apr 2024 22:49:52 +0200 Subject: Adding upstream version 255.4. Signed-off-by: Daniel Baumann --- src/libsystemd/sd-journal/mmap-cache.c | 562 +++++++++++++++++++++++++++++++++ 1 file changed, 562 insertions(+) create mode 100644 src/libsystemd/sd-journal/mmap-cache.c (limited to 'src/libsystemd/sd-journal/mmap-cache.c') diff --git a/src/libsystemd/sd-journal/mmap-cache.c b/src/libsystemd/sd-journal/mmap-cache.c new file mode 100644 index 0000000..973ade6 --- /dev/null +++ b/src/libsystemd/sd-journal/mmap-cache.c @@ -0,0 +1,562 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "list.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "mmap-cache.h" +#include "sigbus.h" + +typedef struct Window Window; + +typedef enum WindowFlags { + WINDOW_KEEP_ALWAYS = 1u << (_MMAP_CACHE_CATEGORY_MAX + 0), + WINDOW_IN_UNUSED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 1), + WINDOW_INVALIDATED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 2), + + _WINDOW_USED_MASK = WINDOW_IN_UNUSED - 1, /* The mask contains all bits that indicate the windows + * is currently in use. Covers the all the object types + * and the additional WINDOW_KEEP_ALWAYS flag. */ +} WindowFlags; + +#define WINDOW_IS_UNUSED(w) (((w)->flags & _WINDOW_USED_MASK) == 0) + +struct Window { + MMapFileDescriptor *fd; + + WindowFlags flags; + + void *ptr; + uint64_t offset; + size_t size; + + LIST_FIELDS(Window, windows); + LIST_FIELDS(Window, unused); +}; + +struct MMapFileDescriptor { + MMapCache *cache; + + int fd; + int prot; + bool sigbus; + + LIST_HEAD(Window, windows); +}; + +struct MMapCache { + unsigned n_ref; + unsigned n_windows; + + unsigned n_category_cache_hit; + unsigned n_window_list_hit; + unsigned n_missed; + + Hashmap *fds; + + LIST_HEAD(Window, unused); + Window *last_unused; + + Window *windows_by_category[_MMAP_CACHE_CATEGORY_MAX]; +}; + +#define WINDOWS_MIN 64 + +#if ENABLE_DEBUG_MMAP_CACHE +/* Tiny windows increase mmap activity and the chance of exposing unsafe use. */ +# define WINDOW_SIZE (page_size()) +#else +# define WINDOW_SIZE ((size_t) (UINT64_C(8) * UINT64_C(1024) * UINT64_C(1024))) +#endif + +MMapCache* mmap_cache_new(void) { + MMapCache *m; + + m = new(MMapCache, 1); + if (!m) + return NULL; + + *m = (MMapCache) { + .n_ref = 1, + }; + + return m; +} + +static Window* window_unlink(Window *w) { + assert(w); + + MMapCache *m = mmap_cache_fd_cache(w->fd); + + if (w->ptr) + munmap(w->ptr, w->size); + + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { + if (m->last_unused == w) + m->last_unused = w->unused_prev; + LIST_REMOVE(unused, m->unused, w); + } + + for (unsigned i = 0; i < _MMAP_CACHE_CATEGORY_MAX; i++) + if (FLAGS_SET(w->flags, 1u << i)) + assert_se(TAKE_PTR(m->windows_by_category[i]) == w); + + return LIST_REMOVE(windows, w->fd->windows, w); +} + +static void window_invalidate(Window *w) { + assert(w); + assert(w->fd); + + if (FLAGS_SET(w->flags, WINDOW_INVALIDATED)) + return; + + /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure + * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */ + + assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); + w->flags |= WINDOW_INVALIDATED; +} + +static Window* window_free(Window *w) { + if (!w) + return NULL; + + window_unlink(w); + w->fd->cache->n_windows--; + + return mfree(w); +} + +static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) { + assert(size > 0); + + return + w && + f == w->fd && + offset >= w->offset && + offset + size <= w->offset + w->size; +} + +static bool window_matches_by_addr(Window *w, MMapFileDescriptor *f, void *addr, size_t size) { + assert(size > 0); + + return + w && + f == w->fd && + (uint8_t*) addr >= (uint8_t*) w->ptr && + (uint8_t*) addr + size <= (uint8_t*) w->ptr + w->size; +} + +static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) { + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + + if (!m->last_unused || m->n_windows <= WINDOWS_MIN) { + /* Allocate a new window */ + w = new(Window, 1); + if (!w) + return NULL; + m->n_windows++; + } else + /* Reuse an existing one */ + w = window_unlink(m->last_unused); + + *w = (Window) { + .fd = f, + .offset = offset, + .size = size, + .ptr = ptr, + }; + + return LIST_PREPEND(windows, f->windows, w); +} + +static void category_detach_window(MMapCache *m, MMapCacheCategory c) { + Window *w; + + assert(m); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + + w = TAKE_PTR(m->windows_by_category[c]); + if (!w) + return; /* Nothing attached. */ + + assert(FLAGS_SET(w->flags, 1u << c)); + w->flags &= ~(1u << c); + + if (WINDOW_IS_UNUSED(w)) { + /* Not used anymore? */ +#if ENABLE_DEBUG_MMAP_CACHE + /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */ + window_free(w); +#else + LIST_PREPEND(unused, m->unused, w); + if (!m->last_unused) + m->last_unused = w; + w->flags |= WINDOW_IN_UNUSED; +#endif + } +} + +static void category_attach_window(MMapCache *m, MMapCacheCategory c, Window *w) { + assert(m); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(w); + + if (m->windows_by_category[c] == w) + return; /* Already attached. */ + + category_detach_window(m, c); + + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { + /* Used again? */ + if (m->last_unused == w) + m->last_unused = w->unused_prev; + LIST_REMOVE(unused, m->unused, w); + w->flags &= ~WINDOW_IN_UNUSED; + } + + m->windows_by_category[c] = w; + w->flags |= (1u << c); +} + +static MMapCache* mmap_cache_free(MMapCache *m) { + if (!m) + return NULL; + + /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called, + * all fds are already freed, and hence there is no window. */ + + assert(hashmap_isempty(m->fds)); + hashmap_free(m->fds); + + assert(!m->unused); + assert(m->n_windows == 0); + + return mfree(m); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free); + +static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) { + MMapCache *m = mmap_cache_fd_cache(f); + + assert(ret); + + for (;;) { + void *ptr; + + ptr = mmap(addr, size, f->prot, flags, f->fd, offset); + if (ptr != MAP_FAILED) { + *ret = ptr; + return 0; + } + if (errno != ENOMEM) + return negative_errno(); + + /* When failed with ENOMEM, try again after making a room by freeing an unused window. */ + + if (!m->last_unused) + return -ENOMEM; /* no free window, propagate the original error. */ + + window_free(m->last_unused); + } +} + +static int add_mmap( + MMapFileDescriptor *f, + uint64_t offset, + size_t size, + struct stat *st, + Window **ret) { + + Window *w; + void *d; + int r; + + assert(f); + assert(size > 0); + assert(ret); + + /* overflow check */ + if (size > SIZE_MAX - PAGE_OFFSET_U64(offset)) + return -EADDRNOTAVAIL; + + size = PAGE_ALIGN(size + PAGE_OFFSET_U64(offset)); + offset = PAGE_ALIGN_DOWN_U64(offset); + + if (size < WINDOW_SIZE) { + uint64_t delta; + + delta = PAGE_ALIGN((WINDOW_SIZE - size) / 2); + offset = LESS_BY(offset, delta); + size = WINDOW_SIZE; + } + + if (st) { + /* Memory maps that are larger then the files underneath have undefined behavior. Hence, + * clamp things to the file size if we know it */ + + if (offset >= (uint64_t) st->st_size) + return -EADDRNOTAVAIL; + + if (size > (uint64_t) st->st_size - offset) + size = PAGE_ALIGN((uint64_t) st->st_size - offset); + } + + if (size >= SIZE_MAX) + return -EADDRNOTAVAIL; + + r = mmap_try_harder(f, NULL, MAP_SHARED, offset, size, &d); + if (r < 0) + return r; + + w = window_add(f, offset, size, d); + if (!w) { + (void) munmap(d, size); + return -ENOMEM; + } + + *ret = w; + return 0; +} + +int mmap_cache_fd_get( + MMapFileDescriptor *f, + MMapCacheCategory c, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret) { + + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + int r; + + assert(size > 0); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(ret); + + if (f->sigbus) + return -EIO; + + /* Check whether the current category is the right one already */ + if (window_matches(m->windows_by_category[c], f, offset, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; + goto found; + } + + /* Drop the reference to the window, since it's unnecessary now */ + category_detach_window(m, c); + + /* Search for a matching mmap */ + LIST_FOREACH(windows, i, f->windows) + if (window_matches(i, f, offset, size)) { + m->n_window_list_hit++; + w = i; + goto found; + } + + m->n_missed++; + + /* Create a new mmap */ + r = add_mmap(f, offset, size, st, &w); + if (r < 0) + return r; + +found: + if (keep_always) + w->flags |= WINDOW_KEEP_ALWAYS; + + category_attach_window(m, c, w); + *ret = (uint8_t*) w->ptr + (offset - w->offset); + return 0; +} + +int mmap_cache_fd_pin( + MMapFileDescriptor *f, + MMapCacheCategory c, + void *addr, + size_t size) { + + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + + assert(addr); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(size > 0); + + if (f->sigbus) + return -EIO; + + /* Check if the current category is the right one. */ + if (window_matches_by_addr(m->windows_by_category[c], f, addr, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; + goto found; + } + + /* Search for a matching mmap. */ + LIST_FOREACH(windows, i, f->windows) + if (window_matches_by_addr(i, f, addr, size)) { + m->n_window_list_hit++; + w = i; + goto found; + } + + m->n_missed++; + return -EADDRNOTAVAIL; /* Not found. */ + +found: + if (FLAGS_SET(w->flags, WINDOW_KEEP_ALWAYS)) + return 0; /* The window will never unmapped. */ + + /* Attach the window to the 'pinning' category. */ + category_attach_window(m, MMAP_CACHE_CATEGORY_PIN, w); + return 1; +} + +void mmap_cache_stats_log_debug(MMapCache *m) { + assert(m); + + log_debug("mmap cache statistics: %u category cache hit, %u window list hit, %u miss", + m->n_category_cache_hit, m->n_window_list_hit, m->n_missed); +} + +static void mmap_cache_process_sigbus(MMapCache *m) { + bool found = false; + MMapFileDescriptor *f; + int r; + + assert(m); + + /* Iterate through all triggered pages and mark their files as invalidated. */ + for (;;) { + bool ours; + void *addr; + + r = sigbus_pop(&addr); + if (_likely_(r == 0)) + break; + if (r < 0) { + log_error_errno(r, "SIGBUS handling failed: %m"); + abort(); + } + + ours = false; + HASHMAP_FOREACH(f, m->fds) { + LIST_FOREACH(windows, w, f->windows) + if (window_matches_by_addr(w, f, addr, 1)) { + found = ours = f->sigbus = true; + break; + } + + if (ours) + break; + } + + /* Didn't find a matching window, give up. */ + if (!ours) { + log_error("Unknown SIGBUS page, aborting."); + abort(); + } + } + + /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to + * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure + * not to hit the queue size limit. */ + if (_likely_(!found)) + return; + + HASHMAP_FOREACH(f, m->fds) { + if (!f->sigbus) + continue; + + LIST_FOREACH(windows, w, f->windows) + window_invalidate(w); + } +} + +bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) { + assert(f); + + mmap_cache_process_sigbus(f->cache); + + return f->sigbus; +} + +int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) { + _cleanup_free_ MMapFileDescriptor *f = NULL; + MMapFileDescriptor *existing; + int r; + + assert(m); + assert(fd >= 0); + + existing = hashmap_get(m->fds, FD_TO_PTR(fd)); + if (existing) { + if (existing->prot != prot) + return -EEXIST; + if (ret) + *ret = existing; + return 0; + } + + f = new(MMapFileDescriptor, 1); + if (!f) + return -ENOMEM; + + *f = (MMapFileDescriptor) { + .fd = fd, + .prot = prot, + }; + + r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f); + if (r < 0) + return r; + assert(r > 0); + + f->cache = mmap_cache_ref(m); + + if (ret) + *ret = f; + + TAKE_PTR(f); + return 1; +} + +MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) { + if (!f) + return NULL; + + /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry + * we cannot relate to any existing memory map. */ + + mmap_cache_process_sigbus(f->cache); + + while (f->windows) + window_free(f->windows); + + assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f); + + /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */ + f->cache = mmap_cache_unref(f->cache); + + return mfree(f); +} + +MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) { + assert(f); + return ASSERT_PTR(f->cache); +} -- cgit v1.2.3