1 files changed, 1023 insertions, 0 deletions
diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c
new file mode 100644
index 0000000..6bcfb68
--- /dev/null
+++ b/src/core/exec-credential.c
@@ -0,0 +1,1023 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+
+#include "acl-util.h"
+#include "creds-util.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fileio.h"
+#include "glob-util.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
+        if (!sc)
+                return NULL;
+
+        free(sc->id);
+        free(sc->data);
+        return mfree(sc);
+}
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
+        if (!lc)
+                return NULL;
+
+        free(lc->id);
+        free(lc->path);
+        return mfree(lc);
+}
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+        exec_set_credential_hash_ops,
+        char, string_hash_func, string_compare_func,
+        ExecSetCredential, exec_set_credential_free);
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+        exec_load_credential_hash_ops,
+        char, string_hash_func, string_compare_func,
+        ExecLoadCredential, exec_load_credential_free);
+
+bool exec_context_has_credentials(const ExecContext *c) {
+        assert(c);
+
+        return !hashmap_isempty(c->set_credentials) ||
+                !hashmap_isempty(c->load_credentials) ||
+                !set_isempty(c->import_credentials);
+}
+
+bool exec_context_has_encrypted_credentials(ExecContext *c) {
+        ExecLoadCredential *load_cred;
+        ExecSetCredential *set_cred;
+
+        assert(c);
+
+        HASHMAP_FOREACH(load_cred, c->load_credentials)
+                if (load_cred->encrypted)
+                        return true;
+
+        HASHMAP_FOREACH(set_cred, c->set_credentials)
+                if (set_cred->encrypted)
+                        return true;
+
+        return false;
+}
+
+static int get_credential_directory(
+                const char *runtime_prefix,
+                const char *unit,
+                char **ret) {
+
+        char *p;
+
+        assert(ret);
+
+        if (!runtime_prefix || !unit) {
+                *ret = NULL;
+                return 0;
+        }
+
+        p = path_join(runtime_prefix, "credentials", unit);
+        if (!p)
+                return -ENOMEM;
+
+        *ret = p;
+        return 1;
+}
+
+int exec_context_get_credential_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                char **ret) {
+
+        assert(context);
+        assert(params);
+        assert(unit);
+        assert(ret);
+
+        if (!exec_context_has_credentials(context)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        return get_credential_directory(params->prefix[EXEC_DIRECTORY_RUNTIME], unit, ret);
+}
+
+int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c) {
+        _cleanup_free_ char *p = NULL, *m = NULL;
+        int r;
+
+        assert(u);
+        assert(c);
+
+        if (!exec_context_has_credentials(c))
+                return 0;
+
+        /* Let's make sure the credentials directory of this service is unmounted *after* the service itself
+         * shuts down. This only matters if mount namespacing is not used for the service, and hence the
+         * credentials mount appears on the host. */
+
+        r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+        if (r <= 0)
+                return r;
+
+        r = unit_name_from_path(p, ".mount", &m);
+        if (r < 0)
+                return r;
+
+        return unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+}
+
+int exec_context_destroy_credentials(Unit *u) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+
+        r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+        if (r <= 0)
+                return r;
+
+        /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
+         * unmount it, and afterwards remove the mount point */
+        if (umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW) >= 0)
+                (void) mount_invalidate_state_by_path(u->manager, p);
+
+        (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
+
+        return 0;
+}
+
+static int write_credential(
+                int dfd,
+                const char *id,
+                const void *data,
+                size_t size,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok) {
+
+        _cleanup_(unlink_and_freep) char *tmp = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        r = tempfn_random_child("", "cred", &tmp);
+        if (r < 0)
+                return r;
+
+        fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
+        if (fd < 0) {
+                tmp = mfree(tmp);
+                return -errno;
+        }
+
+        r = loop_write(fd, data, size);
+        if (r < 0)
+                return r;
+
+        if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
+                return -errno;
+
+        if (uid_is_valid(uid) && uid != getuid()) {
+                r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
+                if (r < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                                return r;
+
+                        if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
+                                            * to express: that the user gets read access and nothing
+                                            * else. But if the backing fs can't support that (e.g. ramfs)
+                                            * then we can use file ownership instead. But that's only safe if
+                                            * we can then re-mount the whole thing read-only, so that the
+                                            * user can no longer chmod() the file to gain write access. */
+                                return r;
+
+                        if (fchown(fd, uid, gid) < 0)
+                                return -errno;
+                }
+        }
+
+        if (renameat(dfd, tmp, dfd, id) < 0)
+                return -errno;
+
+        tmp = mfree(tmp);
+        return 0;
+}
+
+typedef enum CredentialSearchPath {
+        CREDENTIAL_SEARCH_PATH_TRUSTED,
+        CREDENTIAL_SEARCH_PATH_ENCRYPTED,
+        CREDENTIAL_SEARCH_PATH_ALL,
+        _CREDENTIAL_SEARCH_PATH_MAX,
+        _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
+} CredentialSearchPath;
+
+static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
+
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(params);
+        assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
+
+        /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
+         * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
+         * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
+
+        if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+                if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
+                        return NULL;
+
+                if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
+                        return NULL;
+        }
+
+        if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+                if (params->received_credentials_directory)
+                        if (strv_extend(&l, params->received_credentials_directory) < 0)
+                                return NULL;
+
+                if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
+                        return NULL;
+        }
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *t = strv_join(l, ":");
+
+                log_debug("Credential search path is: %s", strempty(t));
+        }
+
+        return TAKE_PTR(l);
+}
+
+static int maybe_decrypt_and_write_credential(
+                int dir_fd,
+                const char *id,
+                bool encrypted,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                const char *data,
+                size_t size,
+                uint64_t *left) {
+
+        _cleanup_free_ void *plaintext = NULL;
+        size_t add;
+        int r;
+
+        if (encrypted) {
+                size_t plaintext_size = 0;
+
+                r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
+                                                &plaintext, &plaintext_size);
+                if (r < 0)
+                        return r;
+
+                data = plaintext;
+                size = plaintext_size;
+        }
+
+        add = strlen(id) + size;
+        if (add > *left)
+                return -E2BIG;
+
+        r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write credential '%s': %m", id);
+
+        *left -= add;
+        return 0;
+}
+
+static int load_credential_glob(
+                const char *path,
+                bool encrypted,
+                char **search_path,
+                ReadFullFileFlags flags,
+                int write_dfd,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                uint64_t *left) {
+
+        int r;
+
+        STRV_FOREACH(d, search_path) {
+                _cleanup_globfree_ glob_t pglob = {};
+                _cleanup_free_ char *j = NULL;
+
+                j = path_join(*d, path);
+                if (!j)
+                        return -ENOMEM;
+
+                r = safe_glob(j, 0, &pglob);
+                if (r == -ENOENT)
+                        continue;
+                if (r < 0)
+                        return r;
+
+                for (size_t n = 0; n < pglob.gl_pathc; n++) {
+                        _cleanup_free_ char *fn = NULL;
+                        _cleanup_(erase_and_freep) char *data = NULL;
+                        size_t size;
+
+                        /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+                        r = read_full_file_full(
+                                AT_FDCWD,
+                                pglob.gl_pathv[n],
+                                UINT64_MAX,
+                                encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
+                                flags,
+                                NULL,
+                                &data, &size);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to read credential '%s': %m",
+                                                        pglob.gl_pathv[n]);
+
+                        r = path_extract_filename(pglob.gl_pathv[n], &fn);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to extract filename from '%s': %m",
+                                                        pglob.gl_pathv[n]);
+
+                        r = maybe_decrypt_and_write_credential(
+                                write_dfd,
+                                fn,
+                                encrypted,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                data, size,
+                                left);
+                        if (r == -EEXIST)
+                                continue;
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int load_credential(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *id,
+                const char *path,
+                bool encrypted,
+                const char *unit,
+                int read_dfd,
+                int write_dfd,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                uint64_t *left) {
+
+        ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
+        _cleanup_strv_free_ char **search_path = NULL;
+        _cleanup_(erase_and_freep) char *data = NULL;
+        _cleanup_free_ char *bindname = NULL;
+        const char *source = NULL;
+        bool missing_ok = true;
+        size_t size, maxsz;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(id);
+        assert(path);
+        assert(unit);
+        assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
+        assert(write_dfd >= 0);
+        assert(left);
+
+        if (read_dfd >= 0) {
+                /* If a directory fd is specified, then read the file directly from that dir. In this case we
+                 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
+                 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
+                 * open it. */
+
+                if (!filename_is_valid(path)) /* safety check */
+                        return -EINVAL;
+
+                missing_ok = true;
+                source = path;
+
+        } else if (path_is_absolute(path)) {
+                /* If this is an absolute path, read the data directly from it, and support AF_UNIX
+                 * sockets */
+
+                if (!path_is_valid(path)) /* safety check */
+                        return -EINVAL;
+
+                flags |= READ_FULL_FILE_CONNECT_SOCKET;
+
+                /* Pass some minimal info about the unit and the credential name we are looking to acquire
+                 * via the source socket address in case we read off an AF_UNIX socket. */
+                if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
+                        return -ENOMEM;
+
+                missing_ok = false;
+                source = path;
+
+        } else if (credential_name_valid(path)) {
+                /* If this is a relative path, take it as credential name relative to the credentials
+                 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
+                 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
+
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
+                if (!search_path)
+                        return -ENOMEM;
+
+                missing_ok = true;
+        } else
+                source = NULL;
+
+        if (encrypted)
+                flags |= READ_FULL_FILE_UNBASE64;
+
+        maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
+
+        if (search_path) {
+                STRV_FOREACH(d, search_path) {
+                        _cleanup_free_ char *j = NULL;
+
+                        j = path_join(*d, path);
+                        if (!j)
+                                return -ENOMEM;
+
+                        r = read_full_file_full(
+                                        AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+                                        UINT64_MAX,
+                                        maxsz,
+                                        flags,
+                                        NULL,
+                                        &data, &size);
+                        if (r != -ENOENT)
+                                break;
+                }
+        } else if (source)
+                r = read_full_file_full(
+                                read_dfd, source,
+                                UINT64_MAX,
+                                maxsz,
+                                flags,
+                                bindname,
+                                &data, &size);
+        else
+                r = -ENOENT;
+
+        if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
+                /* Make a missing inherited credential non-fatal, let's just continue. After all apps
+                 * will get clear errors if we don't pass such a missing credential on as they
+                 * themselves will get ENOENT when trying to read them, which should not be much
+                 * worse than when we handle the error here and make it fatal.
+                 *
+                 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
+                 * we are fine, too. */
+                log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
+                return 0;
+        }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read credential '%s': %m", path);
+
+        return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left);
+}
+
+struct load_cred_args {
+        const ExecContext *context;
+        const ExecParameters *params;
+        bool encrypted;
+        const char *unit;
+        int dfd;
+        uid_t uid;
+        gid_t gid;
+        bool ownership_ok;
+        uint64_t *left;
+};
+
+static int load_cred_recurse_dir_cb(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        struct load_cred_args *args = ASSERT_PTR(userdata);
+        _cleanup_free_ char *sub_id = NULL;
+        int r;
+
+        if (event != RECURSE_DIR_ENTRY)
+                return RECURSE_DIR_CONTINUE;
+
+        if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
+                return RECURSE_DIR_CONTINUE;
+
+        sub_id = strreplace(path, "/", "_");
+        if (!sub_id)
+                return -ENOMEM;
+
+        if (!credential_name_valid(sub_id))
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
+
+        if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
+                log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
+                return RECURSE_DIR_CONTINUE;
+        }
+        if (errno != ENOENT)
+                return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
+
+        r = load_credential(
+                        args->context,
+                        args->params,
+                        sub_id,
+                        de->d_name,
+                        args->encrypted,
+                        args->unit,
+                        dir_fd,
+                        args->dfd,
+                        args->uid,
+                        args->gid,
+                        args->ownership_ok,
+                        args->left);
+        if (r < 0)
+                return r;
+
+        return RECURSE_DIR_CONTINUE;
+}
+
+static int acquire_credentials(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                const char *p,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok) {
+
+        uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
+        _cleanup_close_ int dfd = -EBADF;
+        const char *ic;
+        ExecLoadCredential *lc;
+        ExecSetCredential *sc;
+        int r;
+
+        assert(context);
+        assert(p);
+
+        dfd = open(p, O_DIRECTORY|O_CLOEXEC);
+        if (dfd < 0)
+                return -errno;
+
+        r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
+        if (r < 0)
+                return r;
+
+        /* First, load credentials off disk (or acquire via AF_UNIX socket) */
+        HASHMAP_FOREACH(lc, context->load_credentials) {
+                _cleanup_close_ int sub_fd = -EBADF;
+
+                /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
+                 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
+                 * a regular file. Finally, if it's a relative path we will use it as a credential name to
+                 * propagate a credential passed to us from further up. */
+
+                if (path_is_absolute(lc->path)) {
+                        sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+                        if (sub_fd < 0 && !IN_SET(errno,
+                                                  ENOTDIR,  /* Not a directory */
+                                                  ENOENT))  /* Doesn't exist? */
+                                return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
+                }
+
+                if (sub_fd < 0)
+                        /* Regular file (incl. a credential passed in from higher up) */
+                        r = load_credential(
+                                        context,
+                                        params,
+                                        lc->id,
+                                        lc->path,
+                                        lc->encrypted,
+                                        unit,
+                                        AT_FDCWD,
+                                        dfd,
+                                        uid,
+                                        gid,
+                                        ownership_ok,
+                                        &left);
+                else
+                        /* Directory */
+                        r = recurse_dir(
+                                        sub_fd,
+                                        /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
+                                        /* statx_mask= */ 0,
+                                        /* n_depth_max= */ UINT_MAX,
+                                        RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+                                        load_cred_recurse_dir_cb,
+                                        &(struct load_cred_args) {
+                                                .context = context,
+                                                .params = params,
+                                                .encrypted = lc->encrypted,
+                                                .unit = unit,
+                                                .dfd = dfd,
+                                                .uid = uid,
+                                                .gid = gid,
+                                                .ownership_ok = ownership_ok,
+                                                .left = &left,
+                                        });
+                if (r < 0)
+                        return r;
+        }
+
+        /* Next, look for system credentials and credentials in the credentials store. Note that these do not
+         * override any credentials found earlier. */
+        SET_FOREACH(ic, context->import_credentials) {
+                _cleanup_free_ char **search_path = NULL;
+
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
+                if (!search_path)
+                        return -ENOMEM;
+
+                r = load_credential_glob(
+                                ic,
+                                /* encrypted = */ false,
+                                search_path,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
+                                dfd,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                &left);
+                if (r < 0)
+                        return r;
+
+                search_path = strv_free(search_path);
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
+                if (!search_path)
+                        return -ENOMEM;
+
+                r = load_credential_glob(
+                                ic,
+                                /* encrypted = */ true,
+                                search_path,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
+                                dfd,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                &left);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
+         * add them, so that they can act as a "default" if the same credential is specified multiple times. */
+        HASHMAP_FOREACH(sc, context->set_credentials) {
+                _cleanup_(erase_and_freep) void *plaintext = NULL;
+                const char *data;
+                size_t size, add;
+
+                /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
+                 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
+                 * slow and involved, hence it's nice to be able to skip that if the credential already
+                 * exists anyway. */
+                if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+                        continue;
+                if (errno != ENOENT)
+                        return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
+
+                if (sc->encrypted) {
+                        r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
+                        if (r < 0)
+                                return r;
+
+                        data = plaintext;
+                } else {
+                        data = sc->data;
+                        size = sc->size;
+                }
+
+                add = strlen(sc->id) + size;
+                if (add > left)
+                        return -E2BIG;
+
+                r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok);
+                if (r < 0)
+                        return r;
+
+                left -= add;
+        }
+
+        r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
+        if (r < 0)
+                return r;
+
+        /* After we created all keys with the right perms, also make sure the credential store as a whole is
+         * accessible */
+
+        if (uid_is_valid(uid) && uid != getuid()) {
+                r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
+                if (r < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                                return r;
+
+                        if (!ownership_ok)
+                                return r;
+
+                        if (fchown(dfd, uid, gid) < 0)
+                                return -errno;
+                }
+        }
+
+        return 0;
+}
+
+static int setup_credentials_internal(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                const char *final,        /* This is where the credential store shall eventually end up at */
+                const char *workspace,    /* This is where we can prepare it before moving it to the final place */
+                bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
+                bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
+                uid_t uid,
+                gid_t gid) {
+
+        int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
+                                   * if we mounted something; false if we definitely can't mount anything */
+        bool final_mounted;
+        const char *where;
+
+        assert(context);
+        assert(final);
+        assert(workspace);
+
+        if (reuse_workspace) {
+                r = path_is_mount_point(workspace, NULL, 0);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
+                                                   * it, let's keep this in mind */
+                else
+                        workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
+        } else
+                workspace_mounted = -1; /* ditto */
+
+        r = path_is_mount_point(final, NULL, 0);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                /* If the final place already has something mounted, we use that. If the workspace also has
+                 * something mounted we assume it's actually the same mount (but with MS_RDONLY
+                 * different). */
+                final_mounted = true;
+
+                if (workspace_mounted < 0) {
+                        /* If the final place is mounted, but the workspace isn't, then let's bind mount
+                         * the final version to the workspace, and make it writable, so that we can make
+                         * changes */
+
+                        r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+                        if (r < 0)
+                                return r;
+
+                        workspace_mounted = true;
+                }
+        } else
+                final_mounted = false;
+
+        if (workspace_mounted < 0) {
+                /* Nothing is mounted on the workspace yet, let's try to mount something now */
+
+                r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+                if (r < 0) {
+                        /* If that didn't work, try to make a bind mount from the final to the workspace, so
+                         * that we can make it writable there. */
+                        r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+                        if (r < 0) {
+                                if (!ERRNO_IS_PRIVILEGE(r))
+                                        /* Propagate anything that isn't a permission problem. */
+                                        return r;
+
+                                if (must_mount)
+                                        /* If it's not OK to use the plain directory fallback, propagate all
+                                         * errors too. */
+                                        return r;
+
+                                /* If we lack privileges to bind mount stuff, then let's gracefully proceed
+                                 * for compat with container envs, and just use the final dir as is. */
+
+                                workspace_mounted = false;
+                        } else {
+                                /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
+                                r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+                                if (r < 0)
+                                        return r;
+
+                                workspace_mounted = true;
+                        }
+                } else
+                        workspace_mounted = true;
+        }
+
+        assert(!must_mount || workspace_mounted > 0);
+        where = workspace_mounted ? workspace : final;
+
+        (void) label_fix_full(AT_FDCWD, where, final, 0);
+
+        r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
+        if (r < 0)
+                return r;
+
+        if (workspace_mounted) {
+                bool install;
+
+                /* Determine if we should actually install the prepared mount in the final location by bind
+                 * mounting it there. We do so only if the mount is not established there already, and if the
+                 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
+                 * case we are doing all this in a mount namespace, thus no one else will see that we
+                 * allocated a file system we are getting rid of again here. */
+                if (final_mounted)
+                        install = false; /* already installed */
+                else {
+                        r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+                        if (r < 0)
+                                return r;
+
+                        install = r == 0; /* install only if non-empty */
+                }
+
+                if (install) {
+                        /* Make workspace read-only now, so that any bind mount we make from it defaults to
+                         * read-only too */
+                        r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
+                        if (r < 0)
+                                return r;
+
+                        /* And mount it to the final place, read-only */
+                        r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
+                } else
+                        /* Otherwise get rid of it */
+                        r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
+                if (r < 0)
+                        return r;
+        } else {
+                _cleanup_free_ char *parent = NULL;
+
+                /* If we do not have our own mount put used the plain directory fallback, then we need to
+                 * open access to the top-level credential directory and the per-service directory now */
+
+                r = path_extract_directory(final, &parent);
+                if (r < 0)
+                        return r;
+                if (chmod(parent, 0755) < 0)
+                        return -errno;
+        }
+
+        return 0;
+}
+
+int exec_setup_credentials(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_free_ char *p = NULL, *q = NULL;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        if (!exec_context_has_credentials(context))
+                return 0;
+
+        if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
+                return -EINVAL;
+
+        /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
+         * and the subdir we mount over with a read-only file system readable by the service's user */
+        q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
+        if (!q)
+                return -ENOMEM;
+
+        r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
+        if (r < 0 && r != -EEXIST)
+                return r;
+
+        p = path_join(q, unit);
+        if (!p)
+                return -ENOMEM;
+
+        r = mkdir_label(p, 0700); /* per-unit dir: private to user */
+        if (r < 0 && r != -EEXIST)
+                return r;
+
+        r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
+        if (r < 0) {
+                _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
+                _cleanup_free_ char *t = NULL;
+
+                /* If this is not a privilege or support issue then propagate the error */
+                if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                        return r;
+
+                /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
+                 * it into place, so that users can't access half-initialized credential stores. */
+                t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
+                if (!t)
+                        return -ENOMEM;
+
+                /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
+                 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
+                 * after it is fully set up */
+                u = path_join(t, unit);
+                if (!u)
+                        return -ENOMEM;
+
+                FOREACH_STRING(i, t, u) {
+                        r = mkdir_label(i, 0700);
+                        if (r < 0 && r != -EEXIST)
+                                return r;
+                }
+
+                r = setup_credentials_internal(
+                                context,
+                                params,
+                                unit,
+                                p,       /* final mount point */
+                                u,       /* temporary workspace to overmount */
+                                true,    /* reuse the workspace if it is already a mount */
+                                false,   /* it's OK to fall back to a plain directory if we can't mount anything */
+                                uid,
+                                gid);
+                if (r < 0)
+                        return r;
+
+        } else if (r == 0) {
+
+                /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
+                 * we can use the same directory for all cases, after turning off propagation. Question
+                 * though is: where do we turn off propagation exactly, and where do we place the workspace
+                 * directory? We need some place that is guaranteed to be a mount point in the host, and
+                 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
+                 * since we ultimately want to move the resulting file system there, i.e. we need propagation
+                 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
+                 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
+                 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
+                 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
+                 * propagation on the former, and then overmount the latter.
+                 *
+                 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
+                 * for this purpose, but there are few other candidates that work equally well for us, and
+                 * given that we do this in a privately namespaced short-lived single-threaded process that
+                 * no one else sees this should be OK to do. */
+
+                /* Turn off propagation from our namespace to host */
+                r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
+                if (r < 0)
+                        goto child_fail;
+
+                r = setup_credentials_internal(
+                                context,
+                                params,
+                                unit,
+                                p,           /* final mount point */
+                                "/dev/shm",  /* temporary workspace to overmount */
+                                false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
+                                true,        /* insist that something is mounted, do not allow fallback to plain directory */
+                                uid,
+                                gid);
+                if (r < 0)
+                        goto child_fail;
+
+                _exit(EXIT_SUCCESS);
+
+        child_fail:
+                _exit(EXIT_FAILURE);
+        }
+
+        /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
+         * try to remove it. This matters in particular if we created the dir as mount point but then didn't
+         * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
+         * seen by users when trying access this inode. */
+        (void) rmdir(p);
+        return 0;
+}