diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /fs/kernfs | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | fs/kernfs/Kconfig | 8 | ||||
-rw-r--r-- | fs/kernfs/Makefile | 6 | ||||
-rw-r--r-- | fs/kernfs/dir.c | 1866 | ||||
-rw-r--r-- | fs/kernfs/file.c | 1082 | ||||
-rw-r--r-- | fs/kernfs/inode.c | 453 | ||||
-rw-r--r-- | fs/kernfs/kernfs-internal.h | 174 | ||||
-rw-r--r-- | fs/kernfs/mount.c | 434 | ||||
-rw-r--r-- | fs/kernfs/symlink.c | 153 |
8 files changed, 4176 insertions, 0 deletions
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig new file mode 100644 index 0000000000..e7f09105f6 --- /dev/null +++ b/fs/kernfs/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# KERNFS should be selected by its users +# + +config KERNFS + bool + default n diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile new file mode 100644 index 0000000000..4ca54ff54c --- /dev/null +++ b/fs/kernfs/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c new file mode 100644 index 0000000000..8b2bd65d70 --- /dev/null +++ b/fs/kernfs/dir.c @@ -0,0 +1,1866 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + */ + +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/hash.h> + +#include "kernfs-internal.h" + +static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +/* + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to + * call pr_cont() while holding rename_lock. Because sometimes pr_cont() + * will perform wakeups when releasing console_sem. Holding rename_lock + * will introduce deadlock if the scheduler reads the kernfs_name in the + * wakeup path. + */ +static DEFINE_SPINLOCK(kernfs_pr_cont_lock); +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ +static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +static bool __kernfs_active(struct kernfs_node *kn) +{ + return atomic_read(&kn->active) >= 0; +} + +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); + return __kernfs_active(kn); +} + +static bool kernfs_lockdep(struct kernfs_node *kn) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + return kn->flags & KERNFS_LOCKDEP; +#else + return false; +#endif +} + +static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) +{ + if (!kn) + return strlcpy(buf, "(null)", buflen); + + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); +} + +/* kernfs_node_depth - compute depth from @from to @to */ +static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) +{ + size_t depth = 0; + + while (to->parent && to != from) { + depth++; + to = to->parent; + } + return depth; +} + +static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, + struct kernfs_node *b) +{ + size_t da, db; + struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); + + if (ra != rb) + return NULL; + + da = kernfs_depth(ra->kn, a); + db = kernfs_depth(rb->kn, b); + + while (da > db) { + a = a->parent; + da--; + } + while (db > da) { + b = b->parent; + db--; + } + + /* worst case b and a will be the same at root */ + while (b != a) { + b = b->parent; + a = a->parent; + } + + return a; +} + +/** + * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, + * where kn_from is treated as root of the path. + * @kn_from: kernfs node which should be treated as root for the path + * @kn_to: kernfs node to which path is needed + * @buf: buffer to copy the path into + * @buflen: size of @buf + * + * We need to handle couple of scenarios here: + * [1] when @kn_from is an ancestor of @kn_to at some level + * kn_from: /n1/n2/n3 + * kn_to: /n1/n2/n3/n4/n5 + * result: /n4/n5 + * + * [2] when @kn_from is on a different hierarchy and we need to find common + * ancestor between @kn_from and @kn_to. + * kn_from: /n1/n2/n3/n4 + * kn_to: /n1/n2/n5 + * result: /../../n5 + * OR + * kn_from: /n1/n2/n3/n4/n5 [depth=5] + * kn_to: /n1/n2/n3 [depth=3] + * result: /../.. + * + * [3] when @kn_to is %NULL result will be "(null)" + * + * Return: the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. + */ +static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, + struct kernfs_node *kn_from, + char *buf, size_t buflen) +{ + struct kernfs_node *kn, *common; + const char parent_str[] = "/.."; + size_t depth_from, depth_to, len = 0; + int i, j; + + if (!kn_to) + return strlcpy(buf, "(null)", buflen); + + if (!kn_from) + kn_from = kernfs_root(kn_to)->kn; + + if (kn_from == kn_to) + return strlcpy(buf, "/", buflen); + + common = kernfs_common_ancestor(kn_from, kn_to); + if (WARN_ON(!common)) + return -EINVAL; + + depth_to = kernfs_depth(common, kn_to); + depth_from = kernfs_depth(common, kn_from); + + buf[0] = '\0'; + + for (i = 0; i < depth_from; i++) + len += strlcpy(buf + len, parent_str, + len < buflen ? buflen - len : 0); + + /* Calculate how many bytes we need for the rest */ + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); + } + + return len; +} + +/** + * kernfs_name - obtain the name of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Copies the name of @kn into @buf of @buflen bytes. The behavior is + * similar to strlcpy(). + * + * Fills buffer with "(null)" if @kn is %NULL. + * + * Return: the length of @kn's name and if @buf isn't long enough, + * it's filled up to @buflen-1 and nul terminated. + * + * This function can be called from any context. + */ +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + read_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_name_locked(kn, buf, buflen); + read_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} + +/** + * kernfs_path_from_node - build path of node @to relative to @from. + * @from: parent kernfs_node relative to which we need to build the path + * @to: kernfs_node of interest + * @buf: buffer to copy @to's path into + * @buflen: size of @buf + * + * Builds @to's path relative to @from in @buf. @from and @to must + * be on the same kernfs-root. If @from is not parent of @to, then a relative + * path (which includes '..'s) as needed to reach from @from to @to is + * returned. + * + * Return: the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. + */ +int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, + char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + read_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_path_from_node_locked(to, from, buf, buflen); + read_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kernfs_path_from_node); + +/** + * pr_cont_kernfs_name - pr_cont name of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_name(struct kernfs_node *kn) +{ + unsigned long flags; + + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); + + kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + pr_cont("%s", kernfs_pr_cont_buf); + + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); +} + +/** + * pr_cont_kernfs_path - pr_cont path of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_path(struct kernfs_node *kn) +{ + unsigned long flags; + int sz; + + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); + + sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); + if (sz < 0) { + pr_cont("(error)"); + goto out; + } + + if (sz >= sizeof(kernfs_pr_cont_buf)) { + pr_cont("(name too long)"); + goto out; + } + + pr_cont("%s", kernfs_pr_cont_buf); + +out: + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); +} + +/** + * kernfs_get_parent - determine the parent node and pin it + * @kn: kernfs_node of interest + * + * Determines @kn's parent, pins and returns it. This function can be + * called from any context. + * + * Return: parent node of @kn + */ +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + unsigned long flags; + + read_lock_irqsave(&kernfs_rename_lock, flags); + parent = kn->parent; + kernfs_get(parent); + read_unlock_irqrestore(&kernfs_rename_lock, flags); + + return parent; +} + +/** + * kernfs_name_hash - calculate hash of @ns + @name + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Return: 31-bit hash of ns + name (so it fits in an off_t) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(ns); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = end_name_hash(hash); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 2) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash < kn->hash) + return -1; + if (hash > kn->hash) + return 1; + if (ns < kn->ns) + return -1; + if (ns > kn->ns) + return 1; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * kernfs_rwsem held exclusive + * + * Return: + * %0 on success, -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Try to unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. + * + * Return: %true if @kn was actually removed, + * %false if @kn wasn't on the rbtree. + * + * Locking: + * kernfs_rwsem held exclusive + */ +static bool kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (RB_EMPTY_NODE(&kn->rb)) + return false; + + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + + rb_erase(&kn->rb, &kn->parent->dir.children); + RB_CLEAR_NODE(&kn->rb); + return true; +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is %NULL. + * + * Return: + * Pointer to @kn on success, %NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kernfs_lockdep(kn)) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is %NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + int v; + + if (unlikely(!kn)) + return; + + if (kernfs_lockdep(kn)) + rwsem_release(&kn->dep_map, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + wake_up_all(&kernfs_root(kn)->deactivate_waitq); +} + +/** + * kernfs_drain - drain kernfs_node + * @kn: kernfs_node to drain + * + * Drain existing usages and nuke all existing mmaps of @kn. Multiple + * removers may invoke this function concurrently on @kn and all will + * return after draining is complete. + */ +static void kernfs_drain(struct kernfs_node *kn) + __releases(&kernfs_root(kn)->kernfs_rwsem) + __acquires(&kernfs_root(kn)->kernfs_rwsem) +{ + struct kernfs_root *root = kernfs_root(kn); + + lockdep_assert_held_write(&root->kernfs_rwsem); + WARN_ON_ONCE(kernfs_active(kn)); + + /* + * Skip draining if already fully drained. This avoids draining and its + * lockdep annotations for nodes which have never been activated + * allowing embedding kernfs_remove() in create error paths without + * worrying about draining. + */ + if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && + !kernfs_should_drain_open_files(kn)) + return; + + up_write(&root->kernfs_rwsem); + + if (kernfs_lockdep(kn)) { + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) + lock_contended(&kn->dep_map, _RET_IP_); + } + + wait_event(root->deactivate_waitq, + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); + + if (kernfs_lockdep(kn)) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); + } + + if (kernfs_should_drain_open_files(kn)) + kernfs_drain_open_files(kn); + + down_write(&root->kernfs_rwsem); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* + * Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, + "kernfs_put: %s/%s: released with incorrect active_ref %d\n", + parent ? parent->name : "", kn->name, atomic_read(&kn->active)); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + + kfree_const(kn->name); + + if (kn->iattr) { + simple_xattrs_free(&kn->iattr->xattrs, NULL); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); + } + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); + spin_unlock(&kernfs_idr_lock); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + idr_destroy(&root->ino_idr); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +/** + * kernfs_node_from_dentry - determine kernfs_node associated with a dentry + * @dentry: the dentry in question + * + * Return: the kernfs_node associated with @dentry. If @dentry is not a + * kernfs one, %NULL is returned. + * + * While the returned kernfs_node will stay accessible as long as @dentry + * is accessible, the returned node can be in any state and the caller is + * fully responsible for determining what's accessible. + */ +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ + if (dentry->d_sb->s_op == &kernfs_sops) + return kernfs_dentry_node(dentry); + return NULL; +} + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + struct kernfs_node *parent, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + unsigned flags) +{ + struct kernfs_node *kn; + u32 id_highbits; + int ret; + + name = kstrdup_const(name, GFP_KERNEL); + if (!name) + return NULL; + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + idr_preload(GFP_KERNEL); + spin_lock(&kernfs_idr_lock); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; + spin_unlock(&kernfs_idr_lock); + idr_preload_end(); + if (ret < 0) + goto err_out2; + + kn->id = (u64)id_highbits << 32 | ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, KN_DEACTIVATED_BIAS); + RB_CLEAR_NODE(&kn->rb); + + kn->name = name; + kn->mode = mode; + kn->flags = flags; + + if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = uid, + .ia_gid = gid, + }; + + ret = __kernfs_setattr(kn, &iattr); + if (ret < 0) + goto err_out3; + } + + if (parent) { + ret = security_kernfs_init_security(parent, kn); + if (ret) + goto err_out3; + } + + return kn; + + err_out3: + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); + spin_unlock(&kernfs_idr_lock); + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree_const(name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), parent, + name, mode, uid, gid, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/* + * kernfs_find_and_get_node_by_id - get kernfs_node from node id + * @root: the kernfs root + * @id: the target node id + * + * @id's lower 32bits encode ino and upper gen. If the gen portion is + * zero, all generations are matched. + * + * Return: %NULL on failure, + * otherwise a kernfs node with reference counter incremented. + */ +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id) +{ + struct kernfs_node *kn; + ino_t ino = kernfs_id_ino(id); + u32 gen = kernfs_id_gen(id); + + spin_lock(&kernfs_idr_lock); + + kn = idr_find(&root->ino_idr, (u32)ino); + if (!kn) + goto err_unlock; + + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; + } + + /* + * We should fail if @kn has never been activated and guarantee success + * if the caller knows that @kn is active. Both can be achieved by + * __kernfs_active() which tests @kn->active without kernfs_rwsem. + */ + if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) + goto err_unlock; + + spin_unlock(&kernfs_idr_lock); + return kn; +err_unlock: + spin_unlock(&kernfs_idr_lock); + return NULL; +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * Return: + * %0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + struct kernfs_root *root = kernfs_root(parent); + struct kernfs_iattrs *ps_iattr; + bool has_ns; + int ret; + + down_write(&root->kernfs_rwsem); + + ret = -EINVAL; + has_ns = kernfs_ns_enabled(parent); + if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name)) + goto out_unlock; + + if (kernfs_type(parent) != KERNFS_DIR) + goto out_unlock; + + ret = -ENOENT; + if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) + goto out_unlock; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + goto out_unlock; + + /* Update timestamps on the parent */ + down_write(&root->kernfs_iattr_rwsem); + + ps_iattr = parent->iattr; + if (ps_iattr) { + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; + } + + up_write(&root->kernfs_iattr_rwsem); + up_write(&root->kernfs_rwsem); + + /* + * Activate the new node unless CREATE_DEACTIVATED is requested. + * If not activated here, the kernfs user is responsible for + * activating the node with kernfs_activate(). A node which hasn't + * been activated is not visible to userland and its removal won't + * trigger deactivation. + */ + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + return 0; + +out_unlock: + up_write(&root->kernfs_rwsem); + return ret; +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, + const unsigned char *path, + const void *ns) +{ + size_t len; + char *p, *name; + + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); + + spin_lock_irq(&kernfs_pr_cont_lock); + + len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); + + if (len >= sizeof(kernfs_pr_cont_buf)) { + spin_unlock_irq(&kernfs_pr_cont_lock); + return NULL; + } + + p = kernfs_pr_cont_buf; + + while ((name = strsep(&p, "/")) && parent) { + if (*name == '\0') + continue; + parent = kernfs_find_ns(parent, name, ns); + } + + spin_unlock_irq(&kernfs_pr_cont_lock); + + return parent; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); + + down_read(&root->kernfs_rwsem); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + up_read(&root->kernfs_rwsem); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_walk_and_get_ns - find and get kernfs_node with the given path + * @parent: kernfs_node to search under + * @path: path to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with path @path under @parent and get a reference + * if found. This function may sleep. + * + * Return: pointer to the found kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns) +{ + struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); + + down_read(&root->kernfs_rwsem); + kn = kernfs_walk_ns(parent, path, ns); + kernfs_get(kn); + up_read(&root->kernfs_rwsem); + + return kn; +} + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @scops: optional syscall operations for the hierarchy + * @flags: KERNFS_ROOT_* flags + * @priv: opaque data associated with the new directory + * + * Return: the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + idr_init(&root->ino_idr); + init_rwsem(&root->kernfs_rwsem); + init_rwsem(&root->kernfs_iattr_rwsem); + init_rwsem(&root->kernfs_supers_rwsem); + INIT_LIST_HEAD(&root->supers); + + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; + + kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + KERNFS_DIR); + if (!kn) { + idr_destroy(&root->ino_idr); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->priv = priv; + kn->dir.root = root; + + root->syscall_ops = scops; + root->flags = flags; + root->kn = kn; + init_waitqueue_head(&root->deactivate_waitq); + + if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + /* + * kernfs_remove holds kernfs_rwsem from the root so the root + * shouldn't be freed during the operation. + */ + kernfs_get(root->kn); + kernfs_remove(root->kn); + kernfs_put(root->kn); /* will also free @root */ +} + +/** + * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root + * @root: root to use to lookup + * + * Return: @root's kernfs_node + */ +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) +{ + return root->kn; +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @uid: uid of the new directory + * @gid: gid of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Return: the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + void *priv, const void *ns) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, + uid, gid, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +/** + * kernfs_create_empty_dir - create an always empty directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * + * Return: the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->flags |= KERNFS_EMPTY_DIR; + kn->dir.root = parent->dir.root; + kn->ns = NULL; + kn->priv = NULL; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + struct kernfs_root *root; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Negative hashed dentry? */ + if (d_really_is_negative(dentry)) { + struct kernfs_node *parent; + + /* If the kernfs parent node has changed discard and + * proceed to ->lookup. + * + * There's nothing special needed here when getting the + * dentry parent, even if a concurrent rename is in + * progress. That's because the dentry is negative so + * it can only be the target of the rename and it will + * be doing a d_move() not a replace. Consequently the + * dentry d_parent won't change over the d_move(). + * + * Also kernfs negative dentries transitioning from + * negative to positive during revalidate won't happen + * because they are invalidated on containing directory + * changes and the lookup re-done so that a new positive + * dentry can be properly created. + */ + root = kernfs_root_from_sb(dentry->d_sb); + down_read(&root->kernfs_rwsem); + parent = kernfs_dentry_node(dentry->d_parent); + if (parent) { + if (kernfs_dir_changed(parent, dentry)) { + up_read(&root->kernfs_rwsem); + return 0; + } + } + up_read(&root->kernfs_rwsem); + + /* The kernfs parent node hasn't changed, leave the + * dentry negative and return success. + */ + return 1; + } + + kn = kernfs_dentry_node(dentry); + root = kernfs_root(kn); + down_read(&root->kernfs_rwsem); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + /* The kernfs node has been moved? */ + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + up_read(&root->kernfs_rwsem); + return 1; +out_bad: + up_read(&root->kernfs_rwsem); + return 0; +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, +}; + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_node *kn; + struct kernfs_root *root; + struct inode *inode = NULL; + const void *ns = NULL; + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + /* attach dentry and inode */ + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&root->kernfs_rwsem); + return NULL; + } + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) + inode = ERR_PTR(-ENOMEM); + } + /* + * Needed for negative dentry validation. + * The negative dentry can be created in kernfs_iop_lookup() + * or transforms from positive dentry in dentry_unlink_inode() + * called from vfs_rmdir(). + */ + if (!IS_ERR(inode)) + kernfs_set_rev(parent, dentry); + up_read(&root->kernfs_rwsem); + + /* instantiate and hash (possibly negative) dentry */ + return d_splice_alias(inode, dentry); +} + +static int kernfs_iop_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; + + if (!scops || !scops->mkdir) + return -EPERM; + + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->mkdir(parent, dentry->d_name.name, mode); + + kernfs_put_active(parent); + return ret; +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = kernfs_dentry_node(dentry); + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (!scops || !scops->rmdir) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ret = scops->rmdir(kn); + + kernfs_put_active(kn); + return ret; +} + +static int kernfs_iop_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (flags) + return -EINVAL; + + if (!scops || !scops->rename) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + if (!kernfs_get_active(new_parent)) { + kernfs_put_active(kn); + return -ENODEV; + } + + ret = scops->rename(kn, new_parent, new_dentry->d_name.name); + + kernfs_put_active(new_parent); + kernfs_put_active(kn); + return ret; +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + * + * Return: the next descendant to visit or %NULL when done. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +static void kernfs_activate_one(struct kernfs_node *kn) +{ + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + kn->flags |= KERNFS_ACTIVATED; + + if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) + return; + + WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); +} + +/** + * kernfs_activate - activate a node which started deactivated + * @kn: kernfs_node whose subtree is to be activated + * + * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node + * needs to be explicitly activated. A node which hasn't been activated + * isn't visible to userland and deactivation is skipped during its + * removal. This is useful to construct atomic init sequences where + * creation of multiple nodes should either succeed or fail atomically. + * + * The caller is responsible for ensuring that this function is not called + * after kernfs_remove*() is invoked on @kn. + */ +void kernfs_activate(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); + + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) + kernfs_activate_one(pos); + + up_write(&root->kernfs_rwsem); +} + +/** + * kernfs_show - show or hide a node + * @kn: kernfs_node to show or hide + * @show: whether to show or hide + * + * If @show is %false, @kn is marked hidden and deactivated. A hidden node is + * ignored in future activaitons. If %true, the mark is removed and activation + * state is restored. This function won't implicitly activate a new node in a + * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. + * + * To avoid recursion complexities, directories aren't supported for now. + */ +void kernfs_show(struct kernfs_node *kn, bool show) +{ + struct kernfs_root *root = kernfs_root(kn); + + if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) + return; + + down_write(&root->kernfs_rwsem); + + if (show) { + kn->flags &= ~KERNFS_HIDDEN; + if (kn->flags & KERNFS_ACTIVATED) + kernfs_activate_one(kn); + } else { + kn->flags |= KERNFS_HIDDEN; + if (kernfs_active(kn)) + atomic_add(KN_DEACTIVATED_BIAS, &kn->active); + kernfs_drain(kn); + } + + up_write(&root->kernfs_rwsem); +} + +static void __kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + /* Short-circuit if non-root @kn has already finished removal. */ + if (!kn) + return; + + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + /* + * This is for kernfs_remove_self() which plays with active ref + * after removal. + */ + if (kn->parent && RB_EMPTY_NODE(&kn->rb)) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + /* prevent new usage by marking all nodes removing and deactivating */ + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + } + + /* deactivate and unlink the subtree node-by-node */ + do { + pos = kernfs_leftmost_descendant(kn); + + /* + * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's + * base ref could have been put by someone else by the time + * the function returns. Make sure it doesn't go away + * underneath us. + */ + kernfs_get(pos); + + kernfs_drain(pos); + + /* + * kernfs_unlink_sibling() succeeds once per node. Use it + * to decide who's responsible for cleanups. + */ + if (!pos->parent || kernfs_unlink_sibling(pos)) { + struct kernfs_iattrs *ps_iattr = + pos->parent ? pos->parent->iattr : NULL; + + /* update timestamps on the parent */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + + if (ps_iattr) { + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; + } + + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + kernfs_put(pos); + } + + kernfs_put(pos); + } while (pos != kn); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_root *root; + + if (!kn) + return; + + root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); + __kernfs_remove(kn); + up_write(&root->kernfs_rwsem); +} + +/** + * kernfs_break_active_protection - break out of active protection + * @kn: the self kernfs_node + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. Each invocation of + * this function must also be matched with an invocation of + * kernfs_unbreak_active_protection(). + * + * This function releases the active reference of @kn the caller is + * holding. Once this function is called, @kn may be removed at any point + * and the caller is solely responsible for ensuring that the objects it + * dereferences are accessible. + */ +void kernfs_break_active_protection(struct kernfs_node *kn) +{ + /* + * Take out ourself out of the active ref dependency chain. If + * we're called without an active ref, lockdep will complain. + */ + kernfs_put_active(kn); +} + +/** + * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() + * @kn: the self kernfs_node + * + * If kernfs_break_active_protection() was called, this function must be + * invoked before finishing the kernfs operation. Note that while this + * function restores the active reference, it doesn't and can't actually + * restore the active protection - @kn may already or be in the process of + * being removed. Once kernfs_break_active_protection() is invoked, that + * protection is irreversibly gone for the kernfs operation instance. + * + * While this function may be called at any point after + * kernfs_break_active_protection() is invoked, its most useful location + * would be right before the enclosing kernfs operation returns. + */ +void kernfs_unbreak_active_protection(struct kernfs_node *kn) +{ + /* + * @kn->active could be in any state; however, the increment we do + * here will be undone as soon as the enclosing kernfs operation + * finishes and this temporary bump can't break anything. If @kn + * is alive, nothing changes. If @kn is being deactivated, the + * soon-to-follow put will either finish deactivation or restore + * deactivated state. If @kn is already removed, the temporary + * bump is guaranteed to be gone before @kn is released. + */ + atomic_inc(&kn->active); + if (kernfs_lockdep(kn)) + rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); +} + +/** + * kernfs_remove_self - remove a kernfs_node from its own method + * @kn: the self kernfs_node to remove + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. This can be used to + * implement a file operation which deletes itself. + * + * For example, the "delete" file for a sysfs device directory can be + * implemented by invoking kernfs_remove_self() on the "delete" file + * itself. This function breaks the circular dependency of trying to + * deactivate self while holding an active ref itself. It isn't necessary + * to modify the usual removal path to use kernfs_remove_self(). The + * "delete" implementation can simply invoke kernfs_remove_self() on self + * before proceeding with the usual removal path. kernfs will ignore later + * kernfs_remove() on self. + * + * kernfs_remove_self() can be called multiple times concurrently on the + * same kernfs_node. Only the first one actually performs removal and + * returns %true. All others will wait until the kernfs operation which + * won self-removal finishes and return %false. Note that the losers wait + * for the completion of not only the winning kernfs_remove_self() but also + * the whole kernfs_ops which won the arbitration. This can be used to + * guarantee, for example, all concurrent writes to a "delete" file to + * finish only after the whole operation is complete. + * + * Return: %true if @kn is removed by this call, otherwise %false. + */ +bool kernfs_remove_self(struct kernfs_node *kn) +{ + bool ret; + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); + kernfs_break_active_protection(kn); + + /* + * SUICIDAL is used to arbitrate among competing invocations. Only + * the first one will actually perform removal. When the removal + * is complete, SUICIDED is set and the active ref is restored + * while kernfs_rwsem for held exclusive. The ones which lost + * arbitration waits for SUICIDED && drained which can happen only + * after the enclosing kernfs operation which executed the winning + * instance of kernfs_remove_self() finished. + */ + if (!(kn->flags & KERNFS_SUICIDAL)) { + kn->flags |= KERNFS_SUICIDAL; + __kernfs_remove(kn); + kn->flags |= KERNFS_SUICIDED; + ret = true; + } else { + wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); + + if ((kn->flags & KERNFS_SUICIDED) && + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) + break; + + up_write(&root->kernfs_rwsem); + schedule(); + down_write(&root->kernfs_rwsem); + } + finish_wait(waitq, &wait); + WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); + ret = false; + } + + /* + * This must be done while kernfs_rwsem held exclusive; otherwise, + * waiting for SUICIDED && deactivated could finish prematurely. + */ + kernfs_unbreak_active_protection(kn); + + up_write(&root->kernfs_rwsem); + return ret; +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * + * Return: %0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_node *kn; + struct kernfs_root *root; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + root = kernfs_root(parent); + down_write(&root->kernfs_rwsem); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) { + kernfs_get(kn); + __kernfs_remove(kn); + kernfs_put(kn); + } + + up_write(&root->kernfs_rwsem); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + * + * Return: %0 on success, -errno on failure. + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + struct kernfs_node *old_parent; + struct kernfs_root *root; + const char *old_name = NULL; + int error; + + /* can't move or rename root */ + if (!kn->parent) + return -EINVAL; + + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); + + error = -ENOENT; + if (!kernfs_active(kn) || !kernfs_active(new_parent) || + (new_parent->flags & KERNFS_EMPTY_DIR)) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup_const(new_name, GFP_KERNEL); + if (!new_name) + goto out; + } else { + new_name = NULL; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + + /* rename_lock protects ->parent and ->name accessors */ + write_lock_irq(&kernfs_rename_lock); + + old_parent = kn->parent; + kn->parent = new_parent; + + kn->ns = new_ns; + if (new_name) { + old_name = kn->name; + kn->name = new_name; + } + + write_unlock_irq(&kernfs_rename_lock); + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kernfs_link_sibling(kn); + + kernfs_put(old_parent); + kfree_const(old_name); + + error = 0; + out: + up_write(&root->kernfs_rwsem); + return error; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = kernfs_active(pos) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries which are dying/dead or in the wrong namespace */ + while (pos && (!kernfs_active(pos) || pos->ns != ns)) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) { + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && (!kernfs_active(pos) || pos->ns != ns)); + } + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = kernfs_dentry_node(dentry); + struct kernfs_node *pos = file->private_data; + struct kernfs_root *root; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = fs_umode_to_dtype(pos->mode); + int len = strlen(name); + ino_t ino = kernfs_ino(pos); + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + up_read(&root->kernfs_rwsem); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + down_read(&root->kernfs_rwsem); + } + up_read(&root->kernfs_rwsem); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate_shared = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = generic_file_llseek, +}; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c new file mode 100644 index 0000000000..180906c36f --- /dev/null +++ b/fs/kernfs/file.c @@ -0,0 +1,1082 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/pagemap.h> +#include <linux/sched/mm.h> +#include <linux/fsnotify.h> +#include <linux/uio.h> + +#include "kernfs-internal.h" + +struct kernfs_open_node { + struct rcu_head rcu_head; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ + unsigned int nr_mmapped; + unsigned int nr_to_release; +}; + +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for %NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + +static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) +{ + int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); + + return &kernfs_locks->open_file_mutex[idx]; +} + +static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) +{ + struct mutex *lock; + + lock = kernfs_open_file_mutex_ptr(kn); + + mutex_lock(lock); + + return lock; +} + +/** + * of_on - Get the kernfs_open_node of the specified kernfs_open_file + * @of: target kernfs_open_file + * + * Return: the kernfs_open_node of the kernfs_open_file + */ +static struct kernfs_open_node *of_on(struct kernfs_open_file *of) +{ + return rcu_dereference_protected(of->kn->attr.open, + !list_empty(&of->list)); +} + +/** + * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn + * + * @kn: target kernfs_node. + * + * Fetch and return ->attr.open of @kn when caller holds the + * kernfs_open_file_mutex_ptr(kn). + * + * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when + * the caller guarantees that this mutex is being held, other updaters can't + * change ->attr.open and this means that we can safely deref ->attr.open + * outside RCU read-side critical section. + * + * The caller needs to make sure that kernfs_open_file_mutex is held. + * + * Return: @kn->attr.open when kernfs_open_file_mutex is held. + */ +static struct kernfs_open_node * +kernfs_deref_open_node_locked(struct kernfs_node *kn) +{ + return rcu_dereference_protected(kn->attr.open, + lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); +} + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is primarily to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } + return single_start(sf, ppos); +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of_on(of)->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = of->prealloc_buf; + if (buf) + mutex_lock(&of->prealloc_mutex); + else + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + of->event = atomic_read(&of_on(of)->event); + + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, iocb->ki_pos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len < 0) + goto out_free; + + if (copy_to_iter(buf, len, iter) != len) { + len = -EFAULT; + goto out_free; + } + + iocb->ki_pos += len; + + out_free: + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else + kfree(buf); + return len; +} + +static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read_iter(iocb, iter); + return kernfs_file_read_iter(iocb, iter); +} + +/* + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = iov_iter_count(iter); + const struct kernfs_ops *ops; + char *buf; + + if (of->atomic_write_len) { + if (len > of->atomic_write_len) + return -E2BIG; + } else { + len = min_t(size_t, len, PAGE_SIZE); + } + + buf = of->prealloc_buf; + if (buf) + mutex_lock(&of->prealloc_mutex); + else + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_iter(buf, len, iter) != len) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + /* + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, iocb->ki_pos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len > 0) + iocb->ki_pos += len; + +out_free: + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + vm_fault_t ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + vm_fault_t ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + if (rc) + goto out_put; + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = true; + of_on(of)->nr_mmapped++; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * Locking: + * Kernel thread context (may sleep). + * + * Return: + * %0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on; + struct mutex *mutex; + + mutex = kernfs_open_file_mutex_lock(kn); + on = kernfs_deref_open_node_locked(kn); + + if (!on) { + /* not there, initialize a new one */ + on = kzalloc(sizeof(*on), GFP_KERNEL); + if (!on) { + mutex_unlock(mutex); + return -ENOMEM; + } + atomic_set(&on->event, 1); + init_waitqueue_head(&on->poll); + INIT_LIST_HEAD(&on->files); + rcu_assign_pointer(kn->attr.open, on); + } + + list_add_tail(&of->list, &on->files); + if (kn->flags & KERNFS_HAS_RELEASE) + on->nr_to_release++; + + mutex_unlock(mutex); + return 0; +} + +/** + * kernfs_unlink_open_file - Unlink @of from @kn. + * + * @kn: target kernfs_node + * @of: associated kernfs_open_file + * @open_failed: ->open() failed, cancel ->release() + * + * Unlink @of from list of @kn's associated open files. If list of + * associated open files becomes empty, disassociate and free + * kernfs_open_node. + * + * LOCKING: + * None. + */ +static void kernfs_unlink_open_file(struct kernfs_node *kn, + struct kernfs_open_file *of, + bool open_failed) +{ + struct kernfs_open_node *on; + struct mutex *mutex; + + mutex = kernfs_open_file_mutex_lock(kn); + + on = kernfs_deref_open_node_locked(kn); + if (!on) { + mutex_unlock(mutex); + return; + } + + if (of) { + if (kn->flags & KERNFS_HAS_RELEASE) { + WARN_ON_ONCE(of->released == open_failed); + if (open_failed) + on->nr_to_release--; + } + if (of->mmapped) + on->nr_mmapped--; + list_del(&of->list); + } + + if (list_empty(&on->files)) { + rcu_assign_pointer(kn->attr.open, NULL); + kfree_rcu(on, rcu_head); + } + + mutex_unlock(mutex); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root = kernfs_root(kn); + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_lock nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Write path needs to atomic_write_len outside active reference. + * Cache it in open_file. See kernfs_fop_write_iter() for details. + */ + of->atomic_write_len = ops->atomic_write_len; + + error = -EINVAL; + /* + * ->seq_show is incompatible with ->prealloc, + * as seq_read does its own allocation. + * ->read must be used instead. + */ + if (ops->prealloc && ops->seq_show) + goto err_free; + if (ops->prealloc) { + int len = of->atomic_write_len ?: PAGE_SIZE; + of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); + error = -ENOMEM; + if (!of->prealloc_buf) + goto err_free; + mutex_init(&of->prealloc_mutex); + } + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + of->seq_file = file->private_data; + of->seq_file->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_seq_release; + + if (ops->open) { + /* nobody has access to @of yet, skip @of->mutex */ + error = ops->open(of); + if (error) + goto err_put_node; + } + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_put_node: + kernfs_unlink_open_file(kn, of, true); +err_seq_release: + seq_release(inode, file); +err_free: + kfree(of->prealloc_buf); + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +/* used from release/drain to ensure that ->release() is called exactly once */ +static void kernfs_release_file(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + /* + * @of is guaranteed to have no other file operations in flight and + * we just want to synchronize release and drain paths. + * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used + * here because drain path may be called from places which can + * cause circular dependency. + */ + lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); + + if (!of->released) { + /* + * A file is never detached without being released and we + * need to be able to release files which are deactivated + * and being drained. Don't use kernfs_ops(). + */ + kn->attr.ops->release(of); + of->released = true; + of_on(of)->nr_to_release--; + } +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = inode->i_private; + struct kernfs_open_file *of = kernfs_of(filp); + + if (kn->flags & KERNFS_HAS_RELEASE) { + struct mutex *mutex; + + mutex = kernfs_open_file_mutex_lock(kn); + kernfs_release_file(kn, of); + mutex_unlock(mutex); + } + + kernfs_unlink_open_file(kn, of, false); + seq_release(inode, filp); + kfree(of->prealloc_buf); + kfree(of); + + return 0; +} + +bool kernfs_should_drain_open_files(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + bool ret; + + /* + * @kn being deactivated guarantees that @kn->attr.open can't change + * beneath us making the lockless test below safe. + */ + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); + ret = on && (on->nr_mmapped || on->nr_to_release); + rcu_read_unlock(); + + return ret; +} + +void kernfs_drain_open_files(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + struct mutex *mutex; + + mutex = kernfs_open_file_mutex_lock(kn); + on = kernfs_deref_open_node_locked(kn); + if (!on) { + mutex_unlock(mutex); + return; + } + + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + + if (of->mmapped) { + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + of->mmapped = false; + on->nr_mmapped--; + } + + if (kn->flags & KERNFS_HAS_RELEASE) + kernfs_release_file(kn, of); + } + + WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); + mutex_unlock(mutex); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return EPOLLERR|EPOLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_open_node *on = of_on(of); + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + +static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); + __poll_t ret; + + if (!kernfs_get_active(kn)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait); + + kernfs_put_active(kn); + return ret; +} + +static void kernfs_notify_workfn(struct work_struct *work) +{ + struct kernfs_node *kn; + struct kernfs_super_info *info; + struct kernfs_root *root; +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); + return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); + + root = kernfs_root(kn); + /* kick fsnotify */ + + down_read(&root->kernfs_supers_rwsem); + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct kernfs_node *parent; + struct inode *p_inode = NULL; + struct inode *inode; + struct qstr name; + + /* + * We want fsnotify_modify() on @kn but as the + * modifications aren't originating from userland don't + * have the matching @file available. Look up the inodes + * and generate the events manually. + */ + inode = ilookup(info->sb, kernfs_ino(kn)); + if (!inode) + continue; + + name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); + parent = kernfs_get_parent(kn); + if (parent) { + p_inode = ilookup(info->sb, kernfs_ino(parent)); + if (p_inode) { + fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, + inode, FSNOTIFY_EVENT_INODE, + p_inode, &name, inode, 0); + iput(p_inode); + } + + kernfs_put(parent); + } + + if (!p_inode) + fsnotify_inode(inode, FS_MODIFY); + + iput(inode); + } + + up_read(&root->kernfs_supers_rwsem); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + struct kernfs_open_node *on; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + /* kick poll immediately */ + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + rcu_read_unlock(); + + /* schedule work to kick fsnotify */ + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read_iter = kernfs_fop_read_iter, + .write_iter = kernfs_fop_write_iter, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, + .fsync = noop_fsync, + .splice_read = copy_splice_read, + .splice_write = iter_file_splice_write, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @uid: uid of the file + * @gid: gid of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Return: the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + struct lock_class_key *key) +{ + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, + uid, gid, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "kn->active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accessible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + if (ops->release) + kn->flags |= KERNFS_HAS_RELEASE; + + rc = kernfs_add_one(kn); + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c new file mode 100644 index 0000000000..922719a343 --- /dev/null +++ b/fs/kernfs/inode.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + */ + +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/security.h> + +#include "kernfs-internal.h" + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .listxattr = kernfs_iop_listxattr, +}; + +static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) +{ + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; + + mutex_lock(&iattr_mutex); + + if (kn->iattr || !alloc) + goto out_unlock; + + kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); + if (!kn->iattr) + goto out_unlock; + + /* assign default attributes */ + kn->iattr->ia_uid = GLOBAL_ROOT_UID; + kn->iattr->ia_gid = GLOBAL_ROOT_GID; + + ktime_get_real_ts64(&kn->iattr->ia_atime); + kn->iattr->ia_mtime = kn->iattr->ia_atime; + kn->iattr->ia_ctime = kn->iattr->ia_atime; + + simple_xattrs_init(&kn->iattr->xattrs); + atomic_set(&kn->iattr->nr_user_xattrs, 0); + atomic_set(&kn->iattr->user_xattr_size, 0); +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; +} + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + return __kernfs_iattrs(kn, 1); +} + +static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) +{ + return __kernfs_iattrs(kn, 0); +} + +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (ia_valid & ATTR_UID) + attrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + attrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + attrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + attrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + attrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) + kn->mode = iattr->ia_mode; + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Return: %0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_iattr_rwsem); + ret = __kernfs_setattr(kn, iattr); + up_write(&root->kernfs_iattr_rwsem); + return ret; +} + +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root; + int error; + + if (!kn) + return -EINVAL; + + root = kernfs_root(kn); + down_write(&root->kernfs_iattr_rwsem); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(&nop_mnt_idmap, inode, iattr); + +out: + up_write(&root->kernfs_iattr_rwsem); + return error; +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = kernfs_dentry_node(dentry); + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); +} + +static inline void set_inode_attr(struct inode *inode, + struct kernfs_iattrs *attrs) +{ + inode->i_uid = attrs->ia_uid; + inode->i_gid = attrs->ia_gid; + inode->i_atime = attrs->ia_atime; + inode->i_mtime = attrs->ia_mtime; + inode_set_ctime_to_ts(inode, attrs->ia_ctime); +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, attrs); + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root = kernfs_root(kn); + + down_read(&root->kernfs_iattr_rwsem); + kernfs_refresh_inode(kn, inode); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + up_read(&root->kernfs_iattr_rwsem); + + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &ram_aops; + inode->i_op = &kernfs_iops; + inode->i_generation = kernfs_gen(kn); + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + if (kn->flags & KERNFS_EMPTY_DIR) + make_empty_dir_inode(inode); + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * Locking: + * Kernel thread context (may sleep). + * + * Return: + * Pointer to allocated inode on success, %NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kernfs_ino(kn)); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + struct kernfs_node *kn; + struct kernfs_root *root; + int ret; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + root = kernfs_root(kn); + + down_read(&root->kernfs_iattr_rwsem); + kernfs_refresh_inode(kn, inode); + ret = generic_permission(&nop_mnt_idmap, inode, mask); + up_read(&root->kernfs_iattr_rwsem); + + return ret; +} + +int kernfs_xattr_get(struct kernfs_node *kn, const char *name, + void *value, size_t size) +{ + struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); + if (!attrs) + return -ENODATA; + + return simple_xattr_get(&attrs->xattrs, name, value, size); +} + +int kernfs_xattr_set(struct kernfs_node *kn, const char *name, + const void *value, size_t size, int flags) +{ + struct simple_xattr *old_xattr; + struct kernfs_iattrs *attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; +} + +static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + const char *name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + + return kernfs_xattr_get(kn, name, value, size); +} + +static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + + return kernfs_xattr_set(kn, name, value, size, flags); +} + +static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + struct simple_xattr *old_xattr; + int ret; + + if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { + ret = -ENOSPC; + goto dec_count_out; + } + + if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { + ret = -ENOSPC; + goto dec_size_out; + } + + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; + + if (IS_ERR(old_xattr)) { + ret = PTR_ERR(old_xattr); + goto dec_size_out; + } + + ret = 0; + size = old_xattr->size; + simple_xattr_free(old_xattr); +dec_size_out: + atomic_sub(size, sz); +dec_count_out: + atomic_dec(nr); + return ret; +} + +static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + struct simple_xattr *old_xattr; + + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; + + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + atomic_sub(old_xattr->size, sz); + atomic_dec(nr); + simple_xattr_free(old_xattr); + return 0; +} + +static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *full_name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) + return -EOPNOTSUPP; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (value) + return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, + value, size, flags); + else + return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, + value, size, flags); + +} + +static const struct xattr_handler kernfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_xattr_set, +}; + +static const struct xattr_handler kernfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_xattr_set, +}; + +static const struct xattr_handler kernfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_user_xattr_set, +}; + +const struct xattr_handler *kernfs_xattr_handlers[] = { + &kernfs_trusted_xattr_handler, + &kernfs_security_xattr_handler, + &kernfs_user_xattr_handler, + NULL +}; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h new file mode 100644 index 0000000000..a9b854cdfd --- /dev/null +++ b/fs/kernfs/kernfs-internal.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include <linux/lockdep.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> + +#include <linux/kernfs.h> +#include <linux/fs_context.h> + +struct kernfs_iattrs { + kuid_t ia_uid; + kgid_t ia_gid; + struct timespec64 ia_atime; + struct timespec64 ia_mtime; + struct timespec64 ia_ctime; + + struct simple_xattrs xattrs; + atomic_t nr_user_xattrs; + atomic_t user_xattr_size; +}; + +struct kernfs_root { + /* published fields */ + struct kernfs_node *kn; + unsigned int flags; /* KERNFS_ROOT_* flags */ + + /* private fields, do not use outside kernfs proper */ + struct idr ino_idr; + u32 last_id_lowbits; + u32 id_highbits; + struct kernfs_syscall_ops *syscall_ops; + + /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ + struct list_head supers; + + wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; + struct rw_semaphore kernfs_iattr_rwsem; + struct rw_semaphore kernfs_supers_rwsem; +}; + +/* +1 to avoid triggering overflow warning when negating it */ +#define KN_DEACTIVATED_BIAS (INT_MIN + 1) + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return: the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * mount.c + */ +struct kernfs_super_info { + struct super_block *sb; + + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; + + /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ + struct list_head node; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) +{ + if (d_really_is_negative(dentry)) + return NULL; + return d_inode(dentry)->i_private; +} + +static inline void kernfs_set_rev(struct kernfs_node *parent, + struct dentry *dentry) +{ + dentry->d_time = parent->dir.rev; +} + +static inline void kernfs_inc_rev(struct kernfs_node *parent) +{ + parent->dir.rev++; +} + +static inline bool kernfs_dir_changed(struct kernfs_node *parent, + struct dentry *dentry) +{ + if (parent->dir.rev != dentry->d_time) + return true; + return false; +} + +extern const struct super_operations kernfs_sops; +extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; + +/* + * inode.c + */ +extern const struct xattr_handler *kernfs_xattr_handlers[]; +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *iattr); +int kernfs_iop_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); + +/* + * dir.c + */ +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +int kernfs_add_one(struct kernfs_node *kn); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +bool kernfs_should_drain_open_files(struct kernfs_node *kn); +void kernfs_drain_open_files(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +/* + * kernfs locks + */ +extern struct kernfs_global_locks *kernfs_locks; +#endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c new file mode 100644 index 0000000000..c4bf26142e --- /dev/null +++ b/fs/kernfs/mount.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/namei.h> +#include <linux/seq_file.h> +#include <linux/exportfs.h> +#include <linux/uuid.h> +#include <linux/statfs.h> + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; +struct kernfs_global_locks *kernfs_locks; + +static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_options) + return scops->show_options(sf, root); + return 0; +} + +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_node *node = kernfs_dentry_node(dentry); + struct kernfs_root *root = kernfs_root(node); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_path) + return scops->show_path(sf, node, root); + + seq_dentry(sf, dentry, " \t\n\\"); + return 0; +} + +static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + simple_statfs(dentry, buf); + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; +} + +const struct super_operations kernfs_sops = { + .statfs = kernfs_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, + + .show_options = kernfs_sop_show_options, + .show_path = kernfs_sop_show_path, +}; + +static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) +{ + struct kernfs_node *kn = inode->i_private; + + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + *max_len = 2; + *(u64 *)fh = kn->id; + return FILEID_KERNFS; +} + +static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type, bool get_parent) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *kn; + struct inode *inode; + u64 id; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + id = *(u64 *)fid; + break; + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + /* + * blk_log_action() exposes "LOW32,HIGH32" pair without + * type and userland can call us with generic fid + * constructed from them. Combine it back to ID. See + * blk_log_action(). + */ + id = ((u64)fid->i32.gen << 32) | fid->i32.ino; + break; + default: + return NULL; + } + + kn = kernfs_find_and_get_node_by_id(info->root, id); + if (!kn) + return ERR_PTR(-ESTALE); + + if (get_parent) { + struct kernfs_node *parent; + + parent = kernfs_get_parent(kn); + kernfs_put(kn); + kn = parent; + if (!kn) + return ERR_PTR(-ESTALE); + } + + inode = kernfs_get_inode(sb, kn); + kernfs_put(kn); + if (!inode) + return ERR_PTR(-ESTALE); + + return d_obtain_alias(inode); +} + +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); +} + +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); +} + +static struct dentry *kernfs_get_parent_dentry(struct dentry *child) +{ + struct kernfs_node *kn = kernfs_dentry_node(child); + + return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); +} + +static const struct export_operations kernfs_export_ops = { + .encode_fh = kernfs_encode_fh, + .fh_to_dentry = kernfs_fh_to_dentry, + .fh_to_parent = kernfs_fh_to_parent, + .get_parent = kernfs_get_parent_dentry, +}; + +/** + * kernfs_root_from_sb - determine kernfs_root associated with a super_block + * @sb: the super_block in question + * + * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, + * %NULL is returned. + */ +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ + if (sb->s_op == &kernfs_sops) + return kernfs_info(sb)->root; + return NULL; +} + +/* + * find the next ancestor in the path down to @child, where @parent was the + * ancestor whose descendant we want to find. + * + * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root + * node. If @parent is b, then we return the node for c. + * Passing in d as @parent is not ok. + */ +static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, + struct kernfs_node *parent) +{ + if (child == parent) { + pr_crit_once("BUG in find_next_ancestor: called with parent == child"); + return NULL; + } + + while (child->parent != parent) { + if (!child->parent) + return NULL; + child = child->parent; + } + + return child; +} + +/** + * kernfs_node_dentry - get a dentry for the given kernfs_node + * @kn: kernfs_node for which a dentry is needed + * @sb: the kernfs super_block + * + * Return: the dentry pointer + */ +struct dentry *kernfs_node_dentry(struct kernfs_node *kn, + struct super_block *sb) +{ + struct dentry *dentry; + struct kernfs_node *knparent = NULL; + + BUG_ON(sb->s_op != &kernfs_sops); + + dentry = dget(sb->s_root); + + /* Check if this is the root kernfs_node */ + if (!kn->parent) + return dentry; + + knparent = find_next_ancestor(kn, NULL); + if (WARN_ON(!knparent)) { + dput(dentry); + return ERR_PTR(-EINVAL); + } + + do { + struct dentry *dtmp; + struct kernfs_node *kntmp; + + if (kn == knparent) + return dentry; + kntmp = find_next_ancestor(kn, knparent); + if (WARN_ON(!kntmp)) { + dput(dentry); + return ERR_PTR(-EINVAL); + } + dtmp = lookup_positive_unlocked(kntmp->name, dentry, + strlen(kntmp->name)); + dput(dentry); + if (IS_ERR(dtmp)) + return dtmp; + knparent = kntmp; + dentry = dtmp; + } while (true); +} + +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *kf_root = kfc->root; + struct inode *inode; + struct dentry *root; + + info->sb = sb; + /* Userspace would break if executables or devices appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = kfc->magic; + sb->s_op = &kernfs_sops; + sb->s_xattr = kernfs_xattr_handlers; + if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) + sb->s_export_op = &kernfs_export_ops; + sb->s_time_gran = 1; + + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + + /* get root inode, initialize and unlock it */ + down_read(&kf_root->kernfs_rwsem); + inode = kernfs_get_inode(sb, info->root->kn); + up_read(&kf_root->kernfs_rwsem); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = fc->s_fs_info; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + kfc->ns_tag = NULL; + return set_anon_super_fc(sb, fc); +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return: the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_get_tree - kernfs filesystem access/retrieval helper + * @fc: The filesystem context. + * + * This is to be called from each kernfs user's fs_context->ops->get_tree() + * implementation, which should set the specified ->@fs_type and ->@flags, and + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, + * respectively. + * + * Return: %0 on success, -errno on failure. + */ +int kernfs_get_tree(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->root = kfc->root; + info->ns = kfc->ns_tag; + INIT_LIST_HEAD(&info->node); + + fc->s_fs_info = info; + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = kfc->root; + + kfc->new_sb_created = true; + + error = kernfs_fill_super(sb, kfc); + if (error) { + deactivate_locked_super(sb); + return error; + } + sb->s_flags |= SB_ACTIVE; + + uuid_gen(&sb->s_uuid); + + down_write(&root->kernfs_supers_rwsem); + list_add(&info->node, &info->root->supers); + up_write(&root->kernfs_supers_rwsem); + } + + fc->root = dget(sb->s_root); + return 0; +} + +void kernfs_free_fs_context(struct fs_context *fc) +{ + /* Note that we don't deal with kfc->ns_tag here. */ + kfree(fc->s_fs_info); + fc->s_fs_info = NULL; +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = info->root; + + down_write(&root->kernfs_supers_rwsem); + list_del(&info->node); + up_write(&root->kernfs_supers_rwsem); + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); +} + +static void __init kernfs_mutex_init(void) +{ + int count; + + for (count = 0; count < NR_KERNFS_LOCKS; count++) + mutex_init(&kernfs_locks->open_file_mutex[count]); +} + +static void __init kernfs_lock_init(void) +{ + kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL); + WARN_ON(!kernfs_locks); + + kernfs_mutex_init(); +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); + + /* Creates slab cache for kernfs inode attributes */ + kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", + sizeof(struct kernfs_iattrs), + 0, SLAB_PANIC, NULL); + + kernfs_lock_init(); +} diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c new file mode 100644 index 0000000000..45371a70ca --- /dev/null +++ b/fs/kernfs/symlink.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/namei.h> + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Return: the created node on success, ERR_PTR() value on error. + * Ownership of the link matches ownership of the target. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + int error; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; + + if (target->iattr) { + uid = target->iattr->ia_uid; + gid = target->iattr->ia_gid; + } + + kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + error = kernfs_add_one(kn); + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + if ((s - path) + 3 >= PATH_MAX) + return -ENAMETOOLONG; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len >= PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + memcpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct inode *inode, char *path) +{ + struct kernfs_node *kn = inode->i_private; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + struct kernfs_root *root = kernfs_root(parent); + int error; + + down_read(&root->kernfs_rwsem); + error = kernfs_get_target_path(parent, target, path); + up_read(&root->kernfs_rwsem); + + return error; +} + +static const char *kernfs_iop_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + char *body; + int error; + + if (!dentry) + return ERR_PTR(-ECHILD); + body = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!body) + return ERR_PTR(-ENOMEM); + error = kernfs_getlink(inode, body); + if (unlikely(error < 0)) { + kfree(body); + return ERR_PTR(error); + } + set_delayed_call(done, kfree_link, body); + return body; +} + +const struct inode_operations kernfs_symlink_iops = { + .listxattr = kernfs_iop_listxattr, + .get_link = kernfs_iop_get_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; |