diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /fs/orangefs | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/orangefs')
28 files changed, 10637 insertions, 0 deletions
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig new file mode 100644 index 000000000..890c0ae9a --- /dev/null +++ b/fs/orangefs/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +config ORANGEFS_FS + tristate "ORANGEFS (Powered by PVFS) support" + select FS_POSIX_ACL + help + Orange is a parallel file system designed for use on high end + computing (HEC) systems. diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile new file mode 100644 index 000000000..9b6c50bb1 --- /dev/null +++ b/fs/orangefs/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the ORANGEFS filesystem. +# + +obj-$(CONFIG_ORANGEFS_FS) += orangefs.o + +orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \ + dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \ + devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \ + orangefs-debugfs.o waitqueue.o diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c new file mode 100644 index 000000000..605e5a350 --- /dev/null +++ b/fs/orangefs/acl.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" +#include <linux/posix_acl_xattr.h> + +struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) +{ + struct posix_acl *acl; + int ret; + char *key = NULL, *value = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + key = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + key = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + gossip_err("orangefs_get_acl: bogus value of type %d\n", type); + return ERR_PTR(-EINVAL); + } + /* + * Rather than incurring a network call just to determine the exact + * length of the attribute, I just allocate a max length to save on + * the network call. Conceivably, we could pass NULL to + * orangefs_inode_getxattr() to probe the length of the value, but + * I don't do that for now. + */ + value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + + gossip_debug(GOSSIP_ACL_DEBUG, + "inode %pU, key %s, type %d\n", + get_khandle_from_ino(inode), + key, + type); + ret = orangefs_inode_getxattr(inode, key, value, + ORANGEFS_MAX_XATTR_VALUELEN); + /* if the key exists, convert it to an in-memory rep */ + if (ret > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, ret); + } else if (ret == -ENODATA || ret == -ENOSYS) { + acl = NULL; + } else { + gossip_err("inode %pU retrieving acl's failed with error %d\n", + get_khandle_from_ino(inode), + ret); + acl = ERR_PTR(ret); + } + /* kfree(NULL) is safe, so don't worry if value ever got used */ + kfree(value); + return acl; +} + +static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) +{ + int error = 0; + void *value = NULL; + size_t size = 0; + const char *name = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + gossip_err("%s: invalid type %d!\n", __func__, type); + return -EINVAL; + } + + gossip_debug(GOSSIP_ACL_DEBUG, + "%s: inode %pU, key %s type %d\n", + __func__, get_khandle_from_ino(inode), + name, + type); + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (error < 0) + goto out; + } + + gossip_debug(GOSSIP_ACL_DEBUG, + "%s: name %s, value %p, size %zd, acl %p\n", + __func__, name, value, size, acl); + /* + * Go ahead and set the extended attribute now. NOTE: Suppose acl + * was NULL, then value will be NULL and size will be 0 and that + * will xlate to a removexattr. However, we don't want removexattr + * complain if attributes does not exist. + */ + error = orangefs_inode_setxattr(inode, name, value, size, 0); + +out: + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + int error; + struct iattr iattr; + int rc; + + memset(&iattr, 0, sizeof iattr); + + if (type == ACL_TYPE_ACCESS && acl) { + /* + * posix_acl_update_mode checks to see if the permissions + * described by the ACL can be encoded into the + * object's mode. If so, it sets "acl" to NULL + * and "mode" to the new desired value. It is up to + * us to propagate the new mode back to the server... + */ + error = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", + __func__, + error); + return error; + } + + if (inode->i_mode != iattr.ia_mode) + iattr.ia_valid = ATTR_MODE; + + } + + rc = __orangefs_set_acl(inode, acl, type); + + if (!rc && (iattr.ia_valid == ATTR_MODE)) + rc = __orangefs_setattr(inode, &iattr); + + return rc; +} + +int orangefs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + umode_t mode = inode->i_mode; + struct iattr iattr; + int error = 0; + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + + if (acl) { + if (!error) + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + + /* If mode of the inode was changed, then do a forcible ->setattr */ + if (mode != inode->i_mode) { + memset(&iattr, 0, sizeof iattr); + inode->i_mode = mode; + iattr.ia_mode = mode; + iattr.ia_valid |= ATTR_MODE; + __orangefs_setattr(inode, &iattr); + } + + return error; +} diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c new file mode 100644 index 000000000..8bbe9486e --- /dev/null +++ b/fs/orangefs/dcache.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Implementation of dentry (directory cache) functions. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" + +/* Returns 1 if dentry can still be trusted, else 0. */ +static int orangefs_revalidate_lookup(struct dentry *dentry) +{ + struct dentry *parent_dentry = dget_parent(dentry); + struct inode *parent_inode = parent_dentry->d_inode; + struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode); + struct inode *inode = dentry->d_inode; + struct orangefs_kernel_op_s *new_op; + int ret = 0; + int err = 0; + + gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); + + new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); + if (!new_op) { + ret = -ENOMEM; + goto out_put_parent; + } + + new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; + new_op->upcall.req.lookup.parent_refn = parent->refn; + strncpy(new_op->upcall.req.lookup.d_name, + dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s:%s:%d interrupt flag [%d]\n", + __FILE__, + __func__, + __LINE__, + get_interruptible_flag(parent_inode)); + + err = service_operation(new_op, "orangefs_lookup", + get_interruptible_flag(parent_inode)); + + /* Positive dentry: reject if error or not the same inode. */ + if (inode) { + if (err) { + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s:%s:%d lookup failure.\n", + __FILE__, __func__, __LINE__); + goto out_drop; + } + if (!match_handle(new_op->downcall.resp.lookup.refn.khandle, + inode)) { + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s:%s:%d no match.\n", + __FILE__, __func__, __LINE__); + goto out_drop; + } + + /* Negative dentry: reject if success or error other than ENOENT. */ + } else { + gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n", + __func__); + if (!err || err != -ENOENT) { + if (new_op->downcall.status != 0) + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s:%s:%d lookup failure.\n", + __FILE__, __func__, __LINE__); + goto out_drop; + } + } + + orangefs_set_timeout(dentry); + ret = 1; +out_release_op: + op_release(new_op); +out_put_parent: + dput(parent_dentry); + return ret; +out_drop: + gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n", + __FILE__, __func__, __LINE__); + goto out_release_op; +} + +/* + * Verify that dentry is valid. + * + * Should return 1 if dentry can still be trusted, else 0. + */ +static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + int ret; + unsigned long time = (unsigned long) dentry->d_fsdata; + + if (time_before(jiffies, time)) + return 1; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n", + __func__, dentry); + + /* skip root handle lookups. */ + if (dentry->d_inode && is_root_handle(dentry->d_inode)) + return 1; + + /* + * If this passes, the positive dentry still exists or the negative + * dentry still does not exist. + */ + if (!orangefs_revalidate_lookup(dentry)) + return 0; + + /* We do not need to continue with negative dentries. */ + if (!dentry->d_inode) { + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s: negative dentry or positive dentry and inode valid.\n", + __func__); + return 1; + } + + /* Now we must perform a getattr to validate the inode contents. */ + + ret = orangefs_inode_check_changed(dentry->d_inode); + if (ret < 0) { + gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n", + __FILE__, __func__, __LINE__); + return 0; + } + return !ret; +} + +const struct dentry_operations orangefs_dentry_operations = { + .d_revalidate = orangefs_d_revalidate, +}; diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c new file mode 100644 index 000000000..33ee8cb32 --- /dev/null +++ b/fs/orangefs/devorangefs-req.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * Changes by Acxiom Corporation to add protocol version to kernel + * communication, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-dev-proto.h" +#include "orangefs-bufmap.h" +#include "orangefs-debugfs.h" + +#include <linux/debugfs.h> +#include <linux/slab.h> + +/* this file implements the /dev/pvfs2-req device node */ + +uint32_t orangefs_userspace_version; + +static int open_access_count; + +static DEFINE_MUTEX(devreq_mutex); + +#define DUMP_DEVICE_ERROR() \ +do { \ + gossip_err("*****************************************************\n");\ + gossip_err("ORANGEFS Device Error: You cannot open the device file "); \ + gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \ + "are no ", ORANGEFS_REQDEVICE_NAME); \ + gossip_err("instances of a program using this device\ncurrently " \ + "running. (You must verify this!)\n"); \ + gossip_err("For example, you can use the lsof program as follows:\n");\ + gossip_err("'lsof | grep %s' (run this as root)\n", \ + ORANGEFS_REQDEVICE_NAME); \ + gossip_err(" open_access_count = %d\n", open_access_count); \ + gossip_err("*****************************************************\n");\ +} while (0) + +static int hash_func(__u64 tag, int table_size) +{ + return do_div(tag, (unsigned int)table_size); +} + +static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) +{ + int index = hash_func(op->tag, hash_table_size); + + list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); +} + +/* + * find the op with this tag and remove it from the in progress + * hash table. + */ +static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) +{ + struct orangefs_kernel_op_s *op, *next; + int index; + + index = hash_func(tag, hash_table_size); + + spin_lock(&orangefs_htable_ops_in_progress_lock); + list_for_each_entry_safe(op, + next, + &orangefs_htable_ops_in_progress[index], + list) { + if (op->tag == tag && !op_state_purged(op) && + !op_state_given_up(op)) { + list_del_init(&op->list); + spin_unlock(&orangefs_htable_ops_in_progress_lock); + return op; + } + } + + spin_unlock(&orangefs_htable_ops_in_progress_lock); + return NULL; +} + +/* Returns whether any FS are still pending remounted */ +static int mark_all_pending_mounts(void) +{ + int unmounted = 1; + struct orangefs_sb_info_s *orangefs_sb = NULL; + + spin_lock(&orangefs_superblocks_lock); + list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { + /* All of these file system require a remount */ + orangefs_sb->mount_pending = 1; + unmounted = 0; + } + spin_unlock(&orangefs_superblocks_lock); + return unmounted; +} + +/* + * Determine if a given file system needs to be remounted or not + * Returns -1 on error + * 0 if already mounted + * 1 if needs remount + */ +static int fs_mount_pending(__s32 fsid) +{ + int mount_pending = -1; + struct orangefs_sb_info_s *orangefs_sb = NULL; + + spin_lock(&orangefs_superblocks_lock); + list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { + if (orangefs_sb->fs_id == fsid) { + mount_pending = orangefs_sb->mount_pending; + break; + } + } + spin_unlock(&orangefs_superblocks_lock); + return mount_pending; +} + +static int orangefs_devreq_open(struct inode *inode, struct file *file) +{ + int ret = -EINVAL; + + /* in order to ensure that the filesystem driver sees correct UIDs */ + if (file->f_cred->user_ns != &init_user_ns) { + gossip_err("%s: device cannot be opened outside init_user_ns\n", + __func__); + goto out; + } + + if (!(file->f_flags & O_NONBLOCK)) { + gossip_err("%s: device cannot be opened in blocking mode\n", + __func__); + goto out; + } + ret = -EACCES; + gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n"); + mutex_lock(&devreq_mutex); + + if (open_access_count == 0) { + open_access_count = 1; + ret = 0; + } else { + DUMP_DEVICE_ERROR(); + } + mutex_unlock(&devreq_mutex); + +out: + + gossip_debug(GOSSIP_DEV_DEBUG, + "pvfs2-client-core: open device complete (ret = %d)\n", + ret); + return ret; +} + +/* Function for read() callers into the device */ +static ssize_t orangefs_devreq_read(struct file *file, + char __user *buf, + size_t count, loff_t *offset) +{ + struct orangefs_kernel_op_s *op, *temp; + __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; + static __s32 magic = ORANGEFS_DEVREQ_MAGIC; + struct orangefs_kernel_op_s *cur_op; + unsigned long ret; + + /* We do not support blocking IO. */ + if (!(file->f_flags & O_NONBLOCK)) { + gossip_err("%s: blocking read from client-core.\n", + __func__); + return -EINVAL; + } + + /* + * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then + * always read with that size buffer. + */ + if (count != MAX_DEV_REQ_UPSIZE) { + gossip_err("orangefs: client-core tried to read wrong size\n"); + return -EINVAL; + } + + /* Check for an empty list before locking. */ + if (list_empty(&orangefs_request_list)) + return -EAGAIN; + +restart: + cur_op = NULL; + /* Get next op (if any) from top of list. */ + spin_lock(&orangefs_request_list_lock); + list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { + __s32 fsid; + /* This lock is held past the end of the loop when we break. */ + spin_lock(&op->lock); + if (unlikely(op_state_purged(op) || op_state_given_up(op))) { + spin_unlock(&op->lock); + continue; + } + + fsid = fsid_of_op(op); + if (fsid != ORANGEFS_FS_ID_NULL) { + int ret; + /* Skip ops whose filesystem needs to be mounted. */ + ret = fs_mount_pending(fsid); + if (ret == 1) { + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: mount pending, skipping op tag " + "%llu %s\n", + __func__, + llu(op->tag), + get_opname_string(op)); + spin_unlock(&op->lock); + continue; + /* + * Skip ops whose filesystem we don't know about unless + * it is being mounted or unmounted. It is possible for + * a filesystem we don't know about to be unmounted if + * it fails to mount in the kernel after userspace has + * been sent the mount request. + */ + /* XXX: is there a better way to detect this? */ + } else if (ret == -1 && + !(op->upcall.type == + ORANGEFS_VFS_OP_FS_MOUNT || + op->upcall.type == + ORANGEFS_VFS_OP_GETATTR || + op->upcall.type == + ORANGEFS_VFS_OP_FS_UMOUNT)) { + gossip_debug(GOSSIP_DEV_DEBUG, + "orangefs: skipping op tag %llu %s\n", + llu(op->tag), get_opname_string(op)); + gossip_err( + "orangefs: ERROR: fs_mount_pending %d\n", + fsid); + spin_unlock(&op->lock); + continue; + } + } + /* + * Either this op does not pertain to a filesystem, is mounting + * a filesystem, or pertains to a mounted filesystem. Let it + * through. + */ + cur_op = op; + break; + } + + /* + * At this point we either have a valid op and can continue or have not + * found an op and must ask the client to try again later. + */ + if (!cur_op) { + spin_unlock(&orangefs_request_list_lock); + return -EAGAIN; + } + + gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n", + __func__, + llu(cur_op->tag), + get_opname_string(cur_op)); + + /* + * Such an op should never be on the list in the first place. If so, we + * will abort. + */ + if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) { + gossip_err("orangefs: ERROR: Current op already queued.\n"); + list_del_init(&cur_op->list); + spin_unlock(&cur_op->lock); + spin_unlock(&orangefs_request_list_lock); + return -EAGAIN; + } + + list_del_init(&cur_op->list); + spin_unlock(&orangefs_request_list_lock); + + spin_unlock(&cur_op->lock); + + /* Push the upcall out. */ + ret = copy_to_user(buf, &proto_ver, sizeof(__s32)); + if (ret != 0) + goto error; + ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32)); + if (ret != 0) + goto error; + ret = copy_to_user(buf + 2 * sizeof(__s32), + &cur_op->tag, + sizeof(__u64)); + if (ret != 0) + goto error; + ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64), + &cur_op->upcall, + sizeof(struct orangefs_upcall_s)); + if (ret != 0) + goto error; + + spin_lock(&orangefs_htable_ops_in_progress_lock); + spin_lock(&cur_op->lock); + if (unlikely(op_state_given_up(cur_op))) { + spin_unlock(&cur_op->lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); + complete(&cur_op->waitq); + goto restart; + } + + /* + * Set the operation to be in progress and move it between lists since + * it has been sent to the client. + */ + set_op_state_inprogress(cur_op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: 1 op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(cur_op), + cur_op->op_state, + current->comm); + orangefs_devreq_add_op(cur_op); + spin_unlock(&cur_op->lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); + + /* The client only asks to read one size buffer. */ + return MAX_DEV_REQ_UPSIZE; +error: + /* + * We were unable to copy the op data to the client. Put the op back in + * list. If client has crashed, the op will be purged later when the + * device is released. + */ + gossip_err("orangefs: Failed to copy data to user space\n"); + spin_lock(&orangefs_request_list_lock); + spin_lock(&cur_op->lock); + if (likely(!op_state_given_up(cur_op))) { + set_op_state_waiting(cur_op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: 2 op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(cur_op), + cur_op->op_state, + current->comm); + list_add(&cur_op->list, &orangefs_request_list); + spin_unlock(&cur_op->lock); + } else { + spin_unlock(&cur_op->lock); + complete(&cur_op->waitq); + } + spin_unlock(&orangefs_request_list_lock); + return -EFAULT; +} + +/* + * Function for writev() callers into the device. + * + * Userspace should have written: + * - __u32 version + * - __u32 magic + * - __u64 tag + * - struct orangefs_downcall_s + * - trailer buffer (in the case of READDIR operations) + */ +static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + ssize_t ret; + struct orangefs_kernel_op_s *op = NULL; + struct { + __u32 version; + __u32 magic; + __u64 tag; + } head; + int total = ret = iov_iter_count(iter); + int downcall_size = sizeof(struct orangefs_downcall_s); + int head_size = sizeof(head); + + gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n", + __func__, + total, + ret); + + if (total < MAX_DEV_REQ_DOWNSIZE) { + gossip_err("%s: total:%d: must be at least:%u:\n", + __func__, + total, + (unsigned int) MAX_DEV_REQ_DOWNSIZE); + return -EFAULT; + } + + if (!copy_from_iter_full(&head, head_size, iter)) { + gossip_err("%s: failed to copy head.\n", __func__); + return -EFAULT; + } + + if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) { + gossip_err("%s: userspace claims version" + "%d, minimum version required: %d.\n", + __func__, + head.version, + ORANGEFS_MINIMUM_USERSPACE_VERSION); + return -EPROTO; + } + + if (head.magic != ORANGEFS_DEVREQ_MAGIC) { + gossip_err("Error: Device magic number does not match.\n"); + return -EPROTO; + } + + if (!orangefs_userspace_version) { + orangefs_userspace_version = head.version; + } else if (orangefs_userspace_version != head.version) { + gossip_err("Error: userspace version changes\n"); + return -EPROTO; + } + + /* remove the op from the in progress hash table */ + op = orangefs_devreq_remove_op(head.tag); + if (!op) { + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: No one's waiting for tag %llu\n", + __func__, llu(head.tag)); + return ret; + } + + if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) { + gossip_err("%s: failed to copy downcall.\n", __func__); + goto Efault; + } + + if (op->downcall.status) + goto wakeup; + + /* + * We've successfully peeled off the head and the downcall. + * Something has gone awry if total doesn't equal the + * sum of head_size, downcall_size and trailer_size. + */ + if ((head_size + downcall_size + op->downcall.trailer_size) != total) { + gossip_err("%s: funky write, head_size:%d" + ": downcall_size:%d: trailer_size:%lld" + ": total size:%d:\n", + __func__, + head_size, + downcall_size, + op->downcall.trailer_size, + total); + goto Efault; + } + + /* Only READDIR operations should have trailers. */ + if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) && + (op->downcall.trailer_size != 0)) { + gossip_err("%s: %x operation with trailer.", + __func__, + op->downcall.type); + goto Efault; + } + + /* READDIR operations should always have trailers. */ + if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) && + (op->downcall.trailer_size == 0)) { + gossip_err("%s: %x operation with no trailer.", + __func__, + op->downcall.type); + goto Efault; + } + + if (op->downcall.type != ORANGEFS_VFS_OP_READDIR) + goto wakeup; + + op->downcall.trailer_buf = vzalloc(op->downcall.trailer_size); + if (!op->downcall.trailer_buf) + goto Enomem; + + if (!copy_from_iter_full(op->downcall.trailer_buf, + op->downcall.trailer_size, iter)) { + gossip_err("%s: failed to copy trailer.\n", __func__); + vfree(op->downcall.trailer_buf); + goto Efault; + } + +wakeup: + /* + * Return to vfs waitqueue, and back to service_operation + * through wait_for_matching_downcall. + */ + spin_lock(&op->lock); + if (unlikely(op_is_cancel(op))) { + spin_unlock(&op->lock); + put_cancel(op); + } else if (unlikely(op_state_given_up(op))) { + spin_unlock(&op->lock); + complete(&op->waitq); + } else { + set_op_state_serviced(op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(op), + op->op_state, + current->comm); + spin_unlock(&op->lock); + } + return ret; + +Efault: + op->downcall.status = -(ORANGEFS_ERROR_BIT | 9); + ret = -EFAULT; + goto wakeup; + +Enomem: + op->downcall.status = -(ORANGEFS_ERROR_BIT | 8); + ret = -ENOMEM; + goto wakeup; +} + +/* + * NOTE: gets called when the last reference to this device is dropped. + * Using the open_access_count variable, we enforce a reference count + * on this file so that it can be opened by only one process at a time. + * the devreq_mutex is used to make sure all i/o has completed + * before we call orangefs_bufmap_finalize, and similar such tricky + * situations + */ +static int orangefs_devreq_release(struct inode *inode, struct file *file) +{ + int unmounted = 0; + + gossip_debug(GOSSIP_DEV_DEBUG, + "%s:pvfs2-client-core: exiting, closing device\n", + __func__); + + mutex_lock(&devreq_mutex); + orangefs_bufmap_finalize(); + + open_access_count = -1; + + unmounted = mark_all_pending_mounts(); + gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n", + (unmounted ? "UNMOUNTED" : "MOUNTED")); + + purge_waiting_ops(); + purge_inprogress_ops(); + + orangefs_bufmap_run_down(); + + gossip_debug(GOSSIP_DEV_DEBUG, + "pvfs2-client-core: device close complete\n"); + open_access_count = 0; + orangefs_userspace_version = 0; + mutex_unlock(&devreq_mutex); + return 0; +} + +int is_daemon_in_service(void) +{ + int in_service; + + /* + * What this function does is checks if client-core is alive + * based on the access count we maintain on the device. + */ + mutex_lock(&devreq_mutex); + in_service = open_access_count == 1 ? 0 : -EIO; + mutex_unlock(&devreq_mutex); + return in_service; +} + +bool __is_daemon_in_service(void) +{ + return open_access_count == 1; +} + +static inline long check_ioctl_command(unsigned int command) +{ + /* Check for valid ioctl codes */ + if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) { + gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n", + command, + _IOC_TYPE(command), + ORANGEFS_DEV_MAGIC); + return -EINVAL; + } + /* and valid ioctl commands */ + if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) { + gossip_err("Invalid ioctl command number [%d >= %d]\n", + _IOC_NR(command), ORANGEFS_DEV_MAXNR); + return -ENOIOCTLCMD; + } + return 0; +} + +static long dispatch_ioctl_command(unsigned int command, unsigned long arg) +{ + static __s32 magic = ORANGEFS_DEVREQ_MAGIC; + static __s32 max_up_size = MAX_DEV_REQ_UPSIZE; + static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; + struct ORANGEFS_dev_map_desc user_desc; + int ret = 0; + int upstream_kmod = 1; + struct orangefs_sb_info_s *orangefs_sb; + + /* mtmoore: add locking here */ + + switch (command) { + case ORANGEFS_DEV_GET_MAGIC: + return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ? + -EIO : + 0); + case ORANGEFS_DEV_GET_MAX_UPSIZE: + return ((put_user(max_up_size, + (__s32 __user *) arg) == -EFAULT) ? + -EIO : + 0); + case ORANGEFS_DEV_GET_MAX_DOWNSIZE: + return ((put_user(max_down_size, + (__s32 __user *) arg) == -EFAULT) ? + -EIO : + 0); + case ORANGEFS_DEV_MAP: + ret = copy_from_user(&user_desc, + (struct ORANGEFS_dev_map_desc __user *) + arg, + sizeof(struct ORANGEFS_dev_map_desc)); + /* WTF -EIO and not -EFAULT? */ + return ret ? -EIO : orangefs_bufmap_initialize(&user_desc); + case ORANGEFS_DEV_REMOUNT_ALL: + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: got ORANGEFS_DEV_REMOUNT_ALL\n", + __func__); + + /* + * remount all mounted orangefs volumes to regain the lost + * dynamic mount tables (if any) -- NOTE: this is done + * without keeping the superblock list locked due to the + * upcall/downcall waiting. also, the request mutex is + * used to ensure that no operations will be serviced until + * all of the remounts are serviced (to avoid ops between + * mounts to fail) + */ + ret = mutex_lock_interruptible(&orangefs_request_mutex); + if (ret < 0) + return ret; + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: priority remount in progress\n", + __func__); + spin_lock(&orangefs_superblocks_lock); + list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { + /* + * We have to drop the spinlock, so entries can be + * removed. They can't be freed, though, so we just + * keep the forward pointers and zero the back ones - + * that way we can get to the rest of the list. + */ + if (!orangefs_sb->list.prev) + continue; + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: Remounting SB %p\n", + __func__, + orangefs_sb); + + spin_unlock(&orangefs_superblocks_lock); + ret = orangefs_remount(orangefs_sb); + spin_lock(&orangefs_superblocks_lock); + if (ret) { + gossip_debug(GOSSIP_DEV_DEBUG, + "SB %p remount failed\n", + orangefs_sb); + break; + } + } + spin_unlock(&orangefs_superblocks_lock); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: priority remount complete\n", + __func__); + mutex_unlock(&orangefs_request_mutex); + return ret; + + case ORANGEFS_DEV_UPSTREAM: + ret = copy_to_user((void __user *)arg, + &upstream_kmod, + sizeof(upstream_kmod)); + + if (ret != 0) + return -EIO; + else + return ret; + + case ORANGEFS_DEV_CLIENT_MASK: + return orangefs_debugfs_new_client_mask((void __user *)arg); + case ORANGEFS_DEV_CLIENT_STRING: + return orangefs_debugfs_new_client_string((void __user *)arg); + case ORANGEFS_DEV_DEBUG: + return orangefs_debugfs_new_debug((void __user *)arg); + default: + return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; +} + +static long orangefs_devreq_ioctl(struct file *file, + unsigned int command, unsigned long arg) +{ + long ret; + + /* Check for properly constructed commands */ + ret = check_ioctl_command(command); + if (ret < 0) + return (int)ret; + + return (int)dispatch_ioctl_command(command, arg); +} + +#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ + +/* Compat structure for the ORANGEFS_DEV_MAP ioctl */ +struct ORANGEFS_dev_map_desc32 { + compat_uptr_t ptr; + __s32 total_size; + __s32 size; + __s32 count; +}; + +/* + * 32 bit user-space apps' ioctl handlers when kernel modules + * is compiled as a 64 bit one + */ +static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long args) +{ + long ret; + + /* Check for properly constructed commands */ + ret = check_ioctl_command(cmd); + if (ret < 0) + return ret; + if (cmd == ORANGEFS_DEV_MAP) { + struct ORANGEFS_dev_map_desc desc; + struct ORANGEFS_dev_map_desc32 d32; + + if (copy_from_user(&d32, (void __user *)args, sizeof(d32))) + return -EFAULT; + + desc.ptr = compat_ptr(d32.ptr); + desc.total_size = d32.total_size; + desc.size = d32.size; + desc.count = d32.count; + return orangefs_bufmap_initialize(&desc); + } + /* no other ioctl requires translation */ + return dispatch_ioctl_command(cmd, args); +} + +#endif /* CONFIG_COMPAT is in .config */ + +static __poll_t orangefs_devreq_poll(struct file *file, + struct poll_table_struct *poll_table) +{ + __poll_t poll_revent_mask = 0; + + poll_wait(file, &orangefs_request_list_waitq, poll_table); + + if (!list_empty(&orangefs_request_list)) + poll_revent_mask |= EPOLLIN; + return poll_revent_mask; +} + +/* the assigned character device major number */ +static int orangefs_dev_major; + +static const struct file_operations orangefs_devreq_file_operations = { + .owner = THIS_MODULE, + .read = orangefs_devreq_read, + .write_iter = orangefs_devreq_write_iter, + .open = orangefs_devreq_open, + .release = orangefs_devreq_release, + .unlocked_ioctl = orangefs_devreq_ioctl, + +#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ + .compat_ioctl = orangefs_devreq_compat_ioctl, +#endif + .poll = orangefs_devreq_poll +}; + +/* + * Initialize orangefs device specific state: + * Must be called at module load time only + */ +int orangefs_dev_init(void) +{ + /* register orangefs-req device */ + orangefs_dev_major = register_chrdev(0, + ORANGEFS_REQDEVICE_NAME, + &orangefs_devreq_file_operations); + if (orangefs_dev_major < 0) { + gossip_debug(GOSSIP_DEV_DEBUG, + "Failed to register /dev/%s (error %d)\n", + ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); + return orangefs_dev_major; + } + + gossip_debug(GOSSIP_DEV_DEBUG, + "*** /dev/%s character device registered ***\n", + ORANGEFS_REQDEVICE_NAME); + gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n", + ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); + return 0; +} + +void orangefs_dev_cleanup(void) +{ + unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME); + gossip_debug(GOSSIP_DEV_DEBUG, + "*** /dev/%s character device unregistered ***\n", + ORANGEFS_REQDEVICE_NAME); +} diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c new file mode 100644 index 000000000..9cacce5d5 --- /dev/null +++ b/fs/orangefs/dir.c @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2017 Omnibond Systems, L.L.C. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +struct orangefs_dir_part { + struct orangefs_dir_part *next; + size_t len; +}; + +struct orangefs_dir { + __u64 token; + struct orangefs_dir_part *part; + loff_t end; + int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) + +/* + * There can be up to 512 directory entries. Each entry is encoded as + * follows: + * 4 bytes: string size (n) + * n bytes: string + * 1 byte: trailing zero + * padding to 8 bytes + * 16 bytes: khandle + * padding to 8 bytes + * + * The trailer_buf starts with a struct orangefs_readdir_response_s + * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at. Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset. Parts are stored in a linked + * list which grows as data is received from the server. The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list. Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location. Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'. Part one is the + * first part of the part list. + */ + +static int do_readdir(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_readdir_response_s *resp; + int bufi, r; + + /* + * Despite the badly named field, readdir does not use shared + * memory. However, there are a limited number of readdir + * slots, which must be allocated here. This flag simply tells + * the op scheduler to return the op here for retry. + */ + op->uses_shared_memory = 1; + op->upcall.req.readdir.refn = oi->refn; + op->upcall.req.readdir.token = od->token; + op->upcall.req.readdir.max_dirent_count = + ORANGEFS_MAX_DIRENT_COUNT_READDIR; + +again: + bufi = orangefs_readdir_index_get(); + if (bufi < 0) { + od->error = bufi; + return bufi; + } + + op->upcall.req.readdir.buf_index = bufi; + + r = service_operation(op, "orangefs_readdir", + get_interruptible_flag(dentry->d_inode)); + + orangefs_readdir_index_put(bufi); + + if (op_state_purged(op)) { + if (r == -EAGAIN) { + vfree(op->downcall.trailer_buf); + goto again; + } else if (r == -EIO) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } + } + + if (r < 0) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } else if (op->downcall.status) { + vfree(op->downcall.trailer_buf); + od->error = op->downcall.status; + return op->downcall.status; + } + + /* + * The maximum size is size per entry times the 512 entries plus + * the header. This is well under the limit. + */ + if (op->downcall.trailer_size > PART_SIZE) { + vfree(op->downcall.trailer_buf); + od->error = -EIO; + return -EIO; + } + + resp = (struct orangefs_readdir_response_s *) + op->downcall.trailer_buf; + od->token = resp->token; + return 0; +} + +static int parse_readdir(struct orangefs_dir *od, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_dir_part *part, *new; + size_t count; + + count = 1; + part = od->part; + while (part) { + count++; + if (part->next) + part = part->next; + else + break; + } + + new = (void *)op->downcall.trailer_buf; + new->next = NULL; + new->len = op->downcall.trailer_size - + sizeof(struct orangefs_readdir_response_s); + if (!od->part) + od->part = new; + else + part->next = new; + count++; + od->end = count << PART_SHIFT; + + return 0; +} + +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) +{ + struct orangefs_kernel_op_s *op; + int r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; + return -ENOMEM; + } + r = do_readdir(oi, od, dentry, op); + if (r) { + od->error = r; + goto out; + } + r = parse_readdir(od, op); + if (r) { + od->error = r; + goto out; + } + + od->error = 0; +out: + op_release(op); + return od->error; +} + +static int fill_from_part(struct orangefs_dir_part *part, + struct dir_context *ctx) +{ + const int offset = sizeof(struct orangefs_readdir_response_s); + struct orangefs_khandle *khandle; + __u32 *len, padlen; + loff_t i; + char *s; + i = ctx->pos & ~PART_MASK; + + /* The file offset from userspace is too large. */ + if (i > part->len) + return 1; + + /* + * If the seek pointer is positioned just before an entry it + * should find the next entry. + */ + if (i % 8) + i = i + (8 - i%8)%8; + + while (i < part->len) { + if (part->len < i + sizeof *len) + break; + len = (void *)part + offset + i; + /* + * len is the size of the string itself. padlen is the + * total size of the encoded string. + */ + padlen = (sizeof *len + *len + 1) + + (8 - (sizeof *len + *len + 1)%8)%8; + if (part->len < i + padlen + sizeof *khandle) + goto next; + s = (void *)part + offset + i + sizeof *len; + if (s[*len] != 0) + goto next; + khandle = (void *)part + offset + i + padlen; + if (!dir_emit(ctx, s, *len, + orangefs_khandle_to_ino(khandle), + DT_UNKNOWN)) + return 0; + i += padlen + sizeof *khandle; + i = i + (8 - i%8)%8; + BUG_ON(i > part->len); + ctx->pos = (ctx->pos & PART_MASK) | i; + continue; +next: + i += 8; + } + return 1; +} + +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_dir_part *part; + size_t count; + + count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; + + part = od->part; + while (part->next && count) { + count--; + part = part->next; + } + /* This means the userspace file offset is invalid. */ + if (count) { + od->error = -EIO; + return -EIO; + } + + while (part && part->len) { + int r; + r = fill_from_part(part, ctx); + if (r < 0) { + od->error = r; + return r; + } else if (r == 0) { + /* Userspace buffer is full. */ + break; + } else { + /* + * The part ran out of data. Move to the next + * part. */ + ctx->pos = (ctx->pos & PART_MASK) + + (1 << PART_SHIFT); + part = part->next; + } + } + return 0; +} + +static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, + int whence) +{ + struct orangefs_dir *od = file->private_data; + /* + * Delete the stored data so userspace sees new directory + * entries. + */ + if (!whence && offset < od->end) { + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + } + return default_llseek(file, offset, whence); +} + +static int orangefs_dir_iterate(struct file *file, + struct dir_context *ctx) +{ + struct orangefs_inode_s *oi; + struct orangefs_dir *od; + struct dentry *dentry; + int r; + + dentry = file->f_path.dentry; + oi = ORANGEFS_I(dentry->d_inode); + od = file->private_data; + + if (od->error) + return od->error; + + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos++; + } + if (ctx->pos == 1) { + if (!dir_emit_dotdot(file, ctx)) + return 0; + ctx->pos = 1 << PART_SHIFT; + } + + /* + * The seek position is in the first synthesized part but is not + * valid. + */ + if ((ctx->pos & PART_MASK) == 0) + return -EIO; + + r = 0; + + /* + * Must read more if the user has sought past what has been read + * so far. Stop a user who has sought past the end. + */ + while (od->token != ORANGEFS_ITERATE_END && + ctx->pos > od->end) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + } + if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) + return -EIO; + + /* Then try to fill if there's any left in the buffer. */ + if (ctx->pos < od->end) { + r = orangefs_dir_fill(oi, od, dentry, ctx); + if (r) + return r; + } + + /* Finally get some more and try to fill. */ + if (od->token != ORANGEFS_ITERATE_END) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + r = orangefs_dir_fill(oi, od, dentry, ctx); + } + + return r; +} + +static int orangefs_dir_open(struct inode *inode, struct file *file) +{ + struct orangefs_dir *od; + file->private_data = kmalloc(sizeof(struct orangefs_dir), + GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + od = file->private_data; + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + od->error = 0; + return 0; +} + +static int orangefs_dir_release(struct inode *inode, struct file *file) +{ + struct orangefs_dir *od = file->private_data; + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + kfree(od); + return 0; +} + +const struct file_operations orangefs_dir_operations = { + .llseek = orangefs_dir_llseek, + .read = generic_read_dir, + .iterate_shared = orangefs_dir_iterate, + .open = orangefs_dir_open, + .release = orangefs_dir_release +}; diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h new file mode 100644 index 000000000..ea2332e16 --- /dev/null +++ b/fs/orangefs/downcall.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Definitions of downcalls used in Linux kernel module. + */ + +#ifndef __DOWNCALL_H +#define __DOWNCALL_H + +/* + * Sanitized the device-client core interaction + * for clean 32-64 bit usage + */ +struct orangefs_io_response { + __s64 amt_complete; +}; + +struct orangefs_lookup_response { + struct orangefs_object_kref refn; +}; + +struct orangefs_create_response { + struct orangefs_object_kref refn; +}; + +struct orangefs_symlink_response { + struct orangefs_object_kref refn; +}; + +struct orangefs_getattr_response { + struct ORANGEFS_sys_attr_s attributes; + char link_target[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_mkdir_response { + struct orangefs_object_kref refn; +}; + +struct orangefs_statfs_response { + __s64 block_size; + __s64 blocks_total; + __s64 blocks_avail; + __s64 files_total; + __s64 files_avail; +}; + +struct orangefs_fs_mount_response { + __s32 fs_id; + __s32 id; + struct orangefs_khandle root_khandle; +}; + +/* the getxattr response is the attribute value */ +struct orangefs_getxattr_response { + __s32 val_sz; + __s32 __pad1; + char val[ORANGEFS_MAX_XATTR_VALUELEN]; +}; + +/* the listxattr response is an array of attribute names */ +struct orangefs_listxattr_response { + __s32 returned_count; + __s32 __pad1; + __u64 token; + char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN]; + __s32 keylen; + __s32 __pad2; + __s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN]; +}; + +struct orangefs_param_response { + union { + __s64 value64; + __s32 value32[2]; + } u; +}; + +#define PERF_COUNT_BUF_SIZE 4096 +struct orangefs_perf_count_response { + char buffer[PERF_COUNT_BUF_SIZE]; +}; + +#define FS_KEY_BUF_SIZE 4096 +struct orangefs_fs_key_response { + __s32 fs_keylen; + __s32 __pad1; + char fs_key[FS_KEY_BUF_SIZE]; +}; + +/* 2.9.6 */ +struct orangefs_features_response { + __u64 features; +}; + +struct orangefs_downcall_s { + __s32 type; + __s32 status; + /* currently trailer is used only by readdir */ + __s64 trailer_size; + char *trailer_buf; + + union { + struct orangefs_io_response io; + struct orangefs_lookup_response lookup; + struct orangefs_create_response create; + struct orangefs_symlink_response sym; + struct orangefs_getattr_response getattr; + struct orangefs_mkdir_response mkdir; + struct orangefs_statfs_response statfs; + struct orangefs_fs_mount_response fs_mount; + struct orangefs_getxattr_response getxattr; + struct orangefs_listxattr_response listxattr; + struct orangefs_param_response param; + struct orangefs_perf_count_response perf_count; + struct orangefs_fs_key_response fs_key; + struct orangefs_features_response features; + } resp; +}; + +/* + * The readdir response comes in the trailer. It is followed by the + * directory entries as described in dir.c. + */ + +struct orangefs_readdir_response_s { + __u64 token; + __u64 directory_version; + __u32 __pad2; + __u32 orangefs_dirent_outcount; +}; + +#endif /* __DOWNCALL_H */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c new file mode 100644 index 000000000..732661aa2 --- /dev/null +++ b/fs/orangefs/file.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS file operations. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" +#include <linux/fs.h> +#include <linux/pagemap.h> + +static int flush_racache(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d\n", __func__, + get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; + + ret = service_operation(new_op, "orangefs_flush_racache", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", + __func__, ret); + + op_release(new_op); + return ret; +} + +/* + * Post and wait for the I/O upcall to finish + */ +ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr, + int *index_return, struct file *file) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; + struct orangefs_kernel_op_s *new_op = NULL; + int buffer_index; + ssize_t ret; + size_t copy_amount; + int open_for_read; + int open_for_write; + + new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); + if (!new_op) + return -ENOMEM; + + /* synchronous I/O */ + new_op->upcall.req.io.readahead_size = readahead_size; + new_op->upcall.req.io.io_type = type; + new_op->upcall.req.io.refn = orangefs_inode->refn; + +populate_shared_memory: + /* get a shared buffer index */ + buffer_index = orangefs_bufmap_get(); + if (buffer_index < 0) { + ret = buffer_index; + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: orangefs_bufmap_get failure (%zd)\n", + __func__, ret); + goto out; + } + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): GET op %p -> buffer_index %d\n", + __func__, + handle, + new_op, + buffer_index); + + new_op->uses_shared_memory = 1; + new_op->upcall.req.io.buf_index = buffer_index; + new_op->upcall.req.io.count = total_size; + new_op->upcall.req.io.offset = *offset; + if (type == ORANGEFS_IO_WRITE && wr) { + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); + } + /* + * Orangefs has no open, and orangefs checks file permissions + * on each file access. Posix requires that file permissions + * be checked on open and nowhere else. Orangefs-through-the-kernel + * needs to seem posix compliant. + * + * The VFS opens files, even if the filesystem provides no + * method. We can see if a file was successfully opened for + * read and or for write by looking at file->f_mode. + * + * When writes are flowing from the page cache, file is no + * longer available. We can trust the VFS to have checked + * file->f_mode before writing to the page cache. + * + * The mode of a file might change between when it is opened + * and IO commences, or it might be created with an arbitrary mode. + * + * We'll make sure we don't hit EACCES during the IO stage by + * using UID 0. Some of the time we have access without changing + * to UID 0 - how to check? + */ + if (file) { + open_for_write = file->f_mode & FMODE_WRITE; + open_for_read = file->f_mode & FMODE_READ; + } else { + open_for_write = 1; + open_for_read = 0; /* not relevant? */ + } + if ((type == ORANGEFS_IO_WRITE) && open_for_write) + new_op->upcall.uid = 0; + if ((type == ORANGEFS_IO_READ) && open_for_read) + new_op->upcall.uid = 0; + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): offset: %llu total_size: %zd\n", + __func__, + handle, + llu(*offset), + total_size); + /* + * Stage 1: copy the buffers into client-core's address space + */ + if (type == ORANGEFS_IO_WRITE && total_size) { + ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, + total_size); + if (ret < 0) { + gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", + __func__, (long)ret); + goto out; + } + } + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): Calling post_io_request with tag (%llu)\n", + __func__, + handle, + llu(new_op->tag)); + + /* Stage 2: Service the I/O operation */ + ret = service_operation(new_op, + type == ORANGEFS_IO_WRITE ? + "file_write" : + "file_read", + get_interruptible_flag(inode)); + + /* + * If service_operation() returns -EAGAIN #and# the operation was + * purged from orangefs_request_list or htable_ops_in_progress, then + * we know that the client was restarted, causing the shared memory + * area to be wiped clean. To restart a write operation in this + * case, we must re-copy the data from the user's iovec to a NEW + * shared memory location. To restart a read operation, we must get + * a new shared memory location. + */ + if (ret == -EAGAIN && op_state_purged(new_op)) { + orangefs_bufmap_put(buffer_index); + if (type == ORANGEFS_IO_WRITE) + iov_iter_revert(iter, total_size); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s:going to repopulate_shared_memory.\n", + __func__); + goto populate_shared_memory; + } + + if (ret < 0) { + if (ret == -EINTR) { + /* + * We can't return EINTR if any data was written, + * it's not POSIX. It is minimally acceptable + * to give a partial write, the way NFS does. + * + * It would be optimal to return all or nothing, + * but if a userspace write is bigger than + * an IO buffer, and the interrupt occurs + * between buffer writes, that would not be + * possible. + */ + switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { + /* + * If the op was waiting when the interrupt + * occurred, then the client-core did not + * trigger the write. + */ + case OP_VFS_STATE_WAITING: + if (*offset == 0) + ret = -EINTR; + else + ret = 0; + break; + /* + * If the op was in progress when the interrupt + * occurred, then the client-core was able to + * trigger the write. + */ + case OP_VFS_STATE_INPROGR: + if (type == ORANGEFS_IO_READ) + ret = -EINTR; + else + ret = total_size; + break; + default: + gossip_err("%s: unexpected op state :%d:.\n", + __func__, + new_op->op_state); + ret = 0; + break; + } + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: got EINTR, state:%d: %p\n", + __func__, + new_op->op_state, + new_op); + } else { + gossip_err("%s: error in %s handle %pU, returning %zd\n", + __func__, + type == ORANGEFS_IO_READ ? + "read from" : "write to", + handle, ret); + } + if (orangefs_cancel_op_in_progress(new_op)) + return ret; + + goto out; + } + + /* + * Stage 3: Post copy buffers from client-core's address space + */ + if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { + /* + * NOTE: the iovector can either contain addresses which + * can futher be kernel-space or user-space addresses. + * or it can pointers to struct page's + */ + + copy_amount = new_op->downcall.resp.io.amt_complete; + + ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, + copy_amount); + if (ret < 0) { + gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", + __func__, (long)ret); + goto out; + } + } + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): Amount %s, returned by the sys-io call:%d\n", + __func__, + handle, + type == ORANGEFS_IO_READ ? "read" : "written", + (int)new_op->downcall.resp.io.amt_complete); + + ret = new_op->downcall.resp.io.amt_complete; + +out: + if (buffer_index >= 0) { + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + buffer_index = -1; + } + op_release(new_op); + return ret; +} + +int orangefs_revalidate_mapping(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long *bitlock = &orangefs_inode->bitlock; + int ret; + + while (1) { + ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); + if (ret) + return ret; + spin_lock(&inode->i_lock); + if (test_bit(1, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!time_before(jiffies, orangefs_inode->mapping_time)) + break; + spin_unlock(&inode->i_lock); + return 0; + } + + set_bit(1, bitlock); + smp_wmb(); + spin_unlock(&inode->i_lock); + + unmap_mapping_range(mapping, 0, 0, 0); + ret = filemap_write_and_wait(mapping); + if (!ret) + ret = invalidate_inode_pages2(mapping); + + orangefs_inode->mapping_time = jiffies + + orangefs_cache_timeout_msecs*HZ/1000; + + clear_bit(1, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, 1); + + return ret; +} + +static ssize_t orangefs_file_read_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + int ret; + orangefs_stats.reads++; + + down_read(&file_inode(iocb->ki_filp)->i_rwsem); + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + goto out; + + ret = generic_file_read_iter(iocb, iter); +out: + up_read(&file_inode(iocb->ki_filp)->i_rwsem); + return ret; +} + +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + int ret; + orangefs_stats.writes++; + + if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + return ret; + } + + ret = generic_file_write_iter(iocb, iter); + return ret; +} + +static vm_fault_t orangefs_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + int ret; + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); + if (ret == -ESTALE) + ret = -EIO; + if (ret) { + gossip_err("%s: orangefs_inode_getattr failed, " + "ret:%d:.\n", __func__, ret); + return VM_FAULT_SIGBUS; + } + return filemap_fault(vmf); +} + +static const struct vm_operations_struct orangefs_file_vm_ops = { + .fault = orangefs_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = orangefs_page_mkwrite, +}; + +/* + * Memory map a region of a file. + */ +static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + + ret = orangefs_revalidate_mapping(file_inode(file)); + if (ret) + return ret; + + gossip_debug(GOSSIP_FILE_DEBUG, + "orangefs_file_mmap: called on %pD\n", file); + + /* set the sequential readahead hint */ + vma->vm_flags |= VM_SEQ_READ; + vma->vm_flags &= ~VM_RAND_READ; + + file_accessed(file); + vma->vm_ops = &orangefs_file_vm_ops; + return 0; +} + +#define mapping_nrpages(idata) ((idata)->nrpages) + +/* + * Called to notify the module that there are no more references to + * this file (i.e. no processes have it open). + * + * \note Not called when each file is closed. + */ +static int orangefs_file_release(struct inode *inode, struct file *file) +{ + gossip_debug(GOSSIP_FILE_DEBUG, + "orangefs_file_release: called on %pD\n", + file); + + /* + * remove all associated inode pages from the page cache and + * readahead cache (if any); this forces an expensive refresh of + * data for the next caller of mmap (or 'get_block' accesses) + */ + if (mapping_nrpages(file->f_mapping)) { + if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { + gossip_debug(GOSSIP_INODE_DEBUG, + "calling flush_racache on %pU\n", + get_khandle_from_ino(inode)); + flush_racache(inode); + gossip_debug(GOSSIP_INODE_DEBUG, + "flush_racache finished\n"); + } + + } + return 0; +} + +/* + * Push all data for a specific file onto permanent storage. + */ +static int orangefs_fsync(struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + int ret; + struct orangefs_inode_s *orangefs_inode = + ORANGEFS_I(file_inode(file)); + struct orangefs_kernel_op_s *new_op = NULL; + + ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, + start, end); + if (ret < 0) + return ret; + + new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.fsync.refn = orangefs_inode->refn; + + ret = service_operation(new_op, + "orangefs_fsync", + get_interruptible_flag(file_inode(file))); + + gossip_debug(GOSSIP_FILE_DEBUG, + "orangefs_fsync got return value of %d\n", + ret); + + op_release(new_op); + return ret; +} + +/* + * Change the file pointer position for an instance of an open file. + * + * \note If .llseek is overriden, we must acquire lock as described in + * Documentation/filesystems/locking.rst. + * + * Future upgrade could support SEEK_DATA and SEEK_HOLE but would + * require much changes to the FS + */ +static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) +{ + int ret = -EINVAL; + struct inode *inode = file_inode(file); + + if (origin == SEEK_END) { + /* + * revalidate the inode's file size. + * NOTE: We are only interested in file size here, + * so we set mask accordingly. + */ + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); + if (ret == -ESTALE) + ret = -EIO; + if (ret) { + gossip_debug(GOSSIP_FILE_DEBUG, + "%s:%s:%d calling make bad inode\n", + __FILE__, + __func__, + __LINE__); + return ret; + } + } + + gossip_debug(GOSSIP_FILE_DEBUG, + "orangefs_file_llseek: offset is %ld | origin is %d" + " | inode size is %lu\n", + (long)offset, + origin, + (unsigned long)i_size_read(inode)); + + return generic_file_llseek(file, offset, origin); +} + +/* + * Support local locks (locks that only this kernel knows about) + * if Orangefs was mounted -o local_lock. + */ +static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + int rc = -EINVAL; + + if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { + if (cmd == F_GETLK) { + rc = 0; + posix_test_lock(filp, fl); + } else { + rc = posix_lock_file(filp, fl, NULL); + } + } + + return rc; +} + +static int orangefs_flush(struct file *file, fl_owner_t id) +{ + /* + * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the + * service_operation in orangefs_fsync. + * + * Do not send fsync to OrangeFS server on a close. Do send fsync + * on an explicit fsync call. This duplicates historical OrangeFS + * behavior. + */ + int r; + + r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); + if (r > 0) + return 0; + else + return r; +} + +/** ORANGEFS implementation of VFS file operations */ +const struct file_operations orangefs_file_operations = { + .llseek = orangefs_file_llseek, + .read_iter = orangefs_file_read_iter, + .write_iter = orangefs_file_write_iter, + .lock = orangefs_lock, + .mmap = orangefs_file_mmap, + .open = generic_file_open, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .flush = orangefs_flush, + .release = orangefs_file_release, + .fsync = orangefs_fsync, +}; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c new file mode 100644 index 000000000..b3bbb5a57 --- /dev/null +++ b/fs/orangefs/inode.c @@ -0,0 +1,1141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS inode operations. + */ + +#include <linux/blkdev.h> +#include <linux/fileattr.h> +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +static int orangefs_writepage_locked(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct orangefs_write_range *wr = NULL; + struct iov_iter iter; + struct bio_vec bv; + size_t len, wlen; + ssize_t ret; + loff_t off; + + set_page_writeback(page); + + len = i_size_read(inode); + if (PagePrivate(page)) { + wr = (struct orangefs_write_range *)page_private(page); + WARN_ON(wr->pos >= len); + off = wr->pos; + if (off + wr->len > len) + wlen = len - off; + else + wlen = wr->len; + } else { + WARN_ON(1); + off = page_offset(page); + if (off + PAGE_SIZE > len) + wlen = len - off; + else + wlen = PAGE_SIZE; + } + /* Should've been handled in orangefs_invalidate_folio. */ + WARN_ON(off == len || off + wlen > len); + + bv.bv_page = page; + bv.bv_len = wlen; + bv.bv_offset = off % PAGE_SIZE; + WARN_ON(wlen == 0); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); + + ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, + len, wr, NULL, NULL); + if (ret < 0) { + SetPageError(page); + mapping_set_error(page->mapping, ret); + } else { + ret = 0; + } + kfree(detach_page_private(page)); + return ret; +} + +static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + ret = orangefs_writepage_locked(page, wbc); + unlock_page(page); + end_page_writeback(page); + return ret; +} + +struct orangefs_writepages { + loff_t off; + size_t len; + kuid_t uid; + kgid_t gid; + int maxpages; + int npages; + struct page **pages; + struct bio_vec *bv; +}; + +static int orangefs_writepages_work(struct orangefs_writepages *ow, + struct writeback_control *wbc) +{ + struct inode *inode = ow->pages[0]->mapping->host; + struct orangefs_write_range *wrp, wr; + struct iov_iter iter; + ssize_t ret; + size_t len; + loff_t off; + int i; + + len = i_size_read(inode); + + for (i = 0; i < ow->npages; i++) { + set_page_writeback(ow->pages[i]); + ow->bv[i].bv_page = ow->pages[i]; + ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, + ow->off + ow->len) - + max(ow->off, page_offset(ow->pages[i])); + if (i == 0) + ow->bv[i].bv_offset = ow->off - + page_offset(ow->pages[i]); + else + ow->bv[i].bv_offset = 0; + } + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); + + WARN_ON(ow->off >= len); + if (ow->off + ow->len > len) + ow->len = len - ow->off; + + off = ow->off; + wr.uid = ow->uid; + wr.gid = ow->gid; + ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, + 0, &wr, NULL, NULL); + if (ret < 0) { + for (i = 0; i < ow->npages; i++) { + SetPageError(ow->pages[i]); + mapping_set_error(ow->pages[i]->mapping, ret); + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } else { + ret = 0; + for (i = 0; i < ow->npages; i++) { + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } + return ret; +} + +static int orangefs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct orangefs_writepages *ow = data; + struct orangefs_write_range *wr; + int ret; + + if (!PagePrivate(page)) { + unlock_page(page); + /* It's not private so there's nothing to write, right? */ + printk("writepages_callback not private!\n"); + BUG(); + return 0; + } + wr = (struct orangefs_write_range *)page_private(page); + + ret = -1; + if (ow->npages == 0) { + ow->off = wr->pos; + ow->len = wr->len; + ow->uid = wr->uid; + ow->gid = wr->gid; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } + if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + ret = -1; + goto done; + } + if (ow->off + ow->len == wr->pos) { + ow->len += wr->len; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } +done: + if (ret == -1) { + if (ow->npages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + ret = orangefs_writepage_locked(page, wbc); + mapping_set_error(page->mapping, ret); + unlock_page(page); + end_page_writeback(page); + } else { + if (ow->npages == ow->maxpages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + } + return ret; +} + +static int orangefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct orangefs_writepages *ow; + struct blk_plug plug; + int ret; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); + if (!ow) + return -ENOMEM; + ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; + ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); + if (!ow->pages) { + kfree(ow); + return -ENOMEM; + } + ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); + if (!ow->bv) { + kfree(ow->pages); + kfree(ow); + return -ENOMEM; + } + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); + if (ow->npages) + ret = orangefs_writepages_work(ow, wbc); + blk_finish_plug(&plug); + kfree(ow->pages); + kfree(ow->bv); + kfree(ow); + return ret; +} + +static int orangefs_launder_folio(struct folio *); + +static void orangefs_readahead(struct readahead_control *rac) +{ + loff_t offset; + struct iov_iter iter; + struct inode *inode = rac->mapping->host; + struct xarray *i_pages; + struct page *page; + loff_t new_start = readahead_pos(rac); + int ret; + size_t new_len = 0; + + loff_t bytes_remaining = inode->i_size - readahead_pos(rac); + loff_t pages_remaining = bytes_remaining / PAGE_SIZE; + + if (pages_remaining >= 1024) + new_len = 4194304; + else if (pages_remaining > readahead_count(rac)) + new_len = bytes_remaining; + + if (new_len) + readahead_expand(rac, new_start, new_len); + + offset = readahead_pos(rac); + i_pages = &rac->mapping->i_pages; + + iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac)); + + /* read in the pages. */ + if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, + &offset, &iter, readahead_length(rac), + inode->i_size, NULL, NULL, rac->file)) < 0) + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: wait_for_direct_io failed. \n", __func__); + else + ret = 0; + + /* clean up. */ + while ((page = readahead_page(rac))) { + page_endio(page, false, ret); + put_page(page); + } +} + +static int orangefs_read_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + struct iov_iter iter; + struct bio_vec bv; + ssize_t ret; + loff_t off; /* offset of this folio in the file */ + + if (folio_test_dirty(folio)) + orangefs_launder_folio(folio); + + off = folio_pos(folio); + bv.bv_page = &folio->page; + bv.bv_len = folio_size(folio); + bv.bv_offset = 0; + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); + + ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, + folio_size(folio), inode->i_size, NULL, NULL, file); + /* this will only zero remaining unread portions of the folio data */ + iov_iter_zero(~0U, &iter); + /* takes care of potential aliasing */ + flush_dcache_folio(folio); + if (ret < 0) { + folio_set_error(folio); + } else { + folio_mark_uptodate(folio); + ret = 0; + } + /* unlock the folio after the ->read_folio() routine completes */ + folio_unlock(folio); + return ret; +} + +static int orangefs_write_begin(struct file *file, + struct address_space *mapping, loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct orangefs_write_range *wr; + struct folio *folio; + struct page *page; + pgoff_t index; + int ret; + + index = pos >> PAGE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index); + if (!page) + return -ENOMEM; + + *pagep = page; + folio = page_folio(page); + + if (folio_test_dirty(folio) && !folio_test_private(folio)) { + /* + * Should be impossible. If it happens, launder the page + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + ret = orangefs_launder_folio(folio); + if (ret) + return ret; + } + if (folio_test_private(folio)) { + struct orangefs_write_range *wr; + wr = folio_get_private(folio); + if (wr->pos + wr->len == pos && + uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->len += len; + goto okay; + } else { + ret = orangefs_launder_folio(folio); + if (ret) + return ret; + } + } + + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) + return -ENOMEM; + + wr->pos = pos; + wr->len = len; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + folio_attach_private(folio, wr); +okay: + return 0; +} + +static int orangefs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page)) { + unsigned from = pos & (PAGE_SIZE - 1); + if (copied < len) { + zero_user(page, from + copied, len - copied); + } + /* Set fully written pages uptodate. */ + if (pos == page_offset(page) && + (len == PAGE_SIZE || pos + len == inode->i_size)) { + zero_user_segment(page, from + copied, PAGE_SIZE); + SetPageUptodate(page); + } + } + + set_page_dirty(page); + unlock_page(page); + put_page(page); + + mark_inode_dirty_sync(file_inode(file)); + return copied; +} + +static void orangefs_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + struct orangefs_write_range *wr = folio_get_private(folio); + + if (offset == 0 && length == PAGE_SIZE) { + kfree(folio_detach_private(folio)); + return; + /* write range entirely within invalidate range (or equal) */ + } else if (folio_pos(folio) + offset <= wr->pos && + wr->pos + wr->len <= folio_pos(folio) + offset + length) { + kfree(folio_detach_private(folio)); + /* XXX is this right? only caller in fs */ + folio_cancel_dirty(folio); + return; + /* invalidate range chops off end of write range */ + } else if (wr->pos < folio_pos(folio) + offset && + wr->pos + wr->len <= folio_pos(folio) + offset + length && + folio_pos(folio) + offset < wr->pos + wr->len) { + size_t x; + x = wr->pos + wr->len - (folio_pos(folio) + offset); + WARN_ON(x > wr->len); + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range chops off beginning of write range */ + } else if (folio_pos(folio) + offset <= wr->pos && + folio_pos(folio) + offset + length < wr->pos + wr->len && + wr->pos < folio_pos(folio) + offset + length) { + size_t x; + x = folio_pos(folio) + offset + length - wr->pos; + WARN_ON(x > wr->len); + wr->pos += x; + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range entirely within write range (punch hole) */ + } else if (wr->pos < folio_pos(folio) + offset && + folio_pos(folio) + offset + length < wr->pos + wr->len) { + /* XXX what do we do here... should not WARN_ON */ + WARN_ON(1); + /* punch hole */ + /* + * should we just ignore this and write it out anyway? + * it hardly makes sense + */ + return; + /* non-overlapping ranges */ + } else { + /* WARN if they do overlap */ + if (!((folio_pos(folio) + offset + length <= wr->pos) ^ + (wr->pos + wr->len <= folio_pos(folio) + offset))) { + WARN_ON(1); + printk("invalidate range offset %llu length %zu\n", + folio_pos(folio) + offset, length); + printk("write range offset %llu length %zu\n", + wr->pos, wr->len); + } + return; + } + + /* + * Above there are returns where wr is freed or where we WARN. + * Thus the following runs if wr was modified above. + */ + + orangefs_launder_folio(folio); +} + +static bool orangefs_release_folio(struct folio *folio, gfp_t foo) +{ + return !folio_test_private(folio); +} + +static void orangefs_free_folio(struct folio *folio) +{ + kfree(folio_detach_private(folio)); +} + +static int orangefs_launder_folio(struct folio *folio) +{ + int r = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; + folio_wait_writeback(folio); + if (folio_clear_dirty_for_io(folio)) { + r = orangefs_writepage_locked(&folio->page, &wbc); + folio_end_writeback(folio); + } + return r; +} + +static ssize_t orangefs_direct_IO(struct kiocb *iocb, + struct iov_iter *iter) +{ + /* + * Comment from original do_readv_writev: + * Common entry point for read/write/readv/writev + * This function will dispatch it to either the direct I/O + * or buffered I/O path depending on the mount options and/or + * augmented/extended metadata attached to the file. + * Note: File extended attributes override any mount options. + */ + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; + enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ? + ORANGEFS_IO_WRITE : ORANGEFS_IO_READ; + loff_t *offset = &pos; + struct inode *inode = file->f_mapping->host; + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; + size_t count = iov_iter_count(iter); + ssize_t total_count = 0; + ssize_t ret = -EINVAL; + int i = 0; + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", + __func__, + handle, + (int)count); + + if (type == ORANGEFS_IO_WRITE) { + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): proceeding with offset : %llu, " + "size %d\n", + __func__, + handle, + llu(*offset), + (int)count); + } + + if (count == 0) { + ret = 0; + goto out; + } + + while (iov_iter_count(iter)) { + size_t each_count = iov_iter_count(iter); + size_t amt_complete; + i++; + + /* how much to transfer in this loop iteration */ + if (each_count > orangefs_bufmap_size_query()) + each_count = orangefs_bufmap_size_query(); + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): size of each_count(%d)\n", + __func__, + handle, + (int)each_count); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): BEFORE wait_for_io: offset is %d\n", + __func__, + handle, + (int)*offset); + + ret = wait_for_direct_io(type, inode, offset, iter, + each_count, 0, NULL, NULL, file); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): return from wait_for_io:%d\n", + __func__, + handle, + (int)ret); + + if (ret < 0) + goto out; + + *offset += ret; + total_count += ret; + amt_complete = ret; + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): AFTER wait_for_io: offset is %d\n", + __func__, + handle, + (int)*offset); + + /* + * if we got a short I/O operations, + * fall out and return what we got so far + */ + if (amt_complete < each_count) + break; + } /*end while */ + +out: + if (total_count > 0) + ret = total_count; + if (ret > 0) { + if (type == ORANGEFS_IO_READ) { + file_accessed(file); + } else { + file_update_time(file); + if (*offset > i_size_read(inode)) + i_size_write(inode, *offset); + } + } + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): Value(%d) returned.\n", + __func__, + handle, + (int)ret); + + return ret; +} + +/** ORANGEFS2 implementation of address space operations */ +static const struct address_space_operations orangefs_address_operations = { + .writepage = orangefs_writepage, + .readahead = orangefs_readahead, + .read_folio = orangefs_read_folio, + .writepages = orangefs_writepages, + .dirty_folio = filemap_dirty_folio, + .write_begin = orangefs_write_begin, + .write_end = orangefs_write_end, + .invalidate_folio = orangefs_invalidate_folio, + .release_folio = orangefs_release_folio, + .free_folio = orangefs_free_folio, + .launder_folio = orangefs_launder_folio, + .direct_IO = orangefs_direct_IO, +}; + +vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + unsigned long *bitlock = &orangefs_inode->bitlock; + vm_fault_t ret; + struct orangefs_write_range *wr; + + sb_start_pagefault(inode->i_sb); + + if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) { + ret = VM_FAULT_RETRY; + goto out; + } + + folio_lock(folio); + if (folio_test_dirty(folio) && !folio_test_private(folio)) { + /* + * Should be impossible. If it happens, launder the folio + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + if (orangefs_launder_folio(folio)) { + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; + goto out; + } + } + if (folio_test_private(folio)) { + wr = folio_get_private(folio); + if (uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->pos = page_offset(vmf->page); + wr->len = PAGE_SIZE; + goto okay; + } else { + if (orangefs_launder_folio(folio)) { + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; + goto out; + } + } + } + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) { + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; + goto out; + } + wr->pos = page_offset(vmf->page); + wr->len = PAGE_SIZE; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + folio_attach_private(folio, wr); +okay: + + file_update_time(vmf->vma->vm_file); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; + goto out; + } + + /* + * We mark the folio dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty folio and writeprotect it again. + */ + folio_mark_dirty(folio); + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + sb_end_pagefault(inode->i_sb); + return ret; +} + +static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + loff_t orig_size; + int ret = -EINVAL; + + gossip_debug(GOSSIP_INODE_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n", + __func__, + get_khandle_from_ino(inode), + &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id, + iattr->ia_size); + + /* Ensure that we have a up to date size, so we know if it changed. */ + ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE); + if (ret == -ESTALE) + ret = -EIO; + if (ret) { + gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", + __func__, ret); + return ret; + } + orig_size = i_size_read(inode); + + /* This is truncate_setsize in a different order. */ + truncate_pagecache(inode, iattr->ia_size); + i_size_write(inode, iattr->ia_size); + if (iattr->ia_size > orig_size) + pagecache_isize_extended(inode, orig_size, iattr->ia_size); + + new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.truncate.refn = orangefs_inode->refn; + new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; + + ret = service_operation(new_op, + __func__, + get_interruptible_flag(inode)); + + /* + * the truncate has no downcall members to retrieve, but + * the status value tells us if it went through ok or not + */ + gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); + + op_release(new_op); + + if (ret != 0) + return ret; + + if (orig_size != i_size_read(inode)) + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + + return ret; +} + +int __orangefs_setattr(struct inode *inode, struct iattr *iattr) +{ + int ret; + + if (iattr->ia_valid & ATTR_MODE) { + if (iattr->ia_mode & (S_ISVTX)) { + if (is_root_handle(inode)) { + /* + * allow sticky bit to be set on root (since + * it shows up that way by default anyhow), + * but don't show it to the server + */ + iattr->ia_mode -= S_ISVTX; + } else { + gossip_debug(GOSSIP_UTILS_DEBUG, + "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); + ret = -EINVAL; + goto out; + } + } + if (iattr->ia_mode & (S_ISUID)) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "Attempting to set setuid bit (not supported); returning EINVAL.\n"); + ret = -EINVAL; + goto out; + } + } + + if (iattr->ia_valid & ATTR_SIZE) { + ret = orangefs_setattr_size(inode, iattr); + if (ret) + goto out; + } + +again: + spin_lock(&inode->i_lock); + if (ORANGEFS_I(inode)->attr_valid) { + if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) && + gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) { + ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; + } else { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again; + } + } else { + ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; + ORANGEFS_I(inode)->attr_uid = current_fsuid(); + ORANGEFS_I(inode)->attr_gid = current_fsgid(); + } + setattr_copy(&init_user_ns, inode, iattr); + spin_unlock(&inode->i_lock); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) + /* change mod on a file that has ACLs */ + ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + + ret = 0; +out: + return ret; +} + +/* + * Change attributes of an object referenced by dentry. + */ +int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) +{ + int ret; + gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", + dentry); + ret = setattr_prepare(&init_user_ns, dentry, iattr); + if (ret) + goto out; + ret = __orangefs_setattr(d_inode(dentry), iattr); + sync_inode_metadata(d_inode(dentry), 1); +out: + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", + ret); + return ret; +} + +/* + * Obtain attributes of an object given a dentry + */ +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) +{ + int ret; + struct inode *inode = path->dentry->d_inode; + + gossip_debug(GOSSIP_INODE_DEBUG, + "orangefs_getattr: called on %pd mask %u\n", + path->dentry, request_mask); + + ret = orangefs_inode_getattr(inode, + request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); + if (ret == 0) { + generic_fillattr(&init_user_ns, inode, stat); + + /* override block size reported to stat */ + if (!(request_mask & STATX_SIZE)) + stat->result_mask &= ~STATX_SIZE; + + generic_fill_statx_attr(inode, stat); + } + return ret; +} + +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + int ret; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); + + /* Make sure the permission (and other common attrs) are up to date. */ + ret = orangefs_inode_getattr(inode, 0); + if (ret < 0) + return ret; + + return generic_permission(&init_user_ns, inode, mask); +} + +int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) +{ + struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", + get_khandle_from_ino(inode)); + generic_update_time(inode, time, flags); + memset(&iattr, 0, sizeof iattr); + if (flags & S_ATIME) + iattr.ia_valid |= ATTR_ATIME; + if (flags & S_CTIME) + iattr.ia_valid |= ATTR_CTIME; + if (flags & S_MTIME) + iattr.ia_valid |= ATTR_MTIME; + return __orangefs_setattr(inode, &iattr); +} + +static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + u64 val = 0; + int ret; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, + dentry); + + ret = orangefs_inode_getxattr(d_inode(dentry), + "user.pvfs2.meta_hint", + &val, sizeof(val)); + if (ret < 0 && ret != -ENODATA) + return ret; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); + + fileattr_fill_flags(fa, val); + return 0; +} + +static int orangefs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + u64 val = 0; + + gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, + dentry); + /* + * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is + * turned on for a file. The user is not allowed to turn on this bit, + * but the bit is present if the user first gets the flags and then + * updates the flags with some new settings. So, we ignore it in the + * following edit. bligon. + */ + if (fileattr_has_fsx(fa) || + (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) { + gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n", + __func__); + return -EOPNOTSUPP; + } + val = fa->flags; + gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); + return orangefs_inode_setxattr(d_inode(dentry), + "user.pvfs2.meta_hint", + &val, sizeof(val), 0); +} + +/* ORANGEFS2 implementation of VFS inode operations for files */ +static const struct inode_operations orangefs_file_inode_operations = { + .get_acl = orangefs_get_acl, + .set_acl = orangefs_set_acl, + .setattr = orangefs_setattr, + .getattr = orangefs_getattr, + .listxattr = orangefs_listxattr, + .permission = orangefs_permission, + .update_time = orangefs_update_time, + .fileattr_get = orangefs_fileattr_get, + .fileattr_set = orangefs_fileattr_set, +}; + +static int orangefs_init_iops(struct inode *inode) +{ + inode->i_mapping->a_ops = &orangefs_address_operations; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &orangefs_file_inode_operations; + inode->i_fop = &orangefs_file_operations; + break; + case S_IFLNK: + inode->i_op = &orangefs_symlink_inode_operations; + break; + case S_IFDIR: + inode->i_op = &orangefs_dir_inode_operations; + inode->i_fop = &orangefs_dir_operations; + break; + default: + gossip_debug(GOSSIP_INODE_DEBUG, + "%s: unsupported mode\n", + __func__); + return -EINVAL; + } + + return 0; +} + +/* + * Given an ORANGEFS object identifier (fsid, handle), convert it into + * a ino_t type that will be used as a hash-index from where the handle will + * be searched for in the VFS hash table of inodes. + */ +static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref) +{ + if (!ref) + return 0; + return orangefs_khandle_to_ino(&(ref->khandle)); +} + +/* + * Called to set up an inode from iget5_locked. + */ +static int orangefs_set_inode(struct inode *inode, void *data) +{ + struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; + ORANGEFS_I(inode)->refn.fs_id = ref->fs_id; + ORANGEFS_I(inode)->refn.khandle = ref->khandle; + ORANGEFS_I(inode)->attr_valid = 0; + hash_init(ORANGEFS_I(inode)->xattr_cache); + ORANGEFS_I(inode)->mapping_time = jiffies - 1; + ORANGEFS_I(inode)->bitlock = 0; + return 0; +} + +/* + * Called to determine if handles match. + */ +static int orangefs_test_inode(struct inode *inode, void *data) +{ + struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; + struct orangefs_inode_s *orangefs_inode = NULL; + + orangefs_inode = ORANGEFS_I(inode); + /* test handles and fs_ids... */ + return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), + &(ref->khandle)) && + orangefs_inode->refn.fs_id == ref->fs_id); +} + +/* + * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS + * file handle. + * + * @sb: the file system super block instance. + * @ref: The ORANGEFS object for which we are trying to locate an inode. + */ +struct inode *orangefs_iget(struct super_block *sb, + struct orangefs_object_kref *ref) +{ + struct inode *inode = NULL; + unsigned long hash; + int error; + + hash = orangefs_handle_hash(ref); + inode = iget5_locked(sb, + hash, + orangefs_test_inode, + orangefs_set_inode, + ref); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); + if (error) { + iget_failed(inode); + return ERR_PTR(error); + } + + inode->i_ino = hash; /* needed for stat etc */ + orangefs_init_iops(inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_INODE_DEBUG, + "iget handle %pU, fsid %d hash %ld i_ino %lu\n", + &ref->khandle, + ref->fs_id, + hash, + inode->i_ino); + + return inode; +} + +/* + * Allocate an inode for a newly created file and insert it into the inode hash. + */ +struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, + int mode, dev_t dev, struct orangefs_object_kref *ref) +{ + unsigned long hash = orangefs_handle_hash(ref); + struct inode *inode; + int error; + + gossip_debug(GOSSIP_INODE_DEBUG, + "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n", + __func__, + sb, + MAJOR(dev), + MINOR(dev), + mode); + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + orangefs_set_inode(inode, ref); + inode->i_ino = hash; /* needed for stat etc */ + + error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); + if (error) + goto out_iput; + + orangefs_init_iops(inode); + inode->i_rdev = dev; + + error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); + if (error < 0) + goto out_iput; + + gossip_debug(GOSSIP_INODE_DEBUG, + "Initializing ACL's for inode %pU\n", + get_khandle_from_ino(inode)); + orangefs_init_acl(inode, dir); + return inode; + +out_iput: + iput(inode); + return ERR_PTR(error); +} diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c new file mode 100644 index 000000000..600e8eee5 --- /dev/null +++ b/fs/orangefs/namei.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS namei operations. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" + +/* + * Get a newly allocated inode to go with a negative dentry. + */ +static int orangefs_create(struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool exclusive) +{ + struct orangefs_inode_s *parent = ORANGEFS_I(dir); + struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; + struct inode *inode; + struct iattr iattr; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", + __func__, + dentry); + + new_op = op_alloc(ORANGEFS_VFS_OP_CREATE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.create.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.create.attributes, + ORANGEFS_TYPE_METAFILE, mode); + + strncpy(new_op->upcall.req.create.d_name, + dentry->d_name.name, ORANGEFS_NAME_MAX - 1); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: %pd: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", + __func__, + dentry, + &new_op->downcall.resp.create.refn.khandle, + new_op->downcall.resp.create.refn.fs_id, + new_op, + ret); + + if (ret < 0) + goto out; + + ref = new_op->downcall.resp.create.refn; + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref); + if (IS_ERR(inode)) { + gossip_err("%s: Failed to allocate inode for file :%pd:\n", + __func__, + dentry); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: Assigned inode :%pU: for file :%pd:\n", + __func__, + get_khandle_from_ino(inode), + dentry); + + d_instantiate_new(dentry, inode); + orangefs_set_timeout(dentry); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: dentry instantiated for %pd\n", + __func__, + dentry); + + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); + ret = 0; +out: + op_release(new_op); + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: %pd: returning %d\n", + __func__, + dentry, + ret); + return ret; +} + +/* + * Attempt to resolve an object name (dentry->d_name), parent handle, and + * fsid into a handle for the object. + */ +static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct orangefs_inode_s *parent = ORANGEFS_I(dir); + struct orangefs_kernel_op_s *new_op; + struct inode *inode; + int ret = -EINVAL; + + /* + * in theory we could skip a lookup here (if the intent is to + * create) in order to avoid a potentially failed lookup, but + * leaving it in can skip a valid lookup and try to create a file + * that already exists (e.g. the vfs already handles checking for + * -EEXIST on O_EXCL opens, which is broken if we skip this lookup + * in the create path) + */ + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %pd\n", + __func__, dentry); + + if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1)) + return ERR_PTR(-ENAMETOOLONG); + + new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); + if (!new_op) + return ERR_PTR(-ENOMEM); + + new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n", + __FILE__, + __func__, + __LINE__, + &parent->refn.khandle); + new_op->upcall.req.lookup.parent_refn = parent->refn; + + strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: doing lookup on %s under %pU,%d\n", + __func__, + new_op->upcall.req.lookup.d_name, + &new_op->upcall.req.lookup.parent_refn.khandle, + new_op->upcall.req.lookup.parent_refn.fs_id); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Lookup Got %pU, fsid %d (ret=%d)\n", + &new_op->downcall.resp.lookup.refn.khandle, + new_op->downcall.resp.lookup.refn.fs_id, + ret); + + if (ret == 0) { + orangefs_set_timeout(dentry); + inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); + } else if (ret == -ENOENT) { + inode = NULL; + } else { + /* must be a non-recoverable error */ + inode = ERR_PTR(ret); + } + + op_release(new_op); + return d_splice_alias(inode, dentry); +} + +/* return 0 on success; non-zero otherwise */ +static int orangefs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct orangefs_inode_s *parent = ORANGEFS_I(dir); + struct orangefs_kernel_op_s *new_op; + struct iattr iattr; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: called on %pd\n" + " (inode %pU): Parent is %pU | fs_id %d\n", + __func__, + dentry, + get_khandle_from_ino(inode), + &parent->refn.khandle, + parent->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.remove.parent_refn = parent->refn; + strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + + ret = service_operation(new_op, "orangefs_unlink", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: service_operation returned:%d:\n", + __func__, + ret); + + op_release(new_op); + + if (!ret) { + drop_nlink(inode); + + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); + } + return ret; +} + +static int orangefs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct orangefs_inode_s *parent = ORANGEFS_I(dir); + struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; + struct inode *inode; + struct iattr iattr; + int mode = 0755; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); + + if (!symname) + return -EINVAL; + + if (strlen(symname)+1 > ORANGEFS_NAME_MAX) + return -ENAMETOOLONG; + + new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.sym.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.sym.attributes, + ORANGEFS_TYPE_SYMLINK, + mode); + + strncpy(new_op->upcall.req.sym.entry_name, + dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX - 1); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n", + &new_op->downcall.resp.sym.refn.khandle, + new_op->downcall.resp.sym.refn.fs_id, ret); + + if (ret < 0) { + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + ref = new_op->downcall.resp.sym.refn; + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref); + if (IS_ERR(inode)) { + gossip_err + ("*** Failed to allocate orangefs symlink inode\n"); + ret = PTR_ERR(inode); + goto out; + } + /* + * This is necessary because orangefs_inode_getattr will not + * re-read symlink size as it is impossible for it to change. + * Invalidating the cache does not help. orangefs_new_inode + * does not set the correct size (it does not know symname). + */ + inode->i_size = strlen(symname); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Assigned symlink inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate_new(dentry, inode); + orangefs_set_timeout(dentry); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Inode (Symlink) %pU -> %pd\n", + get_khandle_from_ino(inode), + dentry); + + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); + ret = 0; +out: + op_release(new_op); + return ret; +} + +static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct orangefs_inode_s *parent = ORANGEFS_I(dir); + struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; + struct inode *inode; + struct iattr iattr; + int ret; + + new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.mkdir.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, + ORANGEFS_TYPE_DIRECTORY, mode); + + strncpy(new_op->upcall.req.mkdir.d_name, + dentry->d_name.name, ORANGEFS_NAME_MAX - 1); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Mkdir Got ORANGEFS handle %pU on fsid %d\n", + &new_op->downcall.resp.mkdir.refn.khandle, + new_op->downcall.resp.mkdir.refn.fs_id); + + if (ret < 0) { + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + ref = new_op->downcall.resp.mkdir.refn; + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref); + if (IS_ERR(inode)) { + gossip_err("*** Failed to allocate orangefs dir inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "Assigned dir inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate_new(dentry, inode); + orangefs_set_timeout(dentry); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Inode (Directory) %pU -> %pd\n", + get_khandle_from_ino(inode), + dentry); + + /* + * NOTE: we have no good way to keep nlink consistent for directories + * across clients; keep constant at 1. + */ + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); +out: + op_release(new_op); + return ret; +} + +static int orangefs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + struct orangefs_kernel_op_s *new_op; + struct iattr iattr; + int ret; + + if (flags) + return -EINVAL; + + gossip_debug(GOSSIP_NAME_DEBUG, + "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", + old_dentry, new_dentry, d_count(new_dentry)); + + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(new_dir); + __orangefs_setattr(new_dir, &iattr); + + new_op = op_alloc(ORANGEFS_VFS_OP_RENAME); + if (!new_op) + return -EINVAL; + + new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn; + new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn; + + strncpy(new_op->upcall.req.rename.d_old_name, + old_dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + strncpy(new_op->upcall.req.rename.d_new_name, + new_dentry->d_name.name, + ORANGEFS_NAME_MAX - 1); + + ret = service_operation(new_op, + "orangefs_rename", + get_interruptible_flag(old_dentry->d_inode)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "orangefs_rename: got downcall status %d\n", + ret); + + if (new_dentry->d_inode) + new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode); + + op_release(new_op); + return ret; +} + +/* ORANGEFS implementation of VFS inode operations for directories */ +const struct inode_operations orangefs_dir_inode_operations = { + .lookup = orangefs_lookup, + .get_acl = orangefs_get_acl, + .set_acl = orangefs_set_acl, + .create = orangefs_create, + .unlink = orangefs_unlink, + .symlink = orangefs_symlink, + .mkdir = orangefs_mkdir, + .rmdir = orangefs_unlink, + .rename = orangefs_rename, + .setattr = orangefs_setattr, + .getattr = orangefs_getattr, + .listxattr = orangefs_listxattr, + .permission = orangefs_permission, + .update_time = orangefs_update_time, +}; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c new file mode 100644 index 000000000..b501dc07f --- /dev/null +++ b/fs/orangefs/orangefs-bufmap.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +struct slot_map { + int c; + wait_queue_head_t q; + int count; + unsigned long *map; +}; + +static struct slot_map rw_map = { + .c = -1, + .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q) +}; +static struct slot_map readdir_map = { + .c = -1, + .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q) +}; + + +static void install(struct slot_map *m, int count, unsigned long *map) +{ + spin_lock(&m->q.lock); + m->c = m->count = count; + m->map = map; + wake_up_all_locked(&m->q); + spin_unlock(&m->q.lock); +} + +static void mark_killed(struct slot_map *m) +{ + spin_lock(&m->q.lock); + m->c -= m->count + 1; + spin_unlock(&m->q.lock); +} + +static void run_down(struct slot_map *m) +{ + DEFINE_WAIT(wait); + spin_lock(&m->q.lock); + if (m->c != -1) { + for (;;) { + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail(&m->q, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + + if (m->c == -1) + break; + + spin_unlock(&m->q.lock); + schedule(); + spin_lock(&m->q.lock); + } + __remove_wait_queue(&m->q, &wait); + __set_current_state(TASK_RUNNING); + } + m->map = NULL; + spin_unlock(&m->q.lock); +} + +static void put(struct slot_map *m, int slot) +{ + int v; + spin_lock(&m->q.lock); + __clear_bit(slot, m->map); + v = ++m->c; + if (v > 0) + wake_up_locked(&m->q); + if (unlikely(v == -1)) /* finished dying */ + wake_up_all_locked(&m->q); + spin_unlock(&m->q.lock); +} + +static int wait_for_free(struct slot_map *m) +{ + long left = slot_timeout_secs * HZ; + DEFINE_WAIT(wait); + + do { + long n = left, t; + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + if (m->c > 0) + break; + + if (m->c < 0) { + /* we are waiting for map to be installed */ + /* it would better be there soon, or we go away */ + if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ) + n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ; + } + spin_unlock(&m->q.lock); + t = schedule_timeout(n); + spin_lock(&m->q.lock); + if (unlikely(!t) && n != left && m->c < 0) + left = t; + else + left = t + (left - n); + if (signal_pending(current)) + left = -EINTR; + } while (left > 0); + + if (!list_empty(&wait.entry)) + list_del(&wait.entry); + else if (left <= 0 && waitqueue_active(&m->q)) + __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); + __set_current_state(TASK_RUNNING); + + if (likely(left > 0)) + return 0; + + return left < 0 ? -EINTR : -ETIMEDOUT; +} + +static int get(struct slot_map *m) +{ + int res = 0; + spin_lock(&m->q.lock); + if (unlikely(m->c <= 0)) + res = wait_for_free(m); + if (likely(!res)) { + m->c--; + res = find_first_zero_bit(m->map, m->count); + __set_bit(res, m->map); + } + spin_unlock(&m->q.lock); + return res; +} + +/* used to describe mapped buffers */ +struct orangefs_bufmap_desc { + void __user *uaddr; /* user space address pointer */ + struct page **page_array; /* array of mapped pages */ + int array_count; /* size of above arrays */ + struct list_head list_link; +}; + +static struct orangefs_bufmap { + int desc_size; + int desc_shift; + int desc_count; + int total_size; + int page_count; + + struct page **page_array; + struct orangefs_bufmap_desc *desc_array; + + /* array to track usage of buffer descriptors */ + unsigned long *buffer_index_array; + + /* array to track usage of buffer descriptors for readdir */ +#define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG) + unsigned long readdir_index_array[N]; +#undef N +} *__orangefs_bufmap; + +static DEFINE_SPINLOCK(orangefs_bufmap_lock); + +static void +orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap) +{ + unpin_user_pages(bufmap->page_array, bufmap->page_count); +} + +static void +orangefs_bufmap_free(struct orangefs_bufmap *bufmap) +{ + kfree(bufmap->page_array); + kfree(bufmap->desc_array); + bitmap_free(bufmap->buffer_index_array); + kfree(bufmap); +} + +/* + * XXX: Can the size and shift change while the caller gives up the + * XXX: lock between calling this and doing something useful? + */ + +int orangefs_bufmap_size_query(void) +{ + struct orangefs_bufmap *bufmap; + int size = 0; + spin_lock(&orangefs_bufmap_lock); + bufmap = __orangefs_bufmap; + if (bufmap) + size = bufmap->desc_size; + spin_unlock(&orangefs_bufmap_lock); + return size; +} + +int orangefs_bufmap_shift_query(void) +{ + struct orangefs_bufmap *bufmap; + int shift = 0; + spin_lock(&orangefs_bufmap_lock); + bufmap = __orangefs_bufmap; + if (bufmap) + shift = bufmap->desc_shift; + spin_unlock(&orangefs_bufmap_lock); + return shift; +} + +static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); +static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); + +static struct orangefs_bufmap * +orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) +{ + struct orangefs_bufmap *bufmap; + + bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL); + if (!bufmap) + goto out; + + bufmap->total_size = user_desc->total_size; + bufmap->desc_count = user_desc->count; + bufmap->desc_size = user_desc->size; + bufmap->desc_shift = ilog2(bufmap->desc_size); + + bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL); + if (!bufmap->buffer_index_array) + goto out_free_bufmap; + + bufmap->desc_array = + kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc), + GFP_KERNEL); + if (!bufmap->desc_array) + goto out_free_index_array; + + bufmap->page_count = bufmap->total_size / PAGE_SIZE; + + /* allocate storage to track our page mappings */ + bufmap->page_array = + kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL); + if (!bufmap->page_array) + goto out_free_desc_array; + + return bufmap; + +out_free_desc_array: + kfree(bufmap->desc_array); +out_free_index_array: + bitmap_free(bufmap->buffer_index_array); +out_free_bufmap: + kfree(bufmap); +out: + return NULL; +} + +static int +orangefs_bufmap_map(struct orangefs_bufmap *bufmap, + struct ORANGEFS_dev_map_desc *user_desc) +{ + int pages_per_desc = bufmap->desc_size / PAGE_SIZE; + int offset = 0, ret, i; + + /* map the pages */ + ret = pin_user_pages_fast((unsigned long)user_desc->ptr, + bufmap->page_count, FOLL_WRITE, bufmap->page_array); + + if (ret < 0) + return ret; + + if (ret != bufmap->page_count) { + gossip_err("orangefs error: asked for %d pages, only got %d.\n", + bufmap->page_count, ret); + + for (i = 0; i < ret; i++) { + SetPageError(bufmap->page_array[i]); + unpin_user_page(bufmap->page_array[i]); + } + return -ENOMEM; + } + + /* + * ideally we want to get kernel space pointers for each page, but + * we can't kmap that many pages at once if highmem is being used. + * so instead, we just kmap/kunmap the page address each time the + * kaddr is needed. + */ + for (i = 0; i < bufmap->page_count; i++) + flush_dcache_page(bufmap->page_array[i]); + + /* build a list of available descriptors */ + for (offset = 0, i = 0; i < bufmap->desc_count; i++) { + bufmap->desc_array[i].page_array = &bufmap->page_array[offset]; + bufmap->desc_array[i].array_count = pages_per_desc; + bufmap->desc_array[i].uaddr = + (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE)); + offset += pages_per_desc; + } + + return 0; +} + +/* + * orangefs_bufmap_initialize() + * + * initializes the mapped buffer interface + * + * returns 0 on success, -errno on failure + */ +int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc) +{ + struct orangefs_bufmap *bufmap; + int ret = -EINVAL; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "orangefs_bufmap_initialize: called (ptr (" + "%p) sz (%d) cnt(%d).\n", + user_desc->ptr, + user_desc->size, + user_desc->count); + + if (user_desc->total_size < 0 || + user_desc->size < 0 || + user_desc->count < 0) + goto out; + + /* + * sanity check alignment and size of buffer that caller wants to + * work with + */ + if (PAGE_ALIGN((unsigned long)user_desc->ptr) != + (unsigned long)user_desc->ptr) { + gossip_err("orangefs error: memory alignment (front). %p\n", + user_desc->ptr); + goto out; + } + + if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size)) + != (unsigned long)(user_desc->ptr + user_desc->total_size)) { + gossip_err("orangefs error: memory alignment (back).(%p + %d)\n", + user_desc->ptr, + user_desc->total_size); + goto out; + } + + if (user_desc->total_size != (user_desc->size * user_desc->count)) { + gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n", + user_desc->total_size, + user_desc->size, + user_desc->count); + goto out; + } + + if ((user_desc->size % PAGE_SIZE) != 0) { + gossip_err("orangefs error: bufmap size not page size divisible (%d).\n", + user_desc->size); + goto out; + } + + ret = -ENOMEM; + bufmap = orangefs_bufmap_alloc(user_desc); + if (!bufmap) + goto out; + + ret = orangefs_bufmap_map(bufmap, user_desc); + if (ret) + goto out_free_bufmap; + + + spin_lock(&orangefs_bufmap_lock); + if (__orangefs_bufmap) { + spin_unlock(&orangefs_bufmap_lock); + gossip_err("orangefs: error: bufmap already initialized.\n"); + ret = -EINVAL; + goto out_unmap_bufmap; + } + __orangefs_bufmap = bufmap; + install(&rw_map, + bufmap->desc_count, + bufmap->buffer_index_array); + install(&readdir_map, + ORANGEFS_READDIR_DEFAULT_DESC_COUNT, + bufmap->readdir_index_array); + spin_unlock(&orangefs_bufmap_lock); + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "orangefs_bufmap_initialize: exiting normally\n"); + return 0; + +out_unmap_bufmap: + orangefs_bufmap_unmap(bufmap); +out_free_bufmap: + orangefs_bufmap_free(bufmap); +out: + return ret; +} + +/* + * orangefs_bufmap_finalize() + * + * shuts down the mapped buffer interface and releases any resources + * associated with it + * + * no return value + */ +void orangefs_bufmap_finalize(void) +{ + struct orangefs_bufmap *bufmap = __orangefs_bufmap; + if (!bufmap) + return; + gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n"); + mark_killed(&rw_map); + mark_killed(&readdir_map); + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "orangefs_bufmap_finalize: exiting normally\n"); +} + +void orangefs_bufmap_run_down(void) +{ + struct orangefs_bufmap *bufmap = __orangefs_bufmap; + if (!bufmap) + return; + run_down(&rw_map); + run_down(&readdir_map); + spin_lock(&orangefs_bufmap_lock); + __orangefs_bufmap = NULL; + spin_unlock(&orangefs_bufmap_lock); + orangefs_bufmap_unmap(bufmap); + orangefs_bufmap_free(bufmap); +} + +/* + * orangefs_bufmap_get() + * + * gets a free mapped buffer descriptor, will sleep until one becomes + * available if necessary + * + * returns slot on success, -errno on failure + */ +int orangefs_bufmap_get(void) +{ + return get(&rw_map); +} + +/* + * orangefs_bufmap_put() + * + * returns a mapped buffer descriptor to the collection + * + * no return value + */ +void orangefs_bufmap_put(int buffer_index) +{ + put(&rw_map, buffer_index); +} + +/* + * orangefs_readdir_index_get() + * + * gets a free descriptor, will sleep until one becomes + * available if necessary. + * Although the readdir buffers are not mapped into kernel space + * we could do that at a later point of time. Regardless, these + * indices are used by the client-core. + * + * returns slot on success, -errno on failure + */ +int orangefs_readdir_index_get(void) +{ + return get(&readdir_map); +} + +void orangefs_readdir_index_put(int buffer_index) +{ + put(&readdir_map, buffer_index); +} + +/* + * we've been handed an iovec, we need to copy it to + * the shared memory descriptor at "buffer_index". + */ +int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, + int buffer_index, + size_t size) +{ + struct orangefs_bufmap_desc *to; + int i; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "%s: buffer_index:%d: size:%zu:\n", + __func__, buffer_index, size); + + to = &__orangefs_bufmap->desc_array[buffer_index]; + for (i = 0; size; i++) { + struct page *page = to->page_array[i]; + size_t n = size; + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (copy_page_from_iter(page, 0, n, iter) != n) + return -EFAULT; + size -= n; + } + return 0; +} + +/* + * we've been handed an iovec, we need to fill it from + * the shared memory descriptor at "buffer_index". + */ +int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, + int buffer_index, + size_t size) +{ + struct orangefs_bufmap_desc *from; + int i; + + from = &__orangefs_bufmap->desc_array[buffer_index]; + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "%s: buffer_index:%d: size:%zu:\n", + __func__, buffer_index, size); + + + for (i = 0; size; i++) { + struct page *page = from->page_array[i]; + size_t n = size; + if (n > PAGE_SIZE) + n = PAGE_SIZE; + n = copy_page_to_iter(page, 0, n, iter); + if (!n) + return -EFAULT; + size -= n; + } + return 0; +} + +void orangefs_bufmap_page_fill(void *page_to, + int buffer_index, + int slot_index) +{ + struct orangefs_bufmap_desc *from; + void *page_from; + + from = &__orangefs_bufmap->desc_array[buffer_index]; + page_from = kmap_atomic(from->page_array[slot_index]); + memcpy(page_to, page_from, PAGE_SIZE); + kunmap_atomic(page_from); +} diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h new file mode 100644 index 000000000..75b2d2833 --- /dev/null +++ b/fs/orangefs/orangefs-bufmap.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#ifndef __ORANGEFS_BUFMAP_H +#define __ORANGEFS_BUFMAP_H + +int orangefs_bufmap_size_query(void); + +int orangefs_bufmap_shift_query(void); + +int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc); + +void orangefs_bufmap_finalize(void); + +void orangefs_bufmap_run_down(void); + +int orangefs_bufmap_get(void); + +void orangefs_bufmap_put(int buffer_index); + +int orangefs_readdir_index_get(void); + +void orangefs_readdir_index_put(int buffer_index); + +int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, + int buffer_index, + size_t size); + +int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, + int buffer_index, + size_t size); + +void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index); + +#endif /* __ORANGEFS_BUFMAP_H */ diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c new file mode 100644 index 000000000..3b6982bf6 --- /dev/null +++ b/fs/orangefs/orangefs-cache.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" + +/* tags assigned to kernel upcall operations */ +static __u64 next_tag_value; +static DEFINE_SPINLOCK(next_tag_value_lock); + +/* the orangefs memory caches */ + +/* a cache for orangefs upcall/downcall operations */ +static struct kmem_cache *op_cache; + +int op_cache_initialize(void) +{ + op_cache = kmem_cache_create("orangefs_op_cache", + sizeof(struct orangefs_kernel_op_s), + 0, + ORANGEFS_CACHE_CREATE_FLAGS, + NULL); + + if (!op_cache) { + gossip_err("Cannot create orangefs_op_cache\n"); + return -ENOMEM; + } + + /* initialize our atomic tag counter */ + spin_lock(&next_tag_value_lock); + next_tag_value = 100; + spin_unlock(&next_tag_value_lock); + return 0; +} + +int op_cache_finalize(void) +{ + kmem_cache_destroy(op_cache); + return 0; +} + +char *get_opname_string(struct orangefs_kernel_op_s *new_op) +{ + if (new_op) { + __s32 type = new_op->upcall.type; + + if (type == ORANGEFS_VFS_OP_FILE_IO) + return "OP_FILE_IO"; + else if (type == ORANGEFS_VFS_OP_LOOKUP) + return "OP_LOOKUP"; + else if (type == ORANGEFS_VFS_OP_CREATE) + return "OP_CREATE"; + else if (type == ORANGEFS_VFS_OP_GETATTR) + return "OP_GETATTR"; + else if (type == ORANGEFS_VFS_OP_REMOVE) + return "OP_REMOVE"; + else if (type == ORANGEFS_VFS_OP_MKDIR) + return "OP_MKDIR"; + else if (type == ORANGEFS_VFS_OP_READDIR) + return "OP_READDIR"; + else if (type == ORANGEFS_VFS_OP_READDIRPLUS) + return "OP_READDIRPLUS"; + else if (type == ORANGEFS_VFS_OP_SETATTR) + return "OP_SETATTR"; + else if (type == ORANGEFS_VFS_OP_SYMLINK) + return "OP_SYMLINK"; + else if (type == ORANGEFS_VFS_OP_RENAME) + return "OP_RENAME"; + else if (type == ORANGEFS_VFS_OP_STATFS) + return "OP_STATFS"; + else if (type == ORANGEFS_VFS_OP_TRUNCATE) + return "OP_TRUNCATE"; + else if (type == ORANGEFS_VFS_OP_RA_FLUSH) + return "OP_RA_FLUSH"; + else if (type == ORANGEFS_VFS_OP_FS_MOUNT) + return "OP_FS_MOUNT"; + else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) + return "OP_FS_UMOUNT"; + else if (type == ORANGEFS_VFS_OP_GETXATTR) + return "OP_GETXATTR"; + else if (type == ORANGEFS_VFS_OP_SETXATTR) + return "OP_SETXATTR"; + else if (type == ORANGEFS_VFS_OP_LISTXATTR) + return "OP_LISTXATTR"; + else if (type == ORANGEFS_VFS_OP_REMOVEXATTR) + return "OP_REMOVEXATTR"; + else if (type == ORANGEFS_VFS_OP_PARAM) + return "OP_PARAM"; + else if (type == ORANGEFS_VFS_OP_PERF_COUNT) + return "OP_PERF_COUNT"; + else if (type == ORANGEFS_VFS_OP_CANCEL) + return "OP_CANCEL"; + else if (type == ORANGEFS_VFS_OP_FSYNC) + return "OP_FSYNC"; + else if (type == ORANGEFS_VFS_OP_FSKEY) + return "OP_FSKEY"; + else if (type == ORANGEFS_VFS_OP_FEATURES) + return "OP_FEATURES"; + } + return "OP_UNKNOWN?"; +} + +void orangefs_new_tag(struct orangefs_kernel_op_s *op) +{ + spin_lock(&next_tag_value_lock); + op->tag = next_tag_value++; + if (next_tag_value == 0) + next_tag_value = 100; + spin_unlock(&next_tag_value_lock); +} + +struct orangefs_kernel_op_s *op_alloc(__s32 type) +{ + struct orangefs_kernel_op_s *new_op = NULL; + + new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL); + if (new_op) { + INIT_LIST_HEAD(&new_op->list); + spin_lock_init(&new_op->lock); + init_completion(&new_op->waitq); + + new_op->upcall.type = ORANGEFS_VFS_OP_INVALID; + new_op->downcall.type = ORANGEFS_VFS_OP_INVALID; + new_op->downcall.status = -1; + + new_op->op_state = OP_VFS_STATE_UNKNOWN; + + /* initialize the op specific tag and upcall credentials */ + orangefs_new_tag(new_op); + new_op->upcall.type = type; + new_op->attempts = 0; + gossip_debug(GOSSIP_CACHE_DEBUG, + "Alloced OP (%p: %llu %s)\n", + new_op, + llu(new_op->tag), + get_opname_string(new_op)); + + new_op->upcall.uid = from_kuid(&init_user_ns, + current_fsuid()); + + new_op->upcall.gid = from_kgid(&init_user_ns, + current_fsgid()); + } else { + gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); + } + return new_op; +} + +void op_release(struct orangefs_kernel_op_s *orangefs_op) +{ + if (orangefs_op) { + gossip_debug(GOSSIP_CACHE_DEBUG, + "Releasing OP (%p: %llu)\n", + orangefs_op, + llu(orangefs_op->tag)); + kmem_cache_free(op_cache, orangefs_op); + } else { + gossip_err("NULL pointer in op_release\n"); + } +} diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h new file mode 100644 index 000000000..6e079d423 --- /dev/null +++ b/fs/orangefs/orangefs-debug.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* This file just defines debugging masks to be used with the gossip + * logging utility. All debugging masks for ORANGEFS are kept here to make + * sure we don't have collisions. + */ + +#ifndef __ORANGEFS_DEBUG_H +#define __ORANGEFS_DEBUG_H + +#ifdef __KERNEL__ +#include <linux/types.h> +#include <linux/kernel.h> +#else +#include <stdint.h> +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +#define GOSSIP_NO_DEBUG (__u64)0 + +#define GOSSIP_SUPER_DEBUG ((__u64)1 << 0) +#define GOSSIP_INODE_DEBUG ((__u64)1 << 1) +#define GOSSIP_FILE_DEBUG ((__u64)1 << 2) +#define GOSSIP_DIR_DEBUG ((__u64)1 << 3) +#define GOSSIP_UTILS_DEBUG ((__u64)1 << 4) +#define GOSSIP_WAIT_DEBUG ((__u64)1 << 5) +#define GOSSIP_ACL_DEBUG ((__u64)1 << 6) +#define GOSSIP_DCACHE_DEBUG ((__u64)1 << 7) +#define GOSSIP_DEV_DEBUG ((__u64)1 << 8) +#define GOSSIP_NAME_DEBUG ((__u64)1 << 9) +#define GOSSIP_BUFMAP_DEBUG ((__u64)1 << 10) +#define GOSSIP_CACHE_DEBUG ((__u64)1 << 11) +#define GOSSIP_DEBUGFS_DEBUG ((__u64)1 << 12) +#define GOSSIP_XATTR_DEBUG ((__u64)1 << 13) +#define GOSSIP_INIT_DEBUG ((__u64)1 << 14) +#define GOSSIP_SYSFS_DEBUG ((__u64)1 << 15) + +#define GOSSIP_MAX_NR 16 +#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1) + +/* a private internal type */ +struct __keyword_mask_s { + const char *keyword; + __u64 mask_val; +}; + +/* + * Map all kmod keywords to kmod debug masks here. Keep this + * structure "packed": + * + * "all" is always last... + * + * keyword mask_val index + * foo 1 0 + * bar 2 1 + * baz 4 2 + * qux 8 3 + * . . . + */ +static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { + {"super", GOSSIP_SUPER_DEBUG}, + {"inode", GOSSIP_INODE_DEBUG}, + {"file", GOSSIP_FILE_DEBUG}, + {"dir", GOSSIP_DIR_DEBUG}, + {"utils", GOSSIP_UTILS_DEBUG}, + {"wait", GOSSIP_WAIT_DEBUG}, + {"acl", GOSSIP_ACL_DEBUG}, + {"dcache", GOSSIP_DCACHE_DEBUG}, + {"dev", GOSSIP_DEV_DEBUG}, + {"name", GOSSIP_NAME_DEBUG}, + {"bufmap", GOSSIP_BUFMAP_DEBUG}, + {"cache", GOSSIP_CACHE_DEBUG}, + {"debugfs", GOSSIP_DEBUGFS_DEBUG}, + {"xattr", GOSSIP_XATTR_DEBUG}, + {"init", GOSSIP_INIT_DEBUG}, + {"sysfs", GOSSIP_SYSFS_DEBUG}, + {"none", GOSSIP_NO_DEBUG}, + {"all", GOSSIP_MAX_DEBUG} +}; + +static const int num_kmod_keyword_mask_map = (int) + (ARRAY_SIZE(s_kmod_keyword_mask_map)); + +#endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c new file mode 100644 index 000000000..1b508f543 --- /dev/null +++ b/fs/orangefs/orangefs-debugfs.c @@ -0,0 +1,1010 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * What: /sys/kernel/debug/orangefs/debug-help + * Date: June 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * List of client and kernel debug keywords. + * + * + * What: /sys/kernel/debug/orangefs/client-debug + * Date: June 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Debug setting for "the client", the userspace + * helper for the kernel module. + * + * + * What: /sys/kernel/debug/orangefs/kernel-debug + * Date: June 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Debug setting for the orangefs kernel module. + * + * Any of the keywords, or comma-separated lists + * of keywords, from debug-help can be catted to + * client-debug or kernel-debug. + * + * "none", "all" and "verbose" are special keywords + * for client-debug. Setting client-debug to "all" + * is kind of like trying to drink water from a + * fire hose, "verbose" triggers most of the same + * output except for the constant flow of output + * from the main wait loop. + * + * "none" and "all" are similar settings for kernel-debug + * no need for a "verbose". + */ +#include <linux/debugfs.h> +#include <linux/slab.h> + +#include <linux/uaccess.h> + +#include "orangefs-debugfs.h" +#include "protocol.h" +#include "orangefs-kernel.h" + +#define DEBUG_HELP_STRING_SIZE 4096 +#define HELP_STRING_UNINITIALIZED \ + "Client Debug Keywords are unknown until the first time\n" \ + "the client is started after boot.\n" +#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" +#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" +#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" +#define ORANGEFS_VERBOSE "verbose" +#define ORANGEFS_ALL "all" + +/* + * An array of client_debug_mask will be built to hold debug keyword/mask + * values fetched from userspace. + */ +struct client_debug_mask { + char *keyword; + __u64 mask1; + __u64 mask2; +}; + +static void orangefs_kernel_debug_init(void); + +static int orangefs_debug_help_open(struct inode *, struct file *); +static void *help_start(struct seq_file *, loff_t *); +static void *help_next(struct seq_file *, void *, loff_t *); +static void help_stop(struct seq_file *, void *); +static int help_show(struct seq_file *, void *); + +static int orangefs_debug_open(struct inode *, struct file *); + +static ssize_t orangefs_debug_read(struct file *, + char __user *, + size_t, + loff_t *); + +static ssize_t orangefs_debug_write(struct file *, + const char __user *, + size_t, + loff_t *); + +static int orangefs_prepare_cdm_array(char *); +static void debug_mask_to_string(void *, int); +static void do_k_string(void *, int); +static void do_c_string(void *, int); +static int keyword_is_amalgam(char *); +static int check_amalgam_keyword(void *, int); +static void debug_string_to_mask(char *, void *, int); +static void do_c_mask(int, char *, struct client_debug_mask **); +static void do_k_mask(int, char *, __u64 **); + +static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; +static char *debug_help_string; +static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; +static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; + +static struct dentry *client_debug_dentry; +static struct dentry *debug_dir; + +static unsigned int kernel_mask_set_mod_init; +static int orangefs_debug_disabled = 1; +static int help_string_initialized; + +static const struct seq_operations help_debug_ops = { + .start = help_start, + .next = help_next, + .stop = help_stop, + .show = help_show, +}; + +static const struct file_operations debug_help_fops = { + .owner = THIS_MODULE, + .open = orangefs_debug_help_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + +static const struct file_operations kernel_debug_fops = { + .owner = THIS_MODULE, + .open = orangefs_debug_open, + .read = orangefs_debug_read, + .write = orangefs_debug_write, + .llseek = generic_file_llseek, +}; + +static int client_all_index; +static int client_verbose_index; + +static struct client_debug_mask *cdm_array; +static int cdm_element_count; + +static struct client_debug_mask client_debug_mask; + +/* + * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and + * ORANGEFS_KMOD_DEBUG_FILE. + */ +static DEFINE_MUTEX(orangefs_debug_lock); + +/* Used to protect data in ORANGEFS_KMOD_DEBUG_HELP_FILE */ +static DEFINE_MUTEX(orangefs_help_file_lock); + +/* + * initialize kmod debug operations, create orangefs debugfs dir and + * ORANGEFS_KMOD_DEBUG_HELP_FILE. + */ +void orangefs_debugfs_init(int debug_mask) +{ + /* convert input debug mask to a 64-bit unsigned integer */ + orangefs_gossip_debug_mask = (unsigned long long)debug_mask; + + /* + * set the kernel's gossip debug string; invalid mask values will + * be ignored. + */ + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); + + /* remove any invalid values from the mask */ + debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask, + 0); + + /* + * if the mask has a non-zero value, then indicate that the mask + * was set when the kernel module was loaded. The orangefs dev ioctl + * command will look at this boolean to determine if the kernel's + * debug mask should be overwritten when the client-core is started. + */ + if (orangefs_gossip_debug_mask != 0) + kernel_mask_set_mod_init = true; + + pr_info("%s: called with debug mask: :%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + + debug_dir = debugfs_create_dir("orangefs", NULL); + + debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE, 0444, debug_dir, + debug_help_string, &debug_help_fops); + + orangefs_debug_disabled = 0; + + orangefs_kernel_debug_init(); +} + +/* + * initialize the kernel-debug file. + */ +static void orangefs_kernel_debug_init(void) +{ + static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); + + if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { + strcpy(k_buffer, kernel_debug_string); + strcat(k_buffer, "\n"); + } else { + strcpy(k_buffer, "none\n"); + pr_info("%s: overflow 1!\n", __func__); + } + + debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer, + &kernel_debug_fops); +} + + +void orangefs_debugfs_cleanup(void) +{ + debugfs_remove_recursive(debug_dir); + kfree(debug_help_string); + debug_help_string = NULL; +} + +/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */ +static int orangefs_debug_help_open(struct inode *inode, struct file *file) +{ + int rc = -ENODEV; + int ret; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_help_open: start\n"); + + if (orangefs_debug_disabled) + goto out; + + ret = seq_open(file, &help_debug_ops); + if (ret) + goto out; + + ((struct seq_file *)(file->private_data))->private = inode->i_private; + + rc = 0; + +out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_help_open: rc:%d:\n", + rc); + return rc; +} + +/* + * I think start always gets called again after stop. Start + * needs to return NULL when it is done. The whole "payload" + * in this case is a single (long) string, so by the second + * time we get to start (pos = 1), we're done. + */ +static void *help_start(struct seq_file *m, loff_t *pos) +{ + void *payload = NULL; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n"); + + mutex_lock(&orangefs_help_file_lock); + + if (*pos == 0) + payload = m->private; + + return payload; +} + +static void *help_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n"); + + return NULL; +} + +static void help_stop(struct seq_file *m, void *p) +{ + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n"); + mutex_unlock(&orangefs_help_file_lock); +} + +static int help_show(struct seq_file *m, void *v) +{ + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n"); + + seq_puts(m, v); + + return 0; +} + +/* + * initialize the client-debug file. + */ +static void orangefs_client_debug_init(void) +{ + + static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); + + if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { + strcpy(c_buffer, client_debug_string); + strcat(c_buffer, "\n"); + } else { + strcpy(c_buffer, "none\n"); + pr_info("%s: overflow! 2\n", __func__); + } + + client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE, + 0444, + debug_dir, + c_buffer, + &kernel_debug_fops); +} + +/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ +static int orangefs_debug_open(struct inode *inode, struct file *file) +{ + int rc = -ENODEV; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "%s: orangefs_debug_disabled: %d\n", + __func__, + orangefs_debug_disabled); + + if (orangefs_debug_disabled) + goto out; + + rc = 0; + mutex_lock(&orangefs_debug_lock); + file->private_data = inode->i_private; + mutex_unlock(&orangefs_debug_lock); + +out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_open: rc: %d\n", + rc); + return rc; +} + +static ssize_t orangefs_debug_read(struct file *file, + char __user *ubuf, + size_t count, + loff_t *ppos) +{ + char *buf; + int sprintf_ret; + ssize_t read_ret = -ENOMEM; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n"); + + buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); + if (!buf) + goto out; + + mutex_lock(&orangefs_debug_lock); + sprintf_ret = sprintf(buf, "%s", (char *)file->private_data); + mutex_unlock(&orangefs_debug_lock); + + read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret); + + kfree(buf); + +out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_read: ret: %zu\n", + read_ret); + + return read_ret; +} + +static ssize_t orangefs_debug_write(struct file *file, + const char __user *ubuf, + size_t count, + loff_t *ppos) +{ + char *buf; + int rc = -EFAULT; + size_t silly = 0; + char *debug_string; + struct orangefs_kernel_op_s *new_op = NULL; + struct client_debug_mask c_mask = { NULL, 0, 0 }; + char *s; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_write: %pD\n", + file); + + if (count == 0) + return 0; + + /* + * Thwart users who try to jamb a ridiculous number + * of bytes into the debug file... + */ + if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) { + silly = count; + count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1; + } + + buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); + if (!buf) + goto out; + + if (copy_from_user(buf, ubuf, count - 1)) { + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "%s: copy_from_user failed!\n", + __func__); + goto out; + } + + /* + * Map the keyword string from userspace into a valid debug mask. + * The mapping process involves mapping the human-inputted string + * into a valid mask, and then rebuilding the string from the + * verified valid mask. + * + * A service operation is required to set a new client-side + * debug mask. + */ + if (!strcmp(file->f_path.dentry->d_name.name, + ORANGEFS_KMOD_DEBUG_FILE)) { + debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0); + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); + debug_string = kernel_debug_string; + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "New kernel debug string is %s\n", + kernel_debug_string); + } else { + /* Can't reset client debug mask if client is not running. */ + if (is_daemon_in_service()) { + pr_info("%s: Client not running :%d:\n", + __func__, + is_daemon_in_service()); + goto out; + } + + debug_string_to_mask(buf, &c_mask, 1); + debug_mask_to_string(&c_mask, 1); + debug_string = client_debug_string; + + new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); + if (!new_op) { + pr_info("%s: op_alloc failed!\n", __func__); + goto out; + } + + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES; + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; + memset(new_op->upcall.req.param.s_value, + 0, + ORANGEFS_MAX_DEBUG_STRING_LEN); + sprintf(new_op->upcall.req.param.s_value, + "%llx %llx\n", + c_mask.mask1, + c_mask.mask2); + + /* service_operation returns 0 on success... */ + rc = service_operation(new_op, + "orangefs_param", + ORANGEFS_OP_INTERRUPTIBLE); + + if (rc) + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "%s: service_operation failed! rc:%d:\n", + __func__, + rc); + + op_release(new_op); + } + + mutex_lock(&orangefs_debug_lock); + s = file_inode(file)->i_private; + memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + sprintf(s, "%s\n", debug_string); + mutex_unlock(&orangefs_debug_lock); + + *ppos += count; + if (silly) + rc = silly; + else + rc = count; + +out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, + "orangefs_debug_write: rc: %d\n", + rc); + kfree(buf); + return rc; +} + +/* + * After obtaining a string representation of the client's debug + * keywords and their associated masks, this function is called to build an + * array of these values. + */ +static int orangefs_prepare_cdm_array(char *debug_array_string) +{ + int i; + int rc = -EINVAL; + char *cds_head = NULL; + char *cds_delimiter = NULL; + int keyword_len = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + /* + * figure out how many elements the cdm_array needs. + */ + for (i = 0; i < strlen(debug_array_string); i++) + if (debug_array_string[i] == '\n') + cdm_element_count++; + + if (!cdm_element_count) { + pr_info("No elements in client debug array string!\n"); + goto out; + } + + cdm_array = kcalloc(cdm_element_count, sizeof(*cdm_array), GFP_KERNEL); + if (!cdm_array) { + rc = -ENOMEM; + goto out; + } + + cds_head = debug_array_string; + + for (i = 0; i < cdm_element_count; i++) { + cds_delimiter = strchr(cds_head, '\n'); + *cds_delimiter = '\0'; + + keyword_len = strcspn(cds_head, " "); + + cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); + if (!cdm_array[i].keyword) { + rc = -ENOMEM; + goto out; + } + + sscanf(cds_head, + "%s %llx %llx", + cdm_array[i].keyword, + (unsigned long long *)&(cdm_array[i].mask1), + (unsigned long long *)&(cdm_array[i].mask2)); + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) + client_verbose_index = i; + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) + client_all_index = i; + + cds_head = cds_delimiter + 1; + } + + rc = cdm_element_count; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); + +out: + + return rc; + +} + +/* + * /sys/kernel/debug/orangefs/debug-help can be catted to + * see all the available kernel and client debug keywords. + * + * When orangefs.ko initializes, we have no idea what keywords the + * client supports, nor their associated masks. + * + * We pass through this function once at module-load and stamp a + * boilerplate "we don't know" message for the client in the + * debug-help file. We pass through here again when the client + * starts and then we can fill out the debug-help file fully. + * + * The client might be restarted any number of times between + * module reloads, we only build the debug-help file the first time. + */ +int orangefs_prepare_debugfs_help_string(int at_boot) +{ + char *client_title = "Client Debug Keywords:\n"; + char *kernel_title = "Kernel Debug Keywords:\n"; + size_t string_size = DEBUG_HELP_STRING_SIZE; + size_t result_size; + size_t i; + char *new; + int rc = -EINVAL; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (at_boot) + client_title = HELP_STRING_UNINITIALIZED; + + /* build a new debug_help_string. */ + new = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); + if (!new) { + rc = -ENOMEM; + goto out; + } + + /* + * strlcat(dst, src, size) will append at most + * "size - strlen(dst) - 1" bytes of src onto dst, + * null terminating the result, and return the total + * length of the string it tried to create. + * + * We'll just plow through here building our new debug + * help string and let strlcat take care of assuring that + * dst doesn't overflow. + */ + strlcat(new, client_title, string_size); + + if (!at_boot) { + + /* + * fill the client keyword/mask array and remember + * how many elements there were. + */ + cdm_element_count = + orangefs_prepare_cdm_array(client_debug_array_string); + if (cdm_element_count <= 0) { + kfree(new); + goto out; + } + + for (i = 0; i < cdm_element_count; i++) { + strlcat(new, "\t", string_size); + strlcat(new, cdm_array[i].keyword, string_size); + strlcat(new, "\n", string_size); + } + } + + strlcat(new, "\n", string_size); + strlcat(new, kernel_title, string_size); + + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + strlcat(new, "\t", string_size); + strlcat(new, s_kmod_keyword_mask_map[i].keyword, string_size); + result_size = strlcat(new, "\n", string_size); + } + + /* See if we tried to put too many bytes into "new"... */ + if (result_size >= string_size) { + kfree(new); + goto out; + } + + if (at_boot) { + debug_help_string = new; + } else { + mutex_lock(&orangefs_help_file_lock); + memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE); + strlcat(debug_help_string, new, string_size); + mutex_unlock(&orangefs_help_file_lock); + kfree(new); + } + + rc = 0; + +out: return rc; + +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_mask_to_string(void *mask, int type) +{ + int i; + int len = 0; + char *debug_string; + int element_count = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + debug_string = client_debug_string; + element_count = cdm_element_count; + } else { + debug_string = kernel_debug_string; + element_count = num_kmod_keyword_mask_map; + } + + memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + + /* + * Some keywords, like "all" or "verbose", are amalgams of + * numerous other keywords. Make a special check for those + * before grinding through the whole mask only to find out + * later... + */ + if (check_amalgam_keyword(mask, type)) + goto out; + + /* Build the debug string. */ + for (i = 0; i < element_count; i++) + if (type) + do_c_string(mask, i); + else + do_k_string(mask, i); + + len = strlen(debug_string); + + if ((len) && (type)) + client_debug_string[len - 1] = '\0'; + else if (len) + kernel_debug_string[len - 1] = '\0'; + else if (type) + strcpy(client_debug_string, "none"); + else + strcpy(kernel_debug_string, "none"); + +out: +gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); + + return; + +} + +static void do_k_string(void *k_mask, int index) +{ + __u64 *mask = (__u64 *) k_mask; + + if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) + goto out; + + if (*mask & s_kmod_keyword_mask_map[index].mask_val) { + if ((strlen(kernel_debug_string) + + strlen(s_kmod_keyword_mask_map[index].keyword)) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strcat(kernel_debug_string, + s_kmod_keyword_mask_map[index].keyword); + strcat(kernel_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(kernel_debug_string, ORANGEFS_ALL); + goto out; + } + } + +out: + + return; +} + +static void do_c_string(void *c_mask, int index) +{ + struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; + + if (keyword_is_amalgam(cdm_array[index].keyword)) + goto out; + + if ((mask->mask1 & cdm_array[index].mask1) || + (mask->mask2 & cdm_array[index].mask2)) { + if ((strlen(client_debug_string) + + strlen(cdm_array[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + strcat(client_debug_string, + cdm_array[index].keyword); + strcat(client_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(client_debug_string, ORANGEFS_ALL); + goto out; + } + } +out: + return; +} + +static int keyword_is_amalgam(char *keyword) +{ + int rc = 0; + + if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) + rc = 1; + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + * + * return 1 if we found an amalgam. + */ +static int check_amalgam_keyword(void *mask, int type) +{ + __u64 *k_mask; + struct client_debug_mask *c_mask; + int k_all_index = num_kmod_keyword_mask_map - 1; + int rc = 0; + + if (type) { + c_mask = (struct client_debug_mask *) mask; + + if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && + (c_mask->mask2 == cdm_array[client_all_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + + if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && + (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_VERBOSE); + rc = 1; + goto out; + } + + } else { + k_mask = (__u64 *) mask; + + if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { + strcpy(kernel_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + } + +out: + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_string_to_mask(char *debug_string, void *mask, int type) +{ + char *unchecked_keyword; + int i; + char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); + char *original_pointer; + int element_count = 0; + struct client_debug_mask *c_mask = NULL; + __u64 *k_mask = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + c_mask = (struct client_debug_mask *)mask; + element_count = cdm_element_count; + } else { + k_mask = (__u64 *)mask; + *k_mask = 0; + element_count = num_kmod_keyword_mask_map; + } + + original_pointer = strsep_fodder; + while ((unchecked_keyword = strsep(&strsep_fodder, ","))) + if (strlen(unchecked_keyword)) { + for (i = 0; i < element_count; i++) + if (type) + do_c_mask(i, + unchecked_keyword, + &c_mask); + else + do_k_mask(i, + unchecked_keyword, + &k_mask); + } + + kfree(original_pointer); +} + +static void do_c_mask(int i, char *unchecked_keyword, + struct client_debug_mask **sane_mask) +{ + + if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { + (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; + (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; + } +} + +static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) +{ + + if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) + **sane_mask = (**sane_mask) | + s_kmod_keyword_mask_map[i].mask_val; +} + +int orangefs_debugfs_new_client_mask(void __user *arg) +{ + struct dev_mask2_info_s mask2_info = {0}; + int ret; + + ret = copy_from_user(&mask2_info, + (void __user *)arg, + sizeof(struct dev_mask2_info_s)); + + if (ret != 0) + return -EIO; + + client_debug_mask.mask1 = mask2_info.mask1_value; + client_debug_mask.mask2 = mask2_info.mask2_value; + + pr_info("%s: client debug mask has been been received " + ":%llx: :%llx:\n", + __func__, + (unsigned long long)client_debug_mask.mask1, + (unsigned long long)client_debug_mask.mask2); + + return ret; +} + +int orangefs_debugfs_new_client_string(void __user *arg) +{ + int ret; + + ret = copy_from_user(&client_debug_array_string, + (void __user *)arg, + ORANGEFS_MAX_DEBUG_STRING_LEN); + + if (ret != 0) { + pr_info("%s: CLIENT_STRING: copy_from_user failed\n", + __func__); + return -EFAULT; + } + + /* + * The real client-core makes an effort to ensure + * that actual strings that aren't too long to fit in + * this buffer is what we get here. We're going to use + * string functions on the stuff we got, so we'll make + * this extra effort to try and keep from + * flowing out of this buffer when we use the string + * functions, even if somehow the stuff we end up + * with here is garbage. + */ + client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = + '\0'; + + pr_info("%s: client debug array string has been received.\n", + __func__); + + if (!help_string_initialized) { + + /* Build a proper debug help string. */ + ret = orangefs_prepare_debugfs_help_string(0); + if (ret) { + gossip_err("%s: no debug help string \n", + __func__); + return ret; + } + + } + + debug_mask_to_string(&client_debug_mask, 1); + + debugfs_remove(client_debug_dentry); + + orangefs_client_debug_init(); + + help_string_initialized++; + + return 0; +} + +int orangefs_debugfs_new_debug(void __user *arg) +{ + struct dev_mask_info_s mask_info = {0}; + int ret; + + ret = copy_from_user(&mask_info, + (void __user *)arg, + sizeof(mask_info)); + + if (ret != 0) + return -EIO; + + if (mask_info.mask_type == KERNEL_MASK) { + if ((mask_info.mask_value == 0) + && (kernel_mask_set_mod_init)) { + /* + * the kernel debug mask was set when the + * kernel module was loaded; don't override + * it if the client-core was started without + * a value for ORANGEFS_KMODMASK. + */ + return 0; + } + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + orangefs_gossip_debug_mask = mask_info.mask_value; + pr_info("%s: kernel debug mask has been modified to " + ":%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + } else if (mask_info.mask_type == CLIENT_MASK) { + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + pr_info("%s: client debug mask has been modified to" + ":%s: :%llx:\n", + __func__, + client_debug_string, + llu(mask_info.mask_value)); + } else { + gossip_err("Invalid mask type....\n"); + return -EINVAL; + } + + return ret; +} diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h new file mode 100644 index 000000000..502f6dedc --- /dev/null +++ b/fs/orangefs/orangefs-debugfs.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +void orangefs_debugfs_init(int); +void orangefs_debugfs_cleanup(void); +int orangefs_prepare_debugfs_help_string(int); +int orangefs_debugfs_new_client_mask(void __user *); +int orangefs_debugfs_new_client_string(void __user *); +int orangefs_debugfs_new_debug(void __user *); diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h new file mode 100644 index 000000000..dc6609824 --- /dev/null +++ b/fs/orangefs/orangefs-dev-proto.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#ifndef _ORANGEFS_DEV_PROTO_H +#define _ORANGEFS_DEV_PROTO_H + +/* + * types and constants shared between user space and kernel space for + * device interaction using a common protocol + */ + +/* + * valid orangefs kernel operation types + */ +#define ORANGEFS_VFS_OP_INVALID 0xFF000000 +#define ORANGEFS_VFS_OP_FILE_IO 0xFF000001 +#define ORANGEFS_VFS_OP_LOOKUP 0xFF000002 +#define ORANGEFS_VFS_OP_CREATE 0xFF000003 +#define ORANGEFS_VFS_OP_GETATTR 0xFF000004 +#define ORANGEFS_VFS_OP_REMOVE 0xFF000005 +#define ORANGEFS_VFS_OP_MKDIR 0xFF000006 +#define ORANGEFS_VFS_OP_READDIR 0xFF000007 +#define ORANGEFS_VFS_OP_SETATTR 0xFF000008 +#define ORANGEFS_VFS_OP_SYMLINK 0xFF000009 +#define ORANGEFS_VFS_OP_RENAME 0xFF00000A +#define ORANGEFS_VFS_OP_STATFS 0xFF00000B +#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C +#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D +#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E +#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F +#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010 +#define ORANGEFS_VFS_OP_SETXATTR 0xFF000011 +#define ORANGEFS_VFS_OP_LISTXATTR 0xFF000012 +#define ORANGEFS_VFS_OP_REMOVEXATTR 0xFF000013 +#define ORANGEFS_VFS_OP_PARAM 0xFF000014 +#define ORANGEFS_VFS_OP_PERF_COUNT 0xFF000015 +#define ORANGEFS_VFS_OP_CANCEL 0xFF00EE00 +#define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01 +#define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02 +#define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03 +#define ORANGEFS_VFS_OP_FEATURES 0xFF00EE05 /* 2.9.6 */ + +/* features is a 64-bit unsigned bitmask */ +#define ORANGEFS_FEATURE_READAHEAD 1 + +/* + * Misc constants. Please retain them as multiples of 8! + * Otherwise 32-64 bit interactions will be messed up :) + */ +#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 + +#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512 + +#include "upcall.h" +#include "downcall.h" + +#endif diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h new file mode 100644 index 000000000..b5940ec18 --- /dev/null +++ b/fs/orangefs/orangefs-kernel.h @@ -0,0 +1,492 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and + * accessed through the Linux VFS (i.e. using standard I/O system calls). + * This support is only needed on clients that wish to mount the file system. + * + */ + +/* + * Declarations and macros for the ORANGEFS Linux kernel support. + */ + +#ifndef __ORANGEFSKERNEL_H +#define __ORANGEFSKERNEL_H + +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/statfs.h> +#include <linux/backing-dev.h> +#include <linux/device.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> + +#include <linux/aio.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/compat.h> +#include <linux/mount.h> +#include <linux/uaccess.h> +#include <linux/atomic.h> +#include <linux/uio.h> +#include <linux/sched/signal.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/dcache.h> +#include <linux/pagemap.h> +#include <linux/poll.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/hashtable.h> + +#include <asm/unaligned.h> + +#include "orangefs-dev-proto.h" + +#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 20 + +#define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS 30 + +#define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS 900 /* 15 minutes */ + +#define ORANGEFS_REQDEVICE_NAME "pvfs2-req" + +#define ORANGEFS_DEVREQ_MAGIC 0x20030529 +#define ORANGEFS_PURGE_RETRY_COUNT 0x00000005 + +#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \ +sizeof(__u64) + sizeof(struct orangefs_upcall_s)) +#define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \ +sizeof(__u64) + sizeof(struct orangefs_downcall_s)) + +/* + * valid orangefs kernel operation states + * + * unknown - op was just initialized + * waiting - op is on request_list (upward bound) + * inprogr - op is in progress (waiting for downcall) + * serviced - op has matching downcall; ok + * purged - op has to start a timer since client-core + * exited uncleanly before servicing op + * given up - submitter has given up waiting for it + */ +enum orangefs_vfs_op_states { + OP_VFS_STATE_UNKNOWN = 0, + OP_VFS_STATE_WAITING = 1, + OP_VFS_STATE_INPROGR = 2, + OP_VFS_STATE_SERVICED = 4, + OP_VFS_STATE_PURGED = 8, + OP_VFS_STATE_GIVEN_UP = 16, +}; + +/* + * orangefs kernel memory related flags + */ + +#if (defined CONFIG_DEBUG_SLAB) +#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE +#else +#define ORANGEFS_CACHE_CREATE_FLAGS 0 +#endif + +extern int orangefs_init_acl(struct inode *inode, struct inode *dir); +extern const struct xattr_handler *orangefs_xattr_handlers[]; + +extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); +extern int orangefs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, + int type); + +/* + * orangefs data structures + */ +struct orangefs_kernel_op_s { + enum orangefs_vfs_op_states op_state; + __u64 tag; + + /* + * Set uses_shared_memory to non zero if this operation uses + * shared memory. If true, then a retry on the op must also + * get a new shared memory buffer and re-populate it. + * Cancels don't care - it only matters for service_operation() + * retry logics and cancels don't go through it anymore. It + * safely stays non-zero when we use it as slot_to_free. + */ + union { + int uses_shared_memory; + int slot_to_free; + }; + + struct orangefs_upcall_s upcall; + struct orangefs_downcall_s downcall; + + struct completion waitq; + spinlock_t lock; + + int attempts; + + struct list_head list; +}; + +#define set_op_state_waiting(op) ((op)->op_state = OP_VFS_STATE_WAITING) +#define set_op_state_inprogress(op) ((op)->op_state = OP_VFS_STATE_INPROGR) +#define set_op_state_given_up(op) ((op)->op_state = OP_VFS_STATE_GIVEN_UP) +static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op) +{ + op->op_state = OP_VFS_STATE_SERVICED; + complete(&op->waitq); +} + +#define op_state_waiting(op) ((op)->op_state & OP_VFS_STATE_WAITING) +#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR) +#define op_state_serviced(op) ((op)->op_state & OP_VFS_STATE_SERVICED) +#define op_state_purged(op) ((op)->op_state & OP_VFS_STATE_PURGED) +#define op_state_given_up(op) ((op)->op_state & OP_VFS_STATE_GIVEN_UP) +#define op_is_cancel(op) ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL) + +void op_release(struct orangefs_kernel_op_s *op); + +extern void orangefs_bufmap_put(int); +static inline void put_cancel(struct orangefs_kernel_op_s *op) +{ + orangefs_bufmap_put(op->slot_to_free); + op_release(op); +} + +static inline void set_op_state_purged(struct orangefs_kernel_op_s *op) +{ + spin_lock(&op->lock); + if (unlikely(op_is_cancel(op))) { + list_del_init(&op->list); + spin_unlock(&op->lock); + put_cancel(op); + } else { + op->op_state |= OP_VFS_STATE_PURGED; + complete(&op->waitq); + spin_unlock(&op->lock); + } +} + +/* per inode private orangefs info */ +struct orangefs_inode_s { + struct orangefs_object_kref refn; + char link_target[ORANGEFS_NAME_MAX]; + /* + * Reading/Writing Extended attributes need to acquire the appropriate + * reader/writer semaphore on the orangefs_inode_s structure. + */ + struct rw_semaphore xattr_sem; + + struct inode vfs_inode; + sector_t last_failed_block_index_read; + + unsigned long getattr_time; + unsigned long mapping_time; + int attr_valid; + kuid_t attr_uid; + kgid_t attr_gid; + unsigned long bitlock; + + DECLARE_HASHTABLE(xattr_cache, 4); +}; + +/* per superblock private orangefs info */ +struct orangefs_sb_info_s { + struct orangefs_khandle root_khandle; + __s32 fs_id; + int id; + int flags; +#define ORANGEFS_OPT_INTR 0x01 +#define ORANGEFS_OPT_LOCAL_LOCK 0x02 + char devname[ORANGEFS_MAX_SERVER_ADDR_LEN]; + struct super_block *sb; + int mount_pending; + int no_list; + struct list_head list; +}; + +struct orangefs_stats { + unsigned long cache_hits; + unsigned long cache_misses; + unsigned long reads; + unsigned long writes; +}; + +struct orangefs_cached_xattr { + struct hlist_node node; + char key[ORANGEFS_MAX_XATTR_NAMELEN]; + char val[ORANGEFS_MAX_XATTR_VALUELEN]; + ssize_t length; + unsigned long timeout; +}; + +struct orangefs_write_range { + loff_t pos; + size_t len; + kuid_t uid; + kgid_t gid; +}; + +extern struct orangefs_stats orangefs_stats; + +/* + * NOTE: See Documentation/filesystems/porting.rst for information + * on implementing FOO_I and properly accessing fs private data + */ +static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode) +{ + return container_of(inode, struct orangefs_inode_s, vfs_inode); +} + +static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb) +{ + return (struct orangefs_sb_info_s *) sb->s_fs_info; +} + +/* ino_t descends from "unsigned long", 8 bytes, 64 bits. */ +static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle) +{ + union { + unsigned char u[8]; + __u64 ino; + } ihandle; + + ihandle.u[0] = khandle->u[0] ^ khandle->u[4]; + ihandle.u[1] = khandle->u[1] ^ khandle->u[5]; + ihandle.u[2] = khandle->u[2] ^ khandle->u[6]; + ihandle.u[3] = khandle->u[3] ^ khandle->u[7]; + ihandle.u[4] = khandle->u[12] ^ khandle->u[8]; + ihandle.u[5] = khandle->u[13] ^ khandle->u[9]; + ihandle.u[6] = khandle->u[14] ^ khandle->u[10]; + ihandle.u[7] = khandle->u[15] ^ khandle->u[11]; + + return ihandle.ino; +} + +static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) +{ + return &(ORANGEFS_I(inode)->refn.khandle); +} + +static inline int is_root_handle(struct inode *inode) +{ + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s: root handle: %pU, this handle: %pU:\n", + __func__, + &ORANGEFS_SB(inode->i_sb)->root_khandle, + get_khandle_from_ino(inode)); + + if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle), + get_khandle_from_ino(inode))) + return 0; + else + return 1; +} + +static inline int match_handle(struct orangefs_khandle resp_handle, + struct inode *inode) +{ + gossip_debug(GOSSIP_DCACHE_DEBUG, + "%s: one handle: %pU, another handle:%pU:\n", + __func__, + &resp_handle, + get_khandle_from_ino(inode)); + + if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode))) + return 0; + else + return 1; +} + +/* + * defined in orangefs-cache.c + */ +int op_cache_initialize(void); +int op_cache_finalize(void); +struct orangefs_kernel_op_s *op_alloc(__s32 type); +void orangefs_new_tag(struct orangefs_kernel_op_s *op); +char *get_opname_string(struct orangefs_kernel_op_s *new_op); + +int orangefs_inode_cache_initialize(void); +int orangefs_inode_cache_finalize(void); + +/* + * defined in orangefs-mod.c + */ +void purge_inprogress_ops(void); + +/* + * defined in waitqueue.c + */ +void purge_waiting_ops(void); + +/* + * defined in super.c + */ +extern uint64_t orangefs_features; + +struct dentry *orangefs_mount(struct file_system_type *fst, + int flags, + const char *devname, + void *data); + +void orangefs_kill_sb(struct super_block *sb); +int orangefs_remount(struct orangefs_sb_info_s *); + +int fsid_key_table_initialize(void); +void fsid_key_table_finalize(void); + +/* + * defined in inode.c + */ +vm_fault_t orangefs_page_mkwrite(struct vm_fault *); +struct inode *orangefs_new_inode(struct super_block *sb, + struct inode *dir, + int mode, + dev_t dev, + struct orangefs_object_kref *ref); + +int __orangefs_setattr(struct inode *, struct iattr *); +int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); + +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); + +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); + +int orangefs_update_time(struct inode *, struct timespec64 *, int); + +/* + * defined in xattr.c + */ +ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* + * defined in namei.c + */ +struct inode *orangefs_iget(struct super_block *sb, + struct orangefs_object_kref *ref); + +/* + * defined in devorangefs-req.c + */ +extern uint32_t orangefs_userspace_version; + +int orangefs_dev_init(void); +void orangefs_dev_cleanup(void); +int is_daemon_in_service(void); +bool __is_daemon_in_service(void); + +/* + * defined in file.c + */ +int orangefs_revalidate_mapping(struct inode *); +ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *, + struct file *); +ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, + struct iov_iter *); + +/* + * defined in orangefs-utils.c + */ +__s32 fsid_of_op(struct orangefs_kernel_op_s *op); + +ssize_t orangefs_inode_getxattr(struct inode *inode, + const char *name, + void *buffer, + size_t size); + +int orangefs_inode_setxattr(struct inode *inode, + const char *name, + const void *value, + size_t size, + int flags); + +#define ORANGEFS_GETATTR_NEW 1 +#define ORANGEFS_GETATTR_SIZE 2 + +int orangefs_inode_getattr(struct inode *, int); + +int orangefs_inode_check_changed(struct inode *inode); + +int orangefs_inode_setattr(struct inode *inode); + +bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); + +int orangefs_normalize_to_errno(__s32 error_code); + +extern struct mutex orangefs_request_mutex; +extern int op_timeout_secs; +extern int slot_timeout_secs; +extern int orangefs_cache_timeout_msecs; +extern int orangefs_dcache_timeout_msecs; +extern int orangefs_getattr_timeout_msecs; +extern struct list_head orangefs_superblocks; +extern spinlock_t orangefs_superblocks_lock; +extern struct list_head orangefs_request_list; +extern spinlock_t orangefs_request_list_lock; +extern wait_queue_head_t orangefs_request_list_waitq; +extern struct list_head *orangefs_htable_ops_in_progress; +extern spinlock_t orangefs_htable_ops_in_progress_lock; +extern int hash_table_size; + +extern const struct file_operations orangefs_file_operations; +extern const struct inode_operations orangefs_symlink_inode_operations; +extern const struct inode_operations orangefs_dir_inode_operations; +extern const struct file_operations orangefs_dir_operations; +extern const struct dentry_operations orangefs_dentry_operations; + +/* + * misc convenience macros + */ + +#define ORANGEFS_OP_INTERRUPTIBLE 1 /* service_operation() is interruptible */ +#define ORANGEFS_OP_PRIORITY 2 /* service_operation() is high priority */ +#define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */ +#define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */ +#define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */ +#define ORANGEFS_OP_WRITEBACK 32 + +int service_operation(struct orangefs_kernel_op_s *op, + const char *op_name, + int flags); + +#define get_interruptible_flag(inode) \ + ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \ + ORANGEFS_OP_INTERRUPTIBLE : 0) + +#define fill_default_sys_attrs(sys_attr, type, mode) \ +do { \ + sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ + sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ + sys_attr.perms = ORANGEFS_util_translate_mode(mode); \ + sys_attr.mtime = 0; \ + sys_attr.atime = 0; \ + sys_attr.ctime = 0; \ + sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ +} while (0) + +static inline void orangefs_set_timeout(struct dentry *dentry) +{ + unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + + dentry->d_fsdata = (void *) time; +} + +#endif /* __ORANGEFSKERNEL_H */ diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c new file mode 100644 index 000000000..5ab741c60 --- /dev/null +++ b/fs/orangefs/orangefs-mod.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2001 Clemson University and The University of Chicago + * + * Changes by Acxiom Corporation to add proc file handler for pvfs2 client + * parameters, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-debugfs.h" +#include "orangefs-sysfs.h" + +/* ORANGEFS_VERSION is a ./configure define */ +#ifndef ORANGEFS_VERSION +#define ORANGEFS_VERSION "upstream" +#endif + +/* + * global variables declared here + */ + +struct orangefs_stats orangefs_stats; + +/* the size of the hash tables for ops in progress */ +int hash_table_size = 509; + +static ulong module_parm_debug_mask; +__u64 orangefs_gossip_debug_mask; +int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; +int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; +int orangefs_cache_timeout_msecs = 500; +int orangefs_dcache_timeout_msecs = 50; +int orangefs_getattr_timeout_msecs = 50; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("ORANGEFS Development Team"); +MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS"); +MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)"); +MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds"); +MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds"); +MODULE_PARM_DESC(hash_table_size, + "size of hash table for operations in progress"); + +static struct file_system_type orangefs_fs_type = { + .name = "pvfs2", + .mount = orangefs_mount, + .kill_sb = orangefs_kill_sb, + .owner = THIS_MODULE, +}; + +module_param(hash_table_size, int, 0); +module_param(module_parm_debug_mask, ulong, 0644); +module_param(op_timeout_secs, int, 0); +module_param(slot_timeout_secs, int, 0); + +/* + * Blocks non-priority requests from being queued for servicing. This + * could be used for protecting the request list data structure, but + * for now it's only being used to stall the op addition to the request + * list + */ +DEFINE_MUTEX(orangefs_request_mutex); + +/* hash table for storing operations waiting for matching downcall */ +struct list_head *orangefs_htable_ops_in_progress; +DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock); + +/* list for queueing upcall operations */ +LIST_HEAD(orangefs_request_list); + +/* used to protect the above orangefs_request_list */ +DEFINE_SPINLOCK(orangefs_request_list_lock); + +/* used for incoming request notification */ +DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq); + +static int __init orangefs_init(void) +{ + int ret; + __u32 i = 0; + + if (op_timeout_secs < 0) + op_timeout_secs = 0; + + if (slot_timeout_secs < 0) + slot_timeout_secs = 0; + + /* initialize global book keeping data structures */ + ret = op_cache_initialize(); + if (ret < 0) + goto out; + + ret = orangefs_inode_cache_initialize(); + if (ret < 0) + goto cleanup_op; + + orangefs_htable_ops_in_progress = + kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); + if (!orangefs_htable_ops_in_progress) { + ret = -ENOMEM; + goto cleanup_inode; + } + + /* initialize a doubly linked at each hash table index */ + for (i = 0; i < hash_table_size; i++) + INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]); + + ret = fsid_key_table_initialize(); + if (ret < 0) + goto cleanup_progress_table; + + /* + * Build the contents of /sys/kernel/debug/orangefs/debug-help + * from the keywords in the kernel keyword/mask array. + * + * The keywords in the client keyword/mask array are + * unknown at boot time. + * + * orangefs_prepare_debugfs_help_string will be used again + * later to rebuild the debug-help-string after the client starts + * and passes along the needed info. The argument signifies + * which time orangefs_prepare_debugfs_help_string is being + * called. + */ + ret = orangefs_prepare_debugfs_help_string(1); + if (ret) + goto cleanup_key_table; + + orangefs_debugfs_init(module_parm_debug_mask); + + ret = orangefs_sysfs_init(); + if (ret) + goto sysfs_init_failed; + + /* Initialize the orangefsdev subsystem. */ + ret = orangefs_dev_init(); + if (ret < 0) { + gossip_err("%s: could not initialize device subsystem %d!\n", + __func__, + ret); + goto cleanup_sysfs; + } + + ret = register_filesystem(&orangefs_fs_type); + if (ret == 0) { + pr_info("%s: module version %s loaded\n", + __func__, + ORANGEFS_VERSION); + goto out; + } + + orangefs_dev_cleanup(); + +cleanup_sysfs: + orangefs_sysfs_exit(); + +sysfs_init_failed: + orangefs_debugfs_cleanup(); + +cleanup_key_table: + fsid_key_table_finalize(); + +cleanup_progress_table: + kfree(orangefs_htable_ops_in_progress); + +cleanup_inode: + orangefs_inode_cache_finalize(); + +cleanup_op: + op_cache_finalize(); + +out: + return ret; +} + +static void __exit orangefs_exit(void) +{ + int i = 0; + gossip_debug(GOSSIP_INIT_DEBUG, "orangefs: orangefs_exit called\n"); + + unregister_filesystem(&orangefs_fs_type); + orangefs_debugfs_cleanup(); + orangefs_sysfs_exit(); + fsid_key_table_finalize(); + orangefs_dev_cleanup(); + BUG_ON(!list_empty(&orangefs_request_list)); + for (i = 0; i < hash_table_size; i++) + BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i])); + + orangefs_inode_cache_finalize(); + op_cache_finalize(); + + kfree(orangefs_htable_ops_in_progress); + + pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION); +} + +/* + * What we do in this function is to walk the list of operations + * that are in progress in the hash table and mark them as purged as well. + */ +void purge_inprogress_ops(void) +{ + int i; + + for (i = 0; i < hash_table_size; i++) { + struct orangefs_kernel_op_s *op; + struct orangefs_kernel_op_s *next; + + spin_lock(&orangefs_htable_ops_in_progress_lock); + list_for_each_entry_safe(op, + next, + &orangefs_htable_ops_in_progress[i], + list) { + set_op_state_purged(op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(op), + op->op_state, + current->comm); + } + spin_unlock(&orangefs_htable_ops_in_progress_lock); + } +} + +module_init(orangefs_init); +module_exit(orangefs_exit); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c new file mode 100644 index 000000000..be4ba03a0 --- /dev/null +++ b/fs/orangefs/orangefs-sysfs.c @@ -0,0 +1,1326 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Documentation/ABI/stable/sysfs-fs-orangefs: + * + * What: /sys/fs/orangefs/perf_counter_reset + * Date: June 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * echo a 0 or a 1 into perf_counter_reset to + * reset all the counters in + * /sys/fs/orangefs/perf_counters + * except ones with PINT_PERF_PRESERVE set. + * + * + * What: /sys/fs/orangefs/perf_counters/... + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Counters and settings for various caches. + * Read only. + * + * + * What: /sys/fs/orangefs/perf_time_interval_secs + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Length of perf counter intervals in + * seconds. + * + * + * What: /sys/fs/orangefs/perf_history_size + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * The perf_counters cache statistics have N, or + * perf_history_size, samples. The default is + * one. + * + * Every perf_time_interval_secs the (first) + * samples are reset. + * + * If N is greater than one, the "current" set + * of samples is reset, and the samples from the + * other N-1 intervals remain available. + * + * + * What: /sys/fs/orangefs/op_timeout_secs + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Service operation timeout in seconds. + * + * + * What: /sys/fs/orangefs/slot_timeout_secs + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * "Slot" timeout in seconds. A "slot" + * is an indexed buffer in the shared + * memory segment used for communication + * between the kernel module and userspace. + * Slots are requested and waited for, + * the wait times out after slot_timeout_secs. + * + * What: /sys/fs/orangefs/cache_timeout_msecs + * Date: Mar 2018 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Time in milliseconds between which + * orangefs_revalidate_mapping will invalidate the page + * cache. + * + * What: /sys/fs/orangefs/dcache_timeout_msecs + * Date: Jul 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Time lookup is valid in milliseconds. + * + * What: /sys/fs/orangefs/getattr_timeout_msecs + * Date: Jul 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Time getattr is valid in milliseconds. + * + * What: /sys/fs/orangefs/readahead_count + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count. + * + * What: /sys/fs/orangefs/readahead_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer size. + * + * What: /sys/fs/orangefs/readahead_count_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count and size. + * + * What: /sys/fs/orangefs/readahead_readcnt + * Date: Jan 2017 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Number of buffers (in multiples of readahead_size) + * which can be read ahead for a single file at once. + * + * What: /sys/fs/orangefs/acache/... + * Date: Jun 2015 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Attribute cache configurable settings. + * + * + * What: /sys/fs/orangefs/ncache/... + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Name cache configurable settings. + * + * + * What: /sys/fs/orangefs/capcache/... + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Capability cache configurable settings. + * + * + * What: /sys/fs/orangefs/ccache/... + * Date: Jun 2015 + * Contact: Mike Marshall <hubcap@omnibond.com> + * Description: + * Credential cache configurable settings. + * + */ + +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-sysfs.h" + +#define ORANGEFS_KOBJ_ID "orangefs" +#define ACACHE_KOBJ_ID "acache" +#define CAPCACHE_KOBJ_ID "capcache" +#define CCACHE_KOBJ_ID "ccache" +#define NCACHE_KOBJ_ID "ncache" +#define PC_KOBJ_ID "pc" +#define STATS_KOBJ_ID "stats" + +/* + * Every item calls orangefs_attr_show and orangefs_attr_store through + * orangefs_sysfs_ops. They look at the orangefs_attributes further below to + * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or + * sysfs_service_op_store. + */ + +struct orangefs_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct orangefs_attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, + struct orangefs_attribute *attr, + const char *buf, + size_t count); +}; + +static ssize_t orangefs_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct orangefs_attribute *attribute; + + attribute = container_of(attr, struct orangefs_attribute, attr); + if (!attribute->show) + return -EIO; + return attribute->show(kobj, attribute, buf); +} + +static ssize_t orangefs_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t len) +{ + struct orangefs_attribute *attribute; + + if (!strcmp(kobj->name, PC_KOBJ_ID) || + !strcmp(kobj->name, STATS_KOBJ_ID)) + return -EPERM; + + attribute = container_of(attr, struct orangefs_attribute, attr); + if (!attribute->store) + return -EIO; + return attribute->store(kobj, attribute, buf, len); +} + +static const struct sysfs_ops orangefs_sysfs_ops = { + .show = orangefs_attr_show, + .store = orangefs_attr_store, +}; + +static ssize_t sysfs_int_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) +{ + int rc = -EIO; + + gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", + kobj->name); + + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "op_timeout_secs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + op_timeout_secs); + goto out; + } else if (!strcmp(attr->attr.name, + "slot_timeout_secs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + slot_timeout_secs); + goto out; + } else if (!strcmp(attr->attr.name, + "cache_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + orangefs_cache_timeout_msecs); + goto out; + } else if (!strcmp(attr->attr.name, + "dcache_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + orangefs_dcache_timeout_msecs); + goto out; + } else if (!strcmp(attr->attr.name, + "getattr_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + orangefs_getattr_timeout_msecs); + goto out; + } else { + goto out; + } + + } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "reads")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%lu\n", + orangefs_stats.reads); + goto out; + } else if (!strcmp(attr->attr.name, "writes")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%lu\n", + orangefs_stats.writes); + goto out; + } else { + goto out; + } + } + +out: + + return rc; +} + +static ssize_t sysfs_int_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) +{ + int rc = 0; + + gossip_debug(GOSSIP_SYSFS_DEBUG, + "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n", + attr->attr.name, buf); + + if (!strcmp(attr->attr.name, "op_timeout_secs")) { + rc = kstrtoint(buf, 0, &op_timeout_secs); + goto out; + } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { + rc = kstrtoint(buf, 0, &slot_timeout_secs); + goto out; + } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { + rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs); + goto out; + } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { + rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); + goto out; + } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { + rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs); + goto out; + } else { + goto out; + } + +out: + if (rc) + rc = -EINVAL; + else + rc = count; + + return rc; +} + +/* + * obtain attribute values from userspace with a service operation. + */ +static ssize_t sysfs_service_op_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) +{ + struct orangefs_kernel_op_s *new_op = NULL; + int rc = 0; + char *ser_op_type = NULL; + __u32 op_alloc_type; + + gossip_debug(GOSSIP_SYSFS_DEBUG, + "sysfs_service_op_show: id:%s:\n", + kobj->name); + + if (strcmp(kobj->name, PC_KOBJ_ID)) + op_alloc_type = ORANGEFS_VFS_OP_PARAM; + else + op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT; + + new_op = op_alloc(op_alloc_type); + if (!new_op) + return -ENOMEM; + + /* Can't do a service_operation if the client is not running... */ + rc = is_daemon_in_service(); + if (rc) { + pr_info_ratelimited("%s: Client not running :%d:\n", + __func__, + is_daemon_in_service()); + goto out; + } + + if (strcmp(kobj->name, PC_KOBJ_ID)) + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET; + + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size") || + !strcmp(attr->attr.name, "readahead_readcnt"))) { + rc = -EINVAL; + goto out; + } + + if (!strcmp(attr->attr.name, "perf_history_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; + else if (!strcmp(attr->attr.name, + "perf_time_interval_secs")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; + else if (!strcmp(attr->attr.name, + "perf_counter_reset")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; + + else if (!strcmp(attr->attr.name, + "readahead_count")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + + else if (!strcmp(attr->attr.name, + "readahead_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + + else if (!strcmp(attr->attr.name, + "readahead_count_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + + else if (!strcmp(attr->attr.name, + "readahead_readcnt")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; + + if (!strcmp(attr->attr.name, "hard_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; + + if (!strcmp(attr->attr.name, "soft_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; + + if (!strcmp(attr->attr.name, "reclaim_percentage")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; + + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; + + if (!strcmp(attr->attr.name, "hard_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; + + if (!strcmp(attr->attr.name, "soft_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; + + if (!strcmp(attr->attr.name, "reclaim_percentage")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; + + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; + + if (!strcmp(attr->attr.name, "hard_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; + + if (!strcmp(attr->attr.name, "soft_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; + + if (!strcmp(attr->attr.name, "reclaim_percentage")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; + + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; + + if (!strcmp(attr->attr.name, "hard_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; + + if (!strcmp(attr->attr.name, "soft_limit")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; + + if (!strcmp(attr->attr.name, "reclaim_percentage")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; + + } else if (!strcmp(kobj->name, PC_KOBJ_ID)) { + if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID)) + new_op->upcall.req.perf_count.type = + ORANGEFS_PERF_COUNT_REQUEST_ACACHE; + + if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID)) + new_op->upcall.req.perf_count.type = + ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE; + + if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID)) + new_op->upcall.req.perf_count.type = + ORANGEFS_PERF_COUNT_REQUEST_NCACHE; + + } else { + gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n", + kobj->name); + rc = -EINVAL; + goto out; + } + + + if (strcmp(kobj->name, PC_KOBJ_ID)) + ser_op_type = "orangefs_param"; + else + ser_op_type = "orangefs_perf_count"; + + /* + * The service_operation will return an errno return code on + * error, and zero on success. + */ + rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE); + +out: + if (!rc) { + if (strcmp(kobj->name, PC_KOBJ_ID)) { + if (new_op->upcall.req.param.op == + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { + rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + (int)new_op->downcall.resp.param.u. + value32[0], + (int)new_op->downcall.resp.param.u. + value32[1]); + } else { + rc = scnprintf(buf, PAGE_SIZE, "%d\n", + (int)new_op->downcall.resp.param.u.value64); + } + } else { + rc = scnprintf( + buf, + PAGE_SIZE, + "%s", + new_op->downcall.resp.perf_count.buffer); + } + } + + op_release(new_op); + + return rc; + +} + +/* + * pass attribute values back to userspace with a service operation. + * + * We have to do a memory allocation, an sscanf and a service operation. + * And we have to evaluate what the user entered, to make sure the + * value is within the range supported by the attribute. So, there's + * a lot of return code checking and mapping going on here. + * + * We want to return 1 if we think everything went OK, and + * EINVAL if not. + */ +static ssize_t sysfs_service_op_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) +{ + struct orangefs_kernel_op_s *new_op = NULL; + int val = 0; + int rc = 0; + + gossip_debug(GOSSIP_SYSFS_DEBUG, + "sysfs_service_op_store: id:%s:\n", + kobj->name); + + new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); + if (!new_op) + return -EINVAL; /* sic */ + + /* Can't do a service_operation if the client is not running... */ + rc = is_daemon_in_service(); + if (rc) { + pr_info("%s: Client not running :%d:\n", + __func__, + is_daemon_in_service()); + goto out; + } + + /* + * The value we want to send back to userspace is in buf, unless this + * there are two parameters, which is specially handled below. + */ + if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) || + strcmp(attr->attr.name, "readahead_count_size")) { + rc = kstrtoint(buf, 0, &val); + if (rc) + goto out; + } + + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; + + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size") || + !strcmp(attr->attr.name, "readahead_readcnt"))) { + rc = -EINVAL; + goto out; + } + + if (!strcmp(attr->attr.name, "perf_history_size")) { + if (val > 0) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "perf_time_interval_secs")) { + if (val > 0) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "perf_counter_reset")) { + if ((val == 0) || (val == 1)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_count")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_size")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_count_size")) { + int val1, val2; + rc = sscanf(buf, "%d %d", &val1, &val2); + if (rc < 2) { + rc = 0; + goto out; + } + if ((val1 >= 0) && (val2 >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } + new_op->upcall.req.param.u.value32[0] = val1; + new_op->upcall.req.param.u.value32[1] = val2; + goto value_set; + } else if (!strcmp(attr->attr.name, + "readahead_readcnt")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; + } else { + rc = 0; + goto out; + } + } + + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "soft_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "reclaim_percentage")) { + if ((val > -1) && (val < 101)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; + } else { + rc = 0; + goto out; + } + } + + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "soft_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "reclaim_percentage")) { + if ((val > -1) && (val < 101)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "timeout_secs")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; + } else { + rc = 0; + goto out; + } + } + + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "soft_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "reclaim_percentage")) { + if ((val > -1) && (val < 101)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "timeout_secs")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; + } else { + rc = 0; + goto out; + } + } + + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "soft_limit")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "reclaim_percentage")) { + if ((val > -1) && (val < 101)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { + if (val > -1) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; + } else { + rc = 0; + goto out; + } + } + + } else { + gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n", + kobj->name); + rc = -EINVAL; + goto out; + } + + new_op->upcall.req.param.u.value64 = val; +value_set: + + /* + * The service_operation will return a errno return code on + * error, and zero on success. + */ + rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE); + + if (rc < 0) { + gossip_err("sysfs_service_op_store: service op returned:%d:\n", + rc); + rc = 0; + } else { + rc = count; + } + +out: + op_release(new_op); + + if (rc == -ENOMEM || rc == 0) + rc = -EINVAL; + + return rc; +} + +static struct orangefs_attribute op_timeout_secs_attribute = + __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute slot_timeout_secs_attribute = + __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute cache_timeout_msecs_attribute = + __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute dcache_timeout_msecs_attribute = + __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute getattr_timeout_msecs_attribute = + __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute readahead_count_attribute = + __ATTR(readahead_count, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_size_attribute = + __ATTR(readahead_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_count_size_attribute = + __ATTR(readahead_count_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_readcnt_attribute = + __ATTR(readahead_readcnt, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute perf_counter_reset_attribute = + __ATTR(perf_counter_reset, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute perf_history_size_attribute = + __ATTR(perf_history_size, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute perf_time_interval_secs_attribute = + __ATTR(perf_time_interval_secs, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct attribute *orangefs_default_attrs[] = { + &op_timeout_secs_attribute.attr, + &slot_timeout_secs_attribute.attr, + &cache_timeout_msecs_attribute.attr, + &dcache_timeout_msecs_attribute.attr, + &getattr_timeout_msecs_attribute.attr, + &readahead_count_attribute.attr, + &readahead_size_attribute.attr, + &readahead_count_size_attribute.attr, + &readahead_readcnt_attribute.attr, + &perf_counter_reset_attribute.attr, + &perf_history_size_attribute.attr, + &perf_time_interval_secs_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(orangefs_default); + +static struct kobject *orangefs_obj; + +static void orangefs_obj_release(struct kobject *kobj) +{ + kfree(orangefs_obj); + orangefs_obj = NULL; +} + +static struct kobj_type orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = orangefs_default_groups, + .release = orangefs_obj_release, +}; + +static struct orangefs_attribute acache_hard_limit_attribute = + __ATTR(hard_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute acache_reclaim_percent_attribute = + __ATTR(reclaim_percentage, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute acache_soft_limit_attribute = + __ATTR(soft_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute acache_timeout_msecs_attribute = + __ATTR(timeout_msecs, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct attribute *acache_orangefs_default_attrs[] = { + &acache_hard_limit_attribute.attr, + &acache_reclaim_percent_attribute.attr, + &acache_soft_limit_attribute.attr, + &acache_timeout_msecs_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(acache_orangefs_default); + +static struct kobject *acache_orangefs_obj; + +static void acache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(acache_orangefs_obj); + acache_orangefs_obj = NULL; +} + +static struct kobj_type acache_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = acache_orangefs_default_groups, + .release = acache_orangefs_obj_release, +}; + +static struct orangefs_attribute capcache_hard_limit_attribute = + __ATTR(hard_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute capcache_reclaim_percent_attribute = + __ATTR(reclaim_percentage, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute capcache_soft_limit_attribute = + __ATTR(soft_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute capcache_timeout_secs_attribute = + __ATTR(timeout_secs, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct attribute *capcache_orangefs_default_attrs[] = { + &capcache_hard_limit_attribute.attr, + &capcache_reclaim_percent_attribute.attr, + &capcache_soft_limit_attribute.attr, + &capcache_timeout_secs_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(capcache_orangefs_default); + +static struct kobject *capcache_orangefs_obj; + +static void capcache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(capcache_orangefs_obj); + capcache_orangefs_obj = NULL; +} + +static struct kobj_type capcache_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = capcache_orangefs_default_groups, + .release = capcache_orangefs_obj_release, +}; + +static struct orangefs_attribute ccache_hard_limit_attribute = + __ATTR(hard_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ccache_reclaim_percent_attribute = + __ATTR(reclaim_percentage, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ccache_soft_limit_attribute = + __ATTR(soft_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ccache_timeout_secs_attribute = + __ATTR(timeout_secs, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct attribute *ccache_orangefs_default_attrs[] = { + &ccache_hard_limit_attribute.attr, + &ccache_reclaim_percent_attribute.attr, + &ccache_soft_limit_attribute.attr, + &ccache_timeout_secs_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(ccache_orangefs_default); + +static struct kobject *ccache_orangefs_obj; + +static void ccache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ccache_orangefs_obj); + ccache_orangefs_obj = NULL; +} + +static struct kobj_type ccache_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = ccache_orangefs_default_groups, + .release = ccache_orangefs_obj_release, +}; + +static struct orangefs_attribute ncache_hard_limit_attribute = + __ATTR(hard_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ncache_reclaim_percent_attribute = + __ATTR(reclaim_percentage, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ncache_soft_limit_attribute = + __ATTR(soft_limit, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute ncache_timeout_msecs_attribute = + __ATTR(timeout_msecs, + 0664, + sysfs_service_op_show, + sysfs_service_op_store); + +static struct attribute *ncache_orangefs_default_attrs[] = { + &ncache_hard_limit_attribute.attr, + &ncache_reclaim_percent_attribute.attr, + &ncache_soft_limit_attribute.attr, + &ncache_timeout_msecs_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(ncache_orangefs_default); + +static struct kobject *ncache_orangefs_obj; + +static void ncache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ncache_orangefs_obj); + ncache_orangefs_obj = NULL; +} + +static struct kobj_type ncache_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = ncache_orangefs_default_groups, + .release = ncache_orangefs_obj_release, +}; + +static struct orangefs_attribute pc_acache_attribute = + __ATTR(acache, + 0664, + sysfs_service_op_show, + NULL); + +static struct orangefs_attribute pc_capcache_attribute = + __ATTR(capcache, + 0664, + sysfs_service_op_show, + NULL); + +static struct orangefs_attribute pc_ncache_attribute = + __ATTR(ncache, + 0664, + sysfs_service_op_show, + NULL); + +static struct attribute *pc_orangefs_default_attrs[] = { + &pc_acache_attribute.attr, + &pc_capcache_attribute.attr, + &pc_ncache_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(pc_orangefs_default); + +static struct kobject *pc_orangefs_obj; + +static void pc_orangefs_obj_release(struct kobject *kobj) +{ + kfree(pc_orangefs_obj); + pc_orangefs_obj = NULL; +} + +static struct kobj_type pc_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = pc_orangefs_default_groups, + .release = pc_orangefs_obj_release, +}; + +static struct orangefs_attribute stats_reads_attribute = + __ATTR(reads, + 0664, + sysfs_int_show, + NULL); + +static struct orangefs_attribute stats_writes_attribute = + __ATTR(writes, + 0664, + sysfs_int_show, + NULL); + +static struct attribute *stats_orangefs_default_attrs[] = { + &stats_reads_attribute.attr, + &stats_writes_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(stats_orangefs_default); + +static struct kobject *stats_orangefs_obj; + +static void stats_orangefs_obj_release(struct kobject *kobj) +{ + kfree(stats_orangefs_obj); + stats_orangefs_obj = NULL; +} + +static struct kobj_type stats_orangefs_ktype = { + .sysfs_ops = &orangefs_sysfs_ops, + .default_groups = stats_orangefs_default_groups, + .release = stats_orangefs_obj_release, +}; + +int orangefs_sysfs_init(void) +{ + int rc = -EINVAL; + + gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n"); + + /* create /sys/fs/orangefs. */ + orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL); + if (!orangefs_obj) + goto out; + + rc = kobject_init_and_add(orangefs_obj, + &orangefs_ktype, + fs_kobj, + ORANGEFS_KOBJ_ID); + + if (rc) + goto ofs_obj_bail; + + kobject_uevent(orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/acache. */ + acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL); + if (!acache_orangefs_obj) { + rc = -EINVAL; + goto ofs_obj_bail; + } + + rc = kobject_init_and_add(acache_orangefs_obj, + &acache_orangefs_ktype, + orangefs_obj, + ACACHE_KOBJ_ID); + + if (rc) + goto acache_obj_bail; + + kobject_uevent(acache_orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/capcache. */ + capcache_orangefs_obj = + kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL); + if (!capcache_orangefs_obj) { + rc = -EINVAL; + goto acache_obj_bail; + } + + rc = kobject_init_and_add(capcache_orangefs_obj, + &capcache_orangefs_ktype, + orangefs_obj, + CAPCACHE_KOBJ_ID); + if (rc) + goto capcache_obj_bail; + + kobject_uevent(capcache_orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/ccache. */ + ccache_orangefs_obj = + kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL); + if (!ccache_orangefs_obj) { + rc = -EINVAL; + goto capcache_obj_bail; + } + + rc = kobject_init_and_add(ccache_orangefs_obj, + &ccache_orangefs_ktype, + orangefs_obj, + CCACHE_KOBJ_ID); + if (rc) + goto ccache_obj_bail; + + kobject_uevent(ccache_orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/ncache. */ + ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL); + if (!ncache_orangefs_obj) { + rc = -EINVAL; + goto ccache_obj_bail; + } + + rc = kobject_init_and_add(ncache_orangefs_obj, + &ncache_orangefs_ktype, + orangefs_obj, + NCACHE_KOBJ_ID); + + if (rc) + goto ncache_obj_bail; + + kobject_uevent(ncache_orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/perf_counters. */ + pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL); + if (!pc_orangefs_obj) { + rc = -EINVAL; + goto ncache_obj_bail; + } + + rc = kobject_init_and_add(pc_orangefs_obj, + &pc_orangefs_ktype, + orangefs_obj, + "perf_counters"); + + if (rc) + goto pc_obj_bail; + + kobject_uevent(pc_orangefs_obj, KOBJ_ADD); + + /* create /sys/fs/orangefs/stats. */ + stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL); + if (!stats_orangefs_obj) { + rc = -EINVAL; + goto pc_obj_bail; + } + + rc = kobject_init_and_add(stats_orangefs_obj, + &stats_orangefs_ktype, + orangefs_obj, + STATS_KOBJ_ID); + + if (rc) + goto stats_obj_bail; + + kobject_uevent(stats_orangefs_obj, KOBJ_ADD); + goto out; + +stats_obj_bail: + kobject_put(stats_orangefs_obj); +pc_obj_bail: + kobject_put(pc_orangefs_obj); +ncache_obj_bail: + kobject_put(ncache_orangefs_obj); +ccache_obj_bail: + kobject_put(ccache_orangefs_obj); +capcache_obj_bail: + kobject_put(capcache_orangefs_obj); +acache_obj_bail: + kobject_put(acache_orangefs_obj); +ofs_obj_bail: + kobject_put(orangefs_obj); +out: + return rc; +} + +void orangefs_sysfs_exit(void) +{ + gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n"); + kobject_put(acache_orangefs_obj); + kobject_put(capcache_orangefs_obj); + kobject_put(ccache_orangefs_obj); + kobject_put(ncache_orangefs_obj); + kobject_put(pc_orangefs_obj); + kobject_put(stats_orangefs_obj); + kobject_put(orangefs_obj); +} diff --git a/fs/orangefs/orangefs-sysfs.h b/fs/orangefs/orangefs-sysfs.h new file mode 100644 index 000000000..f0b76382d --- /dev/null +++ b/fs/orangefs/orangefs-sysfs.h @@ -0,0 +1,2 @@ +extern int orangefs_sysfs_init(void); +extern void orangefs_sysfs_exit(void); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c new file mode 100644 index 000000000..46b7dcff1 --- /dev/null +++ b/fs/orangefs/orangefs-utils.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. + * + * See COPYING in top-level directory. + */ +#include <linux/kernel.h> +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-dev-proto.h" +#include "orangefs-bufmap.h" + +__s32 fsid_of_op(struct orangefs_kernel_op_s *op) +{ + __s32 fsid = ORANGEFS_FS_ID_NULL; + + if (op) { + switch (op->upcall.type) { + case ORANGEFS_VFS_OP_FILE_IO: + fsid = op->upcall.req.io.refn.fs_id; + break; + case ORANGEFS_VFS_OP_LOOKUP: + fsid = op->upcall.req.lookup.parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_CREATE: + fsid = op->upcall.req.create.parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_GETATTR: + fsid = op->upcall.req.getattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_REMOVE: + fsid = op->upcall.req.remove.parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_MKDIR: + fsid = op->upcall.req.mkdir.parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_READDIR: + fsid = op->upcall.req.readdir.refn.fs_id; + break; + case ORANGEFS_VFS_OP_SETATTR: + fsid = op->upcall.req.setattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_SYMLINK: + fsid = op->upcall.req.sym.parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_RENAME: + fsid = op->upcall.req.rename.old_parent_refn.fs_id; + break; + case ORANGEFS_VFS_OP_STATFS: + fsid = op->upcall.req.statfs.fs_id; + break; + case ORANGEFS_VFS_OP_TRUNCATE: + fsid = op->upcall.req.truncate.refn.fs_id; + break; + case ORANGEFS_VFS_OP_RA_FLUSH: + fsid = op->upcall.req.ra_cache_flush.refn.fs_id; + break; + case ORANGEFS_VFS_OP_FS_UMOUNT: + fsid = op->upcall.req.fs_umount.fs_id; + break; + case ORANGEFS_VFS_OP_GETXATTR: + fsid = op->upcall.req.getxattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_SETXATTR: + fsid = op->upcall.req.setxattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_LISTXATTR: + fsid = op->upcall.req.listxattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_REMOVEXATTR: + fsid = op->upcall.req.removexattr.refn.fs_id; + break; + case ORANGEFS_VFS_OP_FSYNC: + fsid = op->upcall.req.fsync.refn.fs_id; + break; + default: + break; + } + } + return fsid; +} + +static int orangefs_inode_flags(struct ORANGEFS_sys_attr_s *attrs) +{ + int flags = 0; + if (attrs->flags & ORANGEFS_IMMUTABLE_FL) + flags |= S_IMMUTABLE; + else + flags &= ~S_IMMUTABLE; + if (attrs->flags & ORANGEFS_APPEND_FL) + flags |= S_APPEND; + else + flags &= ~S_APPEND; + if (attrs->flags & ORANGEFS_NOATIME_FL) + flags |= S_NOATIME; + else + flags &= ~S_NOATIME; + return flags; +} + +static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs) +{ + int perm_mode = 0; + + if (attrs->perms & ORANGEFS_O_EXECUTE) + perm_mode |= S_IXOTH; + if (attrs->perms & ORANGEFS_O_WRITE) + perm_mode |= S_IWOTH; + if (attrs->perms & ORANGEFS_O_READ) + perm_mode |= S_IROTH; + + if (attrs->perms & ORANGEFS_G_EXECUTE) + perm_mode |= S_IXGRP; + if (attrs->perms & ORANGEFS_G_WRITE) + perm_mode |= S_IWGRP; + if (attrs->perms & ORANGEFS_G_READ) + perm_mode |= S_IRGRP; + + if (attrs->perms & ORANGEFS_U_EXECUTE) + perm_mode |= S_IXUSR; + if (attrs->perms & ORANGEFS_U_WRITE) + perm_mode |= S_IWUSR; + if (attrs->perms & ORANGEFS_U_READ) + perm_mode |= S_IRUSR; + + if (attrs->perms & ORANGEFS_G_SGID) + perm_mode |= S_ISGID; + if (attrs->perms & ORANGEFS_U_SUID) + perm_mode |= S_ISUID; + + return perm_mode; +} + +/* + * NOTE: in kernel land, we never use the sys_attr->link_target for + * anything, so don't bother copying it into the sys_attr object here. + */ +static inline void copy_attributes_from_inode(struct inode *inode, + struct ORANGEFS_sys_attr_s *attrs) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + attrs->mask = 0; + if (orangefs_inode->attr_valid & ATTR_UID) { + attrs->owner = from_kuid(&init_user_ns, inode->i_uid); + attrs->mask |= ORANGEFS_ATTR_SYS_UID; + gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner); + } + if (orangefs_inode->attr_valid & ATTR_GID) { + attrs->group = from_kgid(&init_user_ns, inode->i_gid); + attrs->mask |= ORANGEFS_ATTR_SYS_GID; + gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group); + } + + if (orangefs_inode->attr_valid & ATTR_ATIME) { + attrs->mask |= ORANGEFS_ATTR_SYS_ATIME; + if (orangefs_inode->attr_valid & ATTR_ATIME_SET) { + attrs->atime = (time64_t)inode->i_atime.tv_sec; + attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET; + } + } + if (orangefs_inode->attr_valid & ATTR_MTIME) { + attrs->mask |= ORANGEFS_ATTR_SYS_MTIME; + if (orangefs_inode->attr_valid & ATTR_MTIME_SET) { + attrs->mtime = (time64_t)inode->i_mtime.tv_sec; + attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET; + } + } + if (orangefs_inode->attr_valid & ATTR_CTIME) + attrs->mask |= ORANGEFS_ATTR_SYS_CTIME; + + /* + * ORANGEFS cannot set size with a setattr operation. Probably not + * likely to be requested through the VFS, but just in case, don't + * worry about ATTR_SIZE + */ + + if (orangefs_inode->attr_valid & ATTR_MODE) { + attrs->perms = ORANGEFS_util_translate_mode(inode->i_mode); + attrs->mask |= ORANGEFS_ATTR_SYS_PERM; + } +} + +static int orangefs_inode_type(enum orangefs_ds_type objtype) +{ + if (objtype == ORANGEFS_TYPE_METAFILE) + return S_IFREG; + else if (objtype == ORANGEFS_TYPE_DIRECTORY) + return S_IFDIR; + else if (objtype == ORANGEFS_TYPE_SYMLINK) + return S_IFLNK; + else + return -1; +} + +static void orangefs_make_bad_inode(struct inode *inode) +{ + if (is_root_handle(inode)) { + /* + * if this occurs, the pvfs2-client-core was killed but we + * can't afford to lose the inode operations and such + * associated with the root handle in any case. + */ + gossip_debug(GOSSIP_UTILS_DEBUG, + "*** NOT making bad root inode %pU\n", + get_khandle_from_ino(inode)); + } else { + gossip_debug(GOSSIP_UTILS_DEBUG, + "*** making bad inode %pU\n", + get_khandle_from_ino(inode)); + make_bad_inode(inode); + } +} + +static int orangefs_inode_is_stale(struct inode *inode, + struct ORANGEFS_sys_attr_s *attrs, char *link_target) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + int type = orangefs_inode_type(attrs->objtype); + /* + * If the inode type or symlink target have changed then this + * inode is stale. + */ + if (type == -1 || inode_wrong_type(inode, type)) { + orangefs_make_bad_inode(inode); + return 1; + } + if (type == S_IFLNK && strncmp(orangefs_inode->link_target, + link_target, ORANGEFS_NAME_MAX)) { + orangefs_make_bad_inode(inode); + return 1; + } + return 0; +} + +int orangefs_inode_getattr(struct inode *inode, int flags) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + loff_t inode_size; + int ret, type; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n", + __func__, get_khandle_from_ino(inode), flags); + +again: + spin_lock(&inode->i_lock); + /* Must have all the attributes in the mask and be within cache time. */ + if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || + orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + if (orangefs_inode->attr_valid) { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again; + } + spin_unlock(&inode->i_lock); + return 0; + } + spin_unlock(&inode->i_lock); + + new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.getattr.refn = orangefs_inode->refn; + /* + * Size is the hardest attribute to get. The incremental cost of any + * other attribute is essentially zero. + */ + if (flags) + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + else + new_op->upcall.req.getattr.mask = + ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE; + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + if (ret != 0) + goto out; + +again2: + spin_lock(&inode->i_lock); + /* Must have all the attributes in the mask and be within cache time. */ + if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || + orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + if (orangefs_inode->attr_valid) { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again2; + } + if (inode->i_state & I_DIRTY_PAGES) { + ret = 0; + goto out_unlock; + } + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n", + __func__); + ret = 0; + goto out_unlock; + } + + if (!(flags & ORANGEFS_GETATTR_NEW)) { + ret = orangefs_inode_is_stale(inode, + &new_op->downcall.resp.getattr.attributes, + new_op->downcall.resp.getattr.link_target); + if (ret) { + ret = -ESTALE; + goto out_unlock; + } + } + + type = orangefs_inode_type(new_op-> + downcall.resp.getattr.attributes.objtype); + switch (type) { + case S_IFREG: + inode->i_flags = orangefs_inode_flags(&new_op-> + downcall.resp.getattr.attributes); + if (flags) { + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + inode->i_size = inode_size; + inode->i_blkbits = ffs(new_op->downcall.resp.getattr. + attributes.blksize); + inode->i_bytes = inode_size; + inode->i_blocks = + (inode_size + 512 - inode_size % 512)/512; + } + break; + case S_IFDIR: + if (flags) { + inode->i_size = PAGE_SIZE; + inode_set_bytes(inode, inode->i_size); + } + set_nlink(inode, 1); + break; + case S_IFLNK: + if (flags & ORANGEFS_GETATTR_NEW) { + inode->i_size = (loff_t)strlen(new_op-> + downcall.resp.getattr.link_target); + ret = strscpy(orangefs_inode->link_target, + new_op->downcall.resp.getattr.link_target, + ORANGEFS_NAME_MAX); + if (ret == -E2BIG) { + ret = -EIO; + goto out_unlock; + } + inode->i_link = orangefs_inode->link_target; + } + break; + /* i.e. -1 */ + default: + /* XXX: ESTALE? This is what is done if it is not new. */ + orangefs_make_bad_inode(inode); + ret = -ESTALE; + goto out_unlock; + } + + inode->i_uid = make_kuid(&init_user_ns, new_op-> + downcall.resp.getattr.attributes.owner); + inode->i_gid = make_kgid(&init_user_ns, new_op-> + downcall.resp.getattr.attributes.group); + inode->i_atime.tv_sec = (time64_t)new_op-> + downcall.resp.getattr.attributes.atime; + inode->i_mtime.tv_sec = (time64_t)new_op-> + downcall.resp.getattr.attributes.mtime; + inode->i_ctime.tv_sec = (time64_t)new_op-> + downcall.resp.getattr.attributes.ctime; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + + /* special case: mark the root inode as sticky */ + inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | + orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); + + orangefs_inode->getattr_time = jiffies + + orangefs_getattr_timeout_msecs*HZ/1000; + ret = 0; +out_unlock: + spin_unlock(&inode->i_lock); +out: + op_release(new_op); + return ret; +} + +int orangefs_inode_check_changed(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, + get_khandle_from_ino(inode)); + + new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.getattr.refn = orangefs_inode->refn; + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_TYPE | + ORANGEFS_ATTR_SYS_LNK_TARGET; + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + if (ret != 0) + goto out; + + ret = orangefs_inode_is_stale(inode, + &new_op->downcall.resp.getattr.attributes, + new_op->downcall.resp.getattr.link_target); +out: + op_release(new_op); + return ret; +} + +/* + * issues a orangefs setattr request to make sure the new attribute values + * take effect if successful. returns 0 on success; -errno otherwise + */ +int orangefs_inode_setattr(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + new_op = op_alloc(ORANGEFS_VFS_OP_SETATTR); + if (!new_op) + return -ENOMEM; + + spin_lock(&inode->i_lock); + new_op->upcall.uid = from_kuid(&init_user_ns, orangefs_inode->attr_uid); + new_op->upcall.gid = from_kgid(&init_user_ns, orangefs_inode->attr_gid); + new_op->upcall.req.setattr.refn = orangefs_inode->refn; + copy_attributes_from_inode(inode, + &new_op->upcall.req.setattr.attributes); + orangefs_inode->attr_valid = 0; + if (!new_op->upcall.req.setattr.attributes.mask) { + spin_unlock(&inode->i_lock); + op_release(new_op); + return 0; + } + spin_unlock(&inode->i_lock); + + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode) | ORANGEFS_OP_WRITEBACK); + gossip_debug(GOSSIP_UTILS_DEBUG, + "orangefs_inode_setattr: returning %d\n", ret); + if (ret) + orangefs_make_bad_inode(inode); + + op_release(new_op); + + if (ret == 0) + orangefs_inode->getattr_time = jiffies - 1; + return ret; +} + +/* + * The following is a very dirty hack that is now a permanent part of the + * ORANGEFS protocol. See protocol.h for more error definitions. + */ + +/* The order matches include/orangefs-types.h in the OrangeFS source. */ +static int PINT_errno_mapping[] = { + 0, EPERM, ENOENT, EINTR, EIO, ENXIO, EBADF, EAGAIN, ENOMEM, + EFAULT, EBUSY, EEXIST, ENODEV, ENOTDIR, EISDIR, EINVAL, EMFILE, + EFBIG, ENOSPC, EROFS, EMLINK, EPIPE, EDEADLK, ENAMETOOLONG, + ENOLCK, ENOSYS, ENOTEMPTY, ELOOP, EWOULDBLOCK, ENOMSG, EUNATCH, + EBADR, EDEADLOCK, ENODATA, ETIME, ENONET, EREMOTE, ECOMM, + EPROTO, EBADMSG, EOVERFLOW, ERESTART, EMSGSIZE, EPROTOTYPE, + ENOPROTOOPT, EPROTONOSUPPORT, EOPNOTSUPP, EADDRINUSE, + EADDRNOTAVAIL, ENETDOWN, ENETUNREACH, ENETRESET, ENOBUFS, + ETIMEDOUT, ECONNREFUSED, EHOSTDOWN, EHOSTUNREACH, EALREADY, + EACCES, ECONNRESET, ERANGE +}; + +int orangefs_normalize_to_errno(__s32 error_code) +{ + __u32 i; + + /* Success */ + if (error_code == 0) { + return 0; + /* + * This shouldn't ever happen. If it does it should be fixed on the + * server. + */ + } else if (error_code > 0) { + gossip_err("orangefs: error status received.\n"); + gossip_err("orangefs: assuming error code is inverted.\n"); + error_code = -error_code; + } + + /* + * XXX: This is very bad since error codes from ORANGEFS may not be + * suitable for return into userspace. + */ + + /* + * Convert ORANGEFS error values into errno values suitable for return + * from the kernel. + */ + if ((-error_code) & ORANGEFS_NON_ERRNO_ERROR_BIT) { + if (((-error_code) & + (ORANGEFS_ERROR_NUMBER_BITS|ORANGEFS_NON_ERRNO_ERROR_BIT| + ORANGEFS_ERROR_BIT)) == ORANGEFS_ECANCEL) { + /* + * cancellation error codes generally correspond to + * a timeout from the client's perspective + */ + error_code = -ETIMEDOUT; + } else { + /* assume a default error code */ + gossip_err("%s: bad error code :%d:.\n", + __func__, + error_code); + error_code = -EINVAL; + } + + /* Convert ORANGEFS encoded errno values into regular errno values. */ + } else if ((-error_code) & ORANGEFS_ERROR_BIT) { + i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS); + if (i < ARRAY_SIZE(PINT_errno_mapping)) + error_code = -PINT_errno_mapping[i]; + else + error_code = -EINVAL; + + /* + * Only ORANGEFS protocol error codes should ever come here. Otherwise + * there is a bug somewhere. + */ + } else { + gossip_err("%s: unknown error code.\n", __func__); + error_code = -EINVAL; + } + return error_code; +} + +#define NUM_MODES 11 +__s32 ORANGEFS_util_translate_mode(int mode) +{ + int ret = 0; + int i = 0; + static int modes[NUM_MODES] = { + S_IXOTH, S_IWOTH, S_IROTH, + S_IXGRP, S_IWGRP, S_IRGRP, + S_IXUSR, S_IWUSR, S_IRUSR, + S_ISGID, S_ISUID + }; + static int orangefs_modes[NUM_MODES] = { + ORANGEFS_O_EXECUTE, ORANGEFS_O_WRITE, ORANGEFS_O_READ, + ORANGEFS_G_EXECUTE, ORANGEFS_G_WRITE, ORANGEFS_G_READ, + ORANGEFS_U_EXECUTE, ORANGEFS_U_WRITE, ORANGEFS_U_READ, + ORANGEFS_G_SGID, ORANGEFS_U_SUID + }; + + for (i = 0; i < NUM_MODES; i++) + if (mode & modes[i]) + ret |= orangefs_modes[i]; + + return ret; +} +#undef NUM_MODES diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h new file mode 100644 index 000000000..d403cf29a --- /dev/null +++ b/fs/orangefs/protocol.h @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/spinlock_types.h> +#include <linux/slab.h> +#include <linux/ioctl.h> + +/* khandle stuff ***********************************************************/ + +/* + * The 2.9 core will put 64 bit handles in here like this: + * 1234 0000 0000 5678 + * The 3.0 and beyond cores will put 128 bit handles in here like this: + * 1234 5678 90AB CDEF + * The kernel module will always use the first four bytes and + * the last four bytes as an inum. + */ +struct orangefs_khandle { + unsigned char u[16]; +} __aligned(8); + +/* + * kernel version of an object ref. + */ +struct orangefs_object_kref { + struct orangefs_khandle khandle; + __s32 fs_id; + __s32 __pad1; +}; + +/* + * compare 2 khandles assumes little endian thus from large address to + * small address + */ +static inline int ORANGEFS_khandle_cmp(const struct orangefs_khandle *kh1, + const struct orangefs_khandle *kh2) +{ + int i; + + for (i = 15; i >= 0; i--) { + if (kh1->u[i] > kh2->u[i]) + return 1; + if (kh1->u[i] < kh2->u[i]) + return -1; + } + + return 0; +} + +static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh, + void *p, int size) +{ + + memcpy(p, kh->u, 16); + memset(p + 16, 0, size - 16); + +} + +static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh, + void *p, int size) +{ + memset(kh, 0, 16); + memcpy(kh->u, p, 16); + +} + +/* pvfs2-types.h ************************************************************/ + +#define ORANGEFS_SUPER_MAGIC 0x20030528 + +/* + * ORANGEFS error codes are a signed 32-bit integer. Error codes are negative, but + * the sign is stripped before decoding. + */ + +/* Bit 31 is not used since it is the sign. */ + +/* + * Bit 30 specifies that this is a ORANGEFS error. A ORANGEFS error is either an + * encoded errno value or a ORANGEFS protocol error. + */ +#define ORANGEFS_ERROR_BIT (1 << 30) + +/* + * Bit 29 specifies that this is a ORANGEFS protocol error and not an encoded + * errno value. + */ +#define ORANGEFS_NON_ERRNO_ERROR_BIT (1 << 29) + +/* + * Bits 9, 8, and 7 specify the error class, which encodes the section of + * server code the error originated in for logging purposes. It is not used + * in the kernel except to be masked out. + */ +#define ORANGEFS_ERROR_CLASS_BITS 0x380 + +/* Bits 6 - 0 are reserved for the actual error code. */ +#define ORANGEFS_ERROR_NUMBER_BITS 0x7f + +/* Encoded errno values decoded by PINT_errno_mapping in orangefs-utils.c. */ + +/* Our own ORANGEFS protocol error codes. */ +#define ORANGEFS_ECANCEL (1|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_EDEVINIT (2|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_EDETAIL (3|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_EHOSTNTFD (4|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_EADDRNTFD (5|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_ENORECVR (6|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_ETRYAGAIN (7|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_ENOTPVFS (8|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) +#define ORANGEFS_ESECURITY (9|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT) + +/* permission bits */ +#define ORANGEFS_O_EXECUTE (1 << 0) +#define ORANGEFS_O_WRITE (1 << 1) +#define ORANGEFS_O_READ (1 << 2) +#define ORANGEFS_G_EXECUTE (1 << 3) +#define ORANGEFS_G_WRITE (1 << 4) +#define ORANGEFS_G_READ (1 << 5) +#define ORANGEFS_U_EXECUTE (1 << 6) +#define ORANGEFS_U_WRITE (1 << 7) +#define ORANGEFS_U_READ (1 << 8) +/* no ORANGEFS_U_VTX (sticky bit) */ +#define ORANGEFS_G_SGID (1 << 10) +#define ORANGEFS_U_SUID (1 << 11) + +#define ORANGEFS_ITERATE_START 2147483646 +#define ORANGEFS_ITERATE_END 2147483645 +#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL +#define ORANGEFS_APPEND_FL FS_APPEND_FL +#define ORANGEFS_NOATIME_FL FS_NOATIME_FL +#define ORANGEFS_MIRROR_FL 0x01000000ULL +#define ORANGEFS_FS_ID_NULL ((__s32)0) +#define ORANGEFS_ATTR_SYS_UID (1 << 0) +#define ORANGEFS_ATTR_SYS_GID (1 << 1) +#define ORANGEFS_ATTR_SYS_PERM (1 << 2) +#define ORANGEFS_ATTR_SYS_ATIME (1 << 3) +#define ORANGEFS_ATTR_SYS_CTIME (1 << 4) +#define ORANGEFS_ATTR_SYS_MTIME (1 << 5) +#define ORANGEFS_ATTR_SYS_TYPE (1 << 6) +#define ORANGEFS_ATTR_SYS_ATIME_SET (1 << 7) +#define ORANGEFS_ATTR_SYS_MTIME_SET (1 << 8) +#define ORANGEFS_ATTR_SYS_SIZE (1 << 20) +#define ORANGEFS_ATTR_SYS_LNK_TARGET (1 << 24) +#define ORANGEFS_ATTR_SYS_DFILE_COUNT (1 << 25) +#define ORANGEFS_ATTR_SYS_DIRENT_COUNT (1 << 26) +#define ORANGEFS_ATTR_SYS_BLKSIZE (1 << 28) +#define ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT (1 << 29) +#define ORANGEFS_ATTR_SYS_COMMON_ALL \ + (ORANGEFS_ATTR_SYS_UID | \ + ORANGEFS_ATTR_SYS_GID | \ + ORANGEFS_ATTR_SYS_PERM | \ + ORANGEFS_ATTR_SYS_ATIME | \ + ORANGEFS_ATTR_SYS_CTIME | \ + ORANGEFS_ATTR_SYS_MTIME | \ + ORANGEFS_ATTR_SYS_TYPE) + +#define ORANGEFS_ATTR_SYS_ALL_SETABLE \ +(ORANGEFS_ATTR_SYS_COMMON_ALL-ORANGEFS_ATTR_SYS_TYPE) + +#define ORANGEFS_ATTR_SYS_ALL_NOHINT \ + (ORANGEFS_ATTR_SYS_COMMON_ALL | \ + ORANGEFS_ATTR_SYS_SIZE | \ + ORANGEFS_ATTR_SYS_LNK_TARGET | \ + ORANGEFS_ATTR_SYS_DFILE_COUNT | \ + ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \ + ORANGEFS_ATTR_SYS_DIRENT_COUNT | \ + ORANGEFS_ATTR_SYS_BLKSIZE) + +#define ORANGEFS_XATTR_REPLACE 0x2 +#define ORANGEFS_XATTR_CREATE 0x1 +#define ORANGEFS_MAX_SERVER_ADDR_LEN 256 +#define ORANGEFS_NAME_MAX 256 +/* + * max extended attribute name len as imposed by the VFS and exploited for the + * upcall request types. + * NOTE: Please retain them as multiples of 8 even if you wish to change them + * This is *NECESSARY* for supporting 32 bit user-space binaries on a 64-bit + * kernel. Due to implementation within DBPF, this really needs to be + * ORANGEFS_NAME_MAX, which it was the same value as, but no reason to let it + * break if that changes in the future. + */ +#define ORANGEFS_MAX_XATTR_NAMELEN ORANGEFS_NAME_MAX /* Not the same as + * XATTR_NAME_MAX defined + * by <linux/xattr.h> + */ +#define ORANGEFS_MAX_XATTR_VALUELEN 8192 /* Not the same as XATTR_SIZE_MAX + * defined by <linux/xattr.h> + */ +#define ORANGEFS_MAX_XATTR_LISTLEN 16 /* Not the same as XATTR_LIST_MAX + * defined by <linux/xattr.h> + */ +/* + * ORANGEFS I/O operation types, used in both system and server interfaces. + */ +enum ORANGEFS_io_type { + ORANGEFS_IO_READ = 1, + ORANGEFS_IO_WRITE = 2 +}; + +/* + * If this enum is modified the server parameters related to the precreate pool + * batch and low threshold sizes may need to be modified to reflect this + * change. + */ +enum orangefs_ds_type { + ORANGEFS_TYPE_NONE = 0, + ORANGEFS_TYPE_METAFILE = (1 << 0), + ORANGEFS_TYPE_DATAFILE = (1 << 1), + ORANGEFS_TYPE_DIRECTORY = (1 << 2), + ORANGEFS_TYPE_SYMLINK = (1 << 3), + ORANGEFS_TYPE_DIRDATA = (1 << 4), + ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */ +}; + +/* This structure is used by the VFS-client interaction alone */ +struct ORANGEFS_keyval_pair { + char key[ORANGEFS_MAX_XATTR_NAMELEN]; + __s32 key_sz; /* __s32 for portable, fixed-size structures */ + __s32 val_sz; + char val[ORANGEFS_MAX_XATTR_VALUELEN]; +}; + +/* pvfs2-sysint.h ***********************************************************/ +/* Describes attributes for a file, directory, or symlink. */ +struct ORANGEFS_sys_attr_s { + __u32 owner; + __u32 group; + __u32 perms; + __u64 atime; + __u64 mtime; + __u64 ctime; + __s64 size; + + /* NOTE: caller must free if valid */ + char *link_target; + + /* Changed to __s32 so that size of structure does not change */ + __s32 dfile_count; + + /* Changed to __s32 so that size of structure does not change */ + __s32 distr_dir_servers_initial; + + /* Changed to __s32 so that size of structure does not change */ + __s32 distr_dir_servers_max; + + /* Changed to __s32 so that size of structure does not change */ + __s32 distr_dir_split_size; + + __u32 mirror_copies_count; + + /* NOTE: caller must free if valid */ + char *dist_name; + + /* NOTE: caller must free if valid */ + char *dist_params; + + __s64 dirent_count; + enum orangefs_ds_type objtype; + __u64 flags; + __u32 mask; + __s64 blksize; +}; + +#define ORANGEFS_LOOKUP_LINK_NO_FOLLOW 0 + +/* pint-dev.h ***************************************************************/ + +/* parameter structure used in ORANGEFS_DEV_DEBUG ioctl command */ +struct dev_mask_info_s { + enum { + KERNEL_MASK, + CLIENT_MASK, + } mask_type; + __u64 mask_value; +}; + +struct dev_mask2_info_s { + __u64 mask1_value; + __u64 mask2_value; +}; + +/* pvfs2-util.h *************************************************************/ +__s32 ORANGEFS_util_translate_mode(int mode); + +/* pvfs2-debug.h ************************************************************/ +#include "orangefs-debug.h" + +/* pvfs2-internal.h *********************************************************/ +#define llu(x) (unsigned long long)(x) +#define lld(x) (long long)(x) + +/* pint-dev-shared.h ********************************************************/ +#define ORANGEFS_DEV_MAGIC 'k' + +#define ORANGEFS_READDIR_DEFAULT_DESC_COUNT 5 + +#define DEV_GET_MAGIC 0x1 +#define DEV_GET_MAX_UPSIZE 0x2 +#define DEV_GET_MAX_DOWNSIZE 0x3 +#define DEV_MAP 0x4 +#define DEV_REMOUNT_ALL 0x5 +#define DEV_DEBUG 0x6 +#define DEV_UPSTREAM 0x7 +#define DEV_CLIENT_MASK 0x8 +#define DEV_CLIENT_STRING 0x9 +#define DEV_MAX_NR 0xa + +/* supported ioctls, codes are with respect to user-space */ +enum { + ORANGEFS_DEV_GET_MAGIC = _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAGIC, __s32), + ORANGEFS_DEV_GET_MAX_UPSIZE = + _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_UPSIZE, __s32), + ORANGEFS_DEV_GET_MAX_DOWNSIZE = + _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_DOWNSIZE, __s32), + ORANGEFS_DEV_MAP = _IO(ORANGEFS_DEV_MAGIC, DEV_MAP), + ORANGEFS_DEV_REMOUNT_ALL = _IO(ORANGEFS_DEV_MAGIC, DEV_REMOUNT_ALL), + ORANGEFS_DEV_DEBUG = _IOR(ORANGEFS_DEV_MAGIC, DEV_DEBUG, __s32), + ORANGEFS_DEV_UPSTREAM = _IOW(ORANGEFS_DEV_MAGIC, DEV_UPSTREAM, int), + ORANGEFS_DEV_CLIENT_MASK = _IOW(ORANGEFS_DEV_MAGIC, + DEV_CLIENT_MASK, + struct dev_mask2_info_s), + ORANGEFS_DEV_CLIENT_STRING = _IOW(ORANGEFS_DEV_MAGIC, + DEV_CLIENT_STRING, + char *), + ORANGEFS_DEV_MAXNR = DEV_MAX_NR, +}; + +/* + * version number for use in communicating between kernel space and user + * space. Zero signifies the upstream version of the kernel module. + */ +#define ORANGEFS_KERNEL_PROTO_VERSION 0 +#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20903 + +/* + * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl. + * NOTE: See devorangefs-req.c for 32 bit compat structure. + * Since this structure has a variable-sized layout that is different + * on 32 and 64 bit platforms, we need to normalize to a 64 bit layout + * on such systems before servicing ioctl calls from user-space binaries + * that may be 32 bit! + */ +struct ORANGEFS_dev_map_desc { + void __user *ptr; + __s32 total_size; + __s32 size; + __s32 count; +}; + +/* gossip.h *****************************************************************/ + +extern __u64 orangefs_gossip_debug_mask; + +/* try to avoid function call overhead by checking masks in macro */ +#define gossip_debug(mask, fmt, ...) \ +do { \ + if (orangefs_gossip_debug_mask & (mask)) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) + +#define gossip_err pr_err diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c new file mode 100644 index 000000000..5254256a2 --- /dev/null +++ b/fs/orangefs/super.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +#include <linux/parser.h> +#include <linux/hashtable.h> +#include <linux/seq_file.h> + +/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ +static struct kmem_cache *orangefs_inode_cache; + +/* list for storing orangefs specific superblocks in use */ +LIST_HEAD(orangefs_superblocks); + +DEFINE_SPINLOCK(orangefs_superblocks_lock); + +enum { + Opt_intr, + Opt_acl, + Opt_local_lock, + + Opt_err +}; + +static const match_table_t tokens = { + { Opt_acl, "acl" }, + { Opt_intr, "intr" }, + { Opt_local_lock, "local_lock" }, + { Opt_err, NULL } +}; + +uint64_t orangefs_features; + +static int orangefs_show_options(struct seq_file *m, struct dentry *root) +{ + struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); + + if (root->d_sb->s_flags & SB_POSIXACL) + seq_puts(m, ",acl"); + if (orangefs_sb->flags & ORANGEFS_OPT_INTR) + seq_puts(m, ",intr"); + if (orangefs_sb->flags & ORANGEFS_OPT_LOCAL_LOCK) + seq_puts(m, ",local_lock"); + return 0; +} + +static int parse_mount_options(struct super_block *sb, char *options, + int silent) +{ + struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + + /* + * Force any potential flags that might be set from the mount + * to zero, ie, initialize to unset. + */ + sb->s_flags &= ~SB_POSIXACL; + orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; + orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_acl: + sb->s_flags |= SB_POSIXACL; + break; + case Opt_intr: + orangefs_sb->flags |= ORANGEFS_OPT_INTR; + break; + case Opt_local_lock: + orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; + break; + default: + goto fail; + } + } + + return 0; +fail: + if (!silent) + gossip_err("Error: mount option [%s] is not supported.\n", p); + return -EINVAL; +} + +static void orangefs_inode_cache_ctor(void *req) +{ + struct orangefs_inode_s *orangefs_inode = req; + + inode_init_once(&orangefs_inode->vfs_inode); + init_rwsem(&orangefs_inode->xattr_sem); +} + +static struct inode *orangefs_alloc_inode(struct super_block *sb) +{ + struct orangefs_inode_s *orangefs_inode; + + orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); + if (!orangefs_inode) + return NULL; + + /* + * We want to clear everything except for rw_semaphore and the + * vfs_inode. + */ + memset(&orangefs_inode->refn.khandle, 0, 16); + orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; + orangefs_inode->last_failed_block_index_read = 0; + memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "orangefs_alloc_inode: allocated %p\n", + &orangefs_inode->vfs_inode); + return &orangefs_inode->vfs_inode; +} + +static void orangefs_free_inode(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_cached_xattr *cx; + struct hlist_node *tmp; + int i; + + hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) { + hlist_del(&cx->node); + kfree(cx); + } + + kmem_cache_free(orangefs_inode_cache, orangefs_inode); +} + +static void orangefs_destroy_inode(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "%s: deallocated %p destroying inode %pU\n", + __func__, orangefs_inode, get_khandle_from_ino(inode)); +} + +static int orangefs_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); + return orangefs_inode_setattr(inode); +} + +/* + * NOTE: information filled in here is typically reflected in the + * output of the system command 'df' +*/ +static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int ret = -ENOMEM; + struct orangefs_kernel_op_s *new_op = NULL; + int flags = 0; + struct super_block *sb = NULL; + + sb = dentry->d_sb; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "%s: called on sb %p (fs_id is %d)\n", + __func__, + sb, + (int)(ORANGEFS_SB(sb)->fs_id)); + + new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); + if (!new_op) + return ret; + new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id; + + if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR) + flags = ORANGEFS_OP_INTERRUPTIBLE; + + ret = service_operation(new_op, "orangefs_statfs", flags); + + if (new_op->downcall.status < 0) + goto out_op_release; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "%s: got %ld blocks available | " + "%ld blocks total | %ld block size | " + "%ld files total | %ld files avail\n", + __func__, + (long)new_op->downcall.resp.statfs.blocks_avail, + (long)new_op->downcall.resp.statfs.blocks_total, + (long)new_op->downcall.resp.statfs.block_size, + (long)new_op->downcall.resp.statfs.files_total, + (long)new_op->downcall.resp.statfs.files_avail); + + buf->f_type = sb->s_magic; + memcpy(&buf->f_fsid, &ORANGEFS_SB(sb)->fs_id, sizeof(buf->f_fsid)); + buf->f_bsize = new_op->downcall.resp.statfs.block_size; + buf->f_namelen = ORANGEFS_NAME_MAX; + + buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total; + buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail; + buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; + buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; + buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; + buf->f_frsize = 0; + +out_op_release: + op_release(new_op); + gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); + return ret; +} + +/* + * Remount as initiated by VFS layer. We just need to reparse the mount + * options, no need to signal pvfs2-client-core about it. + */ +static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n"); + return parse_mount_options(sb, data, 1); +} + +/* + * Remount as initiated by pvfs2-client-core on restart. This is used to + * repopulate mount information left from previous pvfs2-client-core. + * + * the idea here is that given a valid superblock, we're + * re-initializing the user space client with the initial mount + * information specified when the super block was first initialized. + * this is very different than the first initialization/creation of a + * superblock. we use the special service_priority_operation to make + * sure that the mount gets ahead of any other pending operation that + * is waiting for servicing. this means that the pvfs2-client won't + * fail to start several times for all other pending operations before + * the client regains all of the mount information from us. + * NOTE: this function assumes that the request_mutex is already acquired! + */ +int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) +{ + struct orangefs_kernel_op_s *new_op; + int ret = -EINVAL; + + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n"); + + new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); + if (!new_op) + return -ENOMEM; + strncpy(new_op->upcall.req.fs_mount.orangefs_config_server, + orangefs_sb->devname, + ORANGEFS_MAX_SERVER_ADDR_LEN); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Attempting ORANGEFS Remount via host %s\n", + new_op->upcall.req.fs_mount.orangefs_config_server); + + /* + * we assume that the calling function has already acquired the + * request_mutex to prevent other operations from bypassing + * this one + */ + ret = service_operation(new_op, "orangefs_remount", + ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); + gossip_debug(GOSSIP_SUPER_DEBUG, + "orangefs_remount: mount got return value of %d\n", + ret); + if (ret == 0) { + /* + * store the id assigned to this sb -- it's just a + * short-lived mapping that the system interface uses + * to map this superblock to a particular mount entry + */ + orangefs_sb->id = new_op->downcall.resp.fs_mount.id; + orangefs_sb->mount_pending = 0; + } + + op_release(new_op); + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", + ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); + if (!ret) + orangefs_features = + new_op->downcall.resp.features.features; + else + orangefs_features = 0; + op_release(new_op); + } else { + orangefs_features = 0; + } + + return ret; +} + +int fsid_key_table_initialize(void) +{ + return 0; +} + +void fsid_key_table_finalize(void) +{ +} + +static const struct super_operations orangefs_s_ops = { + .alloc_inode = orangefs_alloc_inode, + .free_inode = orangefs_free_inode, + .destroy_inode = orangefs_destroy_inode, + .write_inode = orangefs_write_inode, + .drop_inode = generic_delete_inode, + .statfs = orangefs_statfs, + .remount_fs = orangefs_remount_fs, + .show_options = orangefs_show_options, +}; + +static struct dentry *orangefs_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, + int fh_type) +{ + struct orangefs_object_kref refn; + + if (fh_len < 5 || fh_type > 2) + return NULL; + + ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16); + refn.fs_id = (u32) fid->raw[4]; + gossip_debug(GOSSIP_SUPER_DEBUG, + "fh_to_dentry: handle %pU, fs_id %d\n", + &refn.khandle, + refn.fs_id); + + return d_obtain_alias(orangefs_iget(sb, &refn)); +} + +static int orangefs_encode_fh(struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) +{ + int len = parent ? 10 : 5; + int type = 1; + struct orangefs_object_kref refn; + + if (*max_len < len) { + gossip_err("fh buffer is too small for encoding\n"); + *max_len = len; + type = 255; + goto out; + } + + refn = ORANGEFS_I(inode)->refn; + ORANGEFS_khandle_to(&refn.khandle, fh, 16); + fh[4] = refn.fs_id; + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Encoding fh: handle %pU, fsid %u\n", + &refn.khandle, + refn.fs_id); + + + if (parent) { + refn = ORANGEFS_I(parent)->refn; + ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16); + fh[9] = refn.fs_id; + + type = 2; + gossip_debug(GOSSIP_SUPER_DEBUG, + "Encoding parent: handle %pU, fsid %u\n", + &refn.khandle, + refn.fs_id); + } + *max_len = len; + +out: + return type; +} + +static const struct export_operations orangefs_export_ops = { + .encode_fh = orangefs_encode_fh, + .fh_to_dentry = orangefs_fh_to_dentry, +}; + +static int orangefs_unmount(int id, __s32 fs_id, const char *devname) +{ + struct orangefs_kernel_op_s *op; + int r; + op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); + if (!op) + return -ENOMEM; + op->upcall.req.fs_umount.id = id; + op->upcall.req.fs_umount.fs_id = fs_id; + strncpy(op->upcall.req.fs_umount.orangefs_config_server, + devname, ORANGEFS_MAX_SERVER_ADDR_LEN - 1); + r = service_operation(op, "orangefs_fs_umount", 0); + /* Not much to do about an error here. */ + if (r) + gossip_err("orangefs_unmount: service_operation %d\n", r); + op_release(op); + return r; +} + +static int orangefs_fill_sb(struct super_block *sb, + struct orangefs_fs_mount_response *fs_mount, + void *data, int silent) +{ + int ret; + struct inode *root; + struct dentry *root_dentry; + struct orangefs_object_kref root_object; + + ORANGEFS_SB(sb)->sb = sb; + + ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle; + ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; + ORANGEFS_SB(sb)->id = fs_mount->id; + + if (data) { + ret = parse_mount_options(sb, data, silent); + if (ret) + return ret; + } + + /* Hang the xattr handlers off the superblock */ + sb->s_xattr = orangefs_xattr_handlers; + sb->s_magic = ORANGEFS_SUPER_MAGIC; + sb->s_op = &orangefs_s_ops; + sb->s_d_op = &orangefs_dentry_operations; + + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + ret = super_setup_bdi(sb); + if (ret) + return ret; + + root_object.khandle = ORANGEFS_SB(sb)->root_khandle; + root_object.fs_id = ORANGEFS_SB(sb)->fs_id; + gossip_debug(GOSSIP_SUPER_DEBUG, + "get inode %pU, fsid %d\n", + &root_object.khandle, + root_object.fs_id); + + root = orangefs_iget(sb, &root_object); + if (IS_ERR(root)) + return PTR_ERR(root); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Allocated root inode [%p] with mode %x\n", + root, + root->i_mode); + + /* allocates and places root dentry in dcache */ + root_dentry = d_make_root(root); + if (!root_dentry) + return -ENOMEM; + + sb->s_export_op = &orangefs_export_ops; + sb->s_root = root_dentry; + return 0; +} + +struct dentry *orangefs_mount(struct file_system_type *fst, + int flags, + const char *devname, + void *data) +{ + int ret; + struct super_block *sb = ERR_PTR(-EINVAL); + struct orangefs_kernel_op_s *new_op; + struct dentry *d = ERR_PTR(-EINVAL); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "orangefs_mount: called with devname %s\n", + devname); + + if (!devname) { + gossip_err("ERROR: device name not specified.\n"); + return ERR_PTR(-EINVAL); + } + + new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); + if (!new_op) + return ERR_PTR(-ENOMEM); + + strncpy(new_op->upcall.req.fs_mount.orangefs_config_server, + devname, + ORANGEFS_MAX_SERVER_ADDR_LEN - 1); + + gossip_debug(GOSSIP_SUPER_DEBUG, + "Attempting ORANGEFS Mount via host %s\n", + new_op->upcall.req.fs_mount.orangefs_config_server); + + ret = service_operation(new_op, "orangefs_mount", 0); + gossip_debug(GOSSIP_SUPER_DEBUG, + "orangefs_mount: mount got return value of %d\n", ret); + if (ret) + goto free_op; + + if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) { + gossip_err("ERROR: Retrieved null fs_id\n"); + ret = -EINVAL; + goto free_op; + } + + sb = sget(fst, NULL, set_anon_super, flags, NULL); + + if (IS_ERR(sb)) { + d = ERR_CAST(sb); + orangefs_unmount(new_op->downcall.resp.fs_mount.id, + new_op->downcall.resp.fs_mount.fs_id, devname); + goto free_op; + } + + /* alloc and init our private orangefs sb info */ + sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); + if (!ORANGEFS_SB(sb)) { + d = ERR_PTR(-ENOMEM); + goto free_sb_and_op; + } + + ret = orangefs_fill_sb(sb, + &new_op->downcall.resp.fs_mount, data, + flags & SB_SILENT ? 1 : 0); + + if (ret) { + d = ERR_PTR(ret); + goto free_sb_and_op; + } + + /* + * on successful mount, store the devname and data + * used + */ + strncpy(ORANGEFS_SB(sb)->devname, + devname, + ORANGEFS_MAX_SERVER_ADDR_LEN - 1); + + /* mount_pending must be cleared */ + ORANGEFS_SB(sb)->mount_pending = 0; + + /* + * finally, add this sb to our list of known orangefs + * sb's + */ + gossip_debug(GOSSIP_SUPER_DEBUG, + "Adding SB %p to orangefs superblocks\n", + ORANGEFS_SB(sb)); + spin_lock(&orangefs_superblocks_lock); + list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); + spin_unlock(&orangefs_superblocks_lock); + op_release(new_op); + + /* Must be removed from the list now. */ + ORANGEFS_SB(sb)->no_list = 0; + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return ERR_PTR(-ENOMEM); + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", 0); + orangefs_features = new_op->downcall.resp.features.features; + op_release(new_op); + } else { + orangefs_features = 0; + } + + return dget(sb->s_root); + +free_sb_and_op: + /* Will call orangefs_kill_sb with sb not in list. */ + ORANGEFS_SB(sb)->no_list = 1; + /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ + deactivate_locked_super(sb); +free_op: + gossip_err("orangefs_mount: mount request failed with %d\n", ret); + if (ret == -EINVAL) { + gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n"); + gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n"); + } + + op_release(new_op); + + return d; +} + +void orangefs_kill_sb(struct super_block *sb) +{ + int r; + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); + + /* provided sb cleanup */ + kill_anon_super(sb); + + if (!ORANGEFS_SB(sb)) { + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); + return; + } + /* + * issue the unmount to userspace to tell it to remove the + * dynamic mount info it has for this superblock + */ + r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, + ORANGEFS_SB(sb)->devname); + if (!r) + ORANGEFS_SB(sb)->mount_pending = 1; + + if (!ORANGEFS_SB(sb)->no_list) { + /* remove the sb from our list of orangefs specific sb's */ + spin_lock(&orangefs_superblocks_lock); + /* not list_del_init */ + __list_del_entry(&ORANGEFS_SB(sb)->list); + ORANGEFS_SB(sb)->list.prev = NULL; + spin_unlock(&orangefs_superblocks_lock); + } + + /* + * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us + * gets completed before we free the dang thing. + */ + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); + + /* free the orangefs superblock private data */ + kfree(ORANGEFS_SB(sb)); +} + +int orangefs_inode_cache_initialize(void) +{ + orangefs_inode_cache = kmem_cache_create_usercopy( + "orangefs_inode_cache", + sizeof(struct orangefs_inode_s), + 0, + ORANGEFS_CACHE_CREATE_FLAGS, + offsetof(struct orangefs_inode_s, + link_target), + sizeof_field(struct orangefs_inode_s, + link_target), + orangefs_inode_cache_ctor); + + if (!orangefs_inode_cache) { + gossip_err("Cannot create orangefs_inode_cache\n"); + return -ENOMEM; + } + return 0; +} + +int orangefs_inode_cache_finalize(void) +{ + kmem_cache_destroy(orangefs_inode_cache); + return 0; +} diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c new file mode 100644 index 000000000..db107fe91 --- /dev/null +++ b/fs/orangefs/symlink.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +const struct inode_operations orangefs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = orangefs_setattr, + .getattr = orangefs_getattr, + .listxattr = orangefs_listxattr, + .permission = orangefs_permission, + .update_time = orangefs_update_time, +}; diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h new file mode 100644 index 000000000..16118452a --- /dev/null +++ b/fs/orangefs/upcall.h @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +#ifndef __UPCALL_H +#define __UPCALL_H + +/* + * Sanitized this header file to fix + * 32-64 bit interaction issues between + * client-core and device + */ +struct orangefs_io_request_s { + __s32 __pad1; + __s32 buf_index; + __s32 count; + __s32 __pad2; + __s64 offset; + struct orangefs_object_kref refn; + enum ORANGEFS_io_type io_type; + __s32 readahead_size; +}; + +struct orangefs_lookup_request_s { + __s32 sym_follow; + __s32 __pad1; + struct orangefs_object_kref parent_refn; + char d_name[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_create_request_s { + struct orangefs_object_kref parent_refn; + struct ORANGEFS_sys_attr_s attributes; + char d_name[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_symlink_request_s { + struct orangefs_object_kref parent_refn; + struct ORANGEFS_sys_attr_s attributes; + char entry_name[ORANGEFS_NAME_MAX]; + char target[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_getattr_request_s { + struct orangefs_object_kref refn; + __u32 mask; + __u32 __pad1; +}; + +struct orangefs_setattr_request_s { + struct orangefs_object_kref refn; + struct ORANGEFS_sys_attr_s attributes; +}; + +struct orangefs_remove_request_s { + struct orangefs_object_kref parent_refn; + char d_name[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_mkdir_request_s { + struct orangefs_object_kref parent_refn; + struct ORANGEFS_sys_attr_s attributes; + char d_name[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_readdir_request_s { + struct orangefs_object_kref refn; + __u64 token; + __s32 max_dirent_count; + __s32 buf_index; +}; + +struct orangefs_readdirplus_request_s { + struct orangefs_object_kref refn; + __u64 token; + __s32 max_dirent_count; + __u32 mask; + __s32 buf_index; + __s32 __pad1; +}; + +struct orangefs_rename_request_s { + struct orangefs_object_kref old_parent_refn; + struct orangefs_object_kref new_parent_refn; + char d_old_name[ORANGEFS_NAME_MAX]; + char d_new_name[ORANGEFS_NAME_MAX]; +}; + +struct orangefs_statfs_request_s { + __s32 fs_id; + __s32 __pad1; +}; + +struct orangefs_truncate_request_s { + struct orangefs_object_kref refn; + __s64 size; +}; + +struct orangefs_ra_cache_flush_request_s { + struct orangefs_object_kref refn; +}; + +struct orangefs_fs_mount_request_s { + char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN]; +}; + +struct orangefs_fs_umount_request_s { + __s32 id; + __s32 fs_id; + char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN]; +}; + +struct orangefs_getxattr_request_s { + struct orangefs_object_kref refn; + __s32 key_sz; + __s32 __pad1; + char key[ORANGEFS_MAX_XATTR_NAMELEN]; +}; + +struct orangefs_setxattr_request_s { + struct orangefs_object_kref refn; + struct ORANGEFS_keyval_pair keyval; + __s32 flags; + __s32 __pad1; +}; + +struct orangefs_listxattr_request_s { + struct orangefs_object_kref refn; + __s32 requested_count; + __s32 __pad1; + __u64 token; +}; + +struct orangefs_removexattr_request_s { + struct orangefs_object_kref refn; + __s32 key_sz; + __s32 __pad1; + char key[ORANGEFS_MAX_XATTR_NAMELEN]; +}; + +struct orangefs_op_cancel_s { + __u64 op_tag; +}; + +struct orangefs_fsync_request_s { + struct orangefs_object_kref refn; +}; + +enum orangefs_param_request_type { + ORANGEFS_PARAM_REQUEST_SET = 1, + ORANGEFS_PARAM_REQUEST_GET = 2 +}; + +enum orangefs_param_request_op { + ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS = 1, + ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT = 2, + ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT = 3, + ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE = 4, + ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS = 5, + ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE = 6, + ORANGEFS_PARAM_REQUEST_OP_PERF_RESET = 7, + ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS = 8, + ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT = 9, + ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT = 10, + ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE = 11, + ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS = 12, + ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT = 13, + ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT = 14, + ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE = 15, + ORANGEFS_PARAM_REQUEST_OP_CLIENT_DEBUG = 16, + ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS = 17, + ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT = 18, + ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT = 19, + ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE = 20, + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS = 21, + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT = 22, + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23, + ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24, + ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT = 29, +}; + +struct orangefs_param_request_s { + enum orangefs_param_request_type type; + enum orangefs_param_request_op op; + union { + __s64 value64; + __s32 value32[2]; + } u; + char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN]; +}; + +enum orangefs_perf_count_request_type { + ORANGEFS_PERF_COUNT_REQUEST_ACACHE = 1, + ORANGEFS_PERF_COUNT_REQUEST_NCACHE = 2, + ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE = 3, +}; + +struct orangefs_perf_count_request_s { + enum orangefs_perf_count_request_type type; + __s32 __pad1; +}; + +struct orangefs_fs_key_request_s { + __s32 fsid; + __s32 __pad1; +}; + +/* 2.9.6 */ +struct orangefs_features_request_s { + __u64 features; +}; + +struct orangefs_upcall_s { + __s32 type; + __u32 uid; + __u32 gid; + int pid; + int tgid; + /* Trailers unused but must be retained for protocol compatibility. */ + __s64 trailer_size; + char *trailer_buf; + + union { + struct orangefs_io_request_s io; + struct orangefs_lookup_request_s lookup; + struct orangefs_create_request_s create; + struct orangefs_symlink_request_s sym; + struct orangefs_getattr_request_s getattr; + struct orangefs_setattr_request_s setattr; + struct orangefs_remove_request_s remove; + struct orangefs_mkdir_request_s mkdir; + struct orangefs_readdir_request_s readdir; + struct orangefs_readdirplus_request_s readdirplus; + struct orangefs_rename_request_s rename; + struct orangefs_statfs_request_s statfs; + struct orangefs_truncate_request_s truncate; + struct orangefs_ra_cache_flush_request_s ra_cache_flush; + struct orangefs_fs_mount_request_s fs_mount; + struct orangefs_fs_umount_request_s fs_umount; + struct orangefs_getxattr_request_s getxattr; + struct orangefs_setxattr_request_s setxattr; + struct orangefs_listxattr_request_s listxattr; + struct orangefs_removexattr_request_s removexattr; + struct orangefs_op_cancel_s cancel; + struct orangefs_fsync_request_s fsync; + struct orangefs_param_request_s param; + struct orangefs_perf_count_request_s perf_count; + struct orangefs_fs_key_request_s fs_key; + struct orangefs_features_request_s features; + } req; +}; + +#endif /* __UPCALL_H */ diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c new file mode 100644 index 000000000..beafc33d5 --- /dev/null +++ b/fs/orangefs/waitqueue.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * (C) 2011 Omnibond Systems + * + * Changes by Acxiom Corporation to implement generic service_operation() + * function, Copyright Acxiom Corporation, 2005. + * + * See COPYING in top-level directory. + */ + +/* + * In-kernel waitqueue operations. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" + +static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, + long timeout, + int flags) + __acquires(op->lock); +static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock); + +/* + * What we do in this function is to walk the list of operations that are + * present in the request queue and mark them as purged. + * NOTE: This is called from the device close after client-core has + * guaranteed that no new operations could appear on the list since the + * client-core is anyway going to exit. + */ +void purge_waiting_ops(void) +{ + struct orangefs_kernel_op_s *op, *tmp; + + spin_lock(&orangefs_request_list_lock); + list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "pvfs2-client-core: purging op tag %llu %s\n", + llu(op->tag), + get_opname_string(op)); + set_op_state_purged(op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(op), + op->op_state, + current->comm); + } + spin_unlock(&orangefs_request_list_lock); +} + +/* + * submits a ORANGEFS operation and waits for it to complete + * + * Note op->downcall.status will contain the status of the operation (in + * errno format), whether provided by pvfs2-client or a result of failure to + * service the operation. If the caller wishes to distinguish, then + * op->state can be checked to see if it was serviced or not. + * + * Returns contents of op->downcall.status for convenience + */ +int service_operation(struct orangefs_kernel_op_s *op, + const char *op_name, + int flags) +{ + long timeout = MAX_SCHEDULE_TIMEOUT; + int ret = 0; + + DEFINE_WAIT(wait_entry); + + op->upcall.tgid = current->tgid; + op->upcall.pid = current->pid; + +retry_servicing: + op->downcall.status = 0; + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: %s op:%p: process:%s: pid:%d:\n", + __func__, + op_name, + op, + current->comm, + current->pid); + + /* + * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid + * acquiring the request_mutex because we're servicing a + * high priority remount operation and the request_mutex is + * already taken. + */ + if (!(flags & ORANGEFS_OP_NO_MUTEX)) { + if (flags & ORANGEFS_OP_INTERRUPTIBLE) + ret = mutex_lock_interruptible(&orangefs_request_mutex); + else + ret = mutex_lock_killable(&orangefs_request_mutex); + /* + * check to see if we were interrupted while waiting for + * mutex + */ + if (ret < 0) { + op->downcall.status = ret; + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: service_operation interrupted.\n", + __func__); + return ret; + } + } + + /* queue up the operation */ + spin_lock(&orangefs_request_list_lock); + spin_lock(&op->lock); + set_op_state_waiting(op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(op), + op->op_state, + current->comm); + /* add high priority remount op to the front of the line. */ + if (flags & ORANGEFS_OP_PRIORITY) + list_add(&op->list, &orangefs_request_list); + else + list_add_tail(&op->list, &orangefs_request_list); + spin_unlock(&op->lock); + wake_up_interruptible(&orangefs_request_list_waitq); + if (!__is_daemon_in_service()) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s:client core is NOT in service.\n", + __func__); + /* + * Don't wait for the userspace component to return if + * the filesystem is being umounted anyway. + */ + if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) + timeout = 0; + else + timeout = op_timeout_secs * HZ; + } + spin_unlock(&orangefs_request_list_lock); + + if (!(flags & ORANGEFS_OP_NO_MUTEX)) + mutex_unlock(&orangefs_request_mutex); + + ret = wait_for_matching_downcall(op, timeout, flags); + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: wait_for_matching_downcall returned %d for %p\n", + __func__, + ret, + op); + + /* got matching downcall; make sure status is in errno format */ + if (!ret) { + spin_unlock(&op->lock); + op->downcall.status = + orangefs_normalize_to_errno(op->downcall.status); + ret = op->downcall.status; + goto out; + } + + /* failed to get matching downcall */ + if (ret == -ETIMEDOUT) { + gossip_err("%s: %s -- wait timed out; aborting attempt.\n", + __func__, + op_name); + } + + /* + * remove a waiting op from the request list or + * remove an in-progress op from the in-progress list. + */ + orangefs_clean_up_interrupted_operation(op); + + op->downcall.status = ret; + /* retry if operation has not been serviced and if requested */ + if (ret == -EAGAIN) { + op->attempts++; + timeout = op_timeout_secs * HZ; + gossip_debug(GOSSIP_WAIT_DEBUG, + "orangefs: tag %llu (%s)" + " -- operation to be retried (%d attempt)\n", + llu(op->tag), + op_name, + op->attempts); + + /* + * io ops (ops that use the shared memory buffer) have + * to be returned to their caller for a retry. Other ops + * can just be recycled here. + */ + if (!op->uses_shared_memory) + goto retry_servicing; + } + +out: + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: %s returning: %d for %p.\n", + __func__, + op_name, + ret, + op); + return ret; +} + +/* This can get called on an I/O op if it had a bad service_operation. */ +bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op) +{ + u64 tag = op->tag; + if (!op_state_in_progress(op)) + return false; + + op->slot_to_free = op->upcall.req.io.buf_index; + memset(&op->upcall, 0, sizeof(op->upcall)); + memset(&op->downcall, 0, sizeof(op->downcall)); + op->upcall.type = ORANGEFS_VFS_OP_CANCEL; + op->upcall.req.cancel.op_tag = tag; + op->downcall.type = ORANGEFS_VFS_OP_INVALID; + op->downcall.status = -1; + orangefs_new_tag(op); + + spin_lock(&orangefs_request_list_lock); + /* orangefs_request_list_lock is enough of a barrier here */ + if (!__is_daemon_in_service()) { + spin_unlock(&orangefs_request_list_lock); + return false; + } + spin_lock(&op->lock); + set_op_state_waiting(op); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: op:%s: op_state:%d: process:%s:\n", + __func__, + get_opname_string(op), + op->op_state, + current->comm); + list_add(&op->list, &orangefs_request_list); + spin_unlock(&op->lock); + spin_unlock(&orangefs_request_list_lock); + + gossip_debug(GOSSIP_WAIT_DEBUG, + "Attempting ORANGEFS operation cancellation of tag %llu\n", + llu(tag)); + return true; +} + +/* + * Change an op to the "given up" state and remove it from its list. + */ +static void + orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock) +{ + /* + * handle interrupted cases depending on what state we were in when + * the interruption is detected. + * + * Called with op->lock held. + */ + + /* + * List manipulation code elsewhere will ignore ops that + * have been given up upon. + */ + op->op_state |= OP_VFS_STATE_GIVEN_UP; + + if (list_empty(&op->list)) { + /* caught copying to/from daemon */ + BUG_ON(op_state_serviced(op)); + spin_unlock(&op->lock); + wait_for_completion(&op->waitq); + } else if (op_state_waiting(op)) { + /* + * upcall hasn't been read; remove op from upcall request + * list. + */ + spin_unlock(&op->lock); + spin_lock(&orangefs_request_list_lock); + list_del_init(&op->list); + spin_unlock(&orangefs_request_list_lock); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Interrupted: Removed op %p from request_list\n", + op); + } else if (op_state_in_progress(op)) { + /* op must be removed from the in progress htable */ + spin_unlock(&op->lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); + list_del_init(&op->list); + spin_unlock(&orangefs_htable_ops_in_progress_lock); + gossip_debug(GOSSIP_WAIT_DEBUG, + "Interrupted: Removed op %p" + " from htable_ops_in_progress\n", + op); + } else { + spin_unlock(&op->lock); + gossip_err("interrupted operation is in a weird state 0x%x\n", + op->op_state); + } + reinit_completion(&op->waitq); +} + +/* + * Sleeps on waitqueue waiting for matching downcall. + * If client-core finishes servicing, then we are good to go. + * else if client-core exits, we get woken up here, and retry with a timeout + * + * When this call returns to the caller, the specified op will no + * longer be in either the in_progress hash table or on the request list. + * + * Returns 0 on success and -errno on failure + * Errors are: + * EAGAIN in case we want the caller to requeue and try again.. + * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this + * operation since client-core seems to be exiting too often + * or if we were interrupted. + * + * Returns with op->lock taken. + */ +static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, + long timeout, + int flags) + __acquires(op->lock) +{ + long n; + int writeback = flags & ORANGEFS_OP_WRITEBACK, + interruptible = flags & ORANGEFS_OP_INTERRUPTIBLE; + + /* + * There's a "schedule_timeout" inside of these wait + * primitives, during which the op is out of the hands of the + * user process that needs something done and is being + * manipulated by the client-core process. + */ + if (writeback) + n = wait_for_completion_io_timeout(&op->waitq, timeout); + else if (!writeback && interruptible) + n = wait_for_completion_interruptible_timeout(&op->waitq, + timeout); + else /* !writeback && !interruptible but compiler complains */ + n = wait_for_completion_killable_timeout(&op->waitq, timeout); + + spin_lock(&op->lock); + + if (op_state_serviced(op)) + return 0; + + if (unlikely(n < 0)) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: operation interrupted, tag %llu, %p\n", + __func__, + llu(op->tag), + op); + return -EINTR; + } + if (op_state_purged(op)) { + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: operation purged, tag %llu, %p, %d\n", + __func__, + llu(op->tag), + op, + op->attempts); + return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ? + -EAGAIN : + -EIO; + } + /* must have timed out, then... */ + gossip_debug(GOSSIP_WAIT_DEBUG, + "%s: operation timed out, tag %llu, %p, %d)\n", + __func__, + llu(op->tag), + op, + op->attempts); + return -ETIMEDOUT; +} diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c new file mode 100644 index 000000000..9a5b757fb --- /dev/null +++ b/fs/orangefs/xattr.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS extended attribute operations. + */ + +#include "protocol.h" +#include "orangefs-kernel.h" +#include "orangefs-bufmap.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> +#include <linux/hashtable.h> + +#define SYSTEM_ORANGEFS_KEY "system.pvfs2." +#define SYSTEM_ORANGEFS_KEY_LEN 13 + +/* + * this function returns + * 0 if the key corresponding to name is not meant to be printed as part + * of a listxattr. + * 1 if the key corresponding to name is meant to be returned as part of + * a listxattr. + * The ones that start SYSTEM_ORANGEFS_KEY are the ones to avoid printing. + */ +static int is_reserved_key(const char *key, size_t size) +{ + + if (size < SYSTEM_ORANGEFS_KEY_LEN) + return 1; + + return strncmp(key, SYSTEM_ORANGEFS_KEY, SYSTEM_ORANGEFS_KEY_LEN) ? 1 : 0; +} + +static inline int convert_to_internal_xattr_flags(int setxattr_flags) +{ + int internal_flag = 0; + + if (setxattr_flags & XATTR_REPLACE) { + /* Attribute must exist! */ + internal_flag = ORANGEFS_XATTR_REPLACE; + } else if (setxattr_flags & XATTR_CREATE) { + /* Attribute must not exist */ + internal_flag = ORANGEFS_XATTR_CREATE; + } + return internal_flag; +} + +static unsigned int xattr_key(const char *key) +{ + unsigned int i = 0; + while (key) + i += *key++; + return i % 16; +} + +static struct orangefs_cached_xattr *find_cached_xattr(struct inode *inode, + const char *key) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; + h = &orangefs_inode->xattr_cache[xattr_key(key)]; + if (hlist_empty(h)) + return NULL; + hlist_for_each_entry_safe(cx, tmp, h, node) { +/* if (!time_before(jiffies, cx->timeout)) { + hlist_del(&cx->node); + kfree(cx); + continue; + }*/ + if (!strcmp(cx->key, key)) + return cx; + } + return NULL; +} + +/* + * Tries to get a specified key's attributes of a given + * file into a user-specified buffer. Note that the getxattr + * interface allows for the users to probe the size of an + * extended attribute by passing in a value of 0 to size. + * Thus our return value is always the size of the attribute + * unless the key does not exist for the file and/or if + * there were errors in fetching the attribute value. + */ +ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op = NULL; + struct orangefs_cached_xattr *cx; + ssize_t ret = -ENOMEM; + ssize_t length = 0; + int fsuid; + int fsgid; + + gossip_debug(GOSSIP_XATTR_DEBUG, + "%s: name %s, buffer_size %zd\n", + __func__, name, size); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + + fsuid = from_kuid(&init_user_ns, current_fsuid()); + fsgid = from_kgid(&init_user_ns, current_fsgid()); + + gossip_debug(GOSSIP_XATTR_DEBUG, + "getxattr on inode %pU, name %s " + "(uid %o, gid %o)\n", + get_khandle_from_ino(inode), + name, + fsuid, + fsgid); + + down_read(&orangefs_inode->xattr_sem); + + cx = find_cached_xattr(inode, name); + if (cx && time_before(jiffies, cx->timeout)) { + if (cx->length == -1) { + ret = -ENODATA; + goto out_unlock; + } else { + if (size == 0) { + ret = cx->length; + goto out_unlock; + } + if (cx->length > size) { + ret = -ERANGE; + goto out_unlock; + } + memcpy(buffer, cx->val, cx->length); + memset(buffer + cx->length, 0, size - cx->length); + ret = cx->length; + goto out_unlock; + } + } + + new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR); + if (!new_op) + goto out_unlock; + + new_op->upcall.req.getxattr.refn = orangefs_inode->refn; + strcpy(new_op->upcall.req.getxattr.key, name); + + /* + * NOTE: Although keys are meant to be NULL terminated textual + * strings, I am going to explicitly pass the length just in case + * we change this later on... + */ + new_op->upcall.req.getxattr.key_sz = strlen(name) + 1; + + ret = service_operation(new_op, "orangefs_inode_getxattr", + get_interruptible_flag(inode)); + if (ret != 0) { + if (ret == -ENOENT) { + ret = -ENODATA; + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_getxattr: inode %pU key %s" + " does not exist!\n", + get_khandle_from_ino(inode), + (char *)new_op->upcall.req.getxattr.key); + cx = kmalloc(sizeof *cx, GFP_KERNEL); + if (cx) { + strcpy(cx->key, name); + cx->length = -1; + cx->timeout = jiffies + + orangefs_getattr_timeout_msecs*HZ/1000; + hash_add(orangefs_inode->xattr_cache, &cx->node, + xattr_key(cx->key)); + } + } + goto out_release_op; + } + + /* + * Length returned includes null terminator. + */ + length = new_op->downcall.resp.getxattr.val_sz; + + /* + * Just return the length of the queried attribute. + */ + if (size == 0) { + ret = length; + goto out_release_op; + } + + /* + * Check to see if key length is > provided buffer size. + */ + if (length > size) { + ret = -ERANGE; + goto out_release_op; + } + + memcpy(buffer, new_op->downcall.resp.getxattr.val, length); + memset(buffer + length, 0, size - length); + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_getxattr: inode %pU " + "key %s key_sz %d, val_len %d\n", + get_khandle_from_ino(inode), + (char *)new_op-> + upcall.req.getxattr.key, + (int)new_op-> + upcall.req.getxattr.key_sz, + (int)ret); + + ret = length; + + if (cx) { + strcpy(cx->key, name); + memcpy(cx->val, buffer, length); + cx->length = length; + cx->timeout = jiffies + HZ; + } else { + cx = kmalloc(sizeof *cx, GFP_KERNEL); + if (cx) { + strcpy(cx->key, name); + memcpy(cx->val, buffer, length); + cx->length = length; + cx->timeout = jiffies + HZ; + hash_add(orangefs_inode->xattr_cache, &cx->node, + xattr_key(cx->key)); + } + } + +out_release_op: + op_release(new_op); +out_unlock: + up_read(&orangefs_inode->xattr_sem); + return ret; +} + +static int orangefs_inode_removexattr(struct inode *inode, const char *name, + int flags) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op = NULL; + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; + int ret = -ENOMEM; + + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + + down_write(&orangefs_inode->xattr_sem); + new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); + if (!new_op) + goto out_unlock; + + new_op->upcall.req.removexattr.refn = orangefs_inode->refn; + /* + * NOTE: Although keys are meant to be NULL terminated + * textual strings, I am going to explicitly pass the + * length just in case we change this later on... + */ + strcpy(new_op->upcall.req.removexattr.key, name); + new_op->upcall.req.removexattr.key_sz = strlen(name) + 1; + + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_removexattr: key %s, key_sz %d\n", + (char *)new_op->upcall.req.removexattr.key, + (int)new_op->upcall.req.removexattr.key_sz); + + ret = service_operation(new_op, + "orangefs_inode_removexattr", + get_interruptible_flag(inode)); + if (ret == -ENOENT) { + /* + * Request to replace a non-existent attribute is an error. + */ + if (flags & XATTR_REPLACE) + ret = -ENODATA; + else + ret = 0; + } + + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_removexattr: returning %d\n", ret); + + op_release(new_op); + + h = &orangefs_inode->xattr_cache[xattr_key(name)]; + hlist_for_each_entry_safe(cx, tmp, h, node) { + if (!strcmp(cx->key, name)) { + hlist_del(&cx->node); + kfree(cx); + break; + } + } + +out_unlock: + up_write(&orangefs_inode->xattr_sem); + return ret; +} + +/* + * Tries to set an attribute for a given key on a file. + * + * Returns a -ve number on error and 0 on success. Key is text, but value + * can be binary! + */ +int orangefs_inode_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int internal_flag = 0; + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; + int ret = -ENOMEM; + + gossip_debug(GOSSIP_XATTR_DEBUG, + "%s: name %s, buffer_size %zd\n", + __func__, name, size); + + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + + internal_flag = convert_to_internal_xattr_flags(flags); + + /* This is equivalent to a removexattr */ + if (size == 0 && !value) { + gossip_debug(GOSSIP_XATTR_DEBUG, + "removing xattr (%s)\n", + name); + return orangefs_inode_removexattr(inode, name, flags); + } + + gossip_debug(GOSSIP_XATTR_DEBUG, + "setxattr on inode %pU, name %s\n", + get_khandle_from_ino(inode), + name); + + down_write(&orangefs_inode->xattr_sem); + new_op = op_alloc(ORANGEFS_VFS_OP_SETXATTR); + if (!new_op) + goto out_unlock; + + + new_op->upcall.req.setxattr.refn = orangefs_inode->refn; + new_op->upcall.req.setxattr.flags = internal_flag; + /* + * NOTE: Although keys are meant to be NULL terminated textual + * strings, I am going to explicitly pass the length just in + * case we change this later on... + */ + strcpy(new_op->upcall.req.setxattr.keyval.key, name); + new_op->upcall.req.setxattr.keyval.key_sz = strlen(name) + 1; + memcpy(new_op->upcall.req.setxattr.keyval.val, value, size); + new_op->upcall.req.setxattr.keyval.val_sz = size; + + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_setxattr: key %s, key_sz %d " + " value size %zd\n", + (char *)new_op->upcall.req.setxattr.keyval.key, + (int)new_op->upcall.req.setxattr.keyval.key_sz, + size); + + ret = service_operation(new_op, + "orangefs_inode_setxattr", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_XATTR_DEBUG, + "orangefs_inode_setxattr: returning %d\n", + ret); + + /* when request is serviced properly, free req op struct */ + op_release(new_op); + + h = &orangefs_inode->xattr_cache[xattr_key(name)]; + hlist_for_each_entry_safe(cx, tmp, h, node) { + if (!strcmp(cx->key, name)) { + hlist_del(&cx->node); + kfree(cx); + break; + } + } + +out_unlock: + up_write(&orangefs_inode->xattr_sem); + return ret; +} + +/* + * Tries to get a specified object's keys into a user-specified buffer of a + * given size. Note that like the previous instances of xattr routines, this + * also allows you to pass in a NULL pointer and 0 size to probe the size for + * subsequent memory allocations. Thus our return value is always the size of + * all the keys unless there were errors in fetching the keys! + */ +ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + __u64 token = ORANGEFS_ITERATE_START; + ssize_t ret = -ENOMEM; + ssize_t total = 0; + int count_keys = 0; + int key_size; + int i = 0; + int returned_count = 0; + + if (size > 0 && !buffer) { + gossip_err("%s: bogus NULL pointers\n", __func__); + return -EINVAL; + } + + down_read(&orangefs_inode->xattr_sem); + new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR); + if (!new_op) + goto out_unlock; + + if (buffer && size > 0) + memset(buffer, 0, size); + +try_again: + key_size = 0; + new_op->upcall.req.listxattr.refn = orangefs_inode->refn; + new_op->upcall.req.listxattr.token = token; + new_op->upcall.req.listxattr.requested_count = + (size == 0) ? 0 : ORANGEFS_MAX_XATTR_LISTLEN; + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + if (ret != 0) + goto done; + + if (size == 0) { + /* + * This is a bit of a big upper limit, but I did not want to + * spend too much time getting this correct, since users end + * up allocating memory rather than us... + */ + total = new_op->downcall.resp.listxattr.returned_count * + ORANGEFS_MAX_XATTR_NAMELEN; + goto done; + } + + returned_count = new_op->downcall.resp.listxattr.returned_count; + if (returned_count < 0 || + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { + gossip_err("%s: impossible value for returned_count:%d:\n", + __func__, + returned_count); + ret = -EIO; + goto done; + } + + /* + * Check to see how much can be fit in the buffer. Fit only whole keys. + */ + for (i = 0; i < returned_count; i++) { + if (new_op->downcall.resp.listxattr.lengths[i] < 0 || + new_op->downcall.resp.listxattr.lengths[i] > + ORANGEFS_MAX_XATTR_NAMELEN) { + gossip_err("%s: impossible value for lengths[%d]\n", + __func__, + new_op->downcall.resp.listxattr.lengths[i]); + ret = -EIO; + goto done; + } + if (total + new_op->downcall.resp.listxattr.lengths[i] > size) + goto done; + + /* + * Since many dumb programs try to setxattr() on our reserved + * xattrs this is a feeble attempt at defeating those by not + * listing them in the output of listxattr.. sigh + */ + if (is_reserved_key(new_op->downcall.resp.listxattr.key + + key_size, + new_op->downcall.resp. + listxattr.lengths[i])) { + gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n", + i, new_op->downcall.resp.listxattr.key + + key_size); + memcpy(buffer + total, + new_op->downcall.resp.listxattr.key + key_size, + new_op->downcall.resp.listxattr.lengths[i]); + total += new_op->downcall.resp.listxattr.lengths[i]; + count_keys++; + } else { + gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n", + i, new_op->downcall.resp.listxattr.key + + key_size); + } + key_size += new_op->downcall.resp.listxattr.lengths[i]; + } + + /* + * Since the buffer was large enough, we might have to continue + * fetching more keys! + */ + token = new_op->downcall.resp.listxattr.token; + if (token != ORANGEFS_ITERATE_END) + goto try_again; + +done: + gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d" + " [size of buffer %ld] (filled in %d keys)\n", + __func__, + ret ? (int)ret : (int)total, + (long)size, + count_keys); + op_release(new_op); + if (ret == 0) + ret = total; +out_unlock: + up_read(&orangefs_inode->xattr_sem); + return ret; +} + +static int orangefs_xattr_set_default(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, + struct inode *inode, + const char *name, + const void *buffer, + size_t size, + int flags) +{ + return orangefs_inode_setxattr(inode, name, buffer, size, flags); +} + +static int orangefs_xattr_get_default(const struct xattr_handler *handler, + struct dentry *unused, + struct inode *inode, + const char *name, + void *buffer, + size_t size) +{ + return orangefs_inode_getxattr(inode, name, buffer, size); + +} + +static const struct xattr_handler orangefs_xattr_default_handler = { + .prefix = "", /* match any name => handlers called with full name */ + .get = orangefs_xattr_get_default, + .set = orangefs_xattr_set_default, +}; + +const struct xattr_handler *orangefs_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &orangefs_xattr_default_handler, + NULL +}; |