diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /fs/overlayfs | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/overlayfs')
-rw-r--r-- | fs/overlayfs/Kconfig | 126 | ||||
-rw-r--r-- | fs/overlayfs/Makefile | 9 | ||||
-rw-r--r-- | fs/overlayfs/copy_up.c | 1108 | ||||
-rw-r--r-- | fs/overlayfs/dir.c | 1331 | ||||
-rw-r--r-- | fs/overlayfs/export.c | 876 | ||||
-rw-r--r-- | fs/overlayfs/file.c | 716 | ||||
-rw-r--r-- | fs/overlayfs/inode.c | 1285 | ||||
-rw-r--r-- | fs/overlayfs/namei.c | 1203 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 698 | ||||
-rw-r--r-- | fs/overlayfs/ovl_entry.h | 160 | ||||
-rw-r--r-- | fs/overlayfs/readdir.c | 1235 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 2244 | ||||
-rw-r--r-- | fs/overlayfs/util.c | 1136 |
13 files changed, 12127 insertions, 0 deletions
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000..dd188c799 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: GPL-2.0-only +config OVERLAY_FS + tristate "Overlay filesystem support" + select EXPORTFS + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.rst + +config OVERLAY_FS_REDIRECT_DIR + bool "Overlayfs: turn on redirect directory feature by default" + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + redirects when renaming directories by default. In this case it is + still possible to turn off redirects globally with the + "redirect_dir=off" module option or on a filesystem instance basis + with the "redirect_dir=off" mount option. + + Note, that redirects are not backward compatible. That is, mounting + an overlay which has redirects on a kernel that doesn't support this + feature will have unexpected results. + + If unsure, say N. + +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + If backward compatibility is not an issue, then it is safe and + recommended to say N here. + + For more information, see Documentation/filesystems/overlayfs.rst + + If unsure, say Y. + +config OVERLAY_FS_INDEX + bool "Overlayfs: turn on inodes index feature by default" + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + the index directory to map lower inodes to upper inodes by default. + In this case it is still possible to turn off index globally with the + "index=off" module option or on a filesystem instance basis with the + "index=off" mount option. + + The inodes index feature prevents breaking of lower hardlinks on copy + up. + + Note, that the inodes index feature is not backward compatible. + That is, mounting an overlay which has an inodes index on a kernel + that doesn't support this feature will have unexpected results. + + If unsure, say N. + +config OVERLAY_FS_NFS_EXPORT + bool "Overlayfs: turn on NFS export feature by default" + depends on OVERLAY_FS + depends on OVERLAY_FS_INDEX + depends on !OVERLAY_FS_METACOPY + help + If this config option is enabled then overlay filesystems will use + the index directory to decode overlay NFS file handles by default. + In this case, it is still possible to turn off NFS export support + globally with the "nfs_export=off" module option or on a filesystem + instance basis with the "nfs_export=off" mount option. + + The NFS export feature creates an index on copy up of every file and + directory. This full index is used to detect overlay filesystems + inconsistencies on lookup, like redirect from multiple upper dirs to + the same lower dir. The full index may incur some overhead on mount + time, especially when verifying that directory file handles are not + stale. + + Note, that the NFS export feature is not backward compatible. + That is, mounting an overlay which has a full index on a kernel + that doesn't support this feature will have unexpected results. + + Most users should say N here and enable this feature on a case-by- + case basis with the "nfs_export=on" mount option. + + Say N unless you fully understand the consequences. + +config OVERLAY_FS_XINO_AUTO + bool "Overlayfs: auto enable inode number mapping" + default n + depends on OVERLAY_FS + depends on 64BIT + help + If this config option is enabled then overlay filesystems will use + unused high bits in undelying filesystem inode numbers to map all + inodes to a unified address space. The mapped 64bit inode numbers + might not be compatible with applications that expect 32bit inodes. + + If compatibility with applications that expect 32bit inodes is not an + issue, then it is safe and recommended to say Y here. + + For more information, see Documentation/filesystems/overlayfs.rst + + If unsure, say N. + +config OVERLAY_FS_METACOPY + bool "Overlayfs: turn on metadata only copy up feature by default" + depends on OVERLAY_FS + select OVERLAY_FS_REDIRECT_DIR + help + If this config option is enabled then overlay filesystems will + copy up only metadata where appropriate and data copy up will + happen when a file is opened for WRITE operation. It is still + possible to turn off this feature globally with the "metacopy=off" + module option or on a filesystem instance basis with the + "metacopy=off" mount option. + + Note, that this feature is not backward compatible. That is, + mounting an overlay which has metacopy only inodes on a kernel + that doesn't support this feature will have unexpected results. + + If unsure, say N. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000..9164c585e --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAY_FS) += overlay.o + +overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ + copy_up.o export.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000..86d4b6975 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,1108 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/fileattr.h> +#include <linux/splice.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/sched/signal.h> +#include <linux/cred.h> +#include <linux/namei.h> +#include <linux/fdtable.h> +#include <linux/ratelimit.h> +#include <linux/exportfs.h> +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +static int ovl_ccup_set(const char *buf, const struct kernel_param *param) +{ + pr_warn("\"check_copy_up\" module option is obsolete\n"); + return 0; +} + +static int ovl_ccup_get(char *buf, const struct kernel_param *param) +{ + return sprintf(buf, "N\n"); +} + +module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644); +MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing"); + +static bool ovl_must_copy_xattr(const char *name) +{ + return !strcmp(name, XATTR_POSIX_ACL_ACCESS) || + !strcmp(name, XATTR_POSIX_ACL_DEFAULT) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); +} + +int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new) +{ + struct dentry *old = oldpath->dentry; + ssize_t list_size, size, value_size = 0; + char *buf, *name, *value = NULL; + int error = 0; + size_t slen; + + if (!(old->d_inode->i_opflags & IOP_XATTR) || + !(new->d_inode->i_opflags & IOP_XATTR)) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kvzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out; + } + + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + + if (ovl_is_private_xattr(sb, name)) + continue; + + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } +retry: + size = ovl_do_getxattr(oldpath, name, value, value_size); + if (size == -ERANGE) + size = ovl_do_getxattr(oldpath, name, NULL, 0); + + if (size < 0) { + error = size; + break; + } + + if (size > value_size) { + void *new; + + new = kvmalloc(size, GFP_KERNEL); + if (!new) { + error = -ENOMEM; + break; + } + kvfree(value); + value = new; + value_size = size; + goto retry; + } + + error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0); + if (error) { + if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) + break; + + /* Ignore failure to copy unknown xattrs */ + error = 0; + } + } + kvfree(value); +out: + kvfree(buf); + return error; +} + +static int ovl_copy_fileattr(struct inode *inode, const struct path *old, + const struct path *new) +{ + struct fileattr oldfa = { .flags_valid = true }; + struct fileattr newfa = { .flags_valid = true }; + int err; + + err = ovl_real_fileattr_get(old, &oldfa); + if (err) { + /* Ntfs-3g returns -EINVAL for "no fileattr support" */ + if (err == -ENOTTY || err == -EINVAL) + return 0; + pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", + old->dentry, err); + return err; + } + + /* + * We cannot set immutable and append-only flags on upper inode, + * because we would not be able to link upper inode to upper dir + * not set overlay private xattr on upper inode. + * Store these flags in overlay.protattr xattr instead. + */ + if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { + err = ovl_set_protattr(inode, new->dentry, &oldfa); + if (err == -EPERM) + pr_warn_once("copying fileattr: no xattr on upper\n"); + else if (err) + return err; + } + + /* Don't bother copying flags if none are set */ + if (!(oldfa.flags & OVL_COPY_FS_FLAGS_MASK)) + return 0; + + err = ovl_real_fileattr_get(new, &newfa); + if (err) { + /* + * Returning an error if upper doesn't support fileattr will + * result in a regression, so revert to the old behavior. + */ + if (err == -ENOTTY || err == -EINVAL) { + pr_warn_once("copying fileattr: no support on upper\n"); + return 0; + } + pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", + new->dentry, err); + return err; + } + + BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL); + newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK; + newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK); + + BUILD_BUG_ON(OVL_COPY_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + newfa.fsx_xflags &= ~OVL_COPY_FSX_FLAGS_MASK; + newfa.fsx_xflags |= (oldfa.fsx_xflags & OVL_COPY_FSX_FLAGS_MASK); + + return ovl_real_fileattr_set(new, &newfa); +} + +static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, + struct file *new_file, loff_t len) +{ + struct path datapath; + struct file *old_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + loff_t cloned; + loff_t data_pos = -1; + loff_t hole_len; + bool skip_hole = false; + int error = 0; + + ovl_path_lowerdata(dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; + + old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + /* Try to use clone_file_range to clone up within the same fs */ + cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) + goto out_fput; + /* Couldn't clone, so now we try to copy the data */ + + /* Check if lower fs supports seek operation */ + if (old_file->f_mode & FMODE_LSEEK) + skip_hole = true; + + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + /* + * Fill zero for hole will cost unnecessary disk space + * and meanwhile slow down the copy-up speed, so we do + * an optimization for hole during copy-up, it relies + * on SEEK_DATA implementation in lower fs so if lower + * fs does not support it, copy-up will behave as before. + * + * Detail logic of hole detection as below: + * When we detect next data position is larger than current + * position we will skip that hole, otherwise we copy + * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually, + * it may not recognize all kind of holes and sometimes + * only skips partial of hole area. However, it will be + * enough for most of the use cases. + */ + + if (skip_hole && data_pos < old_pos) { + data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); + if (data_pos > old_pos) { + hole_len = data_pos - old_pos; + len -= hole_len; + old_pos = new_pos = data_pos; + continue; + } else if (data_pos == -ENXIO) { + break; + } else if (data_pos < 0) { + skip_hole = false; + } + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + if (!error && ovl_should_sync(ofs)) + error = vfs_fsync(new_file, 0); +out_fput: + fput(old_file); + return error; +} + +static int ovl_set_size(struct ovl_fs *ofs, + struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = ATTR_SIZE, + .ia_size = stat->size, + }; + + return ovl_do_notify_change(ofs, upperdentry, &attr); +} + +static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return ovl_do_notify_change(ofs, upperdentry, &attr); +} + +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = ovl_do_notify_change(ofs, upperdentry, &attr); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), + }; + err = ovl_do_notify_change(ofs, upperdentry, &attr); + } + if (!err) + ovl_set_timestamps(ofs, upperdentry, stat); + + return err; +} + +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, + bool is_upper) +{ + struct ovl_fh *fh; + int fh_type, dwords; + int buflen = MAX_HANDLE_SZ; + uuid_t *uuid = &real->d_sb->s_uuid; + int err; + + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* + * We encode a non-connectable file handle for non-dir, because we + * only need to find the lower inode number and we don't want to pay + * the price or reconnecting the dentry. + */ + dwords = buflen >> 2; + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); + buflen = (dwords << 2); + + err = -EIO; + if (WARN_ON(fh_type < 0) || + WARN_ON(buflen > MAX_HANDLE_SZ) || + WARN_ON(fh_type == FILEID_INVALID)) + goto out_err; + + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; + /* + * When we will want to decode an overlay dentry from this handle + * and all layers are on the same fs, if we get a disconncted real + * dentry when we decode fid, the only way to tell if we should assign + * it to upperdentry or to lowerstack is by checking this flag. + */ + if (is_upper) + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + if (ofs->config.uuid) + fh->fb.uuid = *uuid; + + return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); +} + +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper) +{ + const struct ovl_fh *fh = NULL; + int err; + + /* + * When lower layer doesn't support export operations store a 'null' fh, + * so we can use the overlay.origin xattr to distignuish between a copy + * up and a pure upper inode. + */ + if (ovl_can_decode_fh(lower->d_sb)) { + fh = ovl_encode_real_fh(ofs, lower, false); + if (IS_ERR(fh)) + return PTR_ERR(fh); + } + + /* + * Do not fail when upper doesn't support xattrs. + */ + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); + kfree(fh); + + /* Ignore -EPERM from setting "user.*" on symlink/special */ + return err == -EPERM ? 0 : err; +} + +/* Store file handle of @upper dir in @index dir entry */ +static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *index) +{ + const struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(ofs, upper, true); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); + + kfree(fh); + return err; +} + +/* + * Create and install index entry. + * + * Caller must hold i_mutex on indexdir. + */ +static int ovl_create_index(struct dentry *dentry, struct dentry *origin, + struct dentry *upper) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = d_inode(indexdir); + struct dentry *index = NULL; + struct dentry *temp = NULL; + struct qstr name = { }; + int err; + + /* + * For now this is only used for creating index entry for directories, + * because non-dir are copied up directly to index and then hardlinked + * to upper dir. + * + * TODO: implement create index for non-dir, so we can call it when + * encoding file handle for non-dir in case index does not exist. + */ + if (WARN_ON(!d_is_dir(dentry))) + return -EIO; + + /* Directory not expected to be indexed before copy up */ + if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) + return -EIO; + + err = ovl_get_index_name(ofs, origin, &name); + if (err) + return err; + + temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0)); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto free_name; + + err = ovl_set_upper_fh(ofs, upper, temp); + if (err) + goto out; + + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + } else { + err = ovl_do_rename(ofs, dir, temp, dir, index, 0); + dput(index); + } +out: + if (err) + ovl_cleanup(ofs, dir, temp); + dput(temp); +free_name: + kfree(name.name); + return err; +} + +struct ovl_copy_up_ctx { + struct dentry *parent; + struct dentry *dentry; + struct path lowerpath; + struct kstat stat; + struct kstat pstat; + const char *link; + struct dentry *destdir; + struct qstr destname; + struct dentry *workdir; + bool origin; + bool indexed; + bool metacopy; +}; + +static int ovl_link_up(struct ovl_copy_up_ctx *c) +{ + int err; + struct dentry *upper; + struct dentry *upperdir = ovl_dentry_upper(c->parent); + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *udir = d_inode(upperdir); + + /* Mark parent "impure" because it may now contain non-pure upper */ + err = ovl_set_impure(c->parent, upperdir); + if (err) + return err; + + err = ovl_set_nlink_lower(c->dentry); + if (err) + return err; + + inode_lock_nested(udir, I_MUTEX_PARENT); + upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, + c->dentry->d_name.len); + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); + dput(upper); + + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(ofs, upperdir, &c->pstat); + ovl_dentry_set_upper_alias(c->dentry); + ovl_dentry_update_reval(c->dentry, upper); + } + } + inode_unlock(udir); + if (err) + return err; + + err = ovl_set_nlink_upper(c->dentry); + + return err; +} + +static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct file *new_file; + int err; + + if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size) + return 0; + + new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); + + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + fput(new_file); + + return err; +} + +static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp }; + int err; + + err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); + if (err) + return err; + + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { + /* + * Copy the fileattr inode flags that are the source of already + * copied i_flags + */ + err = ovl_copy_fileattr(inode, &c->lowerpath, &upperpath); + if (err) + return err; + } + + /* + * Store identifier of lower inode in upper inode xattr to + * allow lookup of the copy up origin inode. + * + * Don't set origin when we are breaking the association with a lower + * hard link. + */ + if (c->origin) { + err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); + if (err) + return err; + } + + if (c->metacopy) { + err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY, + NULL, 0, -EOPNOTSUPP); + if (err) + return err; + } + + inode_lock(temp->d_inode); + if (S_ISREG(c->stat.mode)) + err = ovl_set_size(ofs, temp, &c->stat); + if (!err) + err = ovl_set_attr(ofs, temp, &c->stat); + inode_unlock(temp->d_inode); + + return err; +} + +struct ovl_cu_creds { + const struct cred *old; + struct cred *new; +}; + +static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc) +{ + int err; + + cc->old = cc->new = NULL; + err = security_inode_copy_up(dentry, &cc->new); + if (err < 0) + return err; + + if (cc->new) + cc->old = override_creds(cc->new); + + return 0; +} + +static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) +{ + if (cc->new) { + revert_creds(cc->old); + put_cred(cc->new); + } +} + +/* + * Copyup using workdir to prepare temp file. Used when copying up directories, + * special files or when upper fs doesn't support O_TMPFILE. + */ +static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode; + struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct path path = { .mnt = ovl_upper_mnt(ofs) }; + struct dentry *temp, *upper; + struct ovl_cu_creds cc; + int err; + struct ovl_cattr cattr = { + /* Can't properly set mode on creation because of the umask */ + .mode = c->stat.mode & S_IFMT, + .rdev = c->stat.rdev, + .link = c->link + }; + + /* workdir and destdir could be the same when copying up to indexdir */ + err = -EIO; + if (lock_rename(c->workdir, c->destdir) != NULL) + goto unlock; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + goto unlock; + + temp = ovl_create_temp(ofs, c->workdir, &cattr); + ovl_revert_cu_creds(&cc); + + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto unlock; + + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + path.dentry = temp; + err = ovl_copy_up_data(c, &path); + if (err) + goto cleanup; + + err = ovl_copy_up_metadata(c, temp); + if (err) + goto cleanup; + + if (S_ISDIR(c->stat.mode) && c->indexed) { + err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + if (err) + goto cleanup; + } + + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto cleanup; + + err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); + dput(upper); + if (err) + goto cleanup; + + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); + inode = d_inode(c->dentry); + ovl_inode_update(inode, temp); + if (S_ISDIR(inode->i_mode)) + ovl_set_flag(OVL_WHITEOUTS, inode); +unlock: + unlock_rename(c->workdir, c->destdir); + + return err; + +cleanup: + ovl_cleanup(ofs, wdir, temp); + dput(temp); + goto unlock; +} + +/* Copyup using O_TMPFILE which does not require cross dir locking */ +static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *udir = d_inode(c->destdir); + struct dentry *temp, *upper; + struct file *tmpfile; + struct ovl_cu_creds cc; + int err; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + return err; + + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_revert_cu_creds(&cc); + + if (IS_ERR(tmpfile)) + return PTR_ERR(tmpfile); + + temp = tmpfile->f_path.dentry; + if (!c->metacopy && c->stat.size) { + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + if (err) + goto out_fput; + } + + err = ovl_copy_up_metadata(c, temp); + if (err) + goto out_fput; + + inode_lock_nested(udir, I_MUTEX_PARENT); + + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(ofs, temp, udir, upper); + dput(upper); + } + inode_unlock(udir); + + if (err) + goto out_fput; + + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); + ovl_inode_update(d_inode(c->dentry), dget(temp)); + +out_fput: + fput(tmpfile); + return err; +} + +/* + * Copy up a single dentry + * + * All renames start with copy up of source if necessary. The actual + * rename will only proceed once the copy up was successful. Copy up uses + * upper parent i_mutex for exclusion. Since rename can change d_parent it + * is possible that the copy up will lock the old parent. At that point + * the file will have already been copied up anyway. + */ +static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) +{ + int err; + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + bool to_index = false; + + /* + * Indexed non-dir is copied up directly to the index entry and then + * hardlinked to upper dir. Indexed dir is copied up to indexdir, + * then index entry is created and then copied up dir installed. + * Copying dir up to indexdir instead of workdir simplifies locking. + */ + if (ovl_need_index(c->dentry)) { + c->indexed = true; + if (S_ISDIR(c->stat.mode)) + c->workdir = ovl_indexdir(c->dentry->d_sb); + else + to_index = true; + } + + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) + c->origin = true; + + if (to_index) { + c->destdir = ovl_indexdir(c->dentry->d_sb); + err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname); + if (err) + return err; + } else if (WARN_ON(!c->parent)) { + /* Disconnected dentry must be copied up to index dir */ + return -EIO; + } else { + /* + * Mark parent "impure" because it may now contain non-pure + * upper + */ + err = ovl_set_impure(c->parent, c->destdir); + if (err) + return err; + } + + /* Should we copyup with O_TMPFILE or with workdir? */ + if (S_ISREG(c->stat.mode) && ofs->tmpfile) + err = ovl_copy_up_tmpfile(c); + else + err = ovl_copy_up_workdir(c); + if (err) + goto out; + + if (c->indexed) + ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + + if (to_index) { + /* Initialize nlink for copy up of disconnected dentry */ + err = ovl_set_nlink_upper(c->dentry); + } else { + struct inode *udir = d_inode(c->destdir); + + /* Restore timestamps on parent (best effort) */ + inode_lock(udir); + ovl_set_timestamps(ofs, c->destdir, &c->pstat); + inode_unlock(udir); + + ovl_dentry_set_upper_alias(c->dentry); + ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry)); + } + +out: + if (to_index) + kfree(c->destname.name); + return err; +} + +static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, + int flags) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (!ofs->config.metacopy) + return false; + + if (!S_ISREG(mode)) + return false; + + if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC))) + return false; + + return true; +} + +static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value) +{ + ssize_t res; + char *buf; + + res = ovl_do_getxattr(path, name, NULL, 0); + if (res == -ENODATA || res == -EOPNOTSUPP) + res = 0; + + if (res > 0) { + buf = kzalloc(res, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = ovl_do_getxattr(path, name, buf, res); + if (res < 0) + kfree(buf); + else + *value = buf; + } + return res; +} + +/* Copy up data of an inode which was copied up metadata only in the past. */ +static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct path upperpath; + int err; + char *capability = NULL; + ssize_t cap_size; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry == NULL)) + return -EIO; + + if (c->stat.size) { + err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, + &capability); + if (cap_size < 0) + goto out; + } + + err = ovl_copy_up_data(c, &upperpath); + if (err) + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + + + err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + if (err) + goto out_free; + + ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: + return err; +} + +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + int flags) +{ + int err; + DEFINE_DELAYED_CALL(done); + struct path parentpath; + struct ovl_copy_up_ctx ctx = { + .parent = parent, + .dentry = dentry, + .workdir = ovl_workdir(dentry), + }; + + if (WARN_ON(!ctx.workdir)) + return -EROFS; + + ovl_path_lower(dentry, &ctx.lowerpath); + err = vfs_getattr(&ctx.lowerpath, &ctx.stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + if (err) + return err; + + if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) || + !kgid_has_mapping(current_user_ns(), ctx.stat.gid)) + return -EOVERFLOW; + + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); + + if (parent) { + ovl_path_upper(parent, &parentpath); + ctx.destdir = parentpath.dentry; + ctx.destname = dentry->d_name; + + err = vfs_getattr(&parentpath, &ctx.pstat, + STATX_ATIME | STATX_MTIME, + AT_STATX_SYNC_AS_STAT); + if (err) + return err; + } + + /* maybe truncate regular file. this has no effect on dirs */ + if (flags & O_TRUNC) + ctx.stat.size = 0; + + if (S_ISLNK(ctx.stat.mode)) { + ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done); + if (IS_ERR(ctx.link)) + return PTR_ERR(ctx.link); + } + + err = ovl_copy_up_start(dentry, flags); + /* err < 0: interrupted, err > 0: raced with another copy-up */ + if (unlikely(err)) { + if (err > 0) + err = 0; + } else { + if (!ovl_dentry_upper(dentry)) + err = ovl_do_copy_up(&ctx); + if (!err && parent && !ovl_dentry_has_upper_alias(dentry)) + err = ovl_link_up(&ctx); + if (!err && ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + err = ovl_copy_up_meta_inode_data(&ctx); + ovl_copy_up_end(dentry); + } + do_delayed_call(&done); + + return err; +} + +static int ovl_copy_up_flags(struct dentry *dentry, int flags) +{ + int err = 0; + const struct cred *old_cred; + bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); + + /* + * With NFS export, copy up can get called for a disconnected non-dir. + * In this case, we will copy up lower inode to index dir without + * linking it to upper dir. + */ + if (WARN_ON(disconnected && d_is_dir(dentry))) + return -EIO; + + old_cred = ovl_override_creds(dentry->d_sb); + while (!err) { + struct dentry *next; + struct dentry *parent = NULL; + + if (ovl_already_copied_up(dentry, flags)) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (; !disconnected;) { + parent = dget_parent(next); + + if (ovl_dentry_upper(parent)) + break; + + dput(next); + next = parent; + } + + err = ovl_copy_up_one(parent, next, flags); + + dput(parent); + dput(next); + } + revert_creds(old_cred); + + return err; +} + +static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) +{ + /* Copy up of disconnected dentry does not set upper alias */ + if (ovl_already_copied_up(dentry, flags)) + return false; + + if (special_file(d_inode(dentry)->i_mode)) + return false; + + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return true; +} + +int ovl_maybe_copy_up(struct dentry *dentry, int flags) +{ + int err = 0; + + if (ovl_open_need_copy_up(dentry, flags)) { + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up_flags(dentry, flags); + ovl_drop_write(dentry); + } + } + + return err; +} + +int ovl_copy_up_with_data(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, O_WRONLY); +} + +int ovl_copy_up(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, 0); +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000..5339ff08b --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,1331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/cred.h> +#include <linux/module.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +static unsigned short ovl_redirect_max = 256; +module_param_named(redirect_max, ovl_redirect_max, ushort, 0644); +MODULE_PARM_DESC(redirect_max, + "Maximum length of absolute redirect xattr value"); + +static int ovl_set_redirect(struct dentry *dentry, bool samedir); + +int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (d_is_dir(wdentry)) + err = ovl_do_rmdir(ofs, wdir, wdentry); + else + err = ovl_do_unlink(ofs, wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } + + return err; +} + +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) +{ + struct dentry *temp; + char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); + + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); + + temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct ovl_fs *ofs) +{ + int err; + struct dentry *whiteout; + struct dentry *workdir = ofs->workdir; + struct inode *wdir = workdir->d_inode; + + if (!ofs->whiteout) { + whiteout = ovl_lookup_temp(ofs, workdir); + if (IS_ERR(whiteout)) + goto out; + + err = ovl_do_whiteout(ofs, wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + goto out; + } + ofs->whiteout = whiteout; + } + + if (ofs->share_whiteout) { + whiteout = ovl_lookup_temp(ofs, workdir); + if (IS_ERR(whiteout)) + goto out; + + err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); + if (!err) + goto out; + + if (err != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n", + ofs->whiteout->d_inode->i_nlink, err); + ofs->share_whiteout = false; + } + dput(whiteout); + } + whiteout = ofs->whiteout; + ofs->whiteout = NULL; +out: + return whiteout; +} + +/* Caller must hold i_mutex on both workdir and dir */ +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, + struct dentry *dentry) +{ + struct inode *wdir = ofs->workdir->d_inode; + struct dentry *whiteout; + int err; + int flags = 0; + + whiteout = ovl_whiteout(ofs); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + return err; + + if (d_is_dir(dentry)) + flags = RENAME_EXCHANGE; + + err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(ofs, wdir, dentry); + +out: + dput(whiteout); + return err; + +kill_whiteout: + ovl_cleanup(ofs, wdir, whiteout); + goto out; +} + +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode) +{ + int err; + struct dentry *d, *dentry = *newdentry; + + err = ovl_do_mkdir(ofs, dir, dentry, mode); + if (err) + return err; + + if (likely(!d_unhashed(dentry))) + return 0; + + /* + * vfs_mkdir() may succeed and leave the dentry passed + * to it unhashed and negative. If that happens, try to + * lookup a new hashed and positive dentry. + */ + d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); + if (IS_ERR(d)) { + pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", + dentry, err); + return PTR_ERR(d); + } + dput(dentry); + *newdentry = d; + + return 0; +} + +struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry *newdentry, struct ovl_cattr *attr) +{ + int err; + + if (IS_ERR(newdentry)) + return newdentry; + + err = -ESTALE; + if (newdentry->d_inode) + goto out; + + if (attr->hardlink) { + err = ovl_do_link(ofs, attr->hardlink, dir, newdentry); + } else { + switch (attr->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(ofs, dir, newdentry, attr->mode); + break; + + case S_IFDIR: + /* mkdir is special... */ + err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(ofs, dir, newdentry, attr->mode, + attr->rdev); + break; + + case S_IFLNK: + err = ovl_do_symlink(ofs, dir, newdentry, attr->link); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -EIO; + } +out: + if (err) { + dput(newdentry); + return ERR_PTR(err); + } + return newdentry; +} + +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, + struct ovl_cattr *attr) +{ + return ovl_create_real(ofs, d_inode(workdir), + ovl_lookup_temp(ofs, workdir), attr); +} + +static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, + int xerr) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + int err; + + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); + if (!err) + ovl_dentry_set_opaque(dentry); + + return err; +} + +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +{ + /* + * Fail with -EIO when trying to create opaque dir and upper doesn't + * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to + * return a specific error for noxattr case. + */ + return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); +} + +/* + * Common operations required to be done after creation of file on upper. + * If @hardlink is false, then @inode is a pre-allocated inode, we may or + * may not use to instantiate the new dentry. + */ +static int ovl_instantiate(struct dentry *dentry, struct inode *inode, + struct dentry *newdentry, bool hardlink) +{ + struct ovl_inode_params oip = { + .upperdentry = newdentry, + .newinode = inode, + }; + + ovl_dir_modified(dentry->d_parent, false); + ovl_dentry_set_upper_alias(dentry); + ovl_dentry_init_reval(dentry, newdentry); + + if (!hardlink) { + /* + * ovl_obtain_alias() can be called after ovl_create_real() + * and before we get here, so we may get an inode from cache + * with the same real upperdentry that is not the inode we + * pre-allocated. In this case we will use the cached inode + * to instantiate the new dentry. + * + * XXX: if we ever use ovl_obtain_alias() to decode directory + * file handles, need to use ovl_get_inode_locked() and + * d_instantiate_new() here to prevent from creating two + * hashed directory inode aliases. + */ + inode = ovl_get_inode(dentry->d_sb, &oip); + if (IS_ERR(inode)) + return PTR_ERR(inode); + if (inode == oip.newinode) + ovl_set_flag(OVL_UPPERDATA, inode); + } else { + WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); + dput(newdentry); + inc_nlink(inode); + } + + d_instantiate(dentry, inode); + if (inode != oip.newinode) { + pr_warn_ratelimited("newly created inode found in cache (%pd2)\n", + dentry); + } + + /* Force lookup of new upper hardlink to find its lower */ + if (hardlink) + d_drop(dentry); + + return 0; +} + +static bool ovl_type_merge(struct dentry *dentry) +{ + return OVL_TYPE_MERGE(ovl_path_type(dentry)); +} + +static bool ovl_type_origin(struct dentry *dentry) +{ + return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *attr) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + if (!attr->hardlink && !IS_POSIXACL(udir)) + attr->mode &= ~current_umask(); + + inode_lock_nested(udir, I_MUTEX_PARENT); + newdentry = ovl_create_real(ofs, udir, + ovl_lookup_upper(ofs, dentry->d_name.name, + upperdir, dentry->d_name.len), + attr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && + !ovl_allow_offline_changes(ofs)) { + /* Setting opaque here is just an optimization, allow to fail */ + ovl_set_opaque(dentry, newdentry); + } + + err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink); + if (err) + goto out_cleanup; +out_unlock: + inode_unlock(udir); + return err; + +out_cleanup: + ovl_cleanup(ofs, udir, newdentry); + dput(newdentry); + goto out_unlock; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode)); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(dentry, opaquedir); + if (err) + goto out_cleanup; + + inode_lock(opaquedir->d_inode); + err = ovl_set_attr(ofs, opaquedir, &stat); + inode_unlock(opaquedir->d_inode); + if (err) + goto out_cleanup; + + err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(ofs, upper, list); + ovl_cleanup(ofs, wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(ofs, wdir, opaquedir); + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, + const char *name, const struct posix_acl *acl) +{ + void *buffer; + size_t size; + int err; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) + return 0; + + size = posix_acl_xattr_size(acl->a_count); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + if (err < 0) + goto out_free; + + err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); +out_free: + kfree(buffer); + return err; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *cattr) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + struct posix_acl *acl, *default_acl; + bool hardlink = !!cattr->hardlink; + + if (WARN_ON(!workdir)) + return -EROFS; + + if (!hardlink) { + err = posix_acl_create(dentry->d_parent->d_inode, + &cattr->mode, &default_acl, &acl); + if (err) + return err; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + goto out_dput; + + newdentry = ovl_create_temp(ofs, workdir, cattr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput; + + /* + * mode could have been mutilated due to umask (e.g. sgid directory) + */ + if (!hardlink && + !S_ISLNK(cattr->mode) && + newdentry->d_inode->i_mode != cattr->mode) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = cattr->mode, + }; + inode_lock(newdentry->d_inode); + err = ovl_do_notify_change(ofs, newdentry, &attr); + inode_unlock(newdentry->d_inode); + if (err) + goto out_cleanup; + } + if (!hardlink) { + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_ACCESS, acl); + if (err) + goto out_cleanup; + + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_DEFAULT, default_acl); + if (err) + goto out_cleanup; + } + + if (!hardlink && S_ISDIR(cattr->mode)) { + err = ovl_set_opaque(dentry, newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(ofs, wdir, upper); + } else { + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + err = ovl_instantiate(dentry, inode, newdentry, hardlink); + if (err) { + ovl_cleanup(ofs, udir, newdentry); + dput(newdentry); + } +out_dput: + dput(upper); +out_unlock: + unlock_rename(workdir, upperdir); +out: + if (!hardlink) { + posix_acl_release(acl); + posix_acl_release(default_acl); + } + return err; + +out_cleanup: + ovl_cleanup(ofs, wdir, newdentry); + dput(newdentry); + goto out_dput; +} + +static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *attr, bool origin) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + struct dentry *parent = dentry->d_parent; + + err = ovl_copy_up(parent); + if (err) + return err; + + old_cred = ovl_override_creds(dentry->d_sb); + + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + goto out_revert_creds; + } + + if (!attr->hardlink) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_revert_creds; + /* + * In the creation cases(create, mkdir, mknod, symlink), + * ovl should transfer current's fs{u,g}id to underlying + * fs. Because underlying fs want to initialize its new + * inode owner using current's fs{u,g}id. And in this + * case, the @inode is a new inode that is initialized + * in inode_init_owner() to current's fs{u,g}id. So use + * the inode's i_{u,g}id to override the cred's fs{u,g}id. + * + * But in the other hardlink case, ovl_link() does not + * create a new inode, so just use the ovl mounter's + * fs{u,g}id. + */ + override_cred->fsuid = inode->i_uid; + override_cred->fsgid = inode->i_gid; + err = security_dentry_create_files_as(dentry, + attr->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; + } + put_cred(override_creds(override_cred)); + put_cred(override_cred); + } + + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr); + else + err = ovl_create_over_whiteout(dentry, inode, attr); + +out_revert_creds: + revert_creds(old_cred); + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + struct inode *inode; + struct ovl_cattr attr = { + .rdev = rdev, + .link = link, + }; + + err = ovl_want_write(dentry); + if (err) + goto out; + + /* Preallocate inode to be used by ovl_get_inode() */ + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, rdev); + if (!inode) + goto out_drop_write; + + spin_lock(&inode->i_lock); + inode->i_state |= I_CREATING; + spin_unlock(&inode->i_lock); + + inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); + attr.mode = inode->i_mode; + + err = ovl_create_or_link(dentry, inode, &attr, false); + /* Did we end up using the preallocated inode? */ + if (inode != d_inode(dentry)) + iput(inode); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_set_link_redirect(struct dentry *dentry) +{ + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_set_redirect(dentry, false); + revert_creds(old_cred); + + return err; +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct inode *inode; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + + if (ovl_is_metacopy_dentry(old)) { + err = ovl_set_link_redirect(old); + if (err) + goto out_drop_write; + } + + err = ovl_nlink_start(old); + if (err) + goto out_drop_write; + + inode = d_inode(old); + ihold(inode); + + err = ovl_create_or_link(new, inode, + &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)}, + ovl_type_origin(old)); + if (err) + iput(inode); + + ovl_nlink_end(old); +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper) +{ + return d_inode(ovl_dentry_upper(dentry)) == d_inode(upper); +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + struct list_head *list) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *workdir = ovl_workdir(dentry); + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (WARN_ON(!workdir)) + return -EROFS; + + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && ovl_dentry_upper(dentry) && + !ovl_matches_upper(dentry, upper))) { + goto out_dput_upper; + } + + err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper); + if (err) + goto out_d_drop; + + ovl_dir_modified(dentry->d_parent, true); +out_d_drop: + d_drop(dentry); +out_dput_upper: + dput(upper); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir, + struct list_head *list) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + inode_lock_nested(dir, I_MUTEX_PARENT); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && !ovl_matches_upper(dentry, upper))) + goto out_dput_upper; + + if (is_dir) + err = ovl_do_rmdir(ofs, dir, upper); + else + err = ovl_do_unlink(ofs, dir, upper); + ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + if (!err) + d_drop(dentry); +out_dput_upper: + dput(upper); +out_unlock: + inode_unlock(dir); + dput(opaquedir); +out: + return err; +} + +static bool ovl_pure_upper(struct dentry *dentry) +{ + return !ovl_dentry_lower(dentry) && + !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); +} + +static void ovl_drop_nlink(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct dentry *alias; + + /* Try to find another, hashed alias */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + if (alias != dentry && !d_unhashed(alias)) + break; + } + spin_unlock(&inode->i_lock); + + /* + * Changes to underlying layers may cause i_nlink to lose sync with + * reality. In this case prevent the link count from going to zero + * prematurely. + */ + if (inode->i_nlink > !!alias) + drop_nlink(inode); +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + int err; + const struct cred *old_cred; + bool lower_positive = ovl_lower_positive(dentry); + LIST_HEAD(list); + + /* No need to clean pure upper removed by vfs_rmdir() */ + if (is_dir && (lower_positive || !ovl_pure_upper(dentry))) { + err = ovl_check_empty_dir(dentry, &list); + if (err) + goto out; + } + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + err = ovl_nlink_start(dentry); + if (err) + goto out_drop_write; + + old_cred = ovl_override_creds(dentry->d_sb); + if (!lower_positive) + err = ovl_remove_upper(dentry, is_dir, &list); + else + err = ovl_remove_and_whiteout(dentry, &list); + revert_creds(old_cred); + if (!err) { + if (is_dir) + clear_nlink(dentry->d_inode); + else + ovl_drop_nlink(dentry); + } + ovl_nlink_end(dentry); + + /* + * Copy ctime + * + * Note: we fail to update ctime if there was no copy-up, only a + * whiteout + */ + if (ovl_dentry_upper(dentry)) + ovl_copyattr(d_inode(dentry)); + +out_drop_write: + ovl_drop_write(dentry); +out: + ovl_cache_free(&list); + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static bool ovl_type_merge_or_lower(struct dentry *dentry) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type); +} + +static bool ovl_can_move(struct dentry *dentry) +{ + return ovl_redirect_dir(dentry->d_sb) || + !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry); +} + +static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect) +{ + char *buf, *ret; + struct dentry *d, *tmp; + int buflen = ovl_redirect_max + 1; + + if (!abs_redirect) { + ret = kstrndup(dentry->d_name.name, dentry->d_name.len, + GFP_KERNEL); + goto out; + } + + buf = ret = kmalloc(buflen, GFP_KERNEL); + if (!buf) + goto out; + + buflen--; + buf[buflen] = '\0'; + for (d = dget(dentry); !IS_ROOT(d);) { + const char *name; + int thislen; + + spin_lock(&d->d_lock); + name = ovl_dentry_get_redirect(d); + if (name) { + thislen = strlen(name); + } else { + name = d->d_name.name; + thislen = d->d_name.len; + } + + /* If path is too long, fall back to userspace move */ + if (thislen + (name[0] != '/') > buflen) { + ret = ERR_PTR(-EXDEV); + spin_unlock(&d->d_lock); + goto out_put; + } + + buflen -= thislen; + memcpy(&buf[buflen], name, thislen); + spin_unlock(&d->d_lock); + tmp = dget_parent(d); + + dput(d); + d = tmp; + + /* Absolute redirect: finished */ + if (buf[buflen] == '/') + break; + buflen--; + buf[buflen] = '/'; + } + ret = kstrdup(&buf[buflen], GFP_KERNEL); +out_put: + dput(d); + kfree(buf); +out: + return ret ? ret : ERR_PTR(-ENOMEM); +} + +static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir) +{ + struct dentry *lowerdentry; + + if (!samedir) + return true; + + if (d_is_dir(dentry)) + return false; + + /* + * For non-dir hardlinked files, we need absolute redirects + * in general as two upper hardlinks could be in different + * dirs. We could put a relative redirect now and convert + * it to absolute redirect later. But when nlink > 1 and + * indexing is on, that means relative redirect needs to be + * converted to absolute during copy up of another lower + * hardllink as well. + * + * So without optimizing too much, just check if lower is + * a hard link or not. If lower is hard link, put absolute + * redirect. + */ + lowerdentry = ovl_dentry_lower(dentry); + return (d_inode(lowerdentry)->i_nlink > 1); +} + +static int ovl_set_redirect(struct dentry *dentry, bool samedir) +{ + int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + const char *redirect = ovl_dentry_get_redirect(dentry); + bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir); + + if (redirect && (!absolute_redirect || redirect[0] == '/')) + return 0; + + redirect = ovl_get_redirect(dentry, absolute_redirect); + if (IS_ERR(redirect)) + return PTR_ERR(redirect); + + err = ovl_check_setxattr(ofs, ovl_dentry_upper(dentry), + OVL_XATTR_REDIRECT, + redirect, strlen(redirect), -EXDEV); + if (!err) { + spin_lock(&dentry->d_lock); + ovl_dentry_set_redirect(dentry, redirect); + spin_unlock(&dentry->d_lock); + } else { + kfree(redirect); + pr_warn_ratelimited("failed to set redirect (%i)\n", + err); + /* Fall back to userspace copy-up */ + err = -EXDEV; + } + return err; +} + +static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *old, struct inode *newdir, + struct dentry *new, unsigned int flags) +{ + int err; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool cleanup_whiteout = false; + bool update_nlink = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = d_is_dir(old); + bool new_is_dir = d_is_dir(new); + bool samedir = olddir == newdir; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct ovl_fs *ofs = OVL_FS(old->d_sb); + LIST_HEAD(list); + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + /* Don't copy up directory trees */ + err = -EXDEV; + if (!ovl_can_move(old)) + goto out; + if (!overwrite && !ovl_can_move(new)) + goto out; + + if (overwrite && new_is_dir && !ovl_pure_upper(new)) { + err = ovl_check_empty_dir(new, &list); + if (err) + goto out; + } + + if (overwrite) { + if (ovl_lower_positive(old)) { + if (!ovl_dentry_is_whiteout(new)) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && ovl_dentry_is_whiteout(new)) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } else if (d_inode(new)) { + err = ovl_nlink_start(new); + if (err) + goto out_drop_write; + + update_nlink = true; + } + + old_cred = ovl_override_creds(old->d_sb); + + if (!list_empty(&list)) { + opaquedir = ovl_clear_empty(new, &list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + if (!samedir) { + /* + * When moving a merge dir or non-dir with copy up origin into + * a new parent, we are marking the new parent dir "impure". + * When ovl_iterate() iterates an "impure" upper dir, it will + * lookup the origin inodes of the entries to fill d_ino. + */ + if (ovl_type_origin(old)) { + err = ovl_set_impure(new->d_parent, new_upperdir); + if (err) + goto out_revert_creds; + } + if (!overwrite && ovl_type_origin(new)) { + err = ovl_set_impure(old->d_parent, old_upperdir); + if (err) + goto out_revert_creds; + } + } + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, + old->d_name.len); + err = PTR_ERR(olddentry); + if (IS_ERR(olddentry)) + goto out_unlock; + + err = -ESTALE; + if (!ovl_matches_upper(old, olddentry)) + goto out_dput_old; + + newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput_old; + + old_opaque = ovl_dentry_is_opaque(old); + new_opaque = ovl_dentry_is_opaque(new); + + err = -ESTALE; + if (d_inode(new) && ovl_dentry_upper(new)) { + if (opaquedir) { + if (newdentry != opaquedir) + goto out_dput; + } else { + if (!ovl_matches_upper(new, newdentry)) + goto out_dput; + } + } else { + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } + } + + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (olddentry->d_inode == newdentry->d_inode) + goto out_dput; + + err = 0; + if (ovl_type_merge_or_lower(old)) + err = ovl_set_redirect(old, samedir); + else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + if (err) + goto out_dput; + + if (!overwrite && ovl_type_merge_or_lower(new)) + err = ovl_set_redirect(new, samedir); + else if (!overwrite && new_is_dir && !new_opaque && + ovl_type_merge(old->d_parent)) + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + if (err) + goto out_dput; + + err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, flags); + if (err) + goto out_dput; + + if (cleanup_whiteout) + ovl_cleanup(ofs, old_upperdir->d_inode, newdentry); + + if (overwrite && d_inode(new)) { + if (new_is_dir) + clear_nlink(d_inode(new)); + else + ovl_drop_nlink(new); + } + + ovl_dir_modified(old->d_parent, ovl_type_origin(old) || + (!overwrite && ovl_type_origin(new))); + ovl_dir_modified(new->d_parent, ovl_type_origin(old) || + (d_inode(new) && ovl_type_origin(new))); + + /* copy ctime: */ + ovl_copyattr(d_inode(old)); + if (d_inode(new) && ovl_dentry_upper(new)) + ovl_copyattr(d_inode(new)); + +out_dput: + dput(newdentry); +out_dput_old: + dput(olddentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + revert_creds(old_cred); + if (update_nlink) + ovl_nlink_end(new); +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + ovl_cache_free(&list); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename = ovl_rename, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, + .fileattr_get = ovl_fileattr_get, + .fileattr_set = ovl_fileattr_set, +}; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c new file mode 100644 index 000000000..e55363d34 --- /dev/null +++ b/fs/overlayfs/export.c @@ -0,0 +1,876 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Overlayfs NFS export support. + * + * Amir Goldstein <amir73il@gmail.com> + * + * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + + if (err) { + pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", + dentry, err); + } + + return err; +} + +/* + * Before encoding a non-upper directory file handle from real layer N, we need + * to check if it will be possible to reconnect an overlay dentry from the real + * lower decoded dentry. This is done by following the overlay ancestry up to a + * "layer N connected" ancestor and verifying that all parents along the way are + * "layer N connectable". If an ancestor that is NOT "layer N connectable" is + * found, we need to copy up an ancestor, which is "layer N connectable", thus + * making that ancestor "layer N connected". For example: + * + * layer 1: /a + * layer 2: /a/b/c + * + * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is + * copied up and renamed, upper dir /a will be indexed by lower dir /a from + * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) + * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay + * dentry from the connected lower dentry /a/b/c. + * + * To avoid this problem on decode time, we need to copy up an ancestor of + * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is + * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" + * and when the time comes to decode the file handle from lower dentry /a/b/c, + * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding + * a connected overlay dentry will be accomplished. + * + * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an + * entry /a in the lower layers above layer N and find the indexed dir /a from + * layer 1. If that improvement is made, then the check for "layer N connected" + * will need to verify there are no redirects in lower layers above N. In the + * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a + * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": + * + * layer 1: /A (redirect = /a) + * layer 2: /a/b/c + */ + +/* Return the lowest layer for encoding a connectable file handle */ +static int ovl_connectable_layer(struct dentry *dentry) +{ + struct ovl_entry *oe = OVL_E(dentry); + + /* We can get overlay root from root of any layer */ + if (dentry == dentry->d_sb->s_root) + return oe->numlower; + + /* + * If it's an unindexed merge dir, then it's not connectable with any + * lower layer + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* We can get upper/overlay path from indexed/lower dentry */ + return oe->lowerstack[0].layer->idx; +} + +/* + * @dentry is "connected" if all ancestors up to root or a "connected" ancestor + * have the same uppermost lower layer as the origin's layer. We may need to + * copy up a "connectable" ancestor to make it "connected". A "connected" dentry + * cannot become non "connected", so cache positive result in dentry flags. + * + * Return the connected origin layer or < 0 on error. + */ +static int ovl_connect_layer(struct dentry *dentry) +{ + struct dentry *next, *parent = NULL; + int origin_layer; + int err = 0; + + if (WARN_ON(dentry == dentry->d_sb->s_root) || + WARN_ON(!ovl_dentry_lower(dentry))) + return -EIO; + + origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) + return origin_layer; + + /* Find the topmost origin layer connectable ancestor of @dentry */ + next = dget(dentry); + for (;;) { + parent = dget_parent(next); + if (WARN_ON(parent == next)) { + err = -EIO; + break; + } + + /* + * If @parent is not origin layer connectable, then copy up + * @next which is origin layer connectable and we are done. + */ + if (ovl_connectable_layer(parent) < origin_layer) { + err = ovl_encode_maybe_copy_up(next); + break; + } + + /* If @parent is connected or indexed we are done */ + if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || + ovl_test_flag(OVL_INDEX, d_inode(parent))) + break; + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + if (!err) + ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); + + return err ?: origin_layer; +} + +/* + * We only need to encode origin if there is a chance that the same object was + * encoded pre copy up and then we need to stay consistent with the same + * encoding also after copy up. If non-pure upper is not indexed, then it was + * copied up before NFS export was enabled. In that case we don't need to worry + * about staying consistent with pre copy up encoding and we encode an upper + * file handle. Overlay root dentry is a private case of non-indexed upper. + * + * The following table summarizes the different file handle encodings used for + * different overlay object types: + * + * Object type | Encoding + * -------------------------------- + * Pure upper | U + * Non-indexed upper | U + * Indexed upper | L (*) + * Non-upper | L (*) + * + * U = upper file handle + * L = lower file handle + * + * (*) Connecting an overlay dir from real lower dentry is not always + * possible when there are redirects in lower layers and non-indexed merge dirs. + * To mitigate those case, we may copy up the lower dir ancestor before encode + * a lower dir file handle. + * + * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. + */ +static int ovl_check_encode_origin(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + /* Upper file handle for pure upper */ + if (!ovl_dentry_lower(dentry)) + return 0; + + /* + * Upper file handle for non-indexed upper. + * + * Root is never indexed, so if there's an upper layer, encode upper for + * root. + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* + * Decoding a merge dir, whose origin's ancestor is under a redirected + * lower dir or under a non-indexed upper is not always possible. + * ovl_connect_layer() will try to make origin's layer "connected" by + * copying up a "connectable" ancestor. + */ + if (d_is_dir(dentry) && ovl_upper_mnt(ofs)) + return ovl_connect_layer(dentry); + + /* Lower file handle for indexed and non-upper dir/non-dir */ + return 1; +} + +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, + u32 *fid, int buflen) +{ + struct ovl_fh *fh = NULL; + int err, enc_lower; + int len; + + /* + * Check if we should encode a lower or upper file handle and maybe + * copy up an ancestor to make lower file handle connectable. + */ + err = enc_lower = ovl_check_encode_origin(dentry); + if (enc_lower < 0) + goto fail; + + /* Encode an upper or lower file handle */ + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + len = OVL_FH_LEN(fh); + if (len <= buflen) + memcpy(fid, fh, len); + err = len; + +out: + kfree(fh); + return err; + +fail: + pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", + dentry, err); + goto out; +} + +static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + struct dentry *dentry; + int bytes, buflen = *max_len << 2; + + /* TODO: encode connectable file handles */ + if (parent) + return FILEID_INVALID; + + dentry = d_find_any_alias(inode); + if (!dentry) + return FILEID_INVALID; + + bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); + dput(dentry); + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + if (bytes > buflen) + return FILEID_INVALID; + + return OVL_FILEID_V1; +} + +/* + * Find or instantiate an overlay dentry from real dentries and index. + */ +static struct dentry *ovl_obtain_alias(struct super_block *sb, + struct dentry *upper_alias, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; + struct dentry *upper = upper_alias ?: index; + struct dentry *dentry; + struct inode *inode; + struct ovl_entry *oe; + struct ovl_inode_params oip = { + .lowerpath = lowerpath, + .index = index, + .numlower = !!lower + }; + + /* We get overlay directory dentries with ovl_lookup_real() */ + if (d_is_dir(upper ?: lower)) + return ERR_PTR(-EIO); + + oip.upperdentry = dget(upper); + inode = ovl_get_inode(sb, &oip); + if (IS_ERR(inode)) { + dput(upper); + return ERR_CAST(inode); + } + + if (upper) + ovl_set_flag(OVL_UPPERDATA, inode); + + dentry = d_find_any_alias(inode); + if (dentry) + goto out_iput; + + dentry = d_alloc_anon(inode->i_sb); + if (unlikely(!dentry)) + goto nomem; + oe = ovl_alloc_entry(lower ? 1 : 0); + if (!oe) + goto nomem; + + if (lower) { + oe->lowerstack->dentry = dget(lower); + oe->lowerstack->layer = lowerpath->layer; + } + dentry->d_fsdata = oe; + if (upper_alias) + ovl_dentry_set_upper_alias(dentry); + + ovl_dentry_init_reval(dentry, upper); + + return d_instantiate_anon(dentry, inode); + +nomem: + dput(dentry); + dentry = ERR_PTR(-ENOMEM); +out_iput: + iput(inode); + return dentry; +} + +/* Get the upper or lower dentry in stach whose on layer @idx */ +static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) +{ + struct ovl_entry *oe = dentry->d_fsdata; + int i; + + if (!idx) + return ovl_dentry_upper(dentry); + + for (i = 0; i < oe->numlower; i++) { + if (oe->lowerstack[i].layer->idx == idx) + return oe->lowerstack[i].dentry; + } + + return NULL; +} + +/* + * Lookup a child overlay dentry to get a connected overlay dentry whose real + * dentry is @real. If @real is on upper layer, we lookup a child overlay + * dentry with the same name as the real dentry. Otherwise, we need to consult + * index for lookup. + */ +static struct dentry *ovl_lookup_real_one(struct dentry *connected, + struct dentry *real, + const struct ovl_layer *layer) +{ + struct inode *dir = d_inode(connected); + struct dentry *this, *parent = NULL; + struct name_snapshot name; + int err; + + /* + * Lookup child overlay dentry by real name. The dir mutex protects us + * from racing with overlay rename. If the overlay dentry that is above + * real has already been moved to a parent that is not under the + * connected overlay dir, we return -ECHILD and restart the lookup of + * connected real path from the top. + */ + inode_lock_nested(dir, I_MUTEX_PARENT); + err = -ECHILD; + parent = dget_parent(real); + if (ovl_dentry_real_at(connected, layer->idx) != parent) + goto fail; + + /* + * We also need to take a snapshot of real dentry name to protect us + * from racing with underlying layer rename. In this case, we don't + * care about returning ESTALE, only from dereferencing a free name + * pointer because we hold no lock on the real dentry. + */ + take_dentry_name_snapshot(&name, real); + /* + * No mnt_userns handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-mnt_userns + * transformed ids. + */ + this = lookup_one_len(name.name.name, connected, name.name.len); + release_dentry_name_snapshot(&name); + err = PTR_ERR(this); + if (IS_ERR(this)) { + goto fail; + } else if (!this || !this->d_inode) { + dput(this); + err = -ENOENT; + goto fail; + } else if (ovl_dentry_real_at(this, layer->idx) != real) { + dput(this); + err = -ESTALE; + goto fail; + } + +out: + dput(parent); + inode_unlock(dir); + return this; + +fail: + pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + this = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + const struct ovl_layer *layer); + +/* + * Lookup an indexed or hashed overlay dentry by real inode. + */ +static struct dentry *ovl_lookup_real_inode(struct super_block *sb, + struct dentry *real, + const struct ovl_layer *layer) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct dentry *index = NULL; + struct dentry *this = NULL; + struct inode *inode; + + /* + * Decoding upper dir from index is expensive, so first try to lookup + * overlay dentry in inode/dcache. + */ + inode = ovl_lookup_inode(sb, real, !layer->idx); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (inode) { + this = d_find_any_alias(inode); + iput(inode); + } + + /* + * For decoded lower dir file handle, lookup index by origin to check + * if lower dir was copied up and and/or removed. + */ + if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) { + index = ovl_lookup_index(ofs, NULL, real, false); + if (IS_ERR(index)) + return index; + } + + /* Get connected upper overlay dir from index */ + if (index) { + struct dentry *upper = ovl_index_upper(ofs, index); + + dput(index); + if (IS_ERR_OR_NULL(upper)) + return upper; + + /* + * ovl_lookup_real() in lower layer may call recursively once to + * ovl_lookup_real() in upper layer. The first level call walks + * back lower parents to the topmost indexed parent. The second + * recursive call walks back from indexed upper to the topmost + * connected/hashed upper parent (or up to root). + */ + this = ovl_lookup_real(sb, upper, &ofs->layers[0]); + dput(upper); + } + + if (IS_ERR_OR_NULL(this)) + return this; + + if (ovl_dentry_real_at(this, layer->idx) != real) { + dput(this); + this = ERR_PTR(-EIO); + } + + return this; +} + +/* + * Lookup an indexed or hashed overlay dentry, whose real dentry is an + * ancestor of @real. + */ +static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, + struct dentry *real, + const struct ovl_layer *layer) +{ + struct dentry *next, *parent = NULL; + struct dentry *ancestor = ERR_PTR(-EIO); + + if (real == layer->mnt->mnt_root) + return dget(sb->s_root); + + /* Find the topmost indexed or hashed ancestor */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + /* + * Lookup a matching overlay dentry in inode/dentry + * cache or in index by real inode. + */ + ancestor = ovl_lookup_real_inode(sb, next, layer); + if (ancestor) + break; + + if (parent == layer->mnt->mnt_root) { + ancestor = dget(sb->s_root); + break; + } + + /* + * If @real has been moved out of the layer root directory, + * we will eventully hit the real fs root. This cannot happen + * by legit overlay rename, so we return error in that case. + */ + if (parent == next) { + ancestor = ERR_PTR(-EXDEV); + break; + } + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + return ancestor; +} + +/* + * Lookup a connected overlay dentry whose real dentry is @real. + * If @real is on upper layer, we lookup a child overlay dentry with the same + * path the real dentry. Otherwise, we need to consult index for lookup. + */ +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + const struct ovl_layer *layer) +{ + struct dentry *connected; + int err = 0; + + connected = ovl_lookup_real_ancestor(sb, real, layer); + if (IS_ERR(connected)) + return connected; + + while (!err) { + struct dentry *next, *this; + struct dentry *parent = NULL; + struct dentry *real_connected = ovl_dentry_real_at(connected, + layer->idx); + + if (real_connected == real) + break; + + /* Find the topmost dentry not yet connected */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + if (parent == real_connected) + break; + + /* + * If real has been moved out of 'real_connected', + * we will not find 'real_connected' and hit the layer + * root. In that case, we need to restart connecting. + * This game can go on forever in the worst case. We + * may want to consider taking s_vfs_rename_mutex if + * this happens more than once. + */ + if (parent == layer->mnt->mnt_root) { + dput(connected); + connected = dget(sb->s_root); + break; + } + + /* + * If real file has been moved out of the layer root + * directory, we will eventully hit the real fs root. + * This cannot happen by legit overlay rename, so we + * return error in that case. + */ + if (parent == next) { + err = -EXDEV; + break; + } + + dput(next); + next = parent; + } + + if (!err) { + this = ovl_lookup_real_one(connected, next, layer); + if (IS_ERR(this)) + err = PTR_ERR(this); + + /* + * Lookup of child in overlay can fail when racing with + * overlay rename of child away from 'connected' parent. + * In this case, we need to restart the lookup from the + * top, because we cannot trust that 'real_connected' is + * still an ancestor of 'real'. There is a good chance + * that the renamed overlay ancestor is now in cache, so + * ovl_lookup_real_ancestor() will find it and we can + * continue to connect exactly from where lookup failed. + */ + if (err == -ECHILD) { + this = ovl_lookup_real_ancestor(sb, real, + layer); + err = PTR_ERR_OR_ZERO(this); + } + if (!err) { + dput(connected); + connected = this; + } + } + + dput(parent); + dput(next); + } + + if (err) + goto fail; + + return connected; + +fail: + pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + dput(connected); + return ERR_PTR(err); +} + +/* + * Get an overlay dentry from upper/lower real dentries and index. + */ +static struct dentry *ovl_get_dentry(struct super_block *sb, + struct dentry *upper, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; + struct dentry *real = upper ?: (index ?: lowerpath->dentry); + + /* + * Obtain a disconnected overlay dentry from a non-dir real dentry + * and index. + */ + if (!d_is_dir(real)) + return ovl_obtain_alias(sb, upper, lowerpath, index); + + /* Removed empty directory? */ + if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real)) + return ERR_PTR(-ENOENT); + + /* + * If real dentry is connected and hashed, get a connected overlay + * dentry whose real dentry is @real. + */ + return ovl_lookup_real(sb, real, layer); +} + +static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct dentry *dentry; + struct dentry *upper; + + if (!ovl_upper_mnt(ofs)) + return ERR_PTR(-EACCES); + + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); + if (IS_ERR_OR_NULL(upper)) + return upper; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + + return dentry; +} + +static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; + struct dentry *dentry = NULL; + struct dentry *index = NULL; + struct inode *inode; + int err; + + /* First lookup overlay inode in inode cache by origin fh */ + err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); + if (err) + return ERR_PTR(err); + + if (!d_is_dir(origin.dentry) || + !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_err; + if (inode) { + dentry = d_find_any_alias(inode); + iput(inode); + if (dentry) + goto out; + } + } + + /* Then lookup indexed upper/whiteout by origin fh */ + if (ofs->indexdir) { + index = ovl_get_index_fh(ofs, fh); + err = PTR_ERR(index); + if (IS_ERR(index)) { + index = NULL; + goto out_err; + } + } + + /* Then try to get a connected upper dir by index */ + if (index && d_is_dir(index)) { + struct dentry *upper = ovl_index_upper(ofs, index); + + err = PTR_ERR(upper); + if (IS_ERR_OR_NULL(upper)) + goto out_err; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + goto out; + } + + /* Find origin.dentry again with ovl_acceptable() layer check */ + if (d_is_dir(origin.dentry)) { + dput(origin.dentry); + origin.dentry = NULL; + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); + if (err) + goto out_err; + } + if (index) { + err = ovl_verify_origin(ofs, index, origin.dentry, false); + if (err) + goto out_err; + } + + /* Get a connected non-upper dir or disconnected non-dir */ + dentry = ovl_get_dentry(sb, NULL, &origin, index); + +out: + dput(origin.dentry); + dput(index); + return dentry; + +out_err: + dentry = ERR_PTR(err); + goto out; +} + +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + if (buflen <= OVL_FH_WIRE_OFFSET) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + +static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct dentry *dentry = NULL; + struct ovl_fh *fh = NULL; + int len = fh_len << 2; + unsigned int flags = 0; + int err; + + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) + goto out_err; + + err = ovl_check_fh_len(fh, len); + if (err) + goto out_err; + + flags = fh->fb.flags; + dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? + ovl_upper_fh_to_d(sb, fh) : + ovl_lower_fh_to_d(sb, fh); + err = PTR_ERR(dentry); + if (IS_ERR(dentry) && err != -ESTALE) + goto out_err; + +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + + return dentry; + +out_err: + pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + return ERR_PTR(-EACCES); +} + +static int ovl_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries and + * ovl_fh_to_parent() is not implemented, so we should not get here. + */ + WARN_ON_ONCE(1); + return -EIO; +} + +static struct dentry *ovl_get_parent(struct dentry *dentry) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries, so we + * should not get here. + */ + WARN_ON_ONCE(1); + return ERR_PTR(-EIO); +} + +const struct export_operations ovl_export_operations = { + .encode_fh = ovl_encode_fh, + .fh_to_dentry = ovl_fh_to_dentry, + .fh_to_parent = ovl_fh_to_parent, + .get_name = ovl_get_name, + .get_parent = ovl_get_parent, +}; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c new file mode 100644 index 000000000..cc0c07716 --- /dev/null +++ b/fs/overlayfs/file.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Red Hat, Inc. + */ + +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/uio.h> +#include <linux/uaccess.h> +#include <linux/splice.h> +#include <linux/security.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include "overlayfs.h" + +struct ovl_aio_req { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; +}; + +static struct kmem_cache *ovl_aio_request_cachep; + +static char ovl_whatisit(struct inode *inode, struct inode *realinode) +{ + if (realinode != ovl_inode_upper(inode)) + return 'l'; + if (ovl_has_upperdata(inode)) + return 'u'; + else + return 'm'; +} + +/* No atime modificaton nor notify on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) + +static struct file *ovl_open_realfile(const struct file *file, + const struct path *realpath) +{ + struct inode *realinode = d_inode(realpath->dentry); + struct inode *inode = file_inode(file); + struct user_namespace *real_mnt_userns; + struct file *realfile; + const struct cred *old_cred; + int flags = file->f_flags | OVL_OPEN_FLAGS; + int acc_mode = ACC_MODE(flags); + int err; + + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + old_cred = ovl_override_creds(inode->i_sb); + real_mnt_userns = mnt_user_ns(realpath->mnt); + err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); + if (err) { + realfile = ERR_PTR(err); + } else { + if (!inode_owner_or_capable(real_mnt_userns, realinode)) + flags &= ~O_NOATIME; + + realfile = open_with_fake_path(&file->f_path, flags, realinode, + current_cred()); + } + revert_creds(old_cred); + + pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", + file, file, ovl_whatisit(inode, realinode), file->f_flags, + realfile, IS_ERR(realfile) ? 0 : realfile->f_flags); + + return realfile; +} + +#define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT) + +static int ovl_change_flags(struct file *file, unsigned int flags) +{ + struct inode *inode = file_inode(file); + int err; + + flags &= OVL_SETFL_MASK; + + if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) + return -EPERM; + + if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + if (file->f_op->check_flags) { + err = file->f_op->check_flags(flags); + if (err) + return err; + } + + spin_lock(&file->f_lock); + file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + file->f_iocb_flags = iocb_flags(file); + spin_unlock(&file->f_lock); + + return 0; +} + +static int ovl_real_fdget_meta(const struct file *file, struct fd *real, + bool allow_meta) +{ + struct dentry *dentry = file_dentry(file); + struct path realpath; + + real->flags = 0; + real->file = file->private_data; + + if (allow_meta) + ovl_path_real(dentry, &realpath); + else + ovl_path_realdata(dentry, &realpath); + + /* Has it been copied up since we'd opened it? */ + if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) { + real->flags = FDPUT_FPUT; + real->file = ovl_open_realfile(file, &realpath); + + return PTR_ERR_OR_ZERO(real->file); + } + + /* Did the flags change since open? */ + if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS)) + return ovl_change_flags(real->file, file->f_flags); + + return 0; +} + +static int ovl_real_fdget(const struct file *file, struct fd *real) +{ + if (d_is_dir(file_dentry(file))) { + real->flags = 0; + real->file = ovl_dir_real_file(file, false); + + return PTR_ERR_OR_ZERO(real->file); + } + + return ovl_real_fdget_meta(file, real, false); +} + +static int ovl_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file_dentry(file); + struct file *realfile; + struct path realpath; + int err; + + err = ovl_maybe_copy_up(dentry, file->f_flags); + if (err) + return err; + + /* No longer need these flags, so don't pass them on to underlying fs */ + file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + ovl_path_realdata(dentry, &realpath); + realfile = ovl_open_realfile(file, &realpath); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + file->private_data = realfile; + + return 0; +} + +static int ovl_release(struct inode *inode, struct file *file) +{ + fput(file->private_data); + + return 0; +} + +static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + loff_t ret; + + /* + * The two special cases below do not need to involve real fs, + * so we can optimizing concurrent callers. + */ + if (offset == 0) { + if (whence == SEEK_CUR) + return file->f_pos; + + if (whence == SEEK_SET) + return vfs_setpos(file, 0, 0); + } + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + /* + * Overlay file f_pos is the master copy that is preserved + * through copy up and modified on read/write, but only real + * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose + * limitations that are more strict than ->s_maxbytes for specific + * files, so we use the real file to perform seeks. + */ + ovl_inode_lock(inode); + real.file->f_pos = file->f_pos; + + old_cred = ovl_override_creds(inode->i_sb); + ret = vfs_llseek(real.file, offset, whence); + revert_creds(old_cred); + + file->f_pos = real.file->f_pos; + ovl_inode_unlock(inode); + + fdput(real); + + return ret; +} + +static void ovl_file_accessed(struct file *file) +{ + struct inode *inode, *upperinode; + + if (file->f_flags & O_NOATIME) + return; + + inode = file_inode(file); + upperinode = ovl_inode_upper(inode); + + if (!upperinode) + return; + + if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || + !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { + inode->i_mtime = upperinode->i_mtime; + inode->i_ctime = upperinode->i_ctime; + } + + touch_atime(&file->f_path); +} + +static rwf_t ovl_iocb_to_rwf(int ifl) +{ + rwf_t flags = 0; + + if (ifl & IOCB_NOWAIT) + flags |= RWF_NOWAIT; + if (ifl & IOCB_HIPRI) + flags |= RWF_HIPRI; + if (ifl & IOCB_DSYNC) + flags |= RWF_DSYNC; + if (ifl & IOCB_SYNC) + flags |= RWF_SYNC; + + return flags; +} + +static inline void ovl_aio_put(struct ovl_aio_req *aio_req) +{ + if (refcount_dec_and_test(&aio_req->ref)) { + fput(aio_req->iocb.ki_filp); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } +} + +static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *orig_iocb = aio_req->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(orig_iocb->ki_filp); + + /* Actually acquired in ovl_write_iter() */ + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, + SB_FREEZE_WRITE); + file_end_write(iocb->ki_filp); + ovl_copyattr(inode); + } + + orig_iocb->ki_pos = iocb->ki_pos; + ovl_aio_put(aio_req); +} + +static void ovl_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + ovl_aio_cleanup_handler(aio_req); + orig_iocb->ki_complete(orig_iocb, res); +} + +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + if (!iov_iter_count(iter)) + return 0; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + !(real.file->f_mode & FMODE_CAN_ODIRECT)) + goto out_fdput; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb->ki_flags)); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: + revert_creds(old_cred); + ovl_file_accessed(file); +out_fdput: + fdput(real); + + return ret; +} + +static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + ssize_t ret; + int ifl = iocb->ki_flags; + + if (!iov_iter_count(iter)) + return 0; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(file, &real); + if (ret) + goto out_unlock; + + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + !(real.file->f_mode & FMODE_CAN_ODIRECT)) + goto out_fdput; + + if (!ovl_should_sync(OVL_FS(inode->i_sb))) + ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + if (is_sync_kiocb(iocb)) { + file_start_write(real.file); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(ifl)); + file_end_write(real.file); + /* Update size */ + ovl_copyattr(inode); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + file_start_write(real.file); + /* Pacify lockdep, same trick as done in aio_write() */ + __sb_writers_release(file_inode(real.file)->i_sb, + SB_FREEZE_WRITE); + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); + aio_req->iocb.ki_flags = ifl; + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: + revert_creds(old_cred); +out_fdput: + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + +/* + * Calling iter_file_splice_write() directly from overlay's f_op may deadlock + * due to lock order inversion between pipe->mutex in iter_file_splice_write() + * and file_start_write(real.file) in ovl_write_iter(). + * + * So do everything ovl_write_iter() does and call iter_file_splice_write() on + * the real file. + */ +static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + struct inode *inode = file_inode(out); + ssize_t ret; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(out); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(out, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(inode->i_sb); + file_start_write(real.file); + + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + + file_end_write(real.file); + /* Update size */ + ovl_copyattr(inode); + revert_creds(old_cred); + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + +static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); + if (ret <= 0) + return ret; + + ret = ovl_real_fdget_meta(file, &real, !datasync); + if (ret) + return ret; + + /* Don't sync lower file for fear of receiving EROFS error */ + if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) { + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fsync_range(real.file, start, end, datasync); + revert_creds(old_cred); + } + + fdput(real); + + return ret; +} + +static int ovl_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *realfile = file->private_data; + const struct cred *old_cred; + int ret; + + if (!realfile->f_op->mmap) + return -ENODEV; + + if (WARN_ON(file != vma->vm_file)) + return -EIO; + + vma_set_file(vma, realfile); + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + ovl_file_accessed(file); + + return ret; +} + +static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + int ret; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(file, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fallocate(real.file, mode, offset, len); + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(inode); + + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + +static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fadvise(real.file, offset, len, advice); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + +enum ovl_copyop { + OVL_COPY, + OVL_CLONE, + OVL_DEDUPE, +}; + +static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int flags, enum ovl_copyop op) +{ + struct inode *inode_out = file_inode(file_out); + struct fd real_in, real_out; + const struct cred *old_cred; + loff_t ret; + + inode_lock(inode_out); + if (op != OVL_DEDUPE) { + /* Update mode */ + ovl_copyattr(inode_out); + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + + ret = ovl_real_fdget(file_out, &real_out); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(file_in, &real_in); + if (ret) { + fdput(real_out); + goto out_unlock; + } + + old_cred = ovl_override_creds(file_inode(file_out)->i_sb); + switch (op) { + case OVL_COPY: + ret = vfs_copy_file_range(real_in.file, pos_in, + real_out.file, pos_out, len, flags); + break; + + case OVL_CLONE: + ret = vfs_clone_file_range(real_in.file, pos_in, + real_out.file, pos_out, len, flags); + break; + + case OVL_DEDUPE: + ret = vfs_dedupe_file_range_one(real_in.file, pos_in, + real_out.file, pos_out, len, + flags); + break; + } + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(inode_out); + + fdput(real_in); + fdput(real_out); + +out_unlock: + inode_unlock(inode_out); + + return ret; +} + +static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags, + OVL_COPY); +} + +static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + enum ovl_copyop op; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + op = OVL_DEDUPE; + else + op = OVL_CLONE; + + /* + * Don't copy up because of a dedupe request, this wouldn't make sense + * most of the time (data would be duplicated instead of deduplicated). + */ + if (op == OVL_DEDUPE && + (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out)))) + return -EPERM; + + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, + remap_flags, op); +} + +static int ovl_flush(struct file *file, fl_owner_t id) +{ + struct fd real; + const struct cred *old_cred; + int err; + + err = ovl_real_fdget(file, &real); + if (err) + return err; + + if (real.file->f_op->flush) { + old_cred = ovl_override_creds(file_inode(file)->i_sb); + err = real.file->f_op->flush(real.file, id); + revert_creds(old_cred); + } + fdput(real); + + return err; +} + +const struct file_operations ovl_file_operations = { + .open = ovl_open, + .release = ovl_release, + .llseek = ovl_llseek, + .read_iter = ovl_read_iter, + .write_iter = ovl_write_iter, + .fsync = ovl_fsync, + .mmap = ovl_mmap, + .fallocate = ovl_fallocate, + .fadvise = ovl_fadvise, + .flush = ovl_flush, + .splice_read = generic_file_splice_read, + .splice_write = ovl_splice_write, + + .copy_file_range = ovl_copy_file_range, + .remap_file_range = ovl_remap_file_range, +}; + +int __init ovl_aio_request_cache_init(void) +{ + ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", + sizeof(struct ovl_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ovl_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void ovl_aio_request_cache_destroy(void) +{ + kmem_cache_destroy(ovl_aio_request_cachep); +} diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000..c062f7e2e --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,1285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/ratelimit.h> +#include <linux/fiemap.h> +#include <linux/fileattr.h> +#include <linux/security.h> +#include <linux/namei.h> +#include "overlayfs.h" + + +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + bool full_copy_up = false; + struct dentry *upperdentry; + const struct cred *old_cred; + + err = setattr_prepare(&init_user_ns, dentry, attr); + if (err) + return err; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (attr->ia_valid & ATTR_SIZE) { + /* Truncate should trigger data copy up as well */ + full_copy_up = true; + } + + if (!full_copy_up) + err = ovl_copy_up(dentry); + else + err = ovl_copy_up_with_data(dentry); + if (!err) { + struct inode *winode = NULL; + + upperdentry = ovl_dentry_upper(dentry); + + if (attr->ia_valid & ATTR_SIZE) { + winode = d_inode(upperdentry); + err = get_write_access(winode); + if (err) + goto out_drop_write; + } + + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + /* + * We might have to translate ovl file into real file object + * once use cases emerge. For now, simply don't let underlying + * filesystem rely on attr->ia_file + */ + attr->ia_valid &= ~ATTR_FILE; + + /* + * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN + * set. Overlayfs does not pass O_TRUNC flag to underlying + * filesystem during open -> do not pass ATTR_OPEN. This + * disables optimization in fuse which assumes open(O_TRUNC) + * already set file size to 0. But we never passed O_TRUNC to + * fuse. So by clearing ATTR_OPEN, fuse will be forced to send + * setattr request to server. + */ + attr->ia_valid &= ~ATTR_OPEN; + + inode_lock(upperdentry->d_inode); + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_do_notify_change(ofs, upperdentry, attr); + revert_creds(old_cred); + if (!err) + ovl_copyattr(dentry->d_inode); + inode_unlock(upperdentry->d_inode); + + if (winode) + put_write_access(winode); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) +{ + bool samefs = ovl_same_fs(dentry->d_sb); + unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + unsigned int xinoshift = 64 - xinobits; + + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + return; + } else if (xinobits) { + /* + * All inode numbers of underlying fs should not be using the + * high xinobits, so we use high xinobits to partition the + * overlay st_ino address space. The high bits holds the fsid + * (upper fsid is 0). The lowest xinobit is reserved for mapping + * the non-persistent inode numbers range in case of overflow. + * This way all overlay inode numbers are unique and use the + * overlay st_dev. + */ + if (likely(!(stat->ino >> xinoshift))) { + stat->ino |= ((u64)fsid) << (xinoshift + 1); + stat->dev = dentry->d_sb->s_dev; + return; + } else if (ovl_xino_warn(dentry->d_sb)) { + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); + } + } + + /* The inode could not be mapped to a unified st_ino address space */ + if (S_ISDIR(dentry->d_inode->i_mode)) { + /* + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } else { + /* + * For non-samefs setup, if we cannot map all layers st_ino + * to a unified address space, we need to make sure that st_dev + * is unique per underlying fs, so we use the unique anonymous + * bdev assigned to the underlying fs. + */ + stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; + } +} + +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + enum ovl_path_type type; + struct path realpath; + const struct cred *old_cred; + struct inode *inode = d_inode(dentry); + bool is_dir = S_ISDIR(inode->i_mode); + int fsid = 0; + int err; + bool metacopy_blocks = false; + + metacopy_blocks = ovl_is_metacopy_dentry(dentry); + + type = ovl_path_real(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&realpath, stat, request_mask, flags); + if (err) + goto out; + + /* Report the effective immutable/append-only STATX flags */ + generic_fill_statx_attr(inode, stat); + + /* + * For non-dir or same fs, we use st_ino of the copy up origin. + * This guaranties constant st_dev/st_ino across copy up. + * With xino feature and non-samefs, we use st_ino of the copy up + * origin masked with high bits that represent the layer id. + * + * If lower filesystem supports NFS file handles, this also guaranties + * persistent st_ino across mount cycle. + */ + if (!is_dir || ovl_same_dev(dentry->d_sb)) { + if (!OVL_TYPE_UPPER(type)) { + fsid = ovl_layer_lower(dentry)->fsid; + } else if (OVL_TYPE_ORIGIN(type)) { + struct kstat lowerstat; + u32 lowermask = STATX_INO | STATX_BLOCKS | + (!is_dir ? STATX_NLINK : 0); + + ovl_path_lower(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerstat, + lowermask, flags); + if (err) + goto out; + + /* + * Lower hardlinks may be broken on copy up to different + * upper files, so we cannot use the lower origin st_ino + * for those different files, even for the same fs case. + * + * Similarly, several redirected dirs can point to the + * same dir on a lower layer. With the "verify_lower" + * feature, we do not use the lower origin st_ino, if + * we haven't verified that this redirect is unique. + * + * With inodes index enabled, it is safe to use st_ino + * of an indexed origin. The index validates that the + * upper hardlink is not broken and that a redirected + * dir is the only redirect to that origin. + */ + if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || + (!ovl_verify_lower(dentry->d_sb) && + (is_dir || lowerstat.nlink == 1))) { + fsid = ovl_layer_lower(dentry)->fsid; + stat->ino = lowerstat.ino; + } + + /* + * If we are querying a metacopy dentry and lower + * dentry is data dentry, then use the blocks we + * queried just now. We don't have to do additional + * vfs_getattr(). If lower itself is metacopy, then + * additional vfs_getattr() is unavoidable. + */ + if (metacopy_blocks && + realpath.dentry == ovl_dentry_lowerdata(dentry)) { + stat->blocks = lowerstat.blocks; + metacopy_blocks = false; + } + } + + if (metacopy_blocks) { + /* + * If lower is not same as lowerdata or if there was + * no origin on upper, we can end up here. + */ + struct kstat lowerdatastat; + u32 lowermask = STATX_BLOCKS; + + ovl_path_lowerdata(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerdatastat, + lowermask, flags); + if (err) + goto out; + stat->blocks = lowerdatastat.blocks; + } + } + + ovl_map_dev_ino(dentry, stat, fsid); + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (is_dir && OVL_TYPE_MERGE(type)) + stat->nlink = 1; + + /* + * Return the overlay inode nlinks for indexed upper inodes. + * Overlay inode nlink counts the union of the upper hardlinks + * and non-covered lower hardlinks. It does not include the upper + * index hardlink. + */ + if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) + stat->nlink = dentry->d_inode->i_nlink; + +out: + revert_creds(old_cred); + + return err; +} + +int ovl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + struct inode *upperinode = ovl_inode_upper(inode); + struct inode *realinode; + struct path realpath; + const struct cred *old_cred; + int err; + + /* Careful in RCU walk mode */ + realinode = ovl_i_path_real(inode, &realpath); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + return -ECHILD; + } + + /* + * Check overlay inode with the creds of task and underlying inode + * with creds of mounter + */ + err = generic_permission(&init_user_ns, inode, mask); + if (err) + return err; + + old_cred = ovl_override_creds(inode->i_sb); + if (!upperinode && + !special_file(realinode->i_mode) && mask & MAY_WRITE) { + mask &= ~(MAY_WRITE | MAY_APPEND); + /* Make sure mounter can read file for copy up later */ + mask |= MAY_READ; + } + err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); + revert_creds(old_cred); + + return err; +} + +static const char *ovl_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + const struct cred *old_cred; + const char *p; + + if (!dentry) + return ERR_PTR(-ECHILD); + + old_cred = ovl_override_creds(dentry->d_sb); + p = vfs_get_link(ovl_dentry_real(dentry), done); + revert_creds(old_cred); + return p; +} + +bool ovl_is_private_xattr(struct super_block *sb, const char *name) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_USER_PREFIX, + sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; + else + return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, + sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; +} + +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + struct path realpath; + const struct cred *old_cred; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (!value && !upperdentry) { + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); + revert_creds(old_cred); + if (err < 0) + goto out_drop_write; + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + realdentry = ovl_dentry_upper(dentry); + } + + old_cred = ovl_override_creds(dentry->d_sb); + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, + flags); + } else { + WARN_ON(flags != XATTR_REPLACE); + err = ovl_do_removexattr(ofs, realdentry, name); + } + revert_creds(old_cred); + + /* copy c/mtime */ + ovl_copyattr(inode); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size) +{ + ssize_t res; + const struct cred *old_cred; + struct path realpath; + + ovl_i_path_real(inode, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); + revert_creds(old_cred); + return res; +} + +static bool ovl_can_list(struct super_block *sb, const char *s) +{ + /* Never list private (.overlay) */ + if (ovl_is_private_xattr(sb, s)) + return false; + + /* List all non-trusted xattrs */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* list other trusted for superuser only */ + return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + struct dentry *realdentry = ovl_dentry_real(dentry); + ssize_t res; + size_t len; + char *s; + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_listxattr(realdentry, list, size); + revert_creds(old_cred); + if (res <= 0 || size == 0) + return res; + + /* filter out private xattrs */ + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + + len -= slen; + if (!ovl_can_list(dentry->d_sb, s)) { + res -= slen; + memmove(s, s + slen, len); + } else { + s += slen; + } + } + + return res; +} + +#ifdef CONFIG_FS_POSIX_ACL +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct inode *realinode, + struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + struct user_namespace *fs_userns = i_user_ns(realinode); + + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) +{ + struct inode *realinode; + struct posix_acl *acl, *clone; + struct path realpath; + + /* Careful in RCU walk mode */ + realinode = ovl_i_path_real(inode, &realpath); + if (!realinode) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } + + if (!IS_POSIXACL(realinode)) + return NULL; + + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; + + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; +} +#endif + +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) +{ + if (flags & S_ATIME) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct path upperpath = { + .mnt = ovl_upper_mnt(ofs), + .dentry = ovl_upperdentry_dereference(OVL_I(inode)), + }; + + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; + } + } + return 0; +} + +static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + int err; + struct inode *realinode = ovl_inode_realdata(inode); + const struct cred *old_cred; + + if (!realinode->i_op->fiemap) + return -EOPNOTSUPP; + + old_cred = ovl_override_creds(inode->i_sb); + err = realinode->i_op->fiemap(realinode, fieinfo, start, len); + revert_creds(old_cred); + + return err; +} + +/* + * Work around the fact that security_file_ioctl() takes a file argument. + * Introducing security_inode_fileattr_get/set() hooks would solve this issue + * properly. + */ +static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, + bool set) +{ + struct file *file; + unsigned int cmd; + int err; + + file = dentry_open(realpath, O_RDONLY, current_cred()); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (set) + cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; + else + cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; + + err = security_file_ioctl(file, cmd, 0); + fput(file); + + return err; +} + +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, true); + if (err) + return err; + + return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); +} + +int ovl_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct path upperpath; + const struct cred *old_cred; + unsigned int flags; + int err; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry); + if (!err) { + ovl_path_real(dentry, &upperpath); + + old_cred = ovl_override_creds(inode->i_sb); + /* + * Store immutable/append-only flags in xattr and clear them + * in upper fileattr (in case they were set by older kernel) + * so children of "ovl-immutable" directories lower aliases of + * "ovl-immutable" hardlinks could be copied up. + * Clear xattr when flags are cleared. + */ + err = ovl_set_protattr(inode, upperpath.dentry, fa); + if (!err) + err = ovl_real_fileattr_set(&upperpath, fa); + revert_creds(old_cred); + + /* + * Merge real inode flags with inode flags read from + * overlay.protattr xattr + */ + flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; + + BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); + flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; + inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); + + /* Update ctime */ + ovl_copyattr(inode); + } + ovl_drop_write(dentry); +out: + return err; +} + +/* Convert inode protection flags to fileattr flags */ +static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) +{ + BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); + BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + + if (inode->i_flags & S_APPEND) { + fa->flags |= FS_APPEND_FL; + fa->fsx_xflags |= FS_XFLAG_APPEND; + } + if (inode->i_flags & S_IMMUTABLE) { + fa->flags |= FS_IMMUTABLE_FL; + fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; + } +} + +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, false); + if (err) + return err; + + err = vfs_fileattr_get(realpath->dentry, fa); + if (err == -ENOIOCTLCMD) + err = -ENOTTY; + return err; +} + +int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct path realpath; + const struct cred *old_cred; + int err; + + ovl_path_real(dentry, &realpath); + + old_cred = ovl_override_creds(inode->i_sb); + err = ovl_real_fileattr_get(&realpath, fa); + ovl_fileattr_prot_flags(inode, fa); + revert_creds(old_cred); + + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, + .fiemap = ovl_fiemap, + .fileattr_get = ovl_fileattr_get, + .fileattr_set = ovl_fileattr_set, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .get_link = ovl_get_link, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .update_time = ovl_update_time, +}; + +static const struct inode_operations ovl_special_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, +}; + +static const struct address_space_operations ovl_aops = { + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ + .direct_IO = noop_direct_IO, +}; + +/* + * It is possible to stack overlayfs instance on top of another + * overlayfs instance as lower layer. We need to annotate the + * stackable i_mutex locks according to stack level of the super + * block instance. An overlayfs instance can never be in stack + * depth 0 (there is always a real fs below it). An overlayfs + * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. + * + * For example, here is a snip from /proc/lockdep_chains after + * dir_iterate of nested overlayfs: + * + * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) + * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) + * [...] &type->i_mutex_dir_key (stack_depth=0) + * + * Locking order w.r.t ovl_want_write() is important for nested overlayfs. + * + * This chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * And this chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - lowerinode->i_rwsem (inode_lock[1]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is + * held, because it is in reverse order of the non-nested case using the same + * upper fs: + * - inode->i_rwsem (inode_lock[1]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[1]) + */ +#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH + +static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; + + int depth = inode->i_sb->s_stack_depth - 1; + + if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) + depth = 0; + + if (S_ISDIR(inode->i_mode)) + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); + else + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); + + lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); +#endif +} + +static void ovl_next_ino(struct inode *inode) +{ + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); + if (unlikely(!inode->i_ino)) + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); +} + +static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) +{ + int xinobits = ovl_xino_bits(inode->i_sb); + unsigned int xinoshift = 64 - xinobits; + + /* + * When d_ino is consistent with st_ino (samefs or i_ino has enough + * bits to encode layer), set the same value used for st_ino to i_ino, + * so inode number exposed via /proc/locks and a like will be + * consistent with d_ino and st_ino values. An i_ino value inconsistent + * with d_ino also causes nfsd readdirplus to fail. + */ + inode->i_ino = ino; + if (ovl_same_fs(inode->i_sb)) { + return; + } else if (xinobits && likely(!(ino >> xinoshift))) { + inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); + return; + } + + /* + * For directory inodes on non-samefs with xino disabled or xino + * overflow, we allocate a non-persistent inode number, to be used for + * resolving st_ino collisions in ovl_map_dev_ino(). + * + * To avoid ino collision with legitimate xino values from upper + * layer (fsid 0), use the lowest xinobit to map the non + * persistent inode numbers to the unified st_ino address space. + */ + if (S_ISDIR(inode->i_mode)) { + ovl_next_ino(inode); + if (xinobits) { + inode->i_ino &= ~0UL >> xinobits; + inode->i_ino |= 1UL << xinoshift; + } + } +} + +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid) +{ + struct inode *realinode; + struct ovl_inode *oi = OVL_I(inode); + + if (oip->upperdentry) + oi->__upperdentry = oip->upperdentry; + if (oip->lowerpath && oip->lowerpath->dentry) { + oi->lowerpath.dentry = dget(oip->lowerpath->dentry); + oi->lowerpath.layer = oip->lowerpath->layer; + } + if (oip->lowerdata) + oi->lowerdata = igrab(d_inode(oip->lowerdata)); + + realinode = ovl_inode_real(inode); + ovl_copyattr(inode); + ovl_copyflags(realinode, inode); + ovl_map_ino(inode, ino, fsid); +} + +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + inode->i_flags |= S_NOCMTIME; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; +#endif + + ovl_lockdep_annotate_inode_mutex_key(inode); + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_op = &ovl_file_inode_operations; + inode->i_fop = &ovl_file_operations; + inode->i_mapping->a_ops = &ovl_aops; + break; + + case S_IFDIR: + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + default: + inode->i_op = &ovl_special_inode_operations; + init_special_inode(inode, mode, rdev); + break; + } +} + +/* + * With inodes index enabled, an overlay inode nlink counts the union of upper + * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure + * upper inode, the following nlink modifying operations can happen: + * + * 1. Lower hardlink copy up + * 2. Upper hardlink created, unlinked or renamed over + * 3. Lower hardlink whiteout or renamed over + * + * For the first, copy up case, the union nlink does not change, whether the + * operation succeeds or fails, but the upper inode nlink may change. + * Therefore, before copy up, we store the union nlink value relative to the + * lower inode nlink in the index inode xattr .overlay.nlink. + * + * For the second, upper hardlink case, the union nlink should be incremented + * or decremented IFF the operation succeeds, aligned with nlink change of the + * upper inode. Therefore, before link/unlink/rename, we store the union nlink + * value relative to the upper inode nlink in the index inode. + * + * For the last, lower cover up case, we simplify things by preceding the + * whiteout or cover up with copy up. This makes sure that there is an index + * upper inode where the nlink xattr can be stored before the copied up upper + * entry is unlink. + */ +#define OVL_NLINK_ADD_UPPER (1 << 0) + +/* + * On-disk format for indexed nlink: + * + * nlink relative to the upper inode - "U[+-]NUM" + * nlink relative to the lower inode - "L[+-]NUM" + */ + +static int ovl_set_nlink_common(struct dentry *dentry, + struct dentry *realdentry, const char *format) +{ + struct inode *inode = d_inode(dentry); + struct inode *realinode = d_inode(realdentry); + char buf[13]; + int len; + + len = snprintf(buf, sizeof(buf), format, + (int) (inode->i_nlink - realinode->i_nlink)); + + if (WARN_ON(len >= sizeof(buf))) + return -EIO; + + return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), + OVL_XATTR_NLINK, buf, len); +} + +int ovl_set_nlink_upper(struct dentry *dentry) +{ + return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); +} + +int ovl_set_nlink_lower(struct dentry *dentry) +{ + return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); +} + +unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, + struct dentry *upperdentry, + unsigned int fallback) +{ + int nlink_diff; + int nlink; + char buf[13]; + int err; + + if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) + return fallback; + + err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, + &buf, sizeof(buf) - 1); + if (err < 0) + goto fail; + + buf[err] = '\0'; + if ((buf[0] != 'L' && buf[0] != 'U') || + (buf[1] != '+' && buf[1] != '-')) + goto fail; + + err = kstrtoint(buf + 1, 10, &nlink_diff); + if (err < 0) + goto fail; + + nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; + nlink += nlink_diff; + + if (nlink <= 0) + goto fail; + + return nlink; + +fail: + pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", + upperdentry, err); + return fallback; +} + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) + ovl_fill_inode(inode, mode, rdev); + + return inode; +} + +static int ovl_inode_test(struct inode *inode, void *data) +{ + return inode->i_private == data; +} + +static int ovl_inode_set(struct inode *inode, void *data) +{ + inode->i_private = data; + return 0; +} + +static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, + struct dentry *upperdentry, bool strict) +{ + /* + * For directories, @strict verify from lookup path performs consistency + * checks, so NULL lower/upper in dentry must match NULL lower/upper in + * inode. Non @strict verify from NFS handle decode path passes NULL for + * 'unknown' lower/upper. + */ + if (S_ISDIR(inode->i_mode) && strict) { + /* Real lower dir moved to upper layer under us? */ + if (!lowerdentry && ovl_inode_lower(inode)) + return false; + + /* Lookup of an uncovered redirect origin? */ + if (!upperdentry && ovl_inode_upper(inode)) + return false; + } + + /* + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. + * This happens when finding a copied up overlay inode for a renamed + * or hardlinked overlay dentry and lower dentry cannot be followed + * by origin because lower fs does not support file handles. + */ + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) + return false; + + /* + * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. + * This happens when finding a lower alias for a copied up hard link. + */ + if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) + return false; + + return true; +} + +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper) +{ + struct inode *inode, *key = d_inode(real); + + inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); + if (!inode) + return NULL; + + if (!ovl_verify_inode(inode, is_upper ? NULL : real, + is_upper ? real : NULL, false)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) +{ + struct inode *key = d_inode(dir); + struct inode *trap; + bool res; + + trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); + if (!trap) + return false; + + res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && + !ovl_inode_lower(trap); + + iput(trap); + return res; +} + +/* + * Create an inode cache entry for layer root dir, that will intentionally + * fail ovl_verify_inode(), so any lookup that will find some layer root + * will fail. + */ +struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) +{ + struct inode *key = d_inode(dir); + struct inode *trap; + + if (!d_is_dir(dir)) + return ERR_PTR(-ENOTDIR); + + trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, + ovl_inode_set, key); + if (!trap) + return ERR_PTR(-ENOMEM); + + if (!(trap->i_state & I_NEW)) { + /* Conflicting layer roots? */ + iput(trap); + return ERR_PTR(-ELOOP); + } + + trap->i_mode = S_IFDIR; + trap->i_flags = S_DEAD; + unlock_new_inode(trap); + + return trap; +} + +/* + * Does overlay inode need to be hashed by lower inode? + */ +static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, + struct dentry *lower, bool index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + /* No, if pure upper */ + if (!lower) + return false; + + /* Yes, if already indexed */ + if (index) + return true; + + /* Yes, if won't be copied up */ + if (!ovl_upper_mnt(ofs)) + return true; + + /* No, if lower hardlink is or will be broken on copy up */ + if ((upper || !ovl_indexdir(sb)) && + !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return false; + + /* No, if non-indexed upper with NFS export */ + if (sb->s_export_op && upper) + return false; + + /* Otherwise, hash by lower inode for fsnotify */ + return true; +} + +static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, + struct inode *key) +{ + return newinode ? inode_insert5(newinode, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key) : + iget5_locked(sb, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key); +} + +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip) +{ + struct ovl_fs *ofs = OVL_FS(sb); + struct dentry *upperdentry = oip->upperdentry; + struct ovl_path *lowerpath = oip->lowerpath; + struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; + struct inode *inode; + struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; + struct path realpath = { + .dentry = upperdentry ?: lowerdentry, + .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, + }; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, + oip->index); + int fsid = bylower ? lowerpath->layer->fsid : 0; + bool is_dir; + unsigned long ino = 0; + int err = oip->newinode ? -EEXIST : -ENOMEM; + + if (!realinode) + realinode = d_inode(lowerdentry); + + /* + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key if this is a broken hardlink. + */ + is_dir = S_ISDIR(realinode->i_mode); + if (upperdentry || bylower) { + struct inode *key = d_inode(bylower ? lowerdentry : + upperdentry); + unsigned int nlink = is_dir ? 1 : realinode->i_nlink; + + inode = ovl_iget5(sb, oip->newinode, key); + if (!inode) + goto out_err; + if (!(inode->i_state & I_NEW)) { + /* + * Verify that the underlying files stored in the inode + * match those in the dentry. + */ + if (!ovl_verify_inode(inode, lowerdentry, upperdentry, + true)) { + iput(inode); + err = -ESTALE; + goto out_err; + } + + dput(upperdentry); + kfree(oip->redirect); + goto out; + } + + /* Recalculate nlink for non-dir due to indexing */ + if (!is_dir) + nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, + nlink); + set_nlink(inode, nlink); + ino = key->i_ino; + } else { + /* Lower hardlink that will be broken on copy up */ + inode = new_inode(sb); + if (!inode) { + err = -ENOMEM; + goto out_err; + } + ino = realinode->i_ino; + fsid = lowerpath->layer->fsid; + } + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_inode_init(inode, oip, ino, fsid); + + if (upperdentry && ovl_is_impuredir(sb, upperdentry)) + ovl_set_flag(OVL_IMPURE, inode); + + if (oip->index) + ovl_set_flag(OVL_INDEX, inode); + + OVL_I(inode)->redirect = oip->redirect; + + if (bylower) + ovl_set_flag(OVL_CONST_INO, inode); + + /* Check for non-merge dir that may have whiteouts */ + if (is_dir) { + if (((upperdentry && lowerdentry) || oip->numlower > 1) || + ovl_path_check_origin_xattr(ofs, &realpath)) { + ovl_set_flag(OVL_WHITEOUTS, inode); + } + } + + /* Check for immutable/append-only inode flags in xattr */ + if (upperdentry) + ovl_check_protattr(inode, upperdentry); + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); +out: + return inode; + +out_err: + pr_warn_ratelimited("failed to get inode (%i)\n", err); + inode = ERR_PTR(err); + goto out; +} diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c new file mode 100644 index 000000000..655d08d6f --- /dev/null +++ b/fs/overlayfs/namei.c @@ -0,0 +1,1203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/exportfs.h> +#include "overlayfs.h" + +struct ovl_lookup_data { + struct super_block *sb; + struct vfsmount *mnt; + struct qstr name; + bool is_dir; + bool opaque; + bool stop; + bool last; + char *redirect; + bool metacopy; +}; + +static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, + size_t prelen, const char *post) +{ + int res; + char *buf; + struct ovl_fs *ofs = OVL_FS(d->sb); + + buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); + if (IS_ERR_OR_NULL(buf)) + return PTR_ERR(buf); + + if (buf[0] == '/') { + /* + * One of the ancestor path elements in an absolute path + * lookup in ovl_lookup_layer() could have been opaque and + * that will stop further lookup in lower layers (d->stop=true) + * But we have found an absolute redirect in descendant path + * element and that should force continue lookup in lower + * layers (reset d->stop). + */ + d->stop = false; + } else { + res = strlen(buf) + 1; + memmove(buf + prelen, buf, res); + memcpy(buf, d->name.name, prelen); + } + + strcat(buf, post); + kfree(d->redirect); + d->redirect = buf; + d->name.name = d->redirect; + d->name.len = strlen(d->redirect); + + return 0; +} + +static int ovl_acceptable(void *ctx, struct dentry *dentry) +{ + /* + * A non-dir origin may be disconnected, which is fine, because + * we only need it for its unique inode number. + */ + if (!d_is_dir(dentry)) + return 1; + + /* Don't decode a deleted empty directory */ + if (d_unhashed(dentry)) + return 0; + + /* Check if directory belongs to the layer we are decoding from */ + return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); +} + +/* + * Check validity of an overlay file handle buffer. + * + * Return 0 for a valid file handle. + * Return -ENODATA for "origin unknown". + * Return <0 for an invalid file handle. + */ +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) +{ + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) + return -EINVAL; + + if (fb->magic != OVL_FH_MAGIC) + return -EINVAL; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) + return -ENODATA; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + return -ENODATA; + + return 0; +} + +static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, + enum ovl_xattr ox) +{ + int res, err; + struct ovl_fh *fh = NULL; + + res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + /* Zero size value means "copied up but origin unknown" */ + if (res == 0) + return NULL; + + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); + if (res < 0) + goto fail; + + err = ovl_check_fb_len(&fh->fb, res); + if (err < 0) { + if (err == -ENODATA) + goto out; + goto invalid; + } + + return fh; + +out: + kfree(fh); + return NULL; + +fail: + pr_warn_ratelimited("failed to get origin (%i)\n", res); + goto out; +invalid: + pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); + goto out; +} + +struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct vfsmount *mnt, bool connected) +{ + struct dentry *real; + int bytes; + + if (!capable(CAP_DAC_READ_SEARCH)) + return NULL; + + /* + * Make sure that the stored uuid matches the uuid of the lower + * layer where file handle will be decoded. + * In case of uuid=off option just make sure that stored uuid is null. + */ + if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : + !uuid_is_null(&fh->fb.uuid)) + return NULL; + + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, + connected ? ovl_acceptable : NULL, mnt); + if (IS_ERR(real)) { + /* + * Treat stale file handle to lower file as "origin unknown". + * upper file handle could become stale when upper file is + * unlinked and this information is needed to handle stale + * index entries correctly. + */ + if (real == ERR_PTR(-ESTALE) && + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) + real = NULL; + return real; + } + + if (ovl_dentry_weird(real)) { + dput(real); + return NULL; + } + + return real; +} + +static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) +{ + return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); +} + +static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, + const char *name, + struct dentry *base, int len, + bool drop_negative) +{ + struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + if (drop_negative && ret->d_lockref.count == 1) { + spin_lock(&ret->d_lock); + /* Recheck condition under lock */ + if (d_is_negative(ret) && ret->d_lockref.count == 1) + __d_drop(ret); + spin_unlock(&ret->d_lock); + } + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} + +static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + const char *name, unsigned int namelen, + size_t prelen, const char *post, + struct dentry **ret, bool drop_negative) +{ + struct dentry *this; + struct path path; + int err; + bool last_element = !post[0]; + + this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + if (err == -ENOENT || err == -ENAMETOOLONG) + goto out; + goto out_err; + } + + if (ovl_dentry_weird(this)) { + /* Don't support traversing automounts and other weirdness */ + err = -EREMOTE; + goto out_err; + } + if (ovl_is_whiteout(this)) { + d->stop = d->opaque = true; + goto put_and_out; + } + /* + * This dentry should be a regular file if previous layer lookup + * found a metacopy dentry. + */ + if (last_element && d->metacopy && !d_is_reg(this)) { + d->stop = true; + goto put_and_out; + } + + path.dentry = this; + path.mnt = d->mnt; + if (!d_can_lookup(this)) { + if (d->is_dir || !last_element) { + d->stop = true; + goto put_and_out; + } + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path); + if (err < 0) + goto out_err; + + d->metacopy = err; + d->stop = !d->metacopy; + if (!d->metacopy || d->last) + goto out; + } else { + if (ovl_lookup_trap_inode(d->sb, this)) { + /* Caught in a trap of overlapping layers */ + err = -ELOOP; + goto out_err; + } + + if (last_element) + d->is_dir = true; + if (d->last) + goto out; + + if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + d->stop = true; + if (last_element) + d->opaque = true; + goto out; + } + } + err = ovl_check_redirect(&path, d, prelen, post); + if (err) + goto out_err; +out: + *ret = this; + return 0; + +put_and_out: + dput(this); + this = NULL; + goto out; + +out_err: + dput(this); + return err; +} + +static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, + struct dentry **ret, bool drop_negative) +{ + /* Counting down from the end, since the prefix can change */ + size_t rem = d->name.len - 1; + struct dentry *dentry = NULL; + int err; + + if (d->name.name[0] != '/') + return ovl_lookup_single(base, d, d->name.name, d->name.len, + 0, "", ret, drop_negative); + + while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { + const char *s = d->name.name + d->name.len - rem; + const char *next = strchrnul(s, '/'); + size_t thislen = next - s; + bool end = !next[0]; + + /* Verify we did not go off the rails */ + if (WARN_ON(s[-1] != '/')) + return -EIO; + + err = ovl_lookup_single(base, d, s, thislen, + d->name.len - rem, next, &base, + drop_negative); + dput(dentry); + if (err) + return err; + dentry = base; + if (end) + break; + + rem -= thislen + 1; + + if (WARN_ON(rem >= d->name.len)) + return -EIO; + } + *ret = dentry; + return 0; +} + + +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, + struct dentry *upperdentry, struct ovl_path **stackp) +{ + struct dentry *origin = NULL; + int i; + + for (i = 1; i < ofs->numlayer; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->layers[i].fsid && + ofs->layers[i].fs->bad_uuid) + continue; + + origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt, + connected); + if (origin) + break; + } + + if (!origin) + return -ESTALE; + else if (IS_ERR(origin)) + return PTR_ERR(origin); + + if (upperdentry && !ovl_is_whiteout(upperdentry) && + inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) + goto invalid; + + if (!*stackp) + *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); + if (!*stackp) { + dput(origin); + return -ENOMEM; + } + **stackp = (struct ovl_path){ + .dentry = origin, + .layer = &ofs->layers[i] + }; + + return 0; + +invalid: + pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); + dput(origin); + return -ESTALE; +} + +static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, + struct ovl_path **stackp) +{ + struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN); + int err; + + if (IS_ERR_OR_NULL(fh)) + return PTR_ERR(fh); + + err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); + kfree(fh); + + if (err) { + if (err == -ESTALE) + return 0; + return err; + } + + return 0; +} + +/* + * Verify that @fh matches the file handle stored in xattr @name. + * Return 0 on match, -ESTALE on mismatch, < 0 on error. + */ +static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const struct ovl_fh *fh) +{ + struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox); + int err = 0; + + if (!ofh) + return -ENODATA; + + if (IS_ERR(ofh)) + return PTR_ERR(ofh); + + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) + err = -ESTALE; + + kfree(ofh); + return err; +} + +/* + * Verify that @real dentry matches the file handle stored in xattr @name. + * + * If @set is true and there is no stored file handle, encode @real and store + * file handle in xattr @name. + * + * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. + */ +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, bool is_upper, + bool set) +{ + struct inode *inode; + struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(ofs, real, is_upper); + err = PTR_ERR(fh); + if (IS_ERR(fh)) { + fh = NULL; + goto fail; + } + + err = ovl_verify_fh(ofs, dentry, ox, fh); + if (set && err == -ENODATA) + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + if (err) + goto fail; + +out: + kfree(fh); + return err; + +fail: + inode = d_inode(real); + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", + is_upper ? "upper" : "origin", real, + inode ? inode->i_ino : 0, err); + goto out; +} + +/* Get upper dentry from index */ +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +{ + struct ovl_fh *fh; + struct dentry *upper; + + if (!d_is_dir(index)) + return dget(index); + + fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER); + if (IS_ERR_OR_NULL(fh)) + return ERR_CAST(fh); + + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); + kfree(fh); + + if (IS_ERR_OR_NULL(upper)) + return upper ?: ERR_PTR(-ESTALE); + + if (!d_is_dir(upper)) { + pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", + index, upper); + dput(upper); + return ERR_PTR(-EIO); + } + + return upper; +} + +/* + * Verify that an index entry name matches the origin file handle stored in + * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. + * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. + */ +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) +{ + struct ovl_fh *fh = NULL; + size_t len; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; + struct dentry *upper = NULL; + int err; + + if (!d_inode(index)) + return 0; + + err = -EINVAL; + if (index->d_name.len < sizeof(struct ovl_fb)*2) + goto fail; + + err = -ENOMEM; + len = index->d_name.len / 2; + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); + if (!fh) + goto fail; + + err = -EINVAL; + if (hex2bin(fh->buf, index->d_name.name, len)) + goto fail; + + err = ovl_check_fb_len(&fh->fb, len); + if (err) + goto fail; + + /* + * Whiteout index entries are used as an indication that an exported + * overlay file handle should be treated as stale (i.e. after unlink + * of the overlay inode). These entries contain no origin xattr. + */ + if (ovl_is_whiteout(index)) + goto out; + + /* + * Verifying directory index entries are not stale is expensive, so + * only verify stale dir index if NFS export is enabled. + */ + if (d_is_dir(index) && !ofs->config.nfs_export) + goto out; + + /* + * Directory index entries should have 'upper' xattr pointing to the + * real upper dir. Non-dir index entries are hardlinks to the upper + * real inode. For non-dir index, we can read the copy up origin xattr + * directly from the index dentry, but for dir index we first need to + * decode the upper directory. + */ + upper = ovl_index_upper(ofs, index); + if (IS_ERR_OR_NULL(upper)) { + err = PTR_ERR(upper); + /* + * Directory index entries with no 'upper' xattr need to be + * removed. When dir index entry has a stale 'upper' xattr, + * we assume that upper dir was removed and we treat the dir + * index as orphan entry that needs to be whited out. + */ + if (err == -ESTALE) + goto orphan; + else if (!err) + err = -ESTALE; + goto fail; + } + + err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh); + dput(upper); + if (err) + goto fail; + + /* Check if non-dir index is orphan and don't warn before cleaning it */ + if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { + err = ovl_check_origin_fh(ofs, fh, false, index, &stack); + if (err) + goto fail; + + if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0) + goto orphan; + } + +out: + dput(origin.dentry); + kfree(fh); + return err; + +fail: + pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", + index, d_inode(index)->i_mode & S_IFMT, err); + goto out; + +orphan: + pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(index)->i_nlink); + err = -ENOENT; + goto out; +} + +static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +{ + char *n, *s; + + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); + if (!n) + return -ENOMEM; + + s = bin2hex(n, fh->buf, fh->fb.len); + *name = (struct qstr) QSTR_INIT(n, s - n); + + return 0; + +} + +/* + * Lookup in indexdir for the index entry of a lower real inode or a copy up + * origin inode. The index entry name is the hex representation of the lower + * inode file handle. + * + * If the index dentry in negative, then either no lower aliases have been + * copied up yet, or aliases have been copied up in older kernels and are + * not indexed. + * + * If the index dentry for a copy up origin inode is positive, but points + * to an inode different than the upper inode, then either the upper inode + * has been copied up and not indexed or it was indexed, but since then + * index dir was cleared. Either way, that index cannot be used to identify + * the overlay inode. + */ +int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, + struct qstr *name) +{ + struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(ofs, origin, false); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = ovl_get_index_name_fh(fh, name); + + kfree(fh); + return err; +} + +/* Lookup index by file handle for NFS export */ +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) +{ + struct dentry *index; + struct qstr name; + int err; + + err = ovl_get_index_name_fh(fh, &name); + if (err) + return ERR_PTR(err); + + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); + kfree(name.name); + if (IS_ERR(index)) { + if (PTR_ERR(index) == -ENOENT) + index = NULL; + return index; + } + + if (ovl_is_whiteout(index)) + err = -ESTALE; + else if (ovl_dentry_weird(index)) + err = -EIO; + else + return index; + + dput(index); + return ERR_PTR(err); +} + +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify) +{ + struct dentry *index; + struct inode *inode; + struct qstr name; + bool is_dir = d_is_dir(origin); + int err; + + err = ovl_get_index_name(ofs, origin, &name); + if (err) + return ERR_PTR(err); + + index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + ofs->indexdir, name.len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + if (err == -ENOENT) { + index = NULL; + goto out; + } + pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + "overlayfs: mount with '-o index=off' to disable inodes index.\n", + d_inode(origin)->i_ino, name.len, name.name, + err); + goto out; + } + + inode = d_inode(index); + if (ovl_is_whiteout(index) && !verify) { + /* + * When index lookup is called with !verify for decoding an + * overlay file handle, a whiteout index implies that decode + * should treat file handle as stale and no need to print a + * warning about it. + */ + dput(index); + index = ERR_PTR(-ESTALE); + goto out; + } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || + inode_wrong_type(inode, d_inode(origin)->i_mode)) { + /* + * Index should always be of the same file type as origin + * except for the case of a whiteout index. A whiteout + * index should only exist if all lower aliases have been + * unlinked, which means that finding a lower origin on lookup + * whose index is a whiteout should be treated as an error. + */ + pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); + goto fail; + } else if (is_dir && verify) { + if (!upper) { + pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + origin, index); + goto fail; + } + + /* Verify that dir index 'upper' xattr points to upper dir */ + err = ovl_verify_upper(ofs, index, upper, false); + if (err) { + if (err == -ESTALE) { + pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + upper, origin, index); + } + goto fail; + } + } else if (upper && d_inode(upper) != inode) { + goto out_dput; + } +out: + kfree(name.name); + return index; + +out_dput: + dput(index); + index = NULL; + goto out; + +fail: + dput(index); + index = ERR_PTR(-EIO); + goto out; +} + +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + path->dentry = oe->lowerstack[idx - 1].dentry; + path->mnt = oe->lowerstack[idx - 1].layer->mnt; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + +/* Fix missing 'origin' xattr */ +static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, + struct dentry *lower, struct dentry *upper) +{ + int err; + + if (ovl_check_origin_xattr(ofs, upper)) + return 0; + + err = ovl_want_write(dentry); + if (err) + return err; + + err = ovl_set_origin(ofs, lower, upper); + if (!err) + err = ovl_set_impure(dentry->d_parent, upper->d_parent); + + ovl_drop_write(dentry); + return err; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + const struct cred *old_cred; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct ovl_path *stack = NULL, *origin_path = NULL; + struct dentry *upperdir, *upperdentry = NULL; + struct dentry *origin = NULL; + struct dentry *index = NULL; + unsigned int ctr = 0; + struct inode *inode = NULL; + bool upperopaque = false; + char *upperredirect = NULL; + struct dentry *this; + unsigned int i; + int err; + bool uppermetacopy = false; + struct ovl_lookup_data d = { + .sb = dentry->d_sb, + .name = dentry->d_name, + .is_dir = false, + .opaque = false, + .stop = false, + .last = ofs->config.redirect_follow ? false : !poe->numlower, + .redirect = NULL, + .metacopy = false, + }; + + if (dentry->d_name.len > ofs->namelen) + return ERR_PTR(-ENAMETOOLONG); + + old_cred = ovl_override_creds(dentry->d_sb); + upperdir = ovl_dentry_upper(dentry->d_parent); + if (upperdir) { + d.mnt = ovl_upper_mnt(ofs); + err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); + if (err) + goto out; + + if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { + dput(upperdentry); + err = -EREMOTE; + goto out; + } + if (upperdentry && !d.is_dir) { + /* + * Lookup copy up origin by decoding origin file handle. + * We may get a disconnected dentry, which is fine, + * because we only need to hold the origin inode in + * cache and use its inode number. We may even get a + * connected dentry, that is not under any of the lower + * layers root. That is also fine for using it's inode + * number - it's the same as if we held a reference + * to a dentry in lower layer that was moved under us. + */ + err = ovl_check_origin(ofs, upperdentry, &origin_path); + if (err) + goto out_put_upper; + + if (d.metacopy) + uppermetacopy = true; + } + + if (d.redirect) { + err = -ENOMEM; + upperredirect = kstrdup(d.redirect, GFP_KERNEL); + if (!upperredirect) + goto out_put_upper; + if (d.redirect[0] == '/') + poe = roe; + } + upperopaque = d.opaque; + } + + if (!d.stop && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path), + GFP_KERNEL); + if (!stack) + goto out_put_upper; + } + + for (i = 0; !d.stop && i < poe->numlower; i++) { + struct ovl_path lower = poe->lowerstack[i]; + + if (!ofs->config.redirect_follow) + d.last = i == poe->numlower - 1; + else + d.last = lower.layer->idx == roe->numlower; + + d.mnt = lower.layer->mnt; + err = ovl_lookup_layer(lower.dentry, &d, &this, false); + if (err) + goto out_put; + + if (!this) + continue; + + if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { + dput(this); + err = -EPERM; + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); + goto out_put; + } + + /* + * If no origin fh is stored in upper of a merge dir, store fh + * of lower dir and set upper parent "impure". + */ + if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { + err = ovl_fix_origin(ofs, dentry, this, upperdentry); + if (err) { + dput(this); + goto out_put; + } + } + + /* + * When "verify_lower" feature is enabled, do not merge with a + * lower dir that does not match a stored origin xattr. In any + * case, only verified origin is used for index lookup. + * + * For non-dir dentry, if index=on, then ensure origin + * matches the dentry found using path based lookup, + * otherwise error out. + */ + if (upperdentry && !ctr && + ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || + (!d.is_dir && ofs->config.index && origin_path))) { + err = ovl_verify_origin(ofs, upperdentry, this, false); + if (err) { + dput(this); + if (d.is_dir) + break; + goto out_put; + } + origin = this; + } + + if (d.metacopy && ctr) { + /* + * Do not store intermediate metacopy dentries in + * lower chain, except top most lower metacopy dentry. + * Continue the loop so that if there is an absolute + * redirect on this dentry, poe can be reset to roe. + */ + dput(this); + this = NULL; + } else { + stack[ctr].dentry = this; + stack[ctr].layer = lower.layer; + ctr++; + } + + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", + dentry); + goto out_put; + } + + if (d.stop) + break; + + if (d.redirect && d.redirect[0] == '/' && poe != roe) { + poe = roe; + /* Find the current layer on the root dentry */ + i = lower.layer->idx - 1; + } + } + + /* + * For regular non-metacopy upper dentries, there is no lower + * path based lookup, hence ctr will be zero. If a dentry is found + * using ORIGIN xattr on upper, install it in stack. + * + * For metacopy dentry, path based lookup will find lower dentries. + * Just make sure a corresponding data dentry has been found. + */ + if (d.metacopy || (uppermetacopy && !ctr)) { + pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", + dentry); + err = -EIO; + goto out_put; + } else if (!d.is_dir && upperdentry && !ctr && origin_path) { + if (WARN_ON(stack != NULL)) { + err = -EIO; + goto out_put; + } + stack = origin_path; + ctr = 1; + origin = origin_path->dentry; + origin_path = NULL; + } + + /* + * Always lookup index if there is no-upperdentry. + * + * For the case of upperdentry, we have set origin by now if it + * needed to be set. There are basically three cases. + * + * For directories, lookup index by lower inode and verify it matches + * upper inode. We only trust dir index if we verified that lower dir + * matches origin, otherwise dir index entries may be inconsistent + * and we ignore them. + * + * For regular upper, we already set origin if upper had ORIGIN + * xattr. There is no verification though as there is no path + * based dentry lookup in lower in this case. + * + * For metacopy upper, we set a verified origin already if index + * is enabled and if upper had an ORIGIN xattr. + * + */ + if (!upperdentry && ctr) + origin = stack[0].dentry; + + if (origin && ovl_indexdir(dentry->d_sb) && + (!d.is_dir || ovl_index_all(dentry->d_sb))) { + index = ovl_lookup_index(ofs, upperdentry, origin, true); + if (IS_ERR(index)) { + err = PTR_ERR(index); + index = NULL; + goto out_put; + } + } + + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); + dentry->d_fsdata = oe; + + if (upperopaque) + ovl_dentry_set_opaque(dentry); + + if (upperdentry) + ovl_dentry_set_upper_alias(dentry); + else if (index) { + struct path upperpath = { + .dentry = upperdentry = dget(index), + .mnt = ovl_upper_mnt(ofs), + }; + + upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); + if (IS_ERR(upperredirect)) { + err = PTR_ERR(upperredirect); + upperredirect = NULL; + goto out_free_oe; + } + err = ovl_check_metacopy_xattr(ofs, &upperpath); + if (err < 0) + goto out_free_oe; + uppermetacopy = err; + } + + if (upperdentry || ctr) { + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = stack, + .index = index, + .numlower = ctr, + .redirect = upperredirect, + .lowerdata = (ctr > 1 && !d.is_dir) ? + stack[ctr - 1].dentry : NULL, + }; + + inode = ovl_get_inode(dentry->d_sb, &oip); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_free_oe; + if (upperdentry && !uppermetacopy) + ovl_set_flag(OVL_UPPERDATA, inode); + } + + ovl_dentry_init_reval(dentry, upperdentry); + + revert_creds(old_cred); + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } + dput(index); + kfree(stack); + kfree(d.redirect); + return d_splice_alias(inode, dentry); + +out_free_oe: + dentry->d_fsdata = NULL; + kfree(oe); +out_put: + dput(index); + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } + dput(upperdentry); + kfree(upperredirect); +out: + kfree(d.redirect); + revert_creds(old_cred); + return ERR_PTR(err); +} + +bool ovl_lower_positive(struct dentry *dentry) +{ + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + const struct qstr *name = &dentry->d_name; + const struct cred *old_cred; + unsigned int i; + bool positive = false; + bool done = false; + + /* + * If dentry is negative, then lower is positive iff this is a + * whiteout. + */ + if (!dentry->d_inode) + return ovl_dentry_is_opaque(dentry); + + /* Negative upper -> positive lower */ + if (!ovl_dentry_upper(dentry)) + return true; + + old_cred = ovl_override_creds(dentry->d_sb); + /* Positive upper -> have to look up lower to see whether it exists */ + for (i = 0; !done && !positive && i < poe->numlower; i++) { + struct dentry *this; + struct dentry *lowerdir = poe->lowerstack[i].dentry; + + this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + name->name, lowerdir, name->len); + if (IS_ERR(this)) { + switch (PTR_ERR(this)) { + case -ENOENT: + case -ENAMETOOLONG: + break; + + default: + /* + * Assume something is there, we just couldn't + * access it. + */ + positive = true; + break; + } + } else { + positive = !ovl_is_whiteout(this); + done = true; + dput(this); + } + } + revert_creds(old_cred); + + return positive; +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000..a3c59ac01 --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,698 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <linux/kernel.h> +#include <linux/uuid.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include "ovl_entry.h" + +#undef pr_fmt +#define pr_fmt(fmt) "overlayfs: " fmt + +enum ovl_path_type { + __OVL_PATH_UPPER = (1 << 0), + __OVL_PATH_MERGE = (1 << 1), + __OVL_PATH_ORIGIN = (1 << 2), +}; + +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) + +#define OVL_XATTR_NAMESPACE "overlay." +#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE + +enum ovl_xattr { + OVL_XATTR_OPAQUE, + OVL_XATTR_REDIRECT, + OVL_XATTR_ORIGIN, + OVL_XATTR_IMPURE, + OVL_XATTR_NLINK, + OVL_XATTR_UPPER, + OVL_XATTR_METACOPY, + OVL_XATTR_PROTATTR, +}; + +enum ovl_inode_flag { + /* Pure upper dir that may contain non pure upper entries */ + OVL_IMPURE, + /* Non-merge dir that may contain whiteout entries */ + OVL_WHITEOUTS, + OVL_INDEX, + OVL_UPPERDATA, + /* Inode number will remain constant over copy up. */ + OVL_CONST_INO, +}; + +enum ovl_entry_flag { + OVL_E_UPPER_ALIAS, + OVL_E_OPAQUE, + OVL_E_CONNECTED, +}; + +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + +/* + * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, + * where: + * origin.fh - exported file handle of the lower file + * origin.uuid - uuid of the lower filesystem + */ +#define OVL_FH_VERSION 0 +#define OVL_FH_MAGIC 0xfb + +/* CPU byte order required for fid decoding: */ +#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) +#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) +/* Is the real inode encoded in fid an upper inode? */ +#define OVL_FH_FLAG_PATH_UPPER (1 << 2) + +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ + OVL_FH_FLAG_PATH_UPPER) + +#if defined(__LITTLE_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN 0 +#elif defined(__BIG_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN +#else +#error Endianness not defined +#endif + +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 + +/* On-disk format for "origin" file handle */ +struct ovl_fb { + u8 version; /* 0 */ + u8 magic; /* 0xfb */ + u8 len; /* size of this header + size of fid */ + u8 flags; /* OVL_FH_FLAG_* */ + u8 type; /* fid_type of fid */ + uuid_t uuid; /* uuid of filesystem */ + u32 fid[]; /* file identifier should be 32bit aligned in-memory */ +} __packed; + +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + DECLARE_FLEX_ARRAY(u8, buf); + }; +} __packed; + +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + +extern const char *const ovl_xattr_table[][2]; +static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) +{ + return ovl_xattr_table[ox][ofs->config.userxattr]; +} + +/* + * When changing ownership of an upper object map the intended ownership + * according to the upper layer's idmapping. When an upper mount idmaps files + * that are stored on-disk as owned by id 1001 to id 1000 this means stat on + * this object will report it as being owned by id 1000 when calling stat via + * the upper mount. + * In order to change ownership of an object so stat reports id 1000 when + * called on an idmapped upper mount the value written to disk - i.e., the + * value stored in ia_*id - must 1001. The mount mapping helper will thus take + * care to map 1000 to 1001. + * The mnt idmapping helpers are nops if the upper layer isn't idmapped. + */ +static inline int ovl_do_notify_change(struct ovl_fs *ofs, + struct dentry *upperdentry, + struct iattr *attr) +{ + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); +} + +static inline int ovl_do_rmdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); + + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, + struct dentry *dentry) +{ + int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); + + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, + struct inode *dir, struct dentry *new_dentry) +{ + int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); + + pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); + return err; +} + +static inline int ovl_do_create(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); + + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev) +{ + int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); + + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); + return err; +} + +static inline int ovl_do_symlink(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, + const char *oldname) +{ + int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); + + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, + void *value, size_t size) +{ + int err, len; + + WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); + + err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + name, value, size); + len = (value && err > 0) ? err : 0; + + pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", + path->dentry, name, min(len, 48), value, size, err); + return err; +} + +static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, + struct dentry *upperdentry, + enum ovl_xattr ox, void *value, + size_t size) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size); +} + +static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, + const struct path *path, + enum ovl_xattr ox, void *value, + size_t size) +{ + return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size); +} + +static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, + const char *name, const void *value, + size_t size, int flags) +{ + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + value, size, flags); + + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); + return err; +} + +static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const void *value, + size_t size) +{ + return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0); +} + +static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, + const char *name) +{ + int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox) +{ + return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); +} + +static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, + struct dentry *olddentry, struct inode *newdir, + struct dentry *newdentry, unsigned int flags) +{ + int err; + struct renamedata rd = { + .old_mnt_userns = ovl_upper_mnt_userns(ofs), + .old_dir = olddir, + .old_dentry = olddentry, + .new_mnt_userns = ovl_upper_mnt_userns(ofs), + .new_dir = newdir, + .new_dentry = newdentry, + .flags = flags, + }; + + pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); + err = vfs_rename(&rd); + if (err) { + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) +{ + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + O_LARGEFILE | O_WRONLY, current_cred()); + int err = PTR_ERR_OR_ZERO(file); + + pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); + return file; +} + +static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, + const char *name, + struct dentry *base, int len) +{ + return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); +} + +static inline bool ovl_open_flags_need_copy_up(int flags) +{ + if (!flags) + return false; + + return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); +} + +static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) +{ + /* + * To avoid regressions in existing setups with overlay lower offline + * changes, we allow lower changes only if none of the new features + * are used. + */ + return (!ofs->config.index && !ofs->config.metacopy && + !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON); +} + + +/* util.c */ +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +struct dentry *ovl_workdir(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); +int ovl_can_decode_fh(struct super_block *sb); +struct dentry *ovl_indexdir(struct super_block *sb); +bool ovl_index_all(struct super_block *sb); +bool ovl_verify_lower(struct super_block *sb); +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +bool ovl_dentry_remote(struct dentry *dentry); +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry); +void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry); +void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask); +bool ovl_dentry_weird(struct dentry *dentry); +enum ovl_path_type ovl_path_type(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +void ovl_path_lowerdata(struct dentry *dentry, struct path *path); +struct inode *ovl_i_path_real(struct inode *inode, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); +const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_i_dentry_upper(struct inode *inode); +struct inode *ovl_inode_upper(struct inode *inode); +struct inode *ovl_inode_lower(struct inode *inode); +struct inode *ovl_inode_lowerdata(struct inode *inode); +struct inode *ovl_inode_real(struct inode *inode); +struct inode *ovl_inode_realdata(struct inode *inode); +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry); +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +bool ovl_dentry_is_whiteout(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_upper_alias(struct dentry *dentry); +void ovl_dentry_set_upper_alias(struct dentry *dentry); +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); +bool ovl_has_upperdata(struct inode *inode); +void ovl_set_upperdata(struct inode *inode); +bool ovl_redirect_dir(struct super_block *sb); +const char *ovl_dentry_get_redirect(struct dentry *dentry); +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); +void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); +void ovl_dir_modified(struct dentry *dentry, bool impurity); +u64 ovl_dentry_version_get(struct dentry *dentry); +bool ovl_is_whiteout(struct dentry *dentry); +struct file *ovl_path_open(const struct path *path, int flags); +int ovl_copy_up_start(struct dentry *dentry, int flags); +void ovl_copy_up_end(struct dentry *dentry); +bool ovl_already_copied_up(struct dentry *dentry, int flags); +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); + +static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, + struct dentry *upperdentry) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + return ovl_path_check_origin_xattr(ofs, &upperpath); +} + +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, + enum ovl_xattr ox, const void *value, size_t size, + int xerr); +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); +bool ovl_inuse_trylock(struct dentry *dentry); +void ovl_inuse_unlock(struct dentry *dentry); +bool ovl_is_inuse(struct dentry *dentry); +bool ovl_need_index(struct dentry *dentry); +int ovl_nlink_start(struct dentry *dentry); +void ovl_nlink_end(struct dentry *dentry); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_is_metacopy_dentry(struct dentry *dentry); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); +int ovl_sync_status(struct ovl_fs *ofs); + +static inline void ovl_set_flag(unsigned long flag, struct inode *inode) +{ + set_bit(flag, &OVL_I(inode)->flags); +} + +static inline void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + +static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) +{ + return test_bit(flag, &OVL_I(inode)->flags); +} + +static inline bool ovl_is_impuredir(struct super_block *sb, + struct dentry *upperdentry) +{ + struct ovl_fs *ofs = OVL_FS(sb); + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); +} + +/* + * With xino=auto, we do best effort to keep all inodes on same st_dev and + * d_ino consistent with st_ino. + * With xino=on, we do the same effort but we warn if we failed. + */ +static inline bool ovl_xino_warn(struct super_block *sb) +{ + return OVL_FS(sb)->config.xino == OVL_XINO_ON; +} + +/* All layers on same fs? */ +static inline bool ovl_same_fs(struct super_block *sb) +{ + return OVL_FS(sb)->xino_mode == 0; +} + +/* All overlay inodes have same st_dev? */ +static inline bool ovl_same_dev(struct super_block *sb) +{ + return OVL_FS(sb)->xino_mode >= 0; +} + +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; +} + +static inline void ovl_inode_lock(struct inode *inode) +{ + mutex_lock(&OVL_I(inode)->lock); +} + +static inline int ovl_inode_lock_interruptible(struct inode *inode) +{ + return mutex_lock_interruptible(&OVL_I(inode)->lock); +} + +static inline void ovl_inode_unlock(struct inode *inode) +{ + mutex_unlock(&OVL_I(inode)->lock); +} + + +/* namei.c */ +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + if (fh_len < sizeof(struct ovl_fh)) + return -EINVAL; + + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + +struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct vfsmount *mnt, bool connected); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, + struct dentry *upperdentry, struct ovl_path **stackp); +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, bool is_upper, + bool set); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); +int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, + struct qstr *name); +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +bool ovl_lower_positive(struct dentry *dentry); + +static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool set) +{ + return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin, + false, set); +} + +static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, + struct dentry *upper, bool set) +{ + return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set); +} + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +struct file *ovl_dir_real_file(const struct file *file, bool want_upper); +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list); +void ovl_cache_free(struct list_head *list); +void ovl_dir_cache_free(struct inode *inode); +int ovl_check_d_type_supported(const struct path *realpath); +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level); +int ovl_indexdir_cleanup(struct ovl_fs *ofs); + +/* + * Can we iterate real dir directly? + * + * Non-merge dir may contain whiteouts from a time it was a merge upper, before + * lower dir was removed under it and possibly before it was rotated from upper + * to lower layer. + */ +static inline bool ovl_dir_is_real(struct dentry *dir) +{ + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); +} + +/* inode.c */ +int ovl_set_nlink_upper(struct dentry *dentry); +int ovl_set_nlink_lower(struct dentry *dentry); +unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, + struct dentry *upperdentry, + unsigned int fallback); +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); + +#ifdef CONFIG_FS_POSIX_ACL +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); +#else +#define ovl_get_acl NULL +#endif + +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); +bool ovl_is_private_xattr(struct super_block *sb, const char *name); + +struct ovl_inode_params { + struct inode *newinode; + struct dentry *upperdentry; + struct ovl_path *lowerpath; + bool index; + unsigned int numlower; + char *redirect; + struct dentry *lowerdata; +}; +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid); +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper); +bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir); +struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir); +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip); +void ovl_copyattr(struct inode *to); + +/* vfs inode flags copied from real to ovl inode */ +#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) +/* vfs inode flags read from overlay.protattr xattr to ovl inode */ +#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) + +/* + * fileattr flags copied from lower to upper inode on copy up. + * We cannot copy up immutable/append-only flags, because that would prevent + * linking temp inode to upper dir, so we store them in xattr instead. + */ +#define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL) +#define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME) +#define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL) +#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE) + +void ovl_check_protattr(struct inode *inode, struct dentry *upper); +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa); + +static inline void ovl_copyflags(struct inode *from, struct inode *to) +{ + unsigned int mask = OVL_COPY_I_FLAGS_MASK; + + inode_set_flags(to, from->i_flags & mask, mask); +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, + struct dentry *dentry); +struct ovl_cattr { + dev_t rdev; + umode_t mode; + const char *link; + struct dentry *hardlink; +}; + +#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) + +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode); +struct dentry *ovl_create_real(struct ovl_fs *ofs, + struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); +int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, + struct ovl_cattr *attr); + +/* file.c */ +extern const struct file_operations ovl_file_operations; +int __init ovl_aio_request_cache_init(void); +void ovl_aio_request_cache_destroy(void); +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); +int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ovl_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_with_data(struct dentry *dentry); +int ovl_maybe_copy_up(struct dentry *dentry, int flags); +int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, + bool is_upper); +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper); + +/* export.c */ +extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h new file mode 100644 index 000000000..a479680a5 --- /dev/null +++ b/fs/overlayfs/ovl_entry.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + */ + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; + bool default_permissions; + bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; + bool index; + bool uuid; + bool nfs_export; + int xino; + bool metacopy; + bool userxattr; + bool ovl_volatile; +}; + +struct ovl_sb { + struct super_block *sb; + dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; + /* Used as a lower layer (but maybe also as upper) */ + bool is_lower; +}; + +struct ovl_layer { + /* ovl_free_fs() relies on @mnt being the first member! */ + struct vfsmount *mnt; + /* Trap in ovl inode cache */ + struct inode *trap; + struct ovl_sb *fs; + /* Index of this layer in fs root (upper idx == 0) */ + int idx; + /* One fsid per unique underlying sb (upper fsid == 0) */ + int fsid; +}; + +/* + * ovl_free_fs() relies on @mnt being the first member when unmounting + * the private mounts created for each layer. Let's check both the + * offset and type. + */ +static_assert(offsetof(struct ovl_layer, mnt) == 0); +static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); + +struct ovl_path { + const struct ovl_layer *layer; + struct dentry *dentry; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + unsigned int numlayer; + /* Number of unique fs among layers including upper fs */ + unsigned int numfs; + const struct ovl_layer *layers; + struct ovl_sb *fs; + /* workbasedir is the path at workdir= mount option */ + struct dentry *workbasedir; + /* workdir is the 'work' directory under workbasedir */ + struct dentry *workdir; + /* index directory listing overlay inodes by origin file handle */ + struct dentry *indexdir; + long namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; + bool tmpfile; + bool noxattr; + /* Did we take the inuse lock? */ + bool upperdir_locked; + bool workdir_locked; + bool share_whiteout; + /* Traps in ovl inode cache */ + struct inode *workbasedir_trap; + struct inode *workdir_trap; + struct inode *indexdir_trap; + /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ + int xino_mode; + /* For allocation of non-persistent inode numbers */ + atomic_long_t last_ino; + /* Whiteout dentry cache */ + struct dentry *whiteout; + /* r/o snapshot of upperdir sb's only taken on volatile mounts */ + errseq_t errseq; +}; + +static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) +{ + return ofs->layers[0].mnt; +} + +static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +{ + return mnt_user_ns(ovl_upper_mnt(ofs)); +} + +static inline struct ovl_fs *OVL_FS(struct super_block *sb) +{ + return (struct ovl_fs *)sb->s_fs_info; +} + +static inline bool ovl_should_sync(struct ovl_fs *ofs) +{ + return !ofs->config.ovl_volatile; +} + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + union { + struct { + unsigned long flags; + }; + struct rcu_head rcu; + }; + unsigned numlower; + struct ovl_path lowerstack[]; +}; + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); + +static inline struct ovl_entry *OVL_E(struct dentry *dentry) +{ + return (struct ovl_entry *) dentry->d_fsdata; +} + +struct ovl_inode { + union { + struct ovl_dir_cache *cache; /* directory */ + struct inode *lowerdata; /* regular file */ + }; + const char *redirect; + u64 version; + unsigned long flags; + struct inode vfs_inode; + struct dentry *__upperdentry; + struct ovl_path lowerpath; + + /* synchronize copy up and more */ + struct mutex lock; +}; + +static inline struct ovl_inode *OVL_I(struct inode *inode) +{ + return container_of(inode, struct ovl_inode, vfs_inode); +} + +static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) +{ + return READ_ONCE(oi->__upperdentry); +} diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000..2b2106400 --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,1235 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/xattr.h> +#include <linux/rbtree.h> +#include <linux/security.h> +#include <linux/cred.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +struct ovl_cache_entry { + unsigned int len; + unsigned int type; + u64 real_ino; + u64 ino; + struct list_head l_node; + struct rb_node node; + struct ovl_cache_entry *next_maybe_whiteout; + bool is_upper; + bool is_whiteout; + char name[]; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; + struct rb_root root; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + struct dentry *dentry; + bool is_lowest; + struct rb_root *root; + struct list_head *list; + struct list_head middle; + struct ovl_cache_entry *first_maybe_whiteout; + int count; + int err; + bool is_upper; + bool d_type_supported; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct list_head *cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return rb_entry(n, struct ovl_cache_entry, node); +} + +static bool ovl_cache_entry_find_link(const char *name, int len, + struct rb_node ***link, + struct rb_node **parent) +{ + bool found = false; + struct rb_node **newp = *link; + + while (!found && *newp) { + int cmp; + struct ovl_cache_entry *tmp; + + *parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + found = true; + } + *link = newp; + + return found; +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, + struct ovl_cache_entry *p) +{ + /* Don't care if not doing ovl_iter() */ + if (!rdd->dentry) + return false; + + /* Always recalc d_ino when remapping lower inode numbers */ + if (ovl_xino_bits(rdd->dentry->d_sb)) + return true; + + /* Always recalc d_ino for parent */ + if (strcmp(p->name, "..") == 0) + return true; + + /* If this is lower, then native d_ino will do */ + if (!rdd->is_upper) + return false; + + /* + * Recalc d_ino for '.' and for all entries if dir is impure (contains + * copied up entries) + */ + if ((p->name[0] == '.' && p->len == 1) || + ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) + return true; + + return false; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, + const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); + + p = kmalloc(size, GFP_KERNEL); + if (!p) + return NULL; + + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->real_ino = ino; + p->ino = ino; + /* Defer setting d_ino for upper entry to ovl_iterate() */ + if (ovl_calc_d_ino(rdd, p)) + p->ino = 0; + p->is_upper = rdd->is_upper; + p->is_whiteout = false; + + if (d_type == DT_CHR) { + p->next_maybe_whiteout = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p; + } + return p; +} + +static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root->rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + if (ovl_cache_entry_find_link(name, len, &newp, &parent)) + return true; + + p = ovl_cache_entry_new(rdd, name, len, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return false; + } + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, rdd->root); + + return true; +} + +static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, &rdd->middle); + } else { + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, &rdd->middle); + } + + return rdd->err == 0; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +void ovl_dir_cache_free(struct inode *inode) +{ + struct ovl_dir_cache *cache = ovl_dir_cache(inode); + + if (cache) { + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(d_inode(dentry)) == cache) + ovl_set_dir_cache(d_inode(dentry), NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static bool ovl_fill_merge(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + if (!rdd->is_lowest) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); +} + +static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) +{ + int err; + struct ovl_cache_entry *p; + struct dentry *dentry, *dir = path->dentry; + const struct cred *old_cred; + + old_cred = ovl_override_creds(rdd->dentry->d_sb); + + err = down_write_killable(&dir->d_inode->i_rwsem); + if (!err) { + while (rdd->first_maybe_whiteout) { + p = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + } + inode_unlock(dir->d_inode); + } + revert_creds(old_cred); + + return err; +} + +static inline int ovl_dir_read(const struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->first_maybe_whiteout = NULL; + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + + if (!err && rdd->first_maybe_whiteout && rdd->dentry) + err = ovl_check_whiteouts(realpath, rdd); + + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + bool is_real; + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + od->cursor = NULL; + } + is_real = ovl_dir_is_real(dentry); + if (od->is_real != is_real) { + /* is_real can only become false when dir is copied up */ + if (WARN_ON(is_real)) + return; + od->is_real = false; + } +} + +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = dentry, + .list = list, + .root = root, + .is_lowest = false, + }; + int idx, next; + + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + + if (next != -1) { + err = ovl_dir_read(&realpath, &rdd); + if (err) + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_lowest = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); + } + } + return err; +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct list_head *p; + loff_t off = 0; + + list_for_each(p, &od->cache->entries) { + if (off >= pos) + break; + off++; + } + /* Cursor is safe since the cache is stable */ + od->cursor = p; +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + WARN_ON(!cache->refcount); + cache->refcount++; + return cache; + } + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + cache->root = RB_ROOT; + + res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +/* Map inode number to lower fs unique range */ +static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, + const char *name, int namelen, bool warn) +{ + unsigned int xinoshift = 64 - xinobits; + + if (unlikely(ino >> xinoshift)) { + if (warn) { + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + } + return ino; + } + + /* + * The lowest xinobit is reserved for mapping the non-peresistent inode + * numbers range, but this range is only exposed via st_ino, not here. + */ + return ino | ((u64)fsid) << (xinoshift + 1); +} + +/* + * Set d_ino for upper entries. Non-upper entries should always report + * the uppermost real inode ino and should not call this function. + * + * When not all layer are on same fs, report real ino also for upper. + * + * When all layers are on the same fs, and upper has a reference to + * copy up origin, call vfs_getattr() on the overlay entry to make + * sure that d_ino will be consistent with st_ino from stat(2). + */ +static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p) + +{ + struct dentry *dir = path->dentry; + struct dentry *this = NULL; + enum ovl_path_type type; + u64 ino = p->real_ino; + int xinobits = ovl_xino_bits(dir->d_sb); + int err = 0; + + if (!ovl_same_dev(dir->d_sb)) + goto out; + + if (p->name[0] == '.') { + if (p->len == 1) { + this = dget(dir); + goto get; + } + if (p->len == 2 && p->name[1] == '.') { + /* we shall not be moved */ + this = dget(dir->d_parent); + goto get; + } + } + this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + if (IS_ERR_OR_NULL(this) || !this->d_inode) { + /* Mark a stale entry */ + p->is_whiteout = true; + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + goto fail; + } + goto out; + } + +get: + type = ovl_path_type(this); + if (OVL_TYPE_ORIGIN(type)) { + struct kstat stat; + struct path statpath = *path; + + statpath.dentry = this; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + goto fail; + + /* + * Directory inode is always on overlay st_dev. + * Non-dir with ovl_same_dev() could be on pseudo st_dev in case + * of xino bits overflow. + */ + WARN_ON_ONCE(S_ISDIR(stat.mode) && + dir->d_sb->s_dev != stat.dev); + ino = stat.ino; + } else if (xinobits && !OVL_TYPE_UPPER(type)) { + ino = ovl_remap_lower_ino(ino, xinobits, + ovl_layer_lower(this)->fsid, + p->name, p->len, + ovl_xino_warn(dir->d_sb)); + } + +out: + p->ino = ino; + dput(this); + return err; + +fail: + pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", + p->name, err); + goto out; +} + +static bool ovl_fill_plain(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_cache_entry *p; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return false; + } + list_add_tail(&p->l_node, rdd->list); + + return true; +} + +static int ovl_dir_read_impure(const struct path *path, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_cache_entry *p, *n; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_plain, + .list = list, + .root = root, + }; + + INIT_LIST_HEAD(list); + *root = RB_ROOT; + ovl_path_upper(path->dentry, &realpath); + + err = ovl_dir_read(&realpath, &rdd); + if (err) + return err; + + list_for_each_entry_safe(p, n, list, l_node) { + if (strcmp(p->name, ".") != 0 && + strcmp(p->name, "..") != 0) { + err = ovl_cache_update_ino(path, p); + if (err) + return err; + } + if (p->ino == p->real_ino) { + list_del(&p->l_node); + kfree(p); + } else { + struct rb_node **newp = &root->rb_node; + struct rb_node *parent = NULL; + + if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, + &newp, &parent))) + return -EIO; + + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, root); + } + } + return 0; +} + +static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) +{ + int res; + struct dentry *dentry = path->dentry; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) + return cache; + + /* Impure cache is not refcounted, free it here */ + ovl_dir_cache_free(d_inode(dentry)); + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + res = ovl_dir_read_impure(path, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + if (list_empty(&cache->entries)) { + /* + * A good opportunity to get rid of an unneeded "impure" flag. + * Removing the "impure" xattr is best effort. + */ + if (!ovl_want_write(dentry)) { + ovl_removexattr(ofs, ovl_dentry_upper(dentry), + OVL_XATTR_IMPURE); + ovl_drop_write(dentry); + } + ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + kfree(cache); + return NULL; + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +struct ovl_readdir_translate { + struct dir_context *orig_ctx; + struct ovl_dir_cache *cache; + struct dir_context ctx; + u64 parent_ino; + int fsid; + int xinobits; + bool xinowarn; +}; + +static bool ovl_fill_real(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_translate *rdt = + container_of(ctx, struct ovl_readdir_translate, ctx); + struct dir_context *orig_ctx = rdt->orig_ctx; + + if (rdt->parent_ino && strcmp(name, "..") == 0) { + ino = rdt->parent_ino; + } else if (rdt->cache) { + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); + if (p) + ino = p->ino; + } else if (rdt->xinobits) { + ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, + name, namelen, rdt->xinowarn); + } + + return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); +} + +static bool ovl_is_impure_dir(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct inode *dir = d_inode(file->f_path.dentry); + + /* + * Only upper dir can be impure, but if we are in the middle of + * iterating a lower real dir, dir could be copied up and marked + * impure. We only want the impure cache if we started iterating + * a real upper dir to begin with. + */ + return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); + +} + +static int ovl_iterate_real(struct file *file, struct dir_context *ctx) +{ + int err; + struct ovl_dir_file *od = file->private_data; + struct dentry *dir = file->f_path.dentry; + const struct ovl_layer *lower_layer = ovl_layer_lower(dir); + struct ovl_readdir_translate rdt = { + .ctx.actor = ovl_fill_real, + .orig_ctx = ctx, + .xinobits = ovl_xino_bits(dir->d_sb), + .xinowarn = ovl_xino_warn(dir->d_sb), + }; + + if (rdt.xinobits && lower_layer) + rdt.fsid = lower_layer->fsid; + + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { + struct kstat stat; + struct path statpath = file->f_path; + + statpath.dentry = dir->d_parent; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + return err; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + rdt.parent_ino = stat.ino; + } + + if (ovl_is_impure_dir(file)) { + rdt.cache = ovl_cache_get_impure(&file->f_path); + if (IS_ERR(rdt.cache)) + return PTR_ERR(rdt.cache); + } + + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; +} + + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) { + /* + * If parent is merge, then need to adjust d_ino for '..', if + * dir is impure then need to adjust d_ino for copied up + * entries. + */ + if (ovl_xino_bits(dentry->d_sb) || + (ovl_same_fs(dentry->d_sb) && + (ovl_is_impure_dir(file) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { + err = ovl_iterate_real(file, ctx); + } else { + err = iterate_dir(od->realfile, ctx); + } + goto out; + } + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + err = PTR_ERR(cache); + if (IS_ERR(cache)) + goto out; + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) { + if (!p->ino) { + err = ovl_cache_update_ino(&file->f_path, p); + if (err) + goto out; + } + } + /* ovl_cache_update_ino() sets is_whiteout on stale entry */ + if (!p->is_whiteout) { + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + od->cursor = p->l_node.next; + ctx->pos++; + } + err = 0; +out: + revert_creds(old_cred); + return err; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + inode_lock(file_inode(file)); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + inode_unlock(file_inode(file)); + + return res; +} + +static struct file *ovl_dir_open_realfile(const struct file *file, + const struct path *realpath) +{ + struct file *res; + const struct cred *old_cred; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); + revert_creds(old_cred); + + return res; +} + +/* + * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. + * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. + * + * TODO: use same abstract type for file->private_data of dir and file so + * upperfile could also be cached for files as well. + */ +struct file *ovl_dir_real_file(const struct file *file, bool want_upper) +{ + + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *old, *realfile = od->realfile; + + if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) + return want_upper ? NULL : realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper) { + realfile = READ_ONCE(od->upperfile); + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_dir_open_realfile(file, &upperpath); + if (IS_ERR(realfile)) + return realfile; + + old = cmpxchg_release(&od->upperfile, NULL, realfile); + if (old) { + fput(realfile); + realfile = old; + } + } + } + + return realfile; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct file *realfile; + int err; + + err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + if (err <= 0) + return err; + + realfile = ovl_dir_real_file(file, true); + err = PTR_ERR_OR_ZERO(realfile); + + /* Nothing to sync for lower */ + if (!realfile || err) + return err; + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + inode_lock(inode); + ovl_cache_put(od, file->f_path.dentry); + inode_unlock(inode); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_dir_open_realfile(file, &realpath); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + od->realfile = realfile; + od->is_real = ovl_dir_is_real(file->f_path.dentry); + od->is_upper = OVL_TYPE_UPPER(type); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct ovl_cache_entry *p, *n; + struct rb_root root = RB_ROOT; + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_dir_read_merged(dentry, list, &root); + revert_creds(old_cred); + if (err) + return err; + + err = 0; + + list_for_each_entry_safe(p, n, list, l_node) { + /* + * Select whiteouts in upperdir, they should + * be cleared when deleting this directory. + */ + if (p->is_whiteout) { + if (p->is_upper) + continue; + goto del_entry; + } + + if (p->name[0] == '.') { + if (p->len == 1) + goto del_entry; + if (p->len == 2 && p->name[1] == '.') + goto del_entry; + } + err = -ENOTEMPTY; + break; + +del_entry: + list_del(&p->l_node); + kfree(p); + } + + return err; +} + +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list) +{ + struct ovl_cache_entry *p; + + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (WARN_ON(!p->is_whiteout || !p->is_upper)) + continue; + + dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + if (dentry->d_inode) + ovl_cleanup(ofs, upper->d_inode, dentry); + dput(dentry); + } + inode_unlock(upper->d_inode); +} + +static bool ovl_check_d_type(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + /* Even if d_type is not supported, DT_DIR is returned for . and .. */ + if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) + return true; + + if (d_type != DT_UNKNOWN) + rdd->d_type_supported = true; + + return true; +} + +/* + * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values + * if error is encountered. + */ +int ovl_check_d_type_supported(const struct path *realpath) +{ + int err; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_check_d_type, + .d_type_supported = false, + }; + + err = ovl_dir_read(realpath, &rdd); + if (err) + return err; + + return rdd.d_type_supported; +} + +#define OVL_INCOMPATDIR_NAME "incompat" + +static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, + int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct rb_root root = RB_ROOT; + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = &root, + .is_lowest = false, + }; + bool incompat = false; + + /* + * The "work/incompat" directory is treated specially - if it is not + * empty, instead of printing a generic error and mounting read-only, + * we will error about incompat features and fail the mount. + * + * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name + * starts with '#'. + */ + if (level == 2 && + !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) + incompat = true; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } else if (incompat) { + pr_err("overlay with incompat feature '%s' cannot be mounted\n", + p->name); + err = -EINVAL; + break; + } + dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); + dput(dentry); + if (err) + break; + } + inode_unlock(dir); +out: + ovl_cache_free(&list); + return err; +} + +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + return ovl_cleanup(ofs, dir, dentry); + } + + err = ovl_do_rmdir(ofs, dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + if (!err) + err = ovl_cleanup(ofs, dir, dentry); + } + + return err; +} + +int ovl_indexdir_cleanup(struct ovl_fs *ofs) +{ + int err; + struct dentry *indexdir = ofs->indexdir; + struct dentry *index = NULL; + struct inode *dir = indexdir->d_inode; + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; + LIST_HEAD(list); + struct rb_root root = RB_ROOT; + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = &root, + .is_lowest = false, + }; + + err = ovl_dir_read(&path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + index = NULL; + break; + } + /* Cleanup leftover from index create/cleanup attempt */ + if (index->d_name.name[0] == '#') { + err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); + if (err) + break; + goto next; + } + err = ovl_verify_index(ofs, index); + if (!err) { + goto next; + } else if (err == -ESTALE) { + /* Cleanup stale index entries */ + err = ovl_cleanup(ofs, dir, index); + } else if (err != -ENOENT) { + /* + * Abort mount to avoid corrupting the index if + * an incompatible index entry was found or on out + * of memory. + */ + break; + } else if (ofs->config.nfs_export) { + /* + * Whiteout orphan index to block future open by + * handle after overlay nlink dropped to zero. + */ + err = ovl_cleanup_and_whiteout(ofs, dir, index); + } else { + /* Cleanup orphan index entries */ + err = ovl_cleanup(ofs, dir, index); + } + + if (err) + break; + +next: + dput(index); + index = NULL; + } + dput(index); + inode_unlock(dir); +out: + ovl_cache_free(&list); + if (err) + pr_err("failed index dir cleanup (%i)\n", err); + return err; +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000..51eec4a8e --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,2244 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 2011 Novell Inc. + */ + +#include <uapi/linux/magic.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/posix_acl_xattr.h> +#include <linux/exportfs.h> +#include <linux/file.h> +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + + +struct ovl_dir_cache; + +#define OVL_MAX_STACK 500 + +static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); +module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); +MODULE_PARM_DESC(redirect_dir, + "Default to on or off for the redirect_dir feature"); + +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + +static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); +module_param_named(index, ovl_index_def, bool, 0644); +MODULE_PARM_DESC(index, + "Default to on or off for the inodes index feature"); + +static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); +module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); +MODULE_PARM_DESC(nfs_export, + "Default to on or off for the NFS export feature"); + +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(xino_auto, + "Auto enable xino feature"); + +static void ovl_entry_stack_free(struct ovl_entry *oe) +{ + unsigned int i; + + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); +} + +static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); +module_param_named(metacopy, ovl_metacopy_def, bool, 0644); +MODULE_PARM_DESC(metacopy, + "Default to on or off for the metadata only copy up feature"); + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + ovl_entry_stack_free(oe); + kfree_rcu(oe, rcu); + } +} + +static struct dentry *ovl_d_real(struct dentry *dentry, + const struct inode *inode) +{ + struct dentry *real = NULL, *lower; + + /* It's an overlay file */ + if (inode && d_inode(dentry) == inode) + return dentry; + + if (!d_is_reg(dentry)) { + if (!inode || inode == d_inode(dentry)) + return dentry; + goto bug; + } + + real = ovl_dentry_upper(dentry); + if (real && (inode == d_inode(real))) + return real; + + if (real && !inode && ovl_has_upperdata(d_inode(dentry))) + return real; + + lower = ovl_dentry_lowerdata(dentry); + if (!lower) + goto bug; + real = lower; + + /* Handle recursion */ + real = d_real(real, inode); + + if (!inode || inode == d_inode(real)) + return real; +bug: + WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", + __func__, dentry, inode ? inode->i_sb->s_id : "NULL", + inode ? inode->i_ino : 0, real, + real && d_inode(real) ? d_inode(real)->i_ino : 0); + return dentry; +} + +static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) +{ + int ret = 1; + + if (weak) { + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) + ret = d->d_op->d_weak_revalidate(d, flags); + } else if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + ret = -ESTALE; + } + } + return ret; +} + +static int ovl_dentry_revalidate_common(struct dentry *dentry, + unsigned int flags, bool weak) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct inode *inode = d_inode_rcu(dentry); + struct dentry *upper; + unsigned int i; + int ret = 1; + + /* Careful in RCU mode */ + if (!inode) + return -ECHILD; + + upper = ovl_i_dentry_upper(inode); + if (upper) + ret = ovl_revalidate_real(upper, flags, weak); + + for (i = 0; ret > 0 && i < oe->numlower; i++) { + ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags, + weak); + } + return ret; +} + +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, false); +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, true); +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, + .d_real = ovl_d_real, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, +}; + +static struct kmem_cache *ovl_inode_cachep; + +static struct inode *ovl_alloc_inode(struct super_block *sb) +{ + struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL); + + if (!oi) + return NULL; + + oi->cache = NULL; + oi->redirect = NULL; + oi->version = 0; + oi->flags = 0; + oi->__upperdentry = NULL; + oi->lowerpath.dentry = NULL; + oi->lowerpath.layer = NULL; + oi->lowerdata = NULL; + mutex_init(&oi->lock); + + return &oi->vfs_inode; +} + +static void ovl_free_inode(struct inode *inode) +{ + struct ovl_inode *oi = OVL_I(inode); + + kfree(oi->redirect); + mutex_destroy(&oi->lock); + kmem_cache_free(ovl_inode_cachep, oi); +} + +static void ovl_destroy_inode(struct inode *inode) +{ + struct ovl_inode *oi = OVL_I(inode); + + dput(oi->__upperdentry); + dput(oi->lowerpath.dentry); + if (S_ISDIR(inode->i_mode)) + ovl_dir_cache_free(inode); + else + iput(oi->lowerdata); +} + +static void ovl_free_fs(struct ovl_fs *ofs) +{ + struct vfsmount **mounts; + unsigned i; + + iput(ofs->workbasedir_trap); + iput(ofs->indexdir_trap); + iput(ofs->workdir_trap); + dput(ofs->whiteout); + dput(ofs->indexdir); + dput(ofs->workdir); + if (ofs->workdir_locked) + ovl_inuse_unlock(ofs->workbasedir); + dput(ofs->workbasedir); + if (ofs->upperdir_locked) + ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; + for (i = 0; i < ofs->numlayer; i++) { + iput(ofs->layers[i].trap); + mounts[i] = ofs->layers[i].mnt; + } + kern_unmount_array(mounts, ofs->numlayer); + kfree(ofs->layers); + for (i = 0; i < ofs->numfs; i++) + free_anon_bdev(ofs->fs[i].pseudo_dev); + kfree(ofs->fs); + + kfree(ofs->config.lowerdir); + kfree(ofs->config.upperdir); + kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); + if (ofs->creator_cred) + put_cred(ofs->creator_cred); + kfree(ofs); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + ovl_free_fs(ofs); +} + +/* Sync real dirty inodes in upper filesystem (if it exists) */ +static int ovl_sync_fs(struct super_block *sb, int wait) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret; + + ret = ovl_sync_status(ofs); + /* + * We have to always set the err, because the return value isn't + * checked in syncfs, and instead indirectly return an error via + * the sb's writeback errseq, which VFS inspects after this call. + */ + if (ret < 0) { + errseq_set(&sb->s_wb_err, -EIO); + return -EIO; + } + + if (!ret) + return ret; + + /* + * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). + * All the super blocks will be iterated, including upper_sb. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) + return 0; + + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + + return ret; +} + +/** + * ovl_statfs + * @dentry: The dentry to query + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the upper filesystem (if it exists) + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_real(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = ofs->namelen; + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + +/* Will this overlay be forced to mount/remount ro? */ +static bool ovl_force_readonly(struct ovl_fs *ofs) +{ + return (!ovl_upper_mnt(ofs) || !ofs->workdir); +} + +static const char *ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? "on" : "off"; +} + +static const char * const ovl_xino_str[] = { + "off", + "auto", + "on", +}; + +static inline int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + +/** + * ovl_show_options + * @m: the seq_file handle + * @dentry: The dentry to query + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ofs = sb->s_fs_info; + + seq_show_option(m, "lowerdir", ofs->config.lowerdir); + if (ofs->config.upperdir) { + seq_show_option(m, "upperdir", ofs->config.upperdir); + seq_show_option(m, "workdir", ofs->config.workdir); + } + if (ofs->config.default_permissions) + seq_puts(m, ",default_permissions"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); + if (ofs->config.index != ovl_index_def) + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + if (!ofs->config.uuid) + seq_puts(m, ",uuid=off"); + if (ofs->config.nfs_export != ovl_nfs_export_def) + seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? + "on" : "off"); + if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb)) + seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); + if (ofs->config.metacopy != ovl_metacopy_def) + seq_printf(m, ",metacopy=%s", + ofs->config.metacopy ? "on" : "off"); + if (ofs->config.ovl_volatile) + seq_puts(m, ",volatile"); + if (ofs->config.userxattr) + seq_puts(m, ",userxattr"); + return 0; +} + +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret = 0; + + if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) + return -EROFS; + + if (*flags & SB_RDONLY && !sb_rdonly(sb)) { + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + if (ovl_should_sync(ofs)) { + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } + } + + return ret; +} + +static const struct super_operations ovl_super_operations = { + .alloc_inode = ovl_alloc_inode, + .free_inode = ovl_free_inode, + .destroy_inode = ovl_destroy_inode, + .drop_inode = generic_delete_inode, + .put_super = ovl_put_super, + .sync_fs = ovl_sync_fs, + .statfs = ovl_statfs, + .show_options = ovl_show_options, + .remount_fs = ovl_remount, +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_DEFAULT_PERMISSIONS, + OPT_REDIRECT_DIR, + OPT_INDEX_ON, + OPT_INDEX_OFF, + OPT_UUID_ON, + OPT_UUID_OFF, + OPT_NFS_EXPORT_ON, + OPT_USERXATTR, + OPT_NFS_EXPORT_OFF, + OPT_XINO_ON, + OPT_XINO_OFF, + OPT_XINO_AUTO, + OPT_METACOPY_ON, + OPT_METACOPY_OFF, + OPT_VOLATILE, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, + {OPT_INDEX_ON, "index=on"}, + {OPT_INDEX_OFF, "index=off"}, + {OPT_USERXATTR, "userxattr"}, + {OPT_UUID_ON, "uuid=on"}, + {OPT_UUID_OFF, "uuid=off"}, + {OPT_NFS_EXPORT_ON, "nfs_export=on"}, + {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, + {OPT_XINO_ON, "xino=on"}, + {OPT_XINO_OFF, "xino=off"}, + {OPT_XINO_AUTO, "xino=auto"}, + {OPT_METACOPY_ON, "metacopy=on"}, + {OPT_METACOPY_OFF, "metacopy=off"}, + {OPT_VOLATILE, "volatile"}, + {OPT_ERR, NULL} +}; + +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + int err; + bool metacopy_opt = false, redirect_opt = false; + bool nfs_export_opt = false, index_opt = false; + + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + + while ((p = ovl_next_opt(&opt)) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + case OPT_DEFAULT_PERMISSIONS: + config->default_permissions = true; + break; + + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; + redirect_opt = true; + break; + + case OPT_INDEX_ON: + config->index = true; + index_opt = true; + break; + + case OPT_INDEX_OFF: + config->index = false; + index_opt = true; + break; + + case OPT_UUID_ON: + config->uuid = true; + break; + + case OPT_UUID_OFF: + config->uuid = false; + break; + + case OPT_NFS_EXPORT_ON: + config->nfs_export = true; + nfs_export_opt = true; + break; + + case OPT_NFS_EXPORT_OFF: + config->nfs_export = false; + nfs_export_opt = true; + break; + + case OPT_XINO_ON: + config->xino = OVL_XINO_ON; + break; + + case OPT_XINO_OFF: + config->xino = OVL_XINO_OFF; + break; + + case OPT_XINO_AUTO: + config->xino = OVL_XINO_AUTO; + break; + + case OPT_METACOPY_ON: + config->metacopy = true; + metacopy_opt = true; + break; + + case OPT_METACOPY_OFF: + config->metacopy = false; + metacopy_opt = true; + break; + + case OPT_VOLATILE: + config->ovl_volatile = true; + break; + + case OPT_USERXATTR: + config->userxattr = true; + break; + + default: + pr_err("unrecognized mount option \"%s\" or missing value\n", + p); + return -EINVAL; + } + } + + /* Workdir/index are useless in non-upper mount */ + if (!config->upperdir) { + if (config->workdir) { + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + if (config->index && index_opt) { + pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n"); + index_opt = false; + } + config->index = false; + } + + if (!config->upperdir && config->ovl_volatile) { + pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); + config->ovl_volatile = false; + } + + err = ovl_parse_redirect_mode(config, config->redirect_mode); + if (err) + return err; + + /* + * This is to make the logic below simpler. It doesn't make any other + * difference, since config->redirect_dir is only used for upper. + */ + if (!config->upperdir && config->redirect_follow) + config->redirect_dir = true; + + /* Resolve metacopy -> redirect_dir dependency */ + if (config->metacopy && !config->redirect_dir) { + if (metacopy_opt && redirect_opt) { + pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (redirect_opt) { + /* + * There was an explicit redirect_dir=... that resulted + * in this conflict. + */ + pr_info("disabling metacopy due to redirect_dir=%s\n", + config->redirect_mode); + config->metacopy = false; + } else { + /* Automatically enable redirect otherwise. */ + config->redirect_follow = config->redirect_dir = true; + } + } + + /* Resolve nfs_export -> index dependency */ + if (config->nfs_export && !config->index) { + if (!config->upperdir && config->redirect_follow) { + pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + config->nfs_export = false; + } else if (nfs_export_opt && index_opt) { + pr_err("conflicting options: nfs_export=on,index=off\n"); + return -EINVAL; + } else if (index_opt) { + /* + * There was an explicit index=off that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to index=off\n"); + config->nfs_export = false; + } else { + /* Automatically enable index otherwise. */ + config->index = true; + } + } + + /* Resolve nfs_export -> !metacopy dependency */ + if (config->nfs_export && config->metacopy) { + if (nfs_export_opt && metacopy_opt) { + pr_err("conflicting options: nfs_export=on,metacopy=on\n"); + return -EINVAL; + } + if (metacopy_opt) { + /* + * There was an explicit metacopy=on that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to metacopy=on\n"); + config->nfs_export = false; + } else { + /* + * There was an explicit nfs_export=on that resulted + * in this conflict. + */ + pr_info("disabling metacopy due to nfs_export=on\n"); + config->metacopy = false; + } + } + + + /* Resolve userxattr -> !redirect && !metacopy dependency */ + if (config->userxattr) { + if (config->redirect_follow && redirect_opt) { + pr_err("conflicting options: userxattr,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (config->metacopy && metacopy_opt) { + pr_err("conflicting options: userxattr,metacopy=on\n"); + return -EINVAL; + } + /* + * Silently disable default setting of redirect and metacopy. + * This shall be the default in the future as well: these + * options must be explicitly enabled if used together with + * userxattr. + */ + config->redirect_dir = config->redirect_follow = false; + config->metacopy = false; + } + + return 0; +} + +#define OVL_WORKDIR_NAME "work" +#define OVL_INDEXDIR_NAME "index" + +static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, + const char *name, bool persist) +{ + struct inode *dir = ofs->workbasedir->d_inode; + struct vfsmount *mnt = ovl_upper_mnt(ofs); + struct dentry *work; + int err; + bool retried = false; + + inode_lock_nested(dir, I_MUTEX_PARENT); +retry: + work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); + + if (!IS_ERR(work)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + if (persist) + goto out_unlock; + + retried = true; + err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0); + dput(work); + if (err == -EINVAL) { + work = ERR_PTR(err); + goto out_unlock; + } + goto retry; + } + + err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); + if (err) + goto out_dput; + + /* Weird filesystem returning with hashed negative (kernfs)? */ + err = -EINVAL; + if (d_really_is_negative(work)) + goto out_dput; + + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = ovl_do_notify_change(ofs, work, &attr); + inode_unlock(work->d_inode); + if (err) + goto out_dput; + } else { + err = PTR_ERR(work); + goto out_err; + } +out_unlock: + inode_unlock(dir); + return work; + +out_dput: + dput(work); +out_err: + pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", + ofs->config.workdir, name, -err); + work = NULL; + goto out_unlock; +} + +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (ovl_dentry_weird(path->dentry)) { + pr_err("filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!d_is_dir(path->dentry)) { + pr_err("'%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put_init(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + + if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + pr_err("filesystem on '%s' not supported as upperdir\n", + tmp); + path_put_init(path); + err = -EINVAL; + } + kfree(tmp); + } + return err; +} + +static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, + const char *name) +{ + struct kstatfs statfs; + int err = vfs_statfs(path, &statfs); + + if (err) + pr_err("statfs failed on '%s'\n", name); + else + ofs->namelen = max(ofs->namelen, statfs.f_namelen); + + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, + struct ovl_fs *ofs, int *stack_depth) +{ + int fh_type; + int err; + + err = ovl_mount_dir_noesc(name, path); + if (err) + return err; + + err = ovl_check_namelen(path, ofs, name); + if (err) + return err; + + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + /* + * The inodes index feature and NFS export need to encode and decode + * file handles, so they require that all layers support them. + */ + fh_type = ovl_can_decode_fh(path->dentry->d_sb); + if ((ofs->config.nfs_export || + (ofs->config.index && ofs->config.upperdir)) && !fh_type) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + name); + } + /* + * Decoding origin file handle is required for persistent st_ino. + * Without persistent st_ino, xino=auto falls back to xino=off. + */ + if (ofs->config.xino == OVL_XINO_AUTO && + ofs->config.upperdir && !fh_type) { + ofs->config.xino = OVL_XINO_OFF; + pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n", + name); + } + + /* Check if lower fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_mode = -1; + + return 0; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + +static int __maybe_unused +ovl_posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, handler->name, buffer, size); +} + +static int __maybe_unused +ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode); + struct posix_acl *acl = NULL; + int err; + + /* Check that everything is OK before copy-up */ + if (value) { + /* The above comment can be understood in two ways: + * + * 1. We just want to check whether the basic POSIX ACL format + * is ok. For example, if the header is correct and the size + * is sane. + * 2. We want to know whether the ACL_{GROUP,USER} entries can + * be mapped according to the underlying filesystem. + * + * Currently, we only check 1. If we wanted to check 2. we + * would need to pass the mnt_userns and the fs_userns of the + * underlying filesystem. But frankly, I think checking 1. is + * enough to start the copy-up. + */ + acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + err = -EOPNOTSUPP; + if (!IS_POSIXACL(d_inode(workdir))) + goto out_acl_release; + if (!realinode->i_op->set_acl) + goto out_acl_release; + if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { + err = acl ? -EACCES : 0; + goto out_acl_release; + } + err = -EPERM; + if (!inode_owner_or_capable(&init_user_ns, inode)) + goto out_acl_release; + + posix_acl_release(acl); + + /* + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). + */ + if (unlikely(inode->i_mode & S_ISGID) && + handler->flags == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(&init_user_ns, dentry, &iattr); + if (err) + return err; + } + + err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); + return err; + +out_acl_release: + posix_acl_release(acl); + return err; +} + +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, inode, name, value, size, flags); +} + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = ovl_posix_acl_xattr_get, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = ovl_posix_acl_xattr_get, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler ovl_own_trusted_xattr_handler = { + .prefix = OVL_XATTR_TRUSTED_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_own_user_xattr_handler = { + .prefix = OVL_XATTR_USER_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_other_xattr_handler = { + .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, + .set = ovl_other_xattr_set, +}; + +static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &ovl_posix_acl_access_xattr_handler, + &ovl_posix_acl_default_xattr_handler, +#endif + &ovl_own_trusted_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static const struct xattr_handler *ovl_user_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &ovl_posix_acl_access_xattr_handler, + &ovl_posix_acl_default_xattr_handler, +#endif + &ovl_own_user_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, + struct inode **ptrap, const char *name) +{ + struct inode *trap; + int err; + + trap = ovl_get_trap_inode(sb, dir); + err = PTR_ERR_OR_ZERO(trap); + if (err) { + if (err == -ELOOP) + pr_err("conflicting %s path\n", name); + return err; + } + + *ptrap = trap; + return 0; +} + +/* + * Determine how we treat concurrent use of upperdir/workdir based on the + * index feature. This is papering over mount leaks of container runtimes, + * for example, an old overlay mount is leaked and now its upperdir is + * attempted to be used as a lower layer in a new overlay mount. + */ +static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) +{ + if (ofs->config.index) { + pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + name); + return -EBUSY; + } else { + pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + name); + return 0; + } +} + +static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, + struct ovl_layer *upper_layer, struct path *upperpath) +{ + struct vfsmount *upper_mnt; + int err; + + err = ovl_mount_dir(ofs->config.upperdir, upperpath); + if (err) + goto out; + + /* Upperdir path should not be r/o */ + if (__mnt_is_readonly(upperpath->mnt)) { + pr_err("upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out; + } + + err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir); + if (err) + goto out; + + err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap, + "upperdir"); + if (err) + goto out; + + upper_mnt = clone_private_mount(upperpath); + err = PTR_ERR(upper_mnt); + if (IS_ERR(upper_mnt)) { + pr_err("failed to clone upperpath\n"); + goto out; + } + + /* Don't inherit atime flags */ + upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + upper_layer->mnt = upper_mnt; + upper_layer->idx = 0; + upper_layer->fsid = 0; + + /* + * Inherit SB_NOSEC flag from upperdir. + * + * This optimization changes behavior when a security related attribute + * (suid/sgid/security.*) is changed on an underlying layer. This is + * okay because we don't yet have guarantees in that case, but it will + * need careful treatment once we want to honour changes to underlying + * filesystems. + */ + if (upper_mnt->mnt_sb->s_flags & SB_NOSEC) + sb->s_flags |= SB_NOSEC; + + if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) { + ofs->upperdir_locked = true; + } else { + err = ovl_report_in_use(ofs, "upperdir"); + if (err) + goto out; + } + + err = 0; +out: + return err; +} + +/* + * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and + * negative values if error is encountered. + */ +static int ovl_check_rename_whiteout(struct ovl_fs *ofs) +{ + struct dentry *workdir = ofs->workdir; + struct inode *dir = d_inode(workdir); + struct dentry *temp; + struct dentry *dest; + struct dentry *whiteout; + struct name_snapshot name; + int err; + + inode_lock_nested(dir, I_MUTEX_PARENT); + + temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto out_unlock; + + dest = ovl_lookup_temp(ofs, workdir); + err = PTR_ERR(dest); + if (IS_ERR(dest)) { + dput(temp); + goto out_unlock; + } + + /* Name is inline and stable - using snapshot as a copy helper */ + take_dentry_name_snapshot(&name, temp); + err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); + if (err) { + if (err == -EINVAL) + err = 0; + goto cleanup_temp; + } + + whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto cleanup_temp; + + err = ovl_is_whiteout(whiteout); + + /* Best effort cleanup of whiteout and temp file */ + if (err) + ovl_cleanup(ofs, dir, whiteout); + dput(whiteout); + +cleanup_temp: + ovl_cleanup(ofs, dir, temp); + release_dentry_name_snapshot(&name); + dput(temp); + dput(dest); + +out_unlock: + inode_unlock(dir); + + return err; +} + +static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, + struct dentry *parent, + const char *name, umode_t mode) +{ + size_t len = strlen(name); + struct dentry *child; + + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + child = ovl_lookup_upper(ofs, name, parent, len); + if (!IS_ERR(child) && !child->d_inode) + child = ovl_create_real(ofs, parent->d_inode, child, + OVL_CATTR(mode)); + inode_unlock(parent->d_inode); + dput(parent); + + return child; +} + +/* + * Creates $workdir/work/incompat/volatile/dirty file if it is not already + * present. + */ +static int ovl_create_volatile_dirty(struct ovl_fs *ofs) +{ + unsigned int ctr; + struct dentry *d = dget(ofs->workbasedir); + static const char *const volatile_path[] = { + OVL_WORKDIR_NAME, "incompat", "volatile", "dirty" + }; + const char *const *name = volatile_path; + + for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { + d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG); + if (IS_ERR(d)) + return PTR_ERR(d); + } + dput(d); + return 0; +} + +static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, + const struct path *workpath) +{ + struct vfsmount *mnt = ovl_upper_mnt(ofs); + struct dentry *workdir; + struct file *tmpfile; + bool rename_whiteout; + bool d_type; + int fh_type; + int err; + + err = mnt_want_write(mnt); + if (err) + return err; + + workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); + err = PTR_ERR(workdir); + if (IS_ERR_OR_NULL(workdir)) + goto out; + + ofs->workdir = workdir; + + err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir"); + if (err) + goto out; + + /* + * Upper should support d_type, else whiteouts are visible. Given + * workdir and upper are on same fs, we can do iterate_dir() on + * workdir. This check requires successful creation of workdir in + * previous step. + */ + err = ovl_check_d_type_supported(workpath); + if (err < 0) + goto out; + + d_type = err; + if (!d_type) + pr_warn("upper fs needs to support d_type.\n"); + + /* Check if upper/work fs supports O_TMPFILE */ + tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(tmpfile); + if (ofs->tmpfile) + fput(tmpfile); + else + pr_warn("upper fs does not support tmpfile.\n"); + + + /* Check if upper/work fs supports RENAME_WHITEOUT */ + err = ovl_check_rename_whiteout(ofs); + if (err < 0) + goto out; + + rename_whiteout = err; + if (!rename_whiteout) + pr_warn("upper fs does not support RENAME_WHITEOUT.\n"); + + /* + * Check if upper/work fs supports (trusted|user).overlay.* xattr + */ + err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); + if (err) { + pr_warn("failed to set xattr on upper\n"); + ofs->noxattr = true; + if (ofs->config.index || ofs->config.metacopy) { + ofs->config.index = false; + ofs->config.metacopy = false; + pr_warn("...falling back to index=off,metacopy=off.\n"); + } + /* + * xattr support is required for persistent st_ino. + * Without persistent st_ino, xino=auto falls back to xino=off. + */ + if (ofs->config.xino == OVL_XINO_AUTO) { + ofs->config.xino = OVL_XINO_OFF; + pr_warn("...falling back to xino=off.\n"); + } + if (err == -EPERM && !ofs->config.userxattr) + pr_info("try mounting with 'userxattr' option\n"); + err = 0; + } else { + ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); + } + + /* + * We allowed sub-optimal upper fs configuration and don't want to break + * users over kernel upgrade, but we never allowed remote upper fs, so + * we can enforce strict requirements for remote upper fs. + */ + if (ovl_dentry_remote(ofs->workdir) && + (!d_type || !rename_whiteout || ofs->noxattr)) { + pr_err("upper fs missing required features.\n"); + err = -EINVAL; + goto out; + } + + /* + * For volatile mount, create a incompat/volatile/dirty file to keep + * track of it. + */ + if (ofs->config.ovl_volatile) { + err = ovl_create_volatile_dirty(ofs); + if (err < 0) { + pr_err("Failed to create volatile/dirty file.\n"); + goto out; + } + } + + /* Check if upper/work fs supports file handles */ + fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); + if (ofs->config.index && !fh_type) { + ofs->config.index = false; + pr_warn("upper fs does not support file handles, falling back to index=off.\n"); + } + + /* Check if upper fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_mode = -1; + + /* NFS export of r/w mount depends on index */ + if (ofs->config.nfs_export && !ofs->config.index) { + pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } +out: + mnt_drop_write(mnt); + return err; +} + +static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, + const struct path *upperpath) +{ + int err; + struct path workpath = { }; + + err = ovl_mount_dir(ofs->config.workdir, &workpath); + if (err) + goto out; + + err = -EINVAL; + if (upperpath->mnt != workpath.mnt) { + pr_err("workdir and upperdir must reside under the same mount\n"); + goto out; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { + pr_err("workdir and upperdir must be separate subtrees\n"); + goto out; + } + + ofs->workbasedir = dget(workpath.dentry); + + if (ovl_inuse_trylock(ofs->workbasedir)) { + ofs->workdir_locked = true; + } else { + err = ovl_report_in_use(ofs, "workdir"); + if (err) + goto out; + } + + err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap, + "workdir"); + if (err) + goto out; + + err = ovl_make_workdir(sb, ofs, &workpath); + +out: + path_put(&workpath); + + return err; +} + +static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, + struct ovl_entry *oe, const struct path *upperpath) +{ + struct vfsmount *mnt = ovl_upper_mnt(ofs); + struct dentry *indexdir; + int err; + + err = mnt_want_write(mnt); + if (err) + return err; + + /* Verify lower root is upper root origin */ + err = ovl_verify_origin(ofs, upperpath->dentry, + oe->lowerstack[0].dentry, true); + if (err) { + pr_err("failed to verify upper root origin\n"); + goto out; + } + + /* index dir will act also as workdir */ + iput(ofs->workdir_trap); + ofs->workdir_trap = NULL; + dput(ofs->workdir); + ofs->workdir = NULL; + indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); + if (IS_ERR(indexdir)) { + err = PTR_ERR(indexdir); + } else if (indexdir) { + ofs->indexdir = indexdir; + ofs->workdir = dget(indexdir); + + err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap, + "indexdir"); + if (err) + goto out; + + /* + * Verify upper root is exclusively associated with index dir. + * Older kernels stored upper fh in ".overlay.origin" + * xattr. If that xattr exists, verify that it is a match to + * upper dir file handle. In any case, verify or set xattr + * ".overlay.upper" to indicate that index may have + * directory entries. + */ + if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { + err = ovl_verify_set_fh(ofs, ofs->indexdir, + OVL_XATTR_ORIGIN, + upperpath->dentry, true, false); + if (err) + pr_err("failed to verify index dir 'origin' xattr\n"); + } + err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry, + true); + if (err) + pr_err("failed to verify index dir 'upper' xattr\n"); + + /* Cleanup bad/stale/orphan index entries */ + if (!err) + err = ovl_indexdir_cleanup(ofs); + } + if (err || !ofs->indexdir) + pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + +out: + mnt_drop_write(mnt); + return err; +} + +static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) +{ + unsigned int i; + + if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs)) + return true; + + /* + * We allow using single lower with null uuid for index and nfs_export + * for example to support those features with single lower squashfs. + * To avoid regressions in setups of overlay with re-formatted lower + * squashfs, do not allow decoding origin with lower null uuid unless + * user opted-in to one of the new features that require following the + * lower inode of non-dir upper. + */ + if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid)) + return false; + + for (i = 0; i < ofs->numfs; i++) { + /* + * We use uuid to associate an overlay lower file handle with a + * lower layer, so we can accept lower fs with null uuid as long + * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. + */ + if (ofs->fs[i].is_lower && + uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) { + ofs->fs[i].bad_uuid = true; + return false; + } + } + return true; +} + +/* Get a unique fsid for the layer */ +static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) +{ + struct super_block *sb = path->mnt->mnt_sb; + unsigned int i; + dev_t dev; + int err; + bool bad_uuid = false; + bool warn = false; + + for (i = 0; i < ofs->numfs; i++) { + if (ofs->fs[i].sb == sb) + return i; + } + + if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { + bad_uuid = true; + if (ofs->config.xino == OVL_XINO_AUTO) { + ofs->config.xino = OVL_XINO_OFF; + warn = true; + } + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + warn = true; + } + if (warn) { + pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry, ovl_xino_str[ofs->config.xino]); + } + } + + err = get_anon_bdev(&dev); + if (err) { + pr_err("failed to get anonymous bdev for lowerpath\n"); + return err; + } + + ofs->fs[ofs->numfs].sb = sb; + ofs->fs[ofs->numfs].pseudo_dev = dev; + ofs->fs[ofs->numfs].bad_uuid = bad_uuid; + + return ofs->numfs++; +} + +static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, + struct path *stack, unsigned int numlower, + struct ovl_layer *layers) +{ + int err; + unsigned int i; + + err = -ENOMEM; + ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); + if (ofs->fs == NULL) + goto out; + + /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ + ofs->numfs++; + + /* + * All lower layers that share the same fs as upper layer, use the same + * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower + * only overlay to simplify ovl_fs_free(). + * is_lower will be set if upper fs is shared with a lower layer. + */ + err = get_anon_bdev(&ofs->fs[0].pseudo_dev); + if (err) { + pr_err("failed to get anonymous bdev for upper fs\n"); + goto out; + } + + if (ovl_upper_mnt(ofs)) { + ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb; + ofs->fs[0].is_lower = false; + } + + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt; + struct inode *trap; + int fsid; + + err = fsid = ovl_get_fsid(ofs, &stack[i]); + if (err < 0) + goto out; + + /* + * Check if lower root conflicts with this overlay layers before + * checking if it is in-use as upperdir/workdir of "another" + * mount, because we do not bother to check in ovl_is_inuse() if + * the upperdir/workdir is in fact in-use by our + * upperdir/workdir. + */ + err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir"); + if (err) + goto out; + + if (ovl_is_inuse(stack[i].dentry)) { + err = ovl_report_in_use(ofs, "lowerdir"); + if (err) { + iput(trap); + goto out; + } + } + + mnt = clone_private_mount(&stack[i]); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("failed to clone lowerpath\n"); + iput(trap); + goto out; + } + + /* + * Make lower layers R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; + + layers[ofs->numlayer].trap = trap; + layers[ofs->numlayer].mnt = mnt; + layers[ofs->numlayer].idx = ofs->numlayer; + layers[ofs->numlayer].fsid = fsid; + layers[ofs->numlayer].fs = &ofs->fs[fsid]; + ofs->numlayer++; + ofs->fs[fsid].is_lower = true; + } + + /* + * When all layers on same fs, overlay can use real inode numbers. + * With mount option "xino=<on|auto>", mounter declares that there are + * enough free high bits in underlying fs to hold the unique fsid. + * If overlayfs does encounter underlying inodes using the high xino + * bits reserved for fsid, it emits a warning and uses the original + * inode number or a non persistent inode number allocated from a + * dedicated range. + */ + if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) { + if (ofs->config.xino == OVL_XINO_ON) + pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); + ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_OFF) { + ofs->xino_mode = -1; + } else if (ofs->xino_mode < 0) { + /* + * This is a roundup of number of bits needed for encoding + * fsid, where fsid 0 is reserved for upper fs (even with + * lower only overlay) +1 extra bit is reserved for the non + * persistent inode number range that is used for resolving + * xino lower bits overflow. + */ + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30); + ofs->xino_mode = ilog2(ofs->numfs - 1) + 2; + } + + if (ofs->xino_mode > 0) { + pr_info("\"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_mode); + } + + err = 0; +out: + return err; +} + +static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, + const char *lower, unsigned int numlower, + struct ovl_fs *ofs, struct ovl_layer *layers) +{ + int err; + struct path *stack = NULL; + unsigned int i; + struct ovl_entry *oe; + + if (!ofs->config.upperdir && numlower == 1) { + pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); + return ERR_PTR(-EINVAL); + } + + stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL); + if (!stack) + return ERR_PTR(-ENOMEM); + + err = -EINVAL; + for (i = 0; i < numlower; i++) { + err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth); + if (err) + goto out_err; + + lower = strchr(lower, '\0') + 1; + } + + err = -EINVAL; + sb->s_stack_depth++; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("maximum fs stacking depth exceeded\n"); + goto out_err; + } + + err = ovl_get_layers(sb, ofs, stack, numlower, layers); + if (err) + goto out_err; + + err = -ENOMEM; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_err; + + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = dget(stack[i].dentry); + oe->lowerstack[i].layer = &ofs->layers[i+1]; + } + +out: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); + + return oe; + +out_err: + oe = ERR_PTR(err); + goto out; +} + +/* + * Check if this layer root is a descendant of: + * - another layer of this overlayfs instance + * - upper/work dir of any overlayfs instance + */ +static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, + struct dentry *dentry, const char *name, + bool is_lower) +{ + struct dentry *next = dentry, *parent; + int err = 0; + + if (!dentry) + return 0; + + parent = dget_parent(next); + + /* Walk back ancestors to root (inclusive) looking for traps */ + while (!err && parent != next) { + if (is_lower && ovl_lookup_trap_inode(sb, parent)) { + err = -ELOOP; + pr_err("overlapping %s path\n", name); + } else if (ovl_is_inuse(parent)) { + err = ovl_report_in_use(ofs, name); + } + next = parent; + parent = dget_parent(next); + dput(next); + } + + dput(parent); + + return err; +} + +/* + * Check if any of the layers or work dirs overlap. + */ +static int ovl_check_overlapping_layers(struct super_block *sb, + struct ovl_fs *ofs) +{ + int i, err; + + if (ovl_upper_mnt(ofs)) { + err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, + "upperdir", false); + if (err) + return err; + + /* + * Checking workbasedir avoids hitting ovl_is_inuse(parent) of + * this instance and covers overlapping work and index dirs, + * unless work or index dir have been moved since created inside + * workbasedir. In that case, we already have their traps in + * inode cache and we will catch that case on lookup. + */ + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", + false); + if (err) + return err; + } + + for (i = 1; i < ofs->numlayer; i++) { + err = ovl_check_layer(sb, ofs, + ofs->layers[i].mnt->mnt_root, + "lowerdir", true); + if (err) + return err; + } + + return 0; +} + +static struct dentry *ovl_get_root(struct super_block *sb, + struct dentry *upperdentry, + struct ovl_entry *oe) +{ + struct dentry *root; + struct ovl_path *lowerpath = &oe->lowerstack[0]; + unsigned long ino = d_inode(lowerpath->dentry)->i_ino; + int fsid = lowerpath->layer->fsid; + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = lowerpath, + }; + + root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + if (!root) + return NULL; + + root->d_fsdata = oe; + + if (upperdentry) { + /* Root inode uses upper st_ino/i_ino */ + ino = d_inode(upperdentry)->i_ino; + fsid = 0; + ovl_dentry_set_upper_alias(root); + if (ovl_is_impuredir(sb, upperdentry)) + ovl_set_flag(OVL_IMPURE, d_inode(root)); + } + + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root); + ovl_set_upperdata(d_inode(root)); + ovl_inode_init(d_inode(root), &oip, ino, fsid); + ovl_dentry_init_flags(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE); + + return root; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path upperpath = { }; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ofs; + struct ovl_layer *layers; + struct cred *cred; + char *splitlower = NULL; + unsigned int numlower; + int err; + + err = -EIO; + if (WARN_ON(sb->s_user_ns != current_user_ns())) + goto out; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ofs) + goto out; + + err = -ENOMEM; + ofs->creator_cred = cred = prepare_creds(); + if (!cred) + goto out_err; + + /* Is there a reason anyone would want not to share whiteouts? */ + ofs->share_whiteout = true; + + ofs->config.index = ovl_index_def; + ofs->config.uuid = true; + ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); + ofs->config.metacopy = ovl_metacopy_def; + err = ovl_parse_opt((char *) data, &ofs->config); + if (err) + goto out_err; + + err = -EINVAL; + if (!ofs->config.lowerdir) { + if (!silent) + pr_err("missing 'lowerdir'\n"); + goto out_err; + } + + err = -ENOMEM; + splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL); + if (!splitlower) + goto out_err; + + err = -EINVAL; + numlower = ovl_split_lowerdirs(splitlower); + if (numlower > OVL_MAX_STACK) { + pr_err("too many lower directories, limit is %d\n", + OVL_MAX_STACK); + goto out_err; + } + + err = -ENOMEM; + layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + if (!layers) + goto out_err; + + ofs->layers = layers; + /* Layer 0 is reserved for upper even if there's no upper */ + ofs->numlayer = 1; + + sb->s_stack_depth = 0; + sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_long_set(&ofs->last_ino, 1); + /* Assume underlying fs uses 32bit inodes unless proven otherwise */ + if (ofs->config.xino != OVL_XINO_OFF) { + ofs->xino_mode = BITS_PER_LONG - 32; + if (!ofs->xino_mode) { + pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n"); + ofs->config.xino = OVL_XINO_OFF; + } + } + + /* alloc/destroy_inode needed for setting up traps in inode cache */ + sb->s_op = &ovl_super_operations; + + if (ofs->config.upperdir) { + struct super_block *upper_sb; + + err = -EINVAL; + if (!ofs->config.workdir) { + pr_err("missing 'workdir'\n"); + goto out_err; + } + + err = ovl_get_upper(sb, ofs, &layers[0], &upperpath); + if (err) + goto out_err; + + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + if (!ovl_should_sync(ofs)) { + ofs->errseq = errseq_sample(&upper_sb->s_wb_err); + if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { + err = -EIO; + pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); + goto out_err; + } + } + + err = ovl_get_workdir(sb, ofs, &upperpath); + if (err) + goto out_err; + + if (!ofs->workdir) + sb->s_flags |= SB_RDONLY; + + sb->s_stack_depth = upper_sb->s_stack_depth; + sb->s_time_gran = upper_sb->s_time_gran; + } + oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); + err = PTR_ERR(oe); + if (IS_ERR(oe)) + goto out_err; + + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ovl_upper_mnt(ofs)) + sb->s_flags |= SB_RDONLY; + + if (!ofs->config.uuid && ofs->numfs > 1) { + pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n"); + ofs->config.uuid = true; + } + + if (!ovl_force_readonly(ofs) && ofs->config.index) { + err = ovl_get_indexdir(sb, ofs, oe, &upperpath); + if (err) + goto out_free_oe; + + /* Force r/o mount with no index dir */ + if (!ofs->indexdir) + sb->s_flags |= SB_RDONLY; + } + + err = ovl_check_overlapping_layers(sb, ofs); + if (err) + goto out_free_oe; + + /* Show index=off in /proc/mounts for forced r/o mount */ + if (!ofs->indexdir) { + ofs->config.index = false; + if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) { + pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + } + + if (ofs->config.metacopy && ofs->config.nfs_export) { + pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + + if (ofs->config.nfs_export) + sb->s_export_op = &ovl_export_operations; + + /* Never override disk quota limits or use reserved space */ + cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : + ovl_trusted_xattr_handlers; + sb->s_fs_info = ofs; + sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_SKIP_SYNC; + + err = -ENOMEM; + root_dentry = ovl_get_root(sb, upperpath.dentry, oe); + if (!root_dentry) + goto out_free_oe; + + mntput(upperpath.mnt); + kfree(splitlower); + + sb->s_root = root_dentry; + + return 0; + +out_free_oe: + ovl_entry_stack_free(oe); + kfree(oe); +out_err: + kfree(splitlower); + path_put(&upperpath); + ovl_free_fs(ofs); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlay", + .fs_flags = FS_USERNS_MOUNT, + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlay"); + +static void ovl_inode_init_once(void *foo) +{ + struct ovl_inode *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +static int __init ovl_init(void) +{ + int err; + + ovl_inode_cachep = kmem_cache_create("ovl_inode", + sizeof(struct ovl_inode), 0, + (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + ovl_inode_init_once); + if (ovl_inode_cachep == NULL) + return -ENOMEM; + + err = ovl_aio_request_cache_init(); + if (!err) { + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; + + ovl_aio_request_cache_destroy(); + } + kmem_cache_destroy(ovl_inode_cachep); + + return err; +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ovl_inode_cachep); + ovl_aio_request_cache_destroy(); +} + +module_init(ovl_init); +module_exit(ovl_exit); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c new file mode 100644 index 000000000..0d8f96168 --- /dev/null +++ b/fs/overlayfs/util.c @@ -0,0 +1,1136 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/fileattr.h> +#include <linux/uuid.h> +#include <linux/namei.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ovl_upper_mnt(ofs)); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ovl_upper_mnt(ofs)); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + +/* + * Check if underlying fs supports file handles and try to determine encoding + * type, in order to deduce maximum inode number used by fs. + * + * Return 0 if file handles are not supported. + * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. + * Return -1 if fs uses a non default encoding with unknown inode size. + */ +int ovl_can_decode_fh(struct super_block *sb) +{ + if (!capable(CAP_DAC_READ_SEARCH)) + return 0; + + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) + return 0; + + return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; +} + +struct dentry *ovl_indexdir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->indexdir; +} + +/* Index all files on copy up. For now only enabled for NFS export */ +bool ovl_index_all(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + +/* Verify lower origin on lookup. For now only enabled for NFS export */ +bool ovl_verify_lower(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower) +{ + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; +} + +#define OVL_D_REVALIDATE (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE) + +bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & OVL_D_REVALIDATE; +} + +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry) +{ + if (!ovl_dentry_remote(realdentry)) + return; + + spin_lock(&dentry->d_lock); + dentry->d_flags |= realdentry->d_flags & OVL_D_REVALIDATE; + spin_unlock(&dentry->d_lock); +} + +void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry) +{ + return ovl_dentry_init_flags(dentry, upperdentry, OVL_D_REVALIDATE); +} + +void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask) +{ + struct ovl_entry *oe = OVL_E(dentry); + unsigned int i, flags = 0; + + if (upperdentry) + flags |= upperdentry->d_flags; + for (i = 0; i < oe->numlower; i++) + flags |= oe->lowerstack[i].dentry->d_flags; + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~mask; + dentry->d_flags |= flags & mask; + spin_unlock(&dentry->d_lock); +} + +bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; + + if (ovl_dentry_upper(dentry)) { + type = __OVL_PATH_UPPER; + + /* + * Non-dir dentry can hold lower dentry of its copy up origin. + */ + if (oe->numlower) { + if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry))) + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry) || + !ovl_has_upperdata(d_inode(dentry))) + type |= __OVL_PATH_MERGE; + } + } else { + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; + } + return type; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + path->mnt = ovl_upper_mnt(ofs); + path->dentry = ovl_dentry_upper(dentry); +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->numlower) { + path->mnt = oe->lowerstack[0].layer->mnt; + path->dentry = oe->lowerstack[0].dentry; + } else { + *path = (struct path) { }; + } +} + +void ovl_path_lowerdata(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->numlower) { + path->mnt = oe->lowerstack[oe->numlower - 1].layer->mnt; + path->dentry = oe->lowerstack[oe->numlower - 1].dentry; + } else { + *path = (struct path) { }; + } +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + if (!OVL_TYPE_UPPER(type)) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + WARN_ON_ONCE(d_is_dir(dentry)); + + if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type)) + ovl_path_lowerdata(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} + +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].layer : NULL; +} + +/* + * ovl_dentry_lower() could return either a data dentry or metacopy dentry + * depending on what is stored in lowerstack[0]. At times we need to find + * lower dentry which has data (and not metacopy dentry). This helper + * returns the lower data dentry. + */ +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[oe->numlower - 1].dentry : NULL; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); +} + +struct dentry *ovl_i_dentry_upper(struct inode *inode) +{ + return ovl_upperdentry_dereference(OVL_I(inode)); +} + +struct inode *ovl_i_path_real(struct inode *inode, struct path *path) +{ + path->dentry = ovl_i_dentry_upper(inode); + if (!path->dentry) { + path->dentry = OVL_I(inode)->lowerpath.dentry; + path->mnt = OVL_I(inode)->lowerpath.layer->mnt; + } else { + path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb)); + } + + return path->dentry ? d_inode_rcu(path->dentry) : NULL; +} + +struct inode *ovl_inode_upper(struct inode *inode) +{ + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + + return upperdentry ? d_inode(upperdentry) : NULL; +} + +struct inode *ovl_inode_lower(struct inode *inode) +{ + struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry; + + return lowerdentry ? d_inode(lowerdentry) : NULL; +} + +struct inode *ovl_inode_real(struct inode *inode) +{ + return ovl_inode_upper(inode) ?: ovl_inode_lower(inode); +} + +/* Return inode which contains lower data. Do not return metacopy */ +struct inode *ovl_inode_lowerdata(struct inode *inode) +{ + if (WARN_ON(!S_ISREG(inode->i_mode))) + return NULL; + + return OVL_I(inode)->lowerdata ?: ovl_inode_lower(inode); +} + +/* Return real inode which contains data. Does not return metacopy inode */ +struct inode *ovl_inode_realdata(struct inode *inode) +{ + struct inode *upperinode; + + upperinode = ovl_inode_upper(inode); + if (upperinode && ovl_has_upperdata(inode)) + return upperinode; + + return ovl_inode_lowerdata(inode); +} + +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) +{ + return OVL_I(inode)->cache; +} + +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) +{ + OVL_I(inode)->cache = cache; +} + +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry) +{ + set_bit(flag, &OVL_E(dentry)->flags); +} + +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry) +{ + clear_bit(flag, &OVL_E(dentry)->flags); +} + +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry) +{ + return test_bit(flag, &OVL_E(dentry)->flags); +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry); +} + +bool ovl_dentry_is_whiteout(struct dentry *dentry) +{ + return !dentry->d_inode && ovl_dentry_is_opaque(dentry); +} + +void ovl_dentry_set_opaque(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); +} + +/* + * For hard links and decoded file handles, it's possible for ovl_dentry_upper() + * to return positive, while there's no actual upper alias for the inode. + * Copy up code needs to know about the existence of the upper alias, so it + * can't use ovl_dentry_upper(). + */ +bool ovl_dentry_has_upper_alias(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry); +} + +void ovl_dentry_set_upper_alias(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry); +} + +static bool ovl_should_check_upperdata(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return false; + + if (!ovl_inode_lower(inode)) + return false; + + return true; +} + +bool ovl_has_upperdata(struct inode *inode) +{ + if (!ovl_should_check_upperdata(inode)) + return true; + + if (!ovl_test_flag(OVL_UPPERDATA, inode)) + return false; + /* + * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of + * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure + * if setting of OVL_UPPERDATA is visible, then effects of writes + * before that are visible too. + */ + smp_rmb(); + return true; +} + +void ovl_set_upperdata(struct inode *inode) +{ + /* + * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure + * if OVL_UPPERDATA flag is visible, then effects of write operations + * before it are visible as well. + */ + smp_wmb(); + ovl_set_flag(OVL_UPPERDATA, inode); +} + +/* Caller should hold ovl_inode->lock */ +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry)); +} + +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_has_upperdata(d_inode(dentry)); +} + +bool ovl_redirect_dir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.redirect_dir && !ofs->noxattr; +} + +const char *ovl_dentry_get_redirect(struct dentry *dentry) +{ + return OVL_I(d_inode(dentry))->redirect; +} + +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) +{ + struct ovl_inode *oi = OVL_I(d_inode(dentry)); + + kfree(oi->redirect); + oi->redirect = redirect; +} + +void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) +{ + struct inode *upperinode = d_inode(upperdentry); + + WARN_ON(OVL_I(inode)->__upperdentry); + + /* + * Make sure upperdentry is consistent before making it visible + */ + smp_wmb(); + OVL_I(inode)->__upperdentry = upperdentry; + if (inode_unhashed(inode)) { + inode->i_private = upperinode; + __insert_inode_hash(inode, (unsigned long) upperinode); + } +} + +static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) +{ + struct inode *inode = d_inode(dentry); + + WARN_ON(!inode_is_locked(inode)); + WARN_ON(!d_is_dir(dentry)); + /* + * Version is used by readdir code to keep cache consistent. + * For merge dirs (or dirs with origin) all changes need to be noted. + * For non-merge dirs, cache contains only impure entries (i.e. ones + * which have been copied up and have origins), so only need to note + * changes to impure entries. + */ + if (!ovl_dir_is_real(dentry) || impurity) + OVL_I(inode)->version++; +} + +void ovl_dir_modified(struct dentry *dentry, bool impurity) +{ + /* Copy mtime/ctime */ + ovl_copyattr(d_inode(dentry)); + + ovl_dir_version_inc(dentry, impurity); +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + WARN_ON(!inode_is_locked(inode)); + return OVL_I(inode)->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +struct file *ovl_path_open(const struct path *path, int flags) +{ + struct inode *inode = d_inode(path->dentry); + struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); + int err, acc_mode; + + if (flags & ~(O_ACCMODE | O_LARGEFILE)) + BUG(); + + switch (flags & O_ACCMODE) { + case O_RDONLY: + acc_mode = MAY_READ; + break; + case O_WRONLY: + acc_mode = MAY_WRITE; + break; + default: + BUG(); + } + + err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); + if (err) + return ERR_PTR(err); + + /* O_NOATIME is an optimization, don't fail if not permitted */ + if (inode_owner_or_capable(real_mnt_userns, inode)) + flags |= O_NOATIME; + + return dentry_open(path, flags, current_cred()); +} + +/* Caller should hold ovl_inode->lock */ +static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + return true; + + return false; +} + +bool ovl_already_copied_up(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + /* + * Check if copy-up has happened as well as for upper alias (in + * case of hard links) is there. + * + * Both checks are lockless: + * - false negatives: will recheck under oi->lock + * - false positives: + * + ovl_dentry_upper() uses memory barriers to ensure the + * upper dentry is up-to-date + * + ovl_dentry_has_upper_alias() relies on locking of + * upper parent i_rwsem to prevent reordering copy-up + * with rename. + */ + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up(dentry, flags)) + return true; + + return false; +} + +int ovl_copy_up_start(struct dentry *dentry, int flags) +{ + struct inode *inode = d_inode(dentry); + int err; + + err = ovl_inode_lock_interruptible(inode); + if (!err && ovl_already_copied_up_locked(dentry, flags)) { + err = 1; /* Already copied up */ + ovl_inode_unlock(inode); + } + + return err; +} + +void ovl_copy_up_end(struct dentry *dentry) +{ + ovl_inode_unlock(d_inode(dentry)); +} + +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) +{ + int res; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0); + + /* Zero size value means "copied up but origin unknown" */ + if (res >= 0) + return true; + + return false; +} + +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) +{ + int res; + char val; + + if (!d_is_dir(path->dentry)) + return false; + + res = ovl_path_getxattr(ofs, path, ox, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +#define OVL_XATTR_OPAQUE_POSTFIX "opaque" +#define OVL_XATTR_REDIRECT_POSTFIX "redirect" +#define OVL_XATTR_ORIGIN_POSTFIX "origin" +#define OVL_XATTR_IMPURE_POSTFIX "impure" +#define OVL_XATTR_NLINK_POSTFIX "nlink" +#define OVL_XATTR_UPPER_POSTFIX "upper" +#define OVL_XATTR_METACOPY_POSTFIX "metacopy" +#define OVL_XATTR_PROTATTR_POSTFIX "protattr" + +#define OVL_XATTR_TAB_ENTRY(x) \ + [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ + [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX } + +const char *const ovl_xattr_table[][2] = { + OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), +}; + +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, + enum ovl_xattr ox, const void *value, size_t size, + int xerr) +{ + int err; + + if (ofs->noxattr) + return xerr; + + err = ovl_setxattr(ofs, upperdentry, ox, value, size); + + if (err == -EOPNOTSUPP) { + pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox)); + ofs->noxattr = true; + return xerr; + } + + return err; +} + +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + int err; + + if (ovl_test_flag(OVL_IMPURE, d_inode(dentry))) + return 0; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0); + if (!err) + ovl_set_flag(OVL_IMPURE, d_inode(dentry)); + + return err; +} + + +#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */ + +void ovl_check_protattr(struct inode *inode, struct dentry *upper) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK; + char buf[OVL_PROTATTR_MAX+1]; + int res, n; + + res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf, + OVL_PROTATTR_MAX); + if (res < 0) + return; + + /* + * Initialize inode flags from overlay.protattr xattr and upper inode + * flags. If upper inode has those fileattr flags set (i.e. from old + * kernel), we do not clear them on ovl_get_inode(), but we will clear + * them on next fileattr_set(). + */ + for (n = 0; n < res; n++) { + if (buf[n] == 'a') + iflags |= S_APPEND; + else if (buf[n] == 'i') + iflags |= S_IMMUTABLE; + else + break; + } + + if (!res || n < res) { + pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n", + upper, res); + } else { + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + } +} + +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + char buf[OVL_PROTATTR_MAX]; + int len = 0, err = 0; + u32 iflags = 0; + + BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX); + + if (fa->flags & FS_APPEND_FL) { + buf[len++] = 'a'; + iflags |= S_APPEND; + } + if (fa->flags & FS_IMMUTABLE_FL) { + buf[len++] = 'i'; + iflags |= S_IMMUTABLE; + } + + /* + * Do not allow to set protection flags when upper doesn't support + * xattrs, because we do not set those fileattr flags on upper inode. + * Remove xattr if it exist and all protection flags are cleared. + */ + if (len) { + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, + buf, len, -EPERM); + } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { + err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR); + if (err == -EOPNOTSUPP || err == -ENODATA) + err = 0; + } + if (err) + return err; + + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + + /* Mask out the fileattr flags that should not be set in upper inode */ + fa->flags &= ~OVL_PROT_FS_FLAGS_MASK; + fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK; + + return 0; +} + +/** + * Caller must hold a reference to inode to prevent it from being freed while + * it is marked inuse. + */ +bool ovl_inuse_trylock(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + bool locked = false; + + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_OVL_INUSE)) { + inode->i_state |= I_OVL_INUSE; + locked = true; + } + spin_unlock(&inode->i_lock); + + return locked; +} + +void ovl_inuse_unlock(struct dentry *dentry) +{ + if (dentry) { + struct inode *inode = d_inode(dentry); + + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_OVL_INUSE)); + inode->i_state &= ~I_OVL_INUSE; + spin_unlock(&inode->i_lock); + } +} + +bool ovl_is_inuse(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + bool inuse; + + spin_lock(&inode->i_lock); + inuse = (inode->i_state & I_OVL_INUSE); + spin_unlock(&inode->i_lock); + + return inuse; +} + +/* + * Does this overlay dentry need to be indexed on copy up? + */ +bool ovl_need_index(struct dentry *dentry) +{ + struct dentry *lower = ovl_dentry_lower(dentry); + + if (!lower || !ovl_indexdir(dentry->d_sb)) + return false; + + /* Index all files for NFS export and consistency verification */ + if (ovl_index_all(dentry->d_sb)) + return true; + + /* Index only lower hardlinks on copy up */ + if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return true; + + return false; +} + +/* Caller must hold OVL_I(inode)->lock */ +static void ovl_cleanup_index(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = indexdir->d_inode; + struct dentry *lowerdentry = ovl_dentry_lower(dentry); + struct dentry *upperdentry = ovl_dentry_upper(dentry); + struct dentry *index = NULL; + struct inode *inode; + struct qstr name = { }; + int err; + + err = ovl_get_index_name(ofs, lowerdentry, &name); + if (err) + goto fail; + + inode = d_inode(upperdentry); + if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + upperdentry, inode->i_ino, inode->i_nlink); + /* + * We either have a bug with persistent union nlink or a lower + * hardlink was added while overlay is mounted. Adding a lower + * hardlink and then unlinking all overlay hardlinks would drop + * overlay nlink to zero before all upper inodes are unlinked. + * As a safety measure, when that situation is detected, set + * the overlay nlink to the index inode nlink minus one for the + * index entry itself. + */ + set_nlink(d_inode(dentry), inode->i_nlink - 1); + ovl_set_nlink_upper(dentry); + goto out; + } + + inode_lock_nested(dir, I_MUTEX_PARENT); + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); + err = PTR_ERR(index); + if (IS_ERR(index)) { + index = NULL; + } else if (ovl_index_all(dentry->d_sb)) { + /* Whiteout orphan index to block future open by handle */ + err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), + dir, index); + } else { + /* Cleanup orphan index entries */ + err = ovl_cleanup(ofs, dir, index); + } + + inode_unlock(dir); + if (err) + goto fail; + +out: + kfree(name.name); + dput(index); + return; + +fail: + pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err); + goto out; +} + +/* + * Operations that change overlay inode and upper inode nlink need to be + * synchronized with copy up for persistent nlink accounting. + */ +int ovl_nlink_start(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + const struct cred *old_cred; + int err; + + if (WARN_ON(!inode)) + return -ENOENT; + + /* + * With inodes index is enabled, we store the union overlay nlink + * in an xattr on the index inode. When whiting out an indexed lower, + * we need to decrement the overlay persistent nlink, but before the + * first copy up, we have no upper index inode to store the xattr. + * + * As a workaround, before whiteout/rename over an indexed lower, + * copy up to create the upper index. Creating the upper index will + * initialize the overlay nlink, so it could be dropped if unlink + * or rename succeeds. + * + * TODO: implement metadata only index copy up when called with + * ovl_copy_up_flags(dentry, O_PATH). + */ + if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) { + err = ovl_copy_up(dentry); + if (err) + return err; + } + + err = ovl_inode_lock_interruptible(inode); + if (err) + return err; + + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) + goto out; + + old_cred = ovl_override_creds(dentry->d_sb); + /* + * The overlay inode nlink should be incremented/decremented IFF the + * upper operation succeeds, along with nlink change of upper inode. + * Therefore, before link/unlink/rename, we store the union nlink + * value relative to the upper inode nlink in an upper inode xattr. + */ + err = ovl_set_nlink_upper(dentry); + revert_creds(old_cred); + +out: + if (err) + ovl_inode_unlock(inode); + + return err; +} + +void ovl_nlink_end(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + ovl_cleanup_index(dentry); + revert_creds(old_cred); + } + + ovl_inode_unlock(inode); +} + +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("failed to lock workdir+upperdir\n"); + return -EIO; +} + +/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) +{ + int res; + + /* Only regular files can have metacopy xattr */ + if (!S_ISREG(d_inode(path->dentry)->i_mode)) + return 0; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return 0; + /* + * getxattr on user.* may fail with EACCES in case there's no + * read permission on the inode. Not much we can do, other than + * tell the caller that this is not a metacopy inode. + */ + if (ofs->config.userxattr && res == -EACCES) + return 0; + goto out; + } + + return 1; +out: + pr_warn_ratelimited("failed to get metacopy (%i)\n", res); + return res; +} + +bool ovl_is_metacopy_dentry(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (!d_is_reg(dentry)) + return false; + + if (ovl_dentry_upper(dentry)) { + if (!ovl_has_upperdata(d_inode(dentry))) + return true; + return false; + } + + return (oe->numlower > 1); +} + +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0); + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + if (res < 0) + goto fail; + if (res == 0) + goto invalid; + + buf = kzalloc(res + padding + 1, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res); + if (res < 0) + goto fail; + if (res == 0) + goto invalid; + + if (buf[0] == '/') { + for (s = buf; *s++ == '/'; s = next) { + next = strchrnul(s, '/'); + if (s == next) + goto invalid; + } + } else { + if (strchr(buf, '/') != NULL) + goto invalid; + } + + return buf; +invalid: + pr_warn_ratelimited("invalid redirect (%s)\n", buf); + res = -EINVAL; + goto err_free; +fail: + pr_warn_ratelimited("failed to get redirect (%i)\n", res); +err_free: + kfree(buf); + return ERR_PTR(res); +} + +/* + * ovl_sync_status() - Check fs sync status for volatile mounts + * + * Returns 1 if this is not a volatile mount and a real sync is required. + * + * Returns 0 if syncing can be skipped because mount is volatile, and no errors + * have occurred on the upperdir since the mount. + * + * Returns -errno if it is a volatile mount, and the error that occurred since + * the last mount. If the error code changes, it'll return the latest error + * code. + */ + +int ovl_sync_status(struct ovl_fs *ofs) +{ + struct vfsmount *mnt; + + if (ovl_should_sync(ofs)) + return 1; + + mnt = ovl_upper_mnt(ofs); + if (!mnt) + return 0; + + return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); +} + +/* + * ovl_copyattr() - copy inode attributes from layer to ovl inode + * + * When overlay copies inode information from an upper or lower layer to the + * relevant overlay inode it will apply the idmapping of the upper or lower + * layer when doing so ensuring that the ovl inode ownership will correctly + * reflect the ownership of the idmapped upper or lower layer. For example, an + * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to + * map any lower or upper inode owned by id 1001 to id 1000. These mapping + * helpers are nops when the relevant layer isn't idmapped. + */ +void ovl_copyattr(struct inode *inode) +{ + struct path realpath; + struct inode *realinode; + struct user_namespace *real_mnt_userns; + + realinode = ovl_i_path_real(inode, &realpath); + real_mnt_userns = mnt_user_ns(realpath.mnt); + + inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); + inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + inode->i_mode = realinode->i_mode; + inode->i_atime = realinode->i_atime; + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + i_size_write(inode, i_size_read(realinode)); +} |