summaryrefslogtreecommitdiffstats
path: root/fs/overlayfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/overlayfs
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/overlayfs')
-rw-r--r--fs/overlayfs/Kconfig126
-rw-r--r--fs/overlayfs/Makefile9
-rw-r--r--fs/overlayfs/copy_up.c1108
-rw-r--r--fs/overlayfs/dir.c1331
-rw-r--r--fs/overlayfs/export.c876
-rw-r--r--fs/overlayfs/file.c716
-rw-r--r--fs/overlayfs/inode.c1285
-rw-r--r--fs/overlayfs/namei.c1203
-rw-r--r--fs/overlayfs/overlayfs.h698
-rw-r--r--fs/overlayfs/ovl_entry.h160
-rw-r--r--fs/overlayfs/readdir.c1235
-rw-r--r--fs/overlayfs/super.c2244
-rw-r--r--fs/overlayfs/util.c1136
13 files changed, 12127 insertions, 0 deletions
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000..dd188c799
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config OVERLAY_FS
+ tristate "Overlay filesystem support"
+ select EXPORTFS
+ help
+ An overlay filesystem combines two filesystems - an 'upper' filesystem
+ and a 'lower' filesystem. When a name exists in both filesystems, the
+ object in the 'upper' filesystem is visible while the object in the
+ 'lower' filesystem is either hidden or, in the case of directories,
+ merged with the 'upper' object.
+
+ For more information see Documentation/filesystems/overlayfs.rst
+
+config OVERLAY_FS_REDIRECT_DIR
+ bool "Overlayfs: turn on redirect directory feature by default"
+ depends on OVERLAY_FS
+ help
+ If this config option is enabled then overlay filesystems will use
+ redirects when renaming directories by default. In this case it is
+ still possible to turn off redirects globally with the
+ "redirect_dir=off" module option or on a filesystem instance basis
+ with the "redirect_dir=off" mount option.
+
+ Note, that redirects are not backward compatible. That is, mounting
+ an overlay which has redirects on a kernel that doesn't support this
+ feature will have unexpected results.
+
+ If unsure, say N.
+
+config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
+ bool "Overlayfs: follow redirects even if redirects are turned off"
+ default y
+ depends on OVERLAY_FS
+ help
+ Disable this to get a possibly more secure configuration, but that
+ might not be backward compatible with previous kernels.
+
+ If backward compatibility is not an issue, then it is safe and
+ recommended to say N here.
+
+ For more information, see Documentation/filesystems/overlayfs.rst
+
+ If unsure, say Y.
+
+config OVERLAY_FS_INDEX
+ bool "Overlayfs: turn on inodes index feature by default"
+ depends on OVERLAY_FS
+ help
+ If this config option is enabled then overlay filesystems will use
+ the index directory to map lower inodes to upper inodes by default.
+ In this case it is still possible to turn off index globally with the
+ "index=off" module option or on a filesystem instance basis with the
+ "index=off" mount option.
+
+ The inodes index feature prevents breaking of lower hardlinks on copy
+ up.
+
+ Note, that the inodes index feature is not backward compatible.
+ That is, mounting an overlay which has an inodes index on a kernel
+ that doesn't support this feature will have unexpected results.
+
+ If unsure, say N.
+
+config OVERLAY_FS_NFS_EXPORT
+ bool "Overlayfs: turn on NFS export feature by default"
+ depends on OVERLAY_FS
+ depends on OVERLAY_FS_INDEX
+ depends on !OVERLAY_FS_METACOPY
+ help
+ If this config option is enabled then overlay filesystems will use
+ the index directory to decode overlay NFS file handles by default.
+ In this case, it is still possible to turn off NFS export support
+ globally with the "nfs_export=off" module option or on a filesystem
+ instance basis with the "nfs_export=off" mount option.
+
+ The NFS export feature creates an index on copy up of every file and
+ directory. This full index is used to detect overlay filesystems
+ inconsistencies on lookup, like redirect from multiple upper dirs to
+ the same lower dir. The full index may incur some overhead on mount
+ time, especially when verifying that directory file handles are not
+ stale.
+
+ Note, that the NFS export feature is not backward compatible.
+ That is, mounting an overlay which has a full index on a kernel
+ that doesn't support this feature will have unexpected results.
+
+ Most users should say N here and enable this feature on a case-by-
+ case basis with the "nfs_export=on" mount option.
+
+ Say N unless you fully understand the consequences.
+
+config OVERLAY_FS_XINO_AUTO
+ bool "Overlayfs: auto enable inode number mapping"
+ default n
+ depends on OVERLAY_FS
+ depends on 64BIT
+ help
+ If this config option is enabled then overlay filesystems will use
+ unused high bits in undelying filesystem inode numbers to map all
+ inodes to a unified address space. The mapped 64bit inode numbers
+ might not be compatible with applications that expect 32bit inodes.
+
+ If compatibility with applications that expect 32bit inodes is not an
+ issue, then it is safe and recommended to say Y here.
+
+ For more information, see Documentation/filesystems/overlayfs.rst
+
+ If unsure, say N.
+
+config OVERLAY_FS_METACOPY
+ bool "Overlayfs: turn on metadata only copy up feature by default"
+ depends on OVERLAY_FS
+ select OVERLAY_FS_REDIRECT_DIR
+ help
+ If this config option is enabled then overlay filesystems will
+ copy up only metadata where appropriate and data copy up will
+ happen when a file is opened for WRITE operation. It is still
+ possible to turn off this feature globally with the "metacopy=off"
+ module option or on a filesystem instance basis with the
+ "metacopy=off" mount option.
+
+ Note, that this feature is not backward compatible. That is,
+ mounting an overlay which has metacopy only inodes on a kernel
+ that doesn't support this feature will have unexpected results.
+
+ If unsure, say N.
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000..9164c585e
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
+
+overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \
+ copy_up.o export.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000..86d4b6975
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,1108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fileattr.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/cred.h>
+#include <linux/namei.h>
+#include <linux/fdtable.h>
+#include <linux/ratelimit.h>
+#include <linux/exportfs.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+static int ovl_ccup_set(const char *buf, const struct kernel_param *param)
+{
+ pr_warn("\"check_copy_up\" module option is obsolete\n");
+ return 0;
+}
+
+static int ovl_ccup_get(char *buf, const struct kernel_param *param)
+{
+ return sprintf(buf, "N\n");
+}
+
+module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
+MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
+
+static bool ovl_must_copy_xattr(const char *name)
+{
+ return !strcmp(name, XATTR_POSIX_ACL_ACCESS) ||
+ !strcmp(name, XATTR_POSIX_ACL_DEFAULT) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+}
+
+int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
+{
+ struct dentry *old = oldpath->dentry;
+ ssize_t list_size, size, value_size = 0;
+ char *buf, *name, *value = NULL;
+ int error = 0;
+ size_t slen;
+
+ if (!(old->d_inode->i_opflags & IOP_XATTR) ||
+ !(new->d_inode->i_opflags & IOP_XATTR))
+ return 0;
+
+ list_size = vfs_listxattr(old, NULL, 0);
+ if (list_size <= 0) {
+ if (list_size == -EOPNOTSUPP)
+ return 0;
+ return list_size;
+ }
+
+ buf = kvzalloc(list_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ list_size = vfs_listxattr(old, buf, list_size);
+ if (list_size <= 0) {
+ error = list_size;
+ goto out;
+ }
+
+ for (name = buf; list_size; name += slen) {
+ slen = strnlen(name, list_size) + 1;
+
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > list_size)) {
+ error = -EIO;
+ break;
+ }
+ list_size -= slen;
+
+ if (ovl_is_private_xattr(sb, name))
+ continue;
+
+ error = security_inode_copy_up_xattr(name);
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
+ if (error == 1) {
+ error = 0;
+ continue; /* Discard */
+ }
+retry:
+ size = ovl_do_getxattr(oldpath, name, value, value_size);
+ if (size == -ERANGE)
+ size = ovl_do_getxattr(oldpath, name, NULL, 0);
+
+ if (size < 0) {
+ error = size;
+ break;
+ }
+
+ if (size > value_size) {
+ void *new;
+
+ new = kvmalloc(size, GFP_KERNEL);
+ if (!new) {
+ error = -ENOMEM;
+ break;
+ }
+ kvfree(value);
+ value = new;
+ value_size = size;
+ goto retry;
+ }
+
+ error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0);
+ if (error) {
+ if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
+ break;
+
+ /* Ignore failure to copy unknown xattrs */
+ error = 0;
+ }
+ }
+ kvfree(value);
+out:
+ kvfree(buf);
+ return error;
+}
+
+static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
+ const struct path *new)
+{
+ struct fileattr oldfa = { .flags_valid = true };
+ struct fileattr newfa = { .flags_valid = true };
+ int err;
+
+ err = ovl_real_fileattr_get(old, &oldfa);
+ if (err) {
+ /* Ntfs-3g returns -EINVAL for "no fileattr support" */
+ if (err == -ENOTTY || err == -EINVAL)
+ return 0;
+ pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
+ old->dentry, err);
+ return err;
+ }
+
+ /*
+ * We cannot set immutable and append-only flags on upper inode,
+ * because we would not be able to link upper inode to upper dir
+ * not set overlay private xattr on upper inode.
+ * Store these flags in overlay.protattr xattr instead.
+ */
+ if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) {
+ err = ovl_set_protattr(inode, new->dentry, &oldfa);
+ if (err == -EPERM)
+ pr_warn_once("copying fileattr: no xattr on upper\n");
+ else if (err)
+ return err;
+ }
+
+ /* Don't bother copying flags if none are set */
+ if (!(oldfa.flags & OVL_COPY_FS_FLAGS_MASK))
+ return 0;
+
+ err = ovl_real_fileattr_get(new, &newfa);
+ if (err) {
+ /*
+ * Returning an error if upper doesn't support fileattr will
+ * result in a regression, so revert to the old behavior.
+ */
+ if (err == -ENOTTY || err == -EINVAL) {
+ pr_warn_once("copying fileattr: no support on upper\n");
+ return 0;
+ }
+ pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n",
+ new->dentry, err);
+ return err;
+ }
+
+ BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL);
+ newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK;
+ newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK);
+
+ BUILD_BUG_ON(OVL_COPY_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
+ newfa.fsx_xflags &= ~OVL_COPY_FSX_FLAGS_MASK;
+ newfa.fsx_xflags |= (oldfa.fsx_xflags & OVL_COPY_FSX_FLAGS_MASK);
+
+ return ovl_real_fileattr_set(new, &newfa);
+}
+
+static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
+ struct file *new_file, loff_t len)
+{
+ struct path datapath;
+ struct file *old_file;
+ loff_t old_pos = 0;
+ loff_t new_pos = 0;
+ loff_t cloned;
+ loff_t data_pos = -1;
+ loff_t hole_len;
+ bool skip_hole = false;
+ int error = 0;
+
+ ovl_path_lowerdata(dentry, &datapath);
+ if (WARN_ON(datapath.dentry == NULL))
+ return -EIO;
+
+ old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY);
+ if (IS_ERR(old_file))
+ return PTR_ERR(old_file);
+
+ /* Try to use clone_file_range to clone up within the same fs */
+ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ if (cloned == len)
+ goto out_fput;
+ /* Couldn't clone, so now we try to copy the data */
+
+ /* Check if lower fs supports seek operation */
+ if (old_file->f_mode & FMODE_LSEEK)
+ skip_hole = true;
+
+ while (len) {
+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+ long bytes;
+
+ if (len < this_len)
+ this_len = len;
+
+ if (signal_pending_state(TASK_KILLABLE, current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /*
+ * Fill zero for hole will cost unnecessary disk space
+ * and meanwhile slow down the copy-up speed, so we do
+ * an optimization for hole during copy-up, it relies
+ * on SEEK_DATA implementation in lower fs so if lower
+ * fs does not support it, copy-up will behave as before.
+ *
+ * Detail logic of hole detection as below:
+ * When we detect next data position is larger than current
+ * position we will skip that hole, otherwise we copy
+ * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually,
+ * it may not recognize all kind of holes and sometimes
+ * only skips partial of hole area. However, it will be
+ * enough for most of the use cases.
+ */
+
+ if (skip_hole && data_pos < old_pos) {
+ data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
+ if (data_pos > old_pos) {
+ hole_len = data_pos - old_pos;
+ len -= hole_len;
+ old_pos = new_pos = data_pos;
+ continue;
+ } else if (data_pos == -ENXIO) {
+ break;
+ } else if (data_pos < 0) {
+ skip_hole = false;
+ }
+ }
+
+ bytes = do_splice_direct(old_file, &old_pos,
+ new_file, &new_pos,
+ this_len, SPLICE_F_MOVE);
+ if (bytes <= 0) {
+ error = bytes;
+ break;
+ }
+ WARN_ON(old_pos != new_pos);
+
+ len -= bytes;
+ }
+ if (!error && ovl_should_sync(ofs))
+ error = vfs_fsync(new_file, 0);
+out_fput:
+ fput(old_file);
+ return error;
+}
+
+static int ovl_set_size(struct ovl_fs *ofs,
+ struct dentry *upperdentry, struct kstat *stat)
+{
+ struct iattr attr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = stat->size,
+ };
+
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
+}
+
+static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
+{
+ struct iattr attr = {
+ .ia_valid =
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
+ .ia_atime = stat->atime,
+ .ia_mtime = stat->mtime,
+ };
+
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
+}
+
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
+{
+ int err = 0;
+
+ if (!S_ISLNK(stat->mode)) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
+ }
+ if (!err) {
+ struct iattr attr = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_vfsuid = VFSUIDT_INIT(stat->uid),
+ .ia_vfsgid = VFSGIDT_INIT(stat->gid),
+ };
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
+ }
+ if (!err)
+ ovl_set_timestamps(ofs, upperdentry, stat);
+
+ return err;
+}
+
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+ bool is_upper)
+{
+ struct ovl_fh *fh;
+ int fh_type, dwords;
+ int buflen = MAX_HANDLE_SZ;
+ uuid_t *uuid = &real->d_sb->s_uuid;
+ int err;
+
+ /* Make sure the real fid stays 32bit aligned */
+ BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4);
+ BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255);
+
+ fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We encode a non-connectable file handle for non-dir, because we
+ * only need to find the lower inode number and we don't want to pay
+ * the price or reconnecting the dentry.
+ */
+ dwords = buflen >> 2;
+ fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
+ buflen = (dwords << 2);
+
+ err = -EIO;
+ if (WARN_ON(fh_type < 0) ||
+ WARN_ON(buflen > MAX_HANDLE_SZ) ||
+ WARN_ON(fh_type == FILEID_INVALID))
+ goto out_err;
+
+ fh->fb.version = OVL_FH_VERSION;
+ fh->fb.magic = OVL_FH_MAGIC;
+ fh->fb.type = fh_type;
+ fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN;
+ /*
+ * When we will want to decode an overlay dentry from this handle
+ * and all layers are on the same fs, if we get a disconncted real
+ * dentry when we decode fid, the only way to tell if we should assign
+ * it to upperdentry or to lowerstack is by checking this flag.
+ */
+ if (is_upper)
+ fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER;
+ fh->fb.len = sizeof(fh->fb) + buflen;
+ if (ofs->config.uuid)
+ fh->fb.uuid = *uuid;
+
+ return fh;
+
+out_err:
+ kfree(fh);
+ return ERR_PTR(err);
+}
+
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
+ struct dentry *upper)
+{
+ const struct ovl_fh *fh = NULL;
+ int err;
+
+ /*
+ * When lower layer doesn't support export operations store a 'null' fh,
+ * so we can use the overlay.origin xattr to distignuish between a copy
+ * up and a pure upper inode.
+ */
+ if (ovl_can_decode_fh(lower->d_sb)) {
+ fh = ovl_encode_real_fh(ofs, lower, false);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+ }
+
+ /*
+ * Do not fail when upper doesn't support xattrs.
+ */
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
+ fh ? fh->fb.len : 0, 0);
+ kfree(fh);
+
+ /* Ignore -EPERM from setting "user.*" on symlink/special */
+ return err == -EPERM ? 0 : err;
+}
+
+/* Store file handle of @upper dir in @index dir entry */
+static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
+ struct dentry *index)
+{
+ const struct ovl_fh *fh;
+ int err;
+
+ fh = ovl_encode_real_fh(ofs, upper, true);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+ err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
+
+ kfree(fh);
+ return err;
+}
+
+/*
+ * Create and install index entry.
+ *
+ * Caller must hold i_mutex on indexdir.
+ */
+static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
+ struct dentry *upper)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
+ struct inode *dir = d_inode(indexdir);
+ struct dentry *index = NULL;
+ struct dentry *temp = NULL;
+ struct qstr name = { };
+ int err;
+
+ /*
+ * For now this is only used for creating index entry for directories,
+ * because non-dir are copied up directly to index and then hardlinked
+ * to upper dir.
+ *
+ * TODO: implement create index for non-dir, so we can call it when
+ * encoding file handle for non-dir in case index does not exist.
+ */
+ if (WARN_ON(!d_is_dir(dentry)))
+ return -EIO;
+
+ /* Directory not expected to be indexed before copy up */
+ if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
+ return -EIO;
+
+ err = ovl_get_index_name(ofs, origin, &name);
+ if (err)
+ return err;
+
+ temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0));
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto free_name;
+
+ err = ovl_set_upper_fh(ofs, upper, temp);
+ if (err)
+ goto out;
+
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
+ if (IS_ERR(index)) {
+ err = PTR_ERR(index);
+ } else {
+ err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
+ dput(index);
+ }
+out:
+ if (err)
+ ovl_cleanup(ofs, dir, temp);
+ dput(temp);
+free_name:
+ kfree(name.name);
+ return err;
+}
+
+struct ovl_copy_up_ctx {
+ struct dentry *parent;
+ struct dentry *dentry;
+ struct path lowerpath;
+ struct kstat stat;
+ struct kstat pstat;
+ const char *link;
+ struct dentry *destdir;
+ struct qstr destname;
+ struct dentry *workdir;
+ bool origin;
+ bool indexed;
+ bool metacopy;
+};
+
+static int ovl_link_up(struct ovl_copy_up_ctx *c)
+{
+ int err;
+ struct dentry *upper;
+ struct dentry *upperdir = ovl_dentry_upper(c->parent);
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *udir = d_inode(upperdir);
+
+ /* Mark parent "impure" because it may now contain non-pure upper */
+ err = ovl_set_impure(c->parent, upperdir);
+ if (err)
+ return err;
+
+ err = ovl_set_nlink_lower(c->dentry);
+ if (err)
+ return err;
+
+ inode_lock_nested(udir, I_MUTEX_PARENT);
+ upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
+ c->dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (!IS_ERR(upper)) {
+ err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
+ dput(upper);
+
+ if (!err) {
+ /* Restore timestamps on parent (best effort) */
+ ovl_set_timestamps(ofs, upperdir, &c->pstat);
+ ovl_dentry_set_upper_alias(c->dentry);
+ ovl_dentry_update_reval(c->dentry, upper);
+ }
+ }
+ inode_unlock(udir);
+ if (err)
+ return err;
+
+ err = ovl_set_nlink_upper(c->dentry);
+
+ return err;
+}
+
+static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct file *new_file;
+ int err;
+
+ if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size)
+ return 0;
+
+ new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY);
+ if (IS_ERR(new_file))
+ return PTR_ERR(new_file);
+
+ err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size);
+ fput(new_file);
+
+ return err;
+}
+
+static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *inode = d_inode(c->dentry);
+ struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp };
+ int err;
+
+ err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp);
+ if (err)
+ return err;
+
+ if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
+ /*
+ * Copy the fileattr inode flags that are the source of already
+ * copied i_flags
+ */
+ err = ovl_copy_fileattr(inode, &c->lowerpath, &upperpath);
+ if (err)
+ return err;
+ }
+
+ /*
+ * Store identifier of lower inode in upper inode xattr to
+ * allow lookup of the copy up origin inode.
+ *
+ * Don't set origin when we are breaking the association with a lower
+ * hard link.
+ */
+ if (c->origin) {
+ err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
+ if (err)
+ return err;
+ }
+
+ if (c->metacopy) {
+ err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY,
+ NULL, 0, -EOPNOTSUPP);
+ if (err)
+ return err;
+ }
+
+ inode_lock(temp->d_inode);
+ if (S_ISREG(c->stat.mode))
+ err = ovl_set_size(ofs, temp, &c->stat);
+ if (!err)
+ err = ovl_set_attr(ofs, temp, &c->stat);
+ inode_unlock(temp->d_inode);
+
+ return err;
+}
+
+struct ovl_cu_creds {
+ const struct cred *old;
+ struct cred *new;
+};
+
+static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
+{
+ int err;
+
+ cc->old = cc->new = NULL;
+ err = security_inode_copy_up(dentry, &cc->new);
+ if (err < 0)
+ return err;
+
+ if (cc->new)
+ cc->old = override_creds(cc->new);
+
+ return 0;
+}
+
+static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
+{
+ if (cc->new) {
+ revert_creds(cc->old);
+ put_cred(cc->new);
+ }
+}
+
+/*
+ * Copyup using workdir to prepare temp file. Used when copying up directories,
+ * special files or when upper fs doesn't support O_TMPFILE.
+ */
+static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *inode;
+ struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+ struct path path = { .mnt = ovl_upper_mnt(ofs) };
+ struct dentry *temp, *upper;
+ struct ovl_cu_creds cc;
+ int err;
+ struct ovl_cattr cattr = {
+ /* Can't properly set mode on creation because of the umask */
+ .mode = c->stat.mode & S_IFMT,
+ .rdev = c->stat.rdev,
+ .link = c->link
+ };
+
+ /* workdir and destdir could be the same when copying up to indexdir */
+ err = -EIO;
+ if (lock_rename(c->workdir, c->destdir) != NULL)
+ goto unlock;
+
+ err = ovl_prep_cu_creds(c->dentry, &cc);
+ if (err)
+ goto unlock;
+
+ temp = ovl_create_temp(ofs, c->workdir, &cattr);
+ ovl_revert_cu_creds(&cc);
+
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto unlock;
+
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ path.dentry = temp;
+ err = ovl_copy_up_data(c, &path);
+ if (err)
+ goto cleanup;
+
+ err = ovl_copy_up_metadata(c, temp);
+ if (err)
+ goto cleanup;
+
+ if (S_ISDIR(c->stat.mode) && c->indexed) {
+ err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
+ if (err)
+ goto cleanup;
+ }
+
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto cleanup;
+
+ err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
+ dput(upper);
+ if (err)
+ goto cleanup;
+
+ if (!c->metacopy)
+ ovl_set_upperdata(d_inode(c->dentry));
+ inode = d_inode(c->dentry);
+ ovl_inode_update(inode, temp);
+ if (S_ISDIR(inode->i_mode))
+ ovl_set_flag(OVL_WHITEOUTS, inode);
+unlock:
+ unlock_rename(c->workdir, c->destdir);
+
+ return err;
+
+cleanup:
+ ovl_cleanup(ofs, wdir, temp);
+ dput(temp);
+ goto unlock;
+}
+
+/* Copyup using O_TMPFILE which does not require cross dir locking */
+static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *udir = d_inode(c->destdir);
+ struct dentry *temp, *upper;
+ struct file *tmpfile;
+ struct ovl_cu_creds cc;
+ int err;
+
+ err = ovl_prep_cu_creds(c->dentry, &cc);
+ if (err)
+ return err;
+
+ tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ ovl_revert_cu_creds(&cc);
+
+ if (IS_ERR(tmpfile))
+ return PTR_ERR(tmpfile);
+
+ temp = tmpfile->f_path.dentry;
+ if (!c->metacopy && c->stat.size) {
+ err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size);
+ if (err)
+ goto out_fput;
+ }
+
+ err = ovl_copy_up_metadata(c, temp);
+ if (err)
+ goto out_fput;
+
+ inode_lock_nested(udir, I_MUTEX_PARENT);
+
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
+ err = PTR_ERR(upper);
+ if (!IS_ERR(upper)) {
+ err = ovl_do_link(ofs, temp, udir, upper);
+ dput(upper);
+ }
+ inode_unlock(udir);
+
+ if (err)
+ goto out_fput;
+
+ if (!c->metacopy)
+ ovl_set_upperdata(d_inode(c->dentry));
+ ovl_inode_update(d_inode(c->dentry), dget(temp));
+
+out_fput:
+ fput(tmpfile);
+ return err;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * All renames start with copy up of source if necessary. The actual
+ * rename will only proceed once the copy up was successful. Copy up uses
+ * upper parent i_mutex for exclusion. Since rename can change d_parent it
+ * is possible that the copy up will lock the old parent. At that point
+ * the file will have already been copied up anyway.
+ */
+static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ bool to_index = false;
+
+ /*
+ * Indexed non-dir is copied up directly to the index entry and then
+ * hardlinked to upper dir. Indexed dir is copied up to indexdir,
+ * then index entry is created and then copied up dir installed.
+ * Copying dir up to indexdir instead of workdir simplifies locking.
+ */
+ if (ovl_need_index(c->dentry)) {
+ c->indexed = true;
+ if (S_ISDIR(c->stat.mode))
+ c->workdir = ovl_indexdir(c->dentry->d_sb);
+ else
+ to_index = true;
+ }
+
+ if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
+ c->origin = true;
+
+ if (to_index) {
+ c->destdir = ovl_indexdir(c->dentry->d_sb);
+ err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
+ if (err)
+ return err;
+ } else if (WARN_ON(!c->parent)) {
+ /* Disconnected dentry must be copied up to index dir */
+ return -EIO;
+ } else {
+ /*
+ * Mark parent "impure" because it may now contain non-pure
+ * upper
+ */
+ err = ovl_set_impure(c->parent, c->destdir);
+ if (err)
+ return err;
+ }
+
+ /* Should we copyup with O_TMPFILE or with workdir? */
+ if (S_ISREG(c->stat.mode) && ofs->tmpfile)
+ err = ovl_copy_up_tmpfile(c);
+ else
+ err = ovl_copy_up_workdir(c);
+ if (err)
+ goto out;
+
+ if (c->indexed)
+ ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
+
+ if (to_index) {
+ /* Initialize nlink for copy up of disconnected dentry */
+ err = ovl_set_nlink_upper(c->dentry);
+ } else {
+ struct inode *udir = d_inode(c->destdir);
+
+ /* Restore timestamps on parent (best effort) */
+ inode_lock(udir);
+ ovl_set_timestamps(ofs, c->destdir, &c->pstat);
+ inode_unlock(udir);
+
+ ovl_dentry_set_upper_alias(c->dentry);
+ ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry));
+ }
+
+out:
+ if (to_index)
+ kfree(c->destname.name);
+ return err;
+}
+
+static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
+ int flags)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+
+ if (!ofs->config.metacopy)
+ return false;
+
+ if (!S_ISREG(mode))
+ return false;
+
+ if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)))
+ return false;
+
+ return true;
+}
+
+static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value)
+{
+ ssize_t res;
+ char *buf;
+
+ res = ovl_do_getxattr(path, name, NULL, 0);
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ res = 0;
+
+ if (res > 0) {
+ buf = kzalloc(res, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ res = ovl_do_getxattr(path, name, buf, res);
+ if (res < 0)
+ kfree(buf);
+ else
+ *value = buf;
+ }
+ return res;
+}
+
+/* Copy up data of an inode which was copied up metadata only in the past. */
+static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct path upperpath;
+ int err;
+ char *capability = NULL;
+ ssize_t cap_size;
+
+ ovl_path_upper(c->dentry, &upperpath);
+ if (WARN_ON(upperpath.dentry == NULL))
+ return -EIO;
+
+ if (c->stat.size) {
+ err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS,
+ &capability);
+ if (cap_size < 0)
+ goto out;
+ }
+
+ err = ovl_copy_up_data(c, &upperpath);
+ if (err)
+ goto out_free;
+
+ /*
+ * Writing to upper file will clear security.capability xattr. We
+ * don't want that to happen for normal copy-up operation.
+ */
+ if (capability) {
+ err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
+ if (err)
+ goto out_free;
+ }
+
+
+ err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+ if (err)
+ goto out_free;
+
+ ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+ kfree(capability);
+out:
+ return err;
+}
+
+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ int flags)
+{
+ int err;
+ DEFINE_DELAYED_CALL(done);
+ struct path parentpath;
+ struct ovl_copy_up_ctx ctx = {
+ .parent = parent,
+ .dentry = dentry,
+ .workdir = ovl_workdir(dentry),
+ };
+
+ if (WARN_ON(!ctx.workdir))
+ return -EROFS;
+
+ ovl_path_lower(dentry, &ctx.lowerpath);
+ err = vfs_getattr(&ctx.lowerpath, &ctx.stat,
+ STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ if (err)
+ return err;
+
+ if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) ||
+ !kgid_has_mapping(current_user_ns(), ctx.stat.gid))
+ return -EOVERFLOW;
+
+ ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags);
+
+ if (parent) {
+ ovl_path_upper(parent, &parentpath);
+ ctx.destdir = parentpath.dentry;
+ ctx.destname = dentry->d_name;
+
+ err = vfs_getattr(&parentpath, &ctx.pstat,
+ STATX_ATIME | STATX_MTIME,
+ AT_STATX_SYNC_AS_STAT);
+ if (err)
+ return err;
+ }
+
+ /* maybe truncate regular file. this has no effect on dirs */
+ if (flags & O_TRUNC)
+ ctx.stat.size = 0;
+
+ if (S_ISLNK(ctx.stat.mode)) {
+ ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done);
+ if (IS_ERR(ctx.link))
+ return PTR_ERR(ctx.link);
+ }
+
+ err = ovl_copy_up_start(dentry, flags);
+ /* err < 0: interrupted, err > 0: raced with another copy-up */
+ if (unlikely(err)) {
+ if (err > 0)
+ err = 0;
+ } else {
+ if (!ovl_dentry_upper(dentry))
+ err = ovl_do_copy_up(&ctx);
+ if (!err && parent && !ovl_dentry_has_upper_alias(dentry))
+ err = ovl_link_up(&ctx);
+ if (!err && ovl_dentry_needs_data_copy_up_locked(dentry, flags))
+ err = ovl_copy_up_meta_inode_data(&ctx);
+ ovl_copy_up_end(dentry);
+ }
+ do_delayed_call(&done);
+
+ return err;
+}
+
+static int ovl_copy_up_flags(struct dentry *dentry, int flags)
+{
+ int err = 0;
+ const struct cred *old_cred;
+ bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
+
+ /*
+ * With NFS export, copy up can get called for a disconnected non-dir.
+ * In this case, we will copy up lower inode to index dir without
+ * linking it to upper dir.
+ */
+ if (WARN_ON(disconnected && d_is_dir(dentry)))
+ return -EIO;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ while (!err) {
+ struct dentry *next;
+ struct dentry *parent = NULL;
+
+ if (ovl_already_copied_up(dentry, flags))
+ break;
+
+ next = dget(dentry);
+ /* find the topmost dentry not yet copied up */
+ for (; !disconnected;) {
+ parent = dget_parent(next);
+
+ if (ovl_dentry_upper(parent))
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ err = ovl_copy_up_one(parent, next, flags);
+
+ dput(parent);
+ dput(next);
+ }
+ revert_creds(old_cred);
+
+ return err;
+}
+
+static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
+{
+ /* Copy up of disconnected dentry does not set upper alias */
+ if (ovl_already_copied_up(dentry, flags))
+ return false;
+
+ if (special_file(d_inode(dentry)->i_mode))
+ return false;
+
+ if (!ovl_open_flags_need_copy_up(flags))
+ return false;
+
+ return true;
+}
+
+int ovl_maybe_copy_up(struct dentry *dentry, int flags)
+{
+ int err = 0;
+
+ if (ovl_open_need_copy_up(dentry, flags)) {
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_copy_up_flags(dentry, flags);
+ ovl_drop_write(dentry);
+ }
+ }
+
+ return err;
+}
+
+int ovl_copy_up_with_data(struct dentry *dentry)
+{
+ return ovl_copy_up_flags(dentry, O_WRONLY);
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+ return ovl_copy_up_flags(dentry, 0);
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000..5339ff08b
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,1331 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/atomic.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+
+static unsigned short ovl_redirect_max = 256;
+module_param_named(redirect_max, ovl_redirect_max, ushort, 0644);
+MODULE_PARM_DESC(redirect_max,
+ "Maximum length of absolute redirect xattr value");
+
+static int ovl_set_redirect(struct dentry *dentry, bool samedir);
+
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
+{
+ int err;
+
+ dget(wdentry);
+ if (d_is_dir(wdentry))
+ err = ovl_do_rmdir(ofs, wdir, wdentry);
+ else
+ err = ovl_do_unlink(ofs, wdir, wdentry);
+ dput(wdentry);
+
+ if (err) {
+ pr_err("cleanup of '%pd2' failed (%i)\n",
+ wdentry, err);
+ }
+
+ return err;
+}
+
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
+{
+ struct dentry *temp;
+ char name[20];
+ static atomic_t temp_id = ATOMIC_INIT(0);
+
+ /* counter is allowed to wrap, since temp dentries are ephemeral */
+ snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
+
+ temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
+ if (!IS_ERR(temp) && temp->d_inode) {
+ pr_err("workdir/%s already exists\n", name);
+ dput(temp);
+ temp = ERR_PTR(-EIO);
+ }
+
+ return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
+{
+ int err;
+ struct dentry *whiteout;
+ struct dentry *workdir = ofs->workdir;
+ struct inode *wdir = workdir->d_inode;
+
+ if (!ofs->whiteout) {
+ whiteout = ovl_lookup_temp(ofs, workdir);
+ if (IS_ERR(whiteout))
+ goto out;
+
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ goto out;
+ }
+ ofs->whiteout = whiteout;
+ }
+
+ if (ofs->share_whiteout) {
+ whiteout = ovl_lookup_temp(ofs, workdir);
+ if (IS_ERR(whiteout))
+ goto out;
+
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
+ if (!err)
+ goto out;
+
+ if (err != -EMLINK) {
+ pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n",
+ ofs->whiteout->d_inode->i_nlink, err);
+ ofs->share_whiteout = false;
+ }
+ dput(whiteout);
+ }
+ whiteout = ofs->whiteout;
+ ofs->whiteout = NULL;
+out:
+ return whiteout;
+}
+
+/* Caller must hold i_mutex on both workdir and dir */
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *wdir = ofs->workdir->d_inode;
+ struct dentry *whiteout;
+ int err;
+ int flags = 0;
+
+ whiteout = ovl_whiteout(ofs);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ return err;
+
+ if (d_is_dir(dentry))
+ flags = RENAME_EXCHANGE;
+
+ err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
+ if (err)
+ goto kill_whiteout;
+ if (flags)
+ ovl_cleanup(ofs, wdir, dentry);
+
+out:
+ dput(whiteout);
+ return err;
+
+kill_whiteout:
+ ovl_cleanup(ofs, wdir, whiteout);
+ goto out;
+}
+
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode)
+{
+ int err;
+ struct dentry *d, *dentry = *newdentry;
+
+ err = ovl_do_mkdir(ofs, dir, dentry, mode);
+ if (err)
+ return err;
+
+ if (likely(!d_unhashed(dentry)))
+ return 0;
+
+ /*
+ * vfs_mkdir() may succeed and leave the dentry passed
+ * to it unhashed and negative. If that happens, try to
+ * lookup a new hashed and positive dentry.
+ */
+ d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
+ if (IS_ERR(d)) {
+ pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
+ dentry, err);
+ return PTR_ERR(d);
+ }
+ dput(dentry);
+ *newdentry = d;
+
+ return 0;
+}
+
+struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *newdentry, struct ovl_cattr *attr)
+{
+ int err;
+
+ if (IS_ERR(newdentry))
+ return newdentry;
+
+ err = -ESTALE;
+ if (newdentry->d_inode)
+ goto out;
+
+ if (attr->hardlink) {
+ err = ovl_do_link(ofs, attr->hardlink, dir, newdentry);
+ } else {
+ switch (attr->mode & S_IFMT) {
+ case S_IFREG:
+ err = ovl_do_create(ofs, dir, newdentry, attr->mode);
+ break;
+
+ case S_IFDIR:
+ /* mkdir is special... */
+ err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
+ break;
+
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ err = ovl_do_mknod(ofs, dir, newdentry, attr->mode,
+ attr->rdev);
+ break;
+
+ case S_IFLNK:
+ err = ovl_do_symlink(ofs, dir, newdentry, attr->link);
+ break;
+
+ default:
+ err = -EPERM;
+ }
+ }
+ if (!err && WARN_ON(!newdentry->d_inode)) {
+ /*
+ * Not quite sure if non-instantiated dentry is legal or not.
+ * VFS doesn't seem to care so check and warn here.
+ */
+ err = -EIO;
+ }
+out:
+ if (err) {
+ dput(newdentry);
+ return ERR_PTR(err);
+ }
+ return newdentry;
+}
+
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
+ struct ovl_cattr *attr)
+{
+ return ovl_create_real(ofs, d_inode(workdir),
+ ovl_lookup_temp(ofs, workdir), attr);
+}
+
+static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
+ int xerr)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ int err;
+
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_OPAQUE, "y", 1, xerr);
+ if (!err)
+ ovl_dentry_set_opaque(dentry);
+
+ return err;
+}
+
+static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
+{
+ /*
+ * Fail with -EIO when trying to create opaque dir and upper doesn't
+ * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to
+ * return a specific error for noxattr case.
+ */
+ return ovl_set_opaque_xerr(dentry, upperdentry, -EIO);
+}
+
+/*
+ * Common operations required to be done after creation of file on upper.
+ * If @hardlink is false, then @inode is a pre-allocated inode, we may or
+ * may not use to instantiate the new dentry.
+ */
+static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
+{
+ struct ovl_inode_params oip = {
+ .upperdentry = newdentry,
+ .newinode = inode,
+ };
+
+ ovl_dir_modified(dentry->d_parent, false);
+ ovl_dentry_set_upper_alias(dentry);
+ ovl_dentry_init_reval(dentry, newdentry);
+
+ if (!hardlink) {
+ /*
+ * ovl_obtain_alias() can be called after ovl_create_real()
+ * and before we get here, so we may get an inode from cache
+ * with the same real upperdentry that is not the inode we
+ * pre-allocated. In this case we will use the cached inode
+ * to instantiate the new dentry.
+ *
+ * XXX: if we ever use ovl_obtain_alias() to decode directory
+ * file handles, need to use ovl_get_inode_locked() and
+ * d_instantiate_new() here to prevent from creating two
+ * hashed directory inode aliases.
+ */
+ inode = ovl_get_inode(dentry->d_sb, &oip);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ if (inode == oip.newinode)
+ ovl_set_flag(OVL_UPPERDATA, inode);
+ } else {
+ WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
+ dput(newdentry);
+ inc_nlink(inode);
+ }
+
+ d_instantiate(dentry, inode);
+ if (inode != oip.newinode) {
+ pr_warn_ratelimited("newly created inode found in cache (%pd2)\n",
+ dentry);
+ }
+
+ /* Force lookup of new upper hardlink to find its lower */
+ if (hardlink)
+ d_drop(dentry);
+
+ return 0;
+}
+
+static bool ovl_type_merge(struct dentry *dentry)
+{
+ return OVL_TYPE_MERGE(ovl_path_type(dentry));
+}
+
+static bool ovl_type_origin(struct dentry *dentry)
+{
+ return OVL_TYPE_ORIGIN(ovl_path_type(dentry));
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+ struct ovl_cattr *attr)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry;
+ int err;
+
+ if (!attr->hardlink && !IS_POSIXACL(udir))
+ attr->mode &= ~current_umask();
+
+ inode_lock_nested(udir, I_MUTEX_PARENT);
+ newdentry = ovl_create_real(ofs, udir,
+ ovl_lookup_upper(ofs, dentry->d_name.name,
+ upperdir, dentry->d_name.len),
+ attr);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+
+ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
+ !ovl_allow_offline_changes(ofs)) {
+ /* Setting opaque here is just an optimization, allow to fail */
+ ovl_set_opaque(dentry, newdentry);
+ }
+
+ err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink);
+ if (err)
+ goto out_cleanup;
+out_unlock:
+ inode_unlock(udir);
+ return err;
+
+out_cleanup:
+ ovl_cleanup(ofs, udir, newdentry);
+ dput(newdentry);
+ goto out_unlock;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+ struct list_head *list)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct path upperpath;
+ struct dentry *upper;
+ struct dentry *opaquedir;
+ struct kstat stat;
+ int err;
+
+ if (WARN_ON(!workdir))
+ return ERR_PTR(-EROFS);
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ ovl_path_upper(dentry, &upperpath);
+ err = vfs_getattr(&upperpath, &stat,
+ STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ if (err)
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (!S_ISDIR(stat.mode))
+ goto out_unlock;
+ upper = upperpath.dentry;
+ if (upper->d_parent->d_inode != udir)
+ goto out_unlock;
+
+ opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode));
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out_unlock;
+
+ err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_opaque(dentry, opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ inode_lock(opaquedir->d_inode);
+ err = ovl_set_attr(ofs, opaquedir, &stat);
+ inode_unlock(opaquedir->d_inode);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup_whiteouts(ofs, upper, list);
+ ovl_cleanup(ofs, wdir, upper);
+ unlock_rename(workdir, upperdir);
+
+ /* dentry's upper doesn't match now, get rid of it */
+ d_drop(dentry);
+
+ return opaquedir;
+
+out_cleanup:
+ ovl_cleanup(ofs, wdir, opaquedir);
+ dput(opaquedir);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return ERR_PTR(err);
+}
+
+static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
+ const char *name, const struct posix_acl *acl)
+{
+ void *buffer;
+ size_t size;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
+ return 0;
+
+ size = posix_acl_xattr_size(acl->a_count);
+ buffer = kmalloc(size, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+ if (err < 0)
+ goto out_free;
+
+ err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
+out_free:
+ kfree(buffer);
+ return err;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+ struct ovl_cattr *cattr)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *upper;
+ struct dentry *newdentry;
+ int err;
+ struct posix_acl *acl, *default_acl;
+ bool hardlink = !!cattr->hardlink;
+
+ if (WARN_ON(!workdir))
+ return -EROFS;
+
+ if (!hardlink) {
+ err = posix_acl_create(dentry->d_parent->d_inode,
+ &cattr->mode, &default_acl, &acl);
+ if (err)
+ return err;
+ }
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
+ goto out_dput;
+
+ newdentry = ovl_create_temp(ofs, workdir, cattr);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_dput;
+
+ /*
+ * mode could have been mutilated due to umask (e.g. sgid directory)
+ */
+ if (!hardlink &&
+ !S_ISLNK(cattr->mode) &&
+ newdentry->d_inode->i_mode != cattr->mode) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = cattr->mode,
+ };
+ inode_lock(newdentry->d_inode);
+ err = ovl_do_notify_change(ofs, newdentry, &attr);
+ inode_unlock(newdentry->d_inode);
+ if (err)
+ goto out_cleanup;
+ }
+ if (!hardlink) {
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_ACCESS, acl);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_DEFAULT, default_acl);
+ if (err)
+ goto out_cleanup;
+ }
+
+ if (!hardlink && S_ISDIR(cattr->mode)) {
+ err = ovl_set_opaque(dentry, newdentry);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
+ RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup(ofs, wdir, upper);
+ } else {
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+ }
+ err = ovl_instantiate(dentry, inode, newdentry, hardlink);
+ if (err) {
+ ovl_cleanup(ofs, udir, newdentry);
+ dput(newdentry);
+ }
+out_dput:
+ dput(upper);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ if (!hardlink) {
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ }
+ return err;
+
+out_cleanup:
+ ovl_cleanup(ofs, wdir, newdentry);
+ dput(newdentry);
+ goto out_dput;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
+ struct ovl_cattr *attr, bool origin)
+{
+ int err;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+ struct dentry *parent = dentry->d_parent;
+
+ err = ovl_copy_up(parent);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+
+ /*
+ * When linking a file with copy up origin into a new parent, mark the
+ * new parent dir "impure".
+ */
+ if (origin) {
+ err = ovl_set_impure(parent, ovl_dentry_upper(parent));
+ if (err)
+ goto out_revert_creds;
+ }
+
+ if (!attr->hardlink) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_revert_creds;
+ /*
+ * In the creation cases(create, mkdir, mknod, symlink),
+ * ovl should transfer current's fs{u,g}id to underlying
+ * fs. Because underlying fs want to initialize its new
+ * inode owner using current's fs{u,g}id. And in this
+ * case, the @inode is a new inode that is initialized
+ * in inode_init_owner() to current's fs{u,g}id. So use
+ * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+ *
+ * But in the other hardlink case, ovl_link() does not
+ * create a new inode, so just use the ovl mounter's
+ * fs{u,g}id.
+ */
+ override_cred->fsuid = inode->i_uid;
+ override_cred->fsgid = inode->i_gid;
+ err = security_dentry_create_files_as(dentry,
+ attr->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
+ }
+ put_cred(override_creds(override_cred));
+ put_cred(override_cred);
+ }
+
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, attr);
+
+out_revert_creds:
+ revert_creds(old_cred);
+ return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link)
+{
+ int err;
+ struct inode *inode;
+ struct ovl_cattr attr = {
+ .rdev = rdev,
+ .link = link,
+ };
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ /* Preallocate inode to be used by ovl_get_inode() */
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, rdev);
+ if (!inode)
+ goto out_drop_write;
+
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_CREATING;
+ spin_unlock(&inode->i_lock);
+
+ inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
+ attr.mode = inode->i_mode;
+
+ err = ovl_create_or_link(dentry, inode, &attr, false);
+ /* Did we end up using the preallocated inode? */
+ if (inode != d_inode(dentry))
+ iput(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ /* Don't allow creation of "whiteout" on overlay */
+ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+ return -EPERM;
+
+ return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *link)
+{
+ return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_set_link_redirect(struct dentry *dentry)
+{
+ const struct cred *old_cred;
+ int err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_set_redirect(dentry, false);
+ revert_creds(old_cred);
+
+ return err;
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+ struct dentry *new)
+{
+ int err;
+ struct inode *inode;
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+
+ if (ovl_is_metacopy_dentry(old)) {
+ err = ovl_set_link_redirect(old);
+ if (err)
+ goto out_drop_write;
+ }
+
+ err = ovl_nlink_start(old);
+ if (err)
+ goto out_drop_write;
+
+ inode = d_inode(old);
+ ihold(inode);
+
+ err = ovl_create_or_link(new, inode,
+ &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)},
+ ovl_type_origin(old));
+ if (err)
+ iput(inode);
+
+ ovl_nlink_end(old);
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ return err;
+}
+
+static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper)
+{
+ return d_inode(ovl_dentry_upper(dentry)) == d_inode(upper);
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry,
+ struct list_head *list)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct dentry *upper;
+ struct dentry *opaquedir = NULL;
+ int err;
+
+ if (WARN_ON(!workdir))
+ return -EROFS;
+
+ if (!list_empty(list)) {
+ opaquedir = ovl_clear_empty(dentry, list);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out_dput;
+
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if ((opaquedir && upper != opaquedir) ||
+ (!opaquedir && ovl_dentry_upper(dentry) &&
+ !ovl_matches_upper(dentry, upper))) {
+ goto out_dput_upper;
+ }
+
+ err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper);
+ if (err)
+ goto out_d_drop;
+
+ ovl_dir_modified(dentry->d_parent, true);
+out_d_drop:
+ d_drop(dentry);
+out_dput_upper:
+ dput(upper);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_dput:
+ dput(opaquedir);
+out:
+ return err;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
+ struct list_head *list)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *dir = upperdir->d_inode;
+ struct dentry *upper;
+ struct dentry *opaquedir = NULL;
+ int err;
+
+ if (!list_empty(list)) {
+ opaquedir = ovl_clear_empty(dentry, list);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if ((opaquedir && upper != opaquedir) ||
+ (!opaquedir && !ovl_matches_upper(dentry, upper)))
+ goto out_dput_upper;
+
+ if (is_dir)
+ err = ovl_do_rmdir(ofs, dir, upper);
+ else
+ err = ovl_do_unlink(ofs, dir, upper);
+ ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
+
+ /*
+ * Keeping this dentry hashed would mean having to release
+ * upperpath/lowerpath, which could only be done if we are the
+ * sole user of this dentry. Too tricky... Just unhash for
+ * now.
+ */
+ if (!err)
+ d_drop(dentry);
+out_dput_upper:
+ dput(upper);
+out_unlock:
+ inode_unlock(dir);
+ dput(opaquedir);
+out:
+ return err;
+}
+
+static bool ovl_pure_upper(struct dentry *dentry)
+{
+ return !ovl_dentry_lower(dentry) &&
+ !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
+}
+
+static void ovl_drop_nlink(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *alias;
+
+ /* Try to find another, hashed alias */
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ if (alias != dentry && !d_unhashed(alias))
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * Changes to underlying layers may cause i_nlink to lose sync with
+ * reality. In this case prevent the link count from going to zero
+ * prematurely.
+ */
+ if (inode->i_nlink > !!alias)
+ drop_nlink(inode);
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+ int err;
+ const struct cred *old_cred;
+ bool lower_positive = ovl_lower_positive(dentry);
+ LIST_HEAD(list);
+
+ /* No need to clean pure upper removed by vfs_rmdir() */
+ if (is_dir && (lower_positive || !ovl_pure_upper(dentry))) {
+ err = ovl_check_empty_dir(dentry, &list);
+ if (err)
+ goto out;
+ }
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_nlink_start(dentry);
+ if (err)
+ goto out_drop_write;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (!lower_positive)
+ err = ovl_remove_upper(dentry, is_dir, &list);
+ else
+ err = ovl_remove_and_whiteout(dentry, &list);
+ revert_creds(old_cred);
+ if (!err) {
+ if (is_dir)
+ clear_nlink(dentry->d_inode);
+ else
+ ovl_drop_nlink(dentry);
+ }
+ ovl_nlink_end(dentry);
+
+ /*
+ * Copy ctime
+ *
+ * Note: we fail to update ctime if there was no copy-up, only a
+ * whiteout
+ */
+ if (ovl_dentry_upper(dentry))
+ ovl_copyattr(d_inode(dentry));
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ ovl_cache_free(&list);
+ return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, true);
+}
+
+static bool ovl_type_merge_or_lower(struct dentry *dentry)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type);
+}
+
+static bool ovl_can_move(struct dentry *dentry)
+{
+ return ovl_redirect_dir(dentry->d_sb) ||
+ !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry);
+}
+
+static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect)
+{
+ char *buf, *ret;
+ struct dentry *d, *tmp;
+ int buflen = ovl_redirect_max + 1;
+
+ if (!abs_redirect) {
+ ret = kstrndup(dentry->d_name.name, dentry->d_name.len,
+ GFP_KERNEL);
+ goto out;
+ }
+
+ buf = ret = kmalloc(buflen, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ buflen--;
+ buf[buflen] = '\0';
+ for (d = dget(dentry); !IS_ROOT(d);) {
+ const char *name;
+ int thislen;
+
+ spin_lock(&d->d_lock);
+ name = ovl_dentry_get_redirect(d);
+ if (name) {
+ thislen = strlen(name);
+ } else {
+ name = d->d_name.name;
+ thislen = d->d_name.len;
+ }
+
+ /* If path is too long, fall back to userspace move */
+ if (thislen + (name[0] != '/') > buflen) {
+ ret = ERR_PTR(-EXDEV);
+ spin_unlock(&d->d_lock);
+ goto out_put;
+ }
+
+ buflen -= thislen;
+ memcpy(&buf[buflen], name, thislen);
+ spin_unlock(&d->d_lock);
+ tmp = dget_parent(d);
+
+ dput(d);
+ d = tmp;
+
+ /* Absolute redirect: finished */
+ if (buf[buflen] == '/')
+ break;
+ buflen--;
+ buf[buflen] = '/';
+ }
+ ret = kstrdup(&buf[buflen], GFP_KERNEL);
+out_put:
+ dput(d);
+ kfree(buf);
+out:
+ return ret ? ret : ERR_PTR(-ENOMEM);
+}
+
+static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir)
+{
+ struct dentry *lowerdentry;
+
+ if (!samedir)
+ return true;
+
+ if (d_is_dir(dentry))
+ return false;
+
+ /*
+ * For non-dir hardlinked files, we need absolute redirects
+ * in general as two upper hardlinks could be in different
+ * dirs. We could put a relative redirect now and convert
+ * it to absolute redirect later. But when nlink > 1 and
+ * indexing is on, that means relative redirect needs to be
+ * converted to absolute during copy up of another lower
+ * hardllink as well.
+ *
+ * So without optimizing too much, just check if lower is
+ * a hard link or not. If lower is hard link, put absolute
+ * redirect.
+ */
+ lowerdentry = ovl_dentry_lower(dentry);
+ return (d_inode(lowerdentry)->i_nlink > 1);
+}
+
+static int ovl_set_redirect(struct dentry *dentry, bool samedir)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ const char *redirect = ovl_dentry_get_redirect(dentry);
+ bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir);
+
+ if (redirect && (!absolute_redirect || redirect[0] == '/'))
+ return 0;
+
+ redirect = ovl_get_redirect(dentry, absolute_redirect);
+ if (IS_ERR(redirect))
+ return PTR_ERR(redirect);
+
+ err = ovl_check_setxattr(ofs, ovl_dentry_upper(dentry),
+ OVL_XATTR_REDIRECT,
+ redirect, strlen(redirect), -EXDEV);
+ if (!err) {
+ spin_lock(&dentry->d_lock);
+ ovl_dentry_set_redirect(dentry, redirect);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ kfree(redirect);
+ pr_warn_ratelimited("failed to set redirect (%i)\n",
+ err);
+ /* Fall back to userspace copy-up */
+ err = -EXDEV;
+ }
+ return err;
+}
+
+static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
+ struct dentry *old, struct inode *newdir,
+ struct dentry *new, unsigned int flags)
+{
+ int err;
+ struct dentry *old_upperdir;
+ struct dentry *new_upperdir;
+ struct dentry *olddentry;
+ struct dentry *newdentry;
+ struct dentry *trap;
+ bool old_opaque;
+ bool new_opaque;
+ bool cleanup_whiteout = false;
+ bool update_nlink = false;
+ bool overwrite = !(flags & RENAME_EXCHANGE);
+ bool is_dir = d_is_dir(old);
+ bool new_is_dir = d_is_dir(new);
+ bool samedir = olddir == newdir;
+ struct dentry *opaquedir = NULL;
+ const struct cred *old_cred = NULL;
+ struct ovl_fs *ofs = OVL_FS(old->d_sb);
+ LIST_HEAD(list);
+
+ err = -EINVAL;
+ if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+ goto out;
+
+ flags &= ~RENAME_NOREPLACE;
+
+ /* Don't copy up directory trees */
+ err = -EXDEV;
+ if (!ovl_can_move(old))
+ goto out;
+ if (!overwrite && !ovl_can_move(new))
+ goto out;
+
+ if (overwrite && new_is_dir && !ovl_pure_upper(new)) {
+ err = ovl_check_empty_dir(new, &list);
+ if (err)
+ goto out;
+ }
+
+ if (overwrite) {
+ if (ovl_lower_positive(old)) {
+ if (!ovl_dentry_is_whiteout(new)) {
+ /* Whiteout source */
+ flags |= RENAME_WHITEOUT;
+ } else {
+ /* Switch whiteouts */
+ flags |= RENAME_EXCHANGE;
+ }
+ } else if (is_dir && ovl_dentry_is_whiteout(new)) {
+ flags |= RENAME_EXCHANGE;
+ cleanup_whiteout = true;
+ }
+ }
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+ if (!overwrite) {
+ err = ovl_copy_up(new);
+ if (err)
+ goto out_drop_write;
+ } else if (d_inode(new)) {
+ err = ovl_nlink_start(new);
+ if (err)
+ goto out_drop_write;
+
+ update_nlink = true;
+ }
+
+ old_cred = ovl_override_creds(old->d_sb);
+
+ if (!list_empty(&list)) {
+ opaquedir = ovl_clear_empty(new, &list);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir)) {
+ opaquedir = NULL;
+ goto out_revert_creds;
+ }
+ }
+
+ old_upperdir = ovl_dentry_upper(old->d_parent);
+ new_upperdir = ovl_dentry_upper(new->d_parent);
+
+ if (!samedir) {
+ /*
+ * When moving a merge dir or non-dir with copy up origin into
+ * a new parent, we are marking the new parent dir "impure".
+ * When ovl_iterate() iterates an "impure" upper dir, it will
+ * lookup the origin inodes of the entries to fill d_ino.
+ */
+ if (ovl_type_origin(old)) {
+ err = ovl_set_impure(new->d_parent, new_upperdir);
+ if (err)
+ goto out_revert_creds;
+ }
+ if (!overwrite && ovl_type_origin(new)) {
+ err = ovl_set_impure(old->d_parent, old_upperdir);
+ if (err)
+ goto out_revert_creds;
+ }
+ }
+
+ trap = lock_rename(new_upperdir, old_upperdir);
+
+ olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
+ old->d_name.len);
+ err = PTR_ERR(olddentry);
+ if (IS_ERR(olddentry))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (!ovl_matches_upper(old, olddentry))
+ goto out_dput_old;
+
+ newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_dput_old;
+
+ old_opaque = ovl_dentry_is_opaque(old);
+ new_opaque = ovl_dentry_is_opaque(new);
+
+ err = -ESTALE;
+ if (d_inode(new) && ovl_dentry_upper(new)) {
+ if (opaquedir) {
+ if (newdentry != opaquedir)
+ goto out_dput;
+ } else {
+ if (!ovl_matches_upper(new, newdentry))
+ goto out_dput;
+ }
+ } else {
+ if (!d_is_negative(newdentry)) {
+ if (!new_opaque || !ovl_is_whiteout(newdentry))
+ goto out_dput;
+ } else {
+ if (flags & RENAME_EXCHANGE)
+ goto out_dput;
+ }
+ }
+
+ if (olddentry == trap)
+ goto out_dput;
+ if (newdentry == trap)
+ goto out_dput;
+
+ if (olddentry->d_inode == newdentry->d_inode)
+ goto out_dput;
+
+ err = 0;
+ if (ovl_type_merge_or_lower(old))
+ err = ovl_set_redirect(old, samedir);
+ else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent))
+ err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
+ if (err)
+ goto out_dput;
+
+ if (!overwrite && ovl_type_merge_or_lower(new))
+ err = ovl_set_redirect(new, samedir);
+ else if (!overwrite && new_is_dir && !new_opaque &&
+ ovl_type_merge(old->d_parent))
+ err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
+ if (err)
+ goto out_dput;
+
+ err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry, flags);
+ if (err)
+ goto out_dput;
+
+ if (cleanup_whiteout)
+ ovl_cleanup(ofs, old_upperdir->d_inode, newdentry);
+
+ if (overwrite && d_inode(new)) {
+ if (new_is_dir)
+ clear_nlink(d_inode(new));
+ else
+ ovl_drop_nlink(new);
+ }
+
+ ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
+ (!overwrite && ovl_type_origin(new)));
+ ovl_dir_modified(new->d_parent, ovl_type_origin(old) ||
+ (d_inode(new) && ovl_type_origin(new)));
+
+ /* copy ctime: */
+ ovl_copyattr(d_inode(old));
+ if (d_inode(new) && ovl_dentry_upper(new))
+ ovl_copyattr(d_inode(new));
+
+out_dput:
+ dput(newdentry);
+out_dput_old:
+ dput(olddentry);
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+ revert_creds(old_cred);
+ if (update_nlink)
+ ovl_nlink_end(new);
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ dput(opaquedir);
+ ovl_cache_free(&list);
+ return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+ .lookup = ovl_lookup,
+ .mkdir = ovl_mkdir,
+ .symlink = ovl_symlink,
+ .unlink = ovl_unlink,
+ .rmdir = ovl_rmdir,
+ .rename = ovl_rename,
+ .link = ovl_link,
+ .setattr = ovl_setattr,
+ .create = ovl_create,
+ .mknod = ovl_mknod,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .listxattr = ovl_listxattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
+ .fileattr_get = ovl_fileattr_get,
+ .fileattr_set = ovl_fileattr_set,
+};
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
new file mode 100644
index 000000000..e55363d34
--- /dev/null
+++ b/fs/overlayfs/export.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Overlayfs NFS export support.
+ *
+ * Amir Goldstein <amir73il@gmail.com>
+ *
+ * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+
+static int ovl_encode_maybe_copy_up(struct dentry *dentry)
+{
+ int err;
+
+ if (ovl_dentry_upper(dentry))
+ return 0;
+
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_copy_up(dentry);
+ ovl_drop_write(dentry);
+ }
+
+ if (err) {
+ pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
+ dentry, err);
+ }
+
+ return err;
+}
+
+/*
+ * Before encoding a non-upper directory file handle from real layer N, we need
+ * to check if it will be possible to reconnect an overlay dentry from the real
+ * lower decoded dentry. This is done by following the overlay ancestry up to a
+ * "layer N connected" ancestor and verifying that all parents along the way are
+ * "layer N connectable". If an ancestor that is NOT "layer N connectable" is
+ * found, we need to copy up an ancestor, which is "layer N connectable", thus
+ * making that ancestor "layer N connected". For example:
+ *
+ * layer 1: /a
+ * layer 2: /a/b/c
+ *
+ * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
+ * copied up and renamed, upper dir /a will be indexed by lower dir /a from
+ * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
+ * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
+ * dentry from the connected lower dentry /a/b/c.
+ *
+ * To avoid this problem on decode time, we need to copy up an ancestor of
+ * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
+ * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
+ * and when the time comes to decode the file handle from lower dentry /a/b/c,
+ * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
+ * a connected overlay dentry will be accomplished.
+ *
+ * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
+ * entry /a in the lower layers above layer N and find the indexed dir /a from
+ * layer 1. If that improvement is made, then the check for "layer N connected"
+ * will need to verify there are no redirects in lower layers above N. In the
+ * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
+ * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
+ *
+ * layer 1: /A (redirect = /a)
+ * layer 2: /a/b/c
+ */
+
+/* Return the lowest layer for encoding a connectable file handle */
+static int ovl_connectable_layer(struct dentry *dentry)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+
+ /* We can get overlay root from root of any layer */
+ if (dentry == dentry->d_sb->s_root)
+ return oe->numlower;
+
+ /*
+ * If it's an unindexed merge dir, then it's not connectable with any
+ * lower layer
+ */
+ if (ovl_dentry_upper(dentry) &&
+ !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+ return 0;
+
+ /* We can get upper/overlay path from indexed/lower dentry */
+ return oe->lowerstack[0].layer->idx;
+}
+
+/*
+ * @dentry is "connected" if all ancestors up to root or a "connected" ancestor
+ * have the same uppermost lower layer as the origin's layer. We may need to
+ * copy up a "connectable" ancestor to make it "connected". A "connected" dentry
+ * cannot become non "connected", so cache positive result in dentry flags.
+ *
+ * Return the connected origin layer or < 0 on error.
+ */
+static int ovl_connect_layer(struct dentry *dentry)
+{
+ struct dentry *next, *parent = NULL;
+ int origin_layer;
+ int err = 0;
+
+ if (WARN_ON(dentry == dentry->d_sb->s_root) ||
+ WARN_ON(!ovl_dentry_lower(dentry)))
+ return -EIO;
+
+ origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
+ if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
+ return origin_layer;
+
+ /* Find the topmost origin layer connectable ancestor of @dentry */
+ next = dget(dentry);
+ for (;;) {
+ parent = dget_parent(next);
+ if (WARN_ON(parent == next)) {
+ err = -EIO;
+ break;
+ }
+
+ /*
+ * If @parent is not origin layer connectable, then copy up
+ * @next which is origin layer connectable and we are done.
+ */
+ if (ovl_connectable_layer(parent) < origin_layer) {
+ err = ovl_encode_maybe_copy_up(next);
+ break;
+ }
+
+ /* If @parent is connected or indexed we are done */
+ if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
+ ovl_test_flag(OVL_INDEX, d_inode(parent)))
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ dput(parent);
+ dput(next);
+
+ if (!err)
+ ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
+
+ return err ?: origin_layer;
+}
+
+/*
+ * We only need to encode origin if there is a chance that the same object was
+ * encoded pre copy up and then we need to stay consistent with the same
+ * encoding also after copy up. If non-pure upper is not indexed, then it was
+ * copied up before NFS export was enabled. In that case we don't need to worry
+ * about staying consistent with pre copy up encoding and we encode an upper
+ * file handle. Overlay root dentry is a private case of non-indexed upper.
+ *
+ * The following table summarizes the different file handle encodings used for
+ * different overlay object types:
+ *
+ * Object type | Encoding
+ * --------------------------------
+ * Pure upper | U
+ * Non-indexed upper | U
+ * Indexed upper | L (*)
+ * Non-upper | L (*)
+ *
+ * U = upper file handle
+ * L = lower file handle
+ *
+ * (*) Connecting an overlay dir from real lower dentry is not always
+ * possible when there are redirects in lower layers and non-indexed merge dirs.
+ * To mitigate those case, we may copy up the lower dir ancestor before encode
+ * a lower dir file handle.
+ *
+ * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
+ */
+static int ovl_check_encode_origin(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+
+ /* Upper file handle for pure upper */
+ if (!ovl_dentry_lower(dentry))
+ return 0;
+
+ /*
+ * Upper file handle for non-indexed upper.
+ *
+ * Root is never indexed, so if there's an upper layer, encode upper for
+ * root.
+ */
+ if (ovl_dentry_upper(dentry) &&
+ !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+ return 0;
+
+ /*
+ * Decoding a merge dir, whose origin's ancestor is under a redirected
+ * lower dir or under a non-indexed upper is not always possible.
+ * ovl_connect_layer() will try to make origin's layer "connected" by
+ * copying up a "connectable" ancestor.
+ */
+ if (d_is_dir(dentry) && ovl_upper_mnt(ofs))
+ return ovl_connect_layer(dentry);
+
+ /* Lower file handle for indexed and non-upper dir/non-dir */
+ return 1;
+}
+
+static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ u32 *fid, int buflen)
+{
+ struct ovl_fh *fh = NULL;
+ int err, enc_lower;
+ int len;
+
+ /*
+ * Check if we should encode a lower or upper file handle and maybe
+ * copy up an ancestor to make lower file handle connectable.
+ */
+ err = enc_lower = ovl_check_encode_origin(dentry);
+ if (enc_lower < 0)
+ goto fail;
+
+ /* Encode an upper or lower file handle */
+ fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
+ ovl_dentry_upper(dentry), !enc_lower);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+ len = OVL_FH_LEN(fh);
+ if (len <= buflen)
+ memcpy(fid, fh, len);
+ err = len;
+
+out:
+ kfree(fh);
+ return err;
+
+fail:
+ pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
+ dentry, err);
+ goto out;
+}
+
+static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
+ struct inode *parent)
+{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ struct dentry *dentry;
+ int bytes, buflen = *max_len << 2;
+
+ /* TODO: encode connectable file handles */
+ if (parent)
+ return FILEID_INVALID;
+
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ return FILEID_INVALID;
+
+ bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
+ dput(dentry);
+ if (bytes <= 0)
+ return FILEID_INVALID;
+
+ *max_len = bytes >> 2;
+ if (bytes > buflen)
+ return FILEID_INVALID;
+
+ return OVL_FILEID_V1;
+}
+
+/*
+ * Find or instantiate an overlay dentry from real dentries and index.
+ */
+static struct dentry *ovl_obtain_alias(struct super_block *sb,
+ struct dentry *upper_alias,
+ struct ovl_path *lowerpath,
+ struct dentry *index)
+{
+ struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
+ struct dentry *upper = upper_alias ?: index;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct ovl_entry *oe;
+ struct ovl_inode_params oip = {
+ .lowerpath = lowerpath,
+ .index = index,
+ .numlower = !!lower
+ };
+
+ /* We get overlay directory dentries with ovl_lookup_real() */
+ if (d_is_dir(upper ?: lower))
+ return ERR_PTR(-EIO);
+
+ oip.upperdentry = dget(upper);
+ inode = ovl_get_inode(sb, &oip);
+ if (IS_ERR(inode)) {
+ dput(upper);
+ return ERR_CAST(inode);
+ }
+
+ if (upper)
+ ovl_set_flag(OVL_UPPERDATA, inode);
+
+ dentry = d_find_any_alias(inode);
+ if (dentry)
+ goto out_iput;
+
+ dentry = d_alloc_anon(inode->i_sb);
+ if (unlikely(!dentry))
+ goto nomem;
+ oe = ovl_alloc_entry(lower ? 1 : 0);
+ if (!oe)
+ goto nomem;
+
+ if (lower) {
+ oe->lowerstack->dentry = dget(lower);
+ oe->lowerstack->layer = lowerpath->layer;
+ }
+ dentry->d_fsdata = oe;
+ if (upper_alias)
+ ovl_dentry_set_upper_alias(dentry);
+
+ ovl_dentry_init_reval(dentry, upper);
+
+ return d_instantiate_anon(dentry, inode);
+
+nomem:
+ dput(dentry);
+ dentry = ERR_PTR(-ENOMEM);
+out_iput:
+ iput(inode);
+ return dentry;
+}
+
+/* Get the upper or lower dentry in stach whose on layer @idx */
+static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ int i;
+
+ if (!idx)
+ return ovl_dentry_upper(dentry);
+
+ for (i = 0; i < oe->numlower; i++) {
+ if (oe->lowerstack[i].layer->idx == idx)
+ return oe->lowerstack[i].dentry;
+ }
+
+ return NULL;
+}
+
+/*
+ * Lookup a child overlay dentry to get a connected overlay dentry whose real
+ * dentry is @real. If @real is on upper layer, we lookup a child overlay
+ * dentry with the same name as the real dentry. Otherwise, we need to consult
+ * index for lookup.
+ */
+static struct dentry *ovl_lookup_real_one(struct dentry *connected,
+ struct dentry *real,
+ const struct ovl_layer *layer)
+{
+ struct inode *dir = d_inode(connected);
+ struct dentry *this, *parent = NULL;
+ struct name_snapshot name;
+ int err;
+
+ /*
+ * Lookup child overlay dentry by real name. The dir mutex protects us
+ * from racing with overlay rename. If the overlay dentry that is above
+ * real has already been moved to a parent that is not under the
+ * connected overlay dir, we return -ECHILD and restart the lookup of
+ * connected real path from the top.
+ */
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ err = -ECHILD;
+ parent = dget_parent(real);
+ if (ovl_dentry_real_at(connected, layer->idx) != parent)
+ goto fail;
+
+ /*
+ * We also need to take a snapshot of real dentry name to protect us
+ * from racing with underlying layer rename. In this case, we don't
+ * care about returning ESTALE, only from dereferencing a free name
+ * pointer because we hold no lock on the real dentry.
+ */
+ take_dentry_name_snapshot(&name, real);
+ /*
+ * No mnt_userns handling here: it's an internal lookup. Could skip
+ * permission checking altogether, but for now just use non-mnt_userns
+ * transformed ids.
+ */
+ this = lookup_one_len(name.name.name, connected, name.name.len);
+ release_dentry_name_snapshot(&name);
+ err = PTR_ERR(this);
+ if (IS_ERR(this)) {
+ goto fail;
+ } else if (!this || !this->d_inode) {
+ dput(this);
+ err = -ENOENT;
+ goto fail;
+ } else if (ovl_dentry_real_at(this, layer->idx) != real) {
+ dput(this);
+ err = -ESTALE;
+ goto fail;
+ }
+
+out:
+ dput(parent);
+ inode_unlock(dir);
+ return this;
+
+fail:
+ pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
+ real, layer->idx, connected, err);
+ this = ERR_PTR(err);
+ goto out;
+}
+
+static struct dentry *ovl_lookup_real(struct super_block *sb,
+ struct dentry *real,
+ const struct ovl_layer *layer);
+
+/*
+ * Lookup an indexed or hashed overlay dentry by real inode.
+ */
+static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
+ struct dentry *real,
+ const struct ovl_layer *layer)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ struct dentry *index = NULL;
+ struct dentry *this = NULL;
+ struct inode *inode;
+
+ /*
+ * Decoding upper dir from index is expensive, so first try to lookup
+ * overlay dentry in inode/dcache.
+ */
+ inode = ovl_lookup_inode(sb, real, !layer->idx);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (inode) {
+ this = d_find_any_alias(inode);
+ iput(inode);
+ }
+
+ /*
+ * For decoded lower dir file handle, lookup index by origin to check
+ * if lower dir was copied up and and/or removed.
+ */
+ if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
+ index = ovl_lookup_index(ofs, NULL, real, false);
+ if (IS_ERR(index))
+ return index;
+ }
+
+ /* Get connected upper overlay dir from index */
+ if (index) {
+ struct dentry *upper = ovl_index_upper(ofs, index);
+
+ dput(index);
+ if (IS_ERR_OR_NULL(upper))
+ return upper;
+
+ /*
+ * ovl_lookup_real() in lower layer may call recursively once to
+ * ovl_lookup_real() in upper layer. The first level call walks
+ * back lower parents to the topmost indexed parent. The second
+ * recursive call walks back from indexed upper to the topmost
+ * connected/hashed upper parent (or up to root).
+ */
+ this = ovl_lookup_real(sb, upper, &ofs->layers[0]);
+ dput(upper);
+ }
+
+ if (IS_ERR_OR_NULL(this))
+ return this;
+
+ if (ovl_dentry_real_at(this, layer->idx) != real) {
+ dput(this);
+ this = ERR_PTR(-EIO);
+ }
+
+ return this;
+}
+
+/*
+ * Lookup an indexed or hashed overlay dentry, whose real dentry is an
+ * ancestor of @real.
+ */
+static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
+ struct dentry *real,
+ const struct ovl_layer *layer)
+{
+ struct dentry *next, *parent = NULL;
+ struct dentry *ancestor = ERR_PTR(-EIO);
+
+ if (real == layer->mnt->mnt_root)
+ return dget(sb->s_root);
+
+ /* Find the topmost indexed or hashed ancestor */
+ next = dget(real);
+ for (;;) {
+ parent = dget_parent(next);
+
+ /*
+ * Lookup a matching overlay dentry in inode/dentry
+ * cache or in index by real inode.
+ */
+ ancestor = ovl_lookup_real_inode(sb, next, layer);
+ if (ancestor)
+ break;
+
+ if (parent == layer->mnt->mnt_root) {
+ ancestor = dget(sb->s_root);
+ break;
+ }
+
+ /*
+ * If @real has been moved out of the layer root directory,
+ * we will eventully hit the real fs root. This cannot happen
+ * by legit overlay rename, so we return error in that case.
+ */
+ if (parent == next) {
+ ancestor = ERR_PTR(-EXDEV);
+ break;
+ }
+
+ dput(next);
+ next = parent;
+ }
+
+ dput(parent);
+ dput(next);
+
+ return ancestor;
+}
+
+/*
+ * Lookup a connected overlay dentry whose real dentry is @real.
+ * If @real is on upper layer, we lookup a child overlay dentry with the same
+ * path the real dentry. Otherwise, we need to consult index for lookup.
+ */
+static struct dentry *ovl_lookup_real(struct super_block *sb,
+ struct dentry *real,
+ const struct ovl_layer *layer)
+{
+ struct dentry *connected;
+ int err = 0;
+
+ connected = ovl_lookup_real_ancestor(sb, real, layer);
+ if (IS_ERR(connected))
+ return connected;
+
+ while (!err) {
+ struct dentry *next, *this;
+ struct dentry *parent = NULL;
+ struct dentry *real_connected = ovl_dentry_real_at(connected,
+ layer->idx);
+
+ if (real_connected == real)
+ break;
+
+ /* Find the topmost dentry not yet connected */
+ next = dget(real);
+ for (;;) {
+ parent = dget_parent(next);
+
+ if (parent == real_connected)
+ break;
+
+ /*
+ * If real has been moved out of 'real_connected',
+ * we will not find 'real_connected' and hit the layer
+ * root. In that case, we need to restart connecting.
+ * This game can go on forever in the worst case. We
+ * may want to consider taking s_vfs_rename_mutex if
+ * this happens more than once.
+ */
+ if (parent == layer->mnt->mnt_root) {
+ dput(connected);
+ connected = dget(sb->s_root);
+ break;
+ }
+
+ /*
+ * If real file has been moved out of the layer root
+ * directory, we will eventully hit the real fs root.
+ * This cannot happen by legit overlay rename, so we
+ * return error in that case.
+ */
+ if (parent == next) {
+ err = -EXDEV;
+ break;
+ }
+
+ dput(next);
+ next = parent;
+ }
+
+ if (!err) {
+ this = ovl_lookup_real_one(connected, next, layer);
+ if (IS_ERR(this))
+ err = PTR_ERR(this);
+
+ /*
+ * Lookup of child in overlay can fail when racing with
+ * overlay rename of child away from 'connected' parent.
+ * In this case, we need to restart the lookup from the
+ * top, because we cannot trust that 'real_connected' is
+ * still an ancestor of 'real'. There is a good chance
+ * that the renamed overlay ancestor is now in cache, so
+ * ovl_lookup_real_ancestor() will find it and we can
+ * continue to connect exactly from where lookup failed.
+ */
+ if (err == -ECHILD) {
+ this = ovl_lookup_real_ancestor(sb, real,
+ layer);
+ err = PTR_ERR_OR_ZERO(this);
+ }
+ if (!err) {
+ dput(connected);
+ connected = this;
+ }
+ }
+
+ dput(parent);
+ dput(next);
+ }
+
+ if (err)
+ goto fail;
+
+ return connected;
+
+fail:
+ pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
+ real, layer->idx, connected, err);
+ dput(connected);
+ return ERR_PTR(err);
+}
+
+/*
+ * Get an overlay dentry from upper/lower real dentries and index.
+ */
+static struct dentry *ovl_get_dentry(struct super_block *sb,
+ struct dentry *upper,
+ struct ovl_path *lowerpath,
+ struct dentry *index)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer;
+ struct dentry *real = upper ?: (index ?: lowerpath->dentry);
+
+ /*
+ * Obtain a disconnected overlay dentry from a non-dir real dentry
+ * and index.
+ */
+ if (!d_is_dir(real))
+ return ovl_obtain_alias(sb, upper, lowerpath, index);
+
+ /* Removed empty directory? */
+ if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real))
+ return ERR_PTR(-ENOENT);
+
+ /*
+ * If real dentry is connected and hashed, get a connected overlay
+ * dentry whose real dentry is @real.
+ */
+ return ovl_lookup_real(sb, real, layer);
+}
+
+static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
+ struct ovl_fh *fh)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ struct dentry *dentry;
+ struct dentry *upper;
+
+ if (!ovl_upper_mnt(ofs))
+ return ERR_PTR(-EACCES);
+
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+ if (IS_ERR_OR_NULL(upper))
+ return upper;
+
+ dentry = ovl_get_dentry(sb, upper, NULL, NULL);
+ dput(upper);
+
+ return dentry;
+}
+
+static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
+ struct ovl_fh *fh)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_path origin = { };
+ struct ovl_path *stack = &origin;
+ struct dentry *dentry = NULL;
+ struct dentry *index = NULL;
+ struct inode *inode;
+ int err;
+
+ /* First lookup overlay inode in inode cache by origin fh */
+ err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
+ if (err)
+ return ERR_PTR(err);
+
+ if (!d_is_dir(origin.dentry) ||
+ !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
+ inode = ovl_lookup_inode(sb, origin.dentry, false);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_err;
+ if (inode) {
+ dentry = d_find_any_alias(inode);
+ iput(inode);
+ if (dentry)
+ goto out;
+ }
+ }
+
+ /* Then lookup indexed upper/whiteout by origin fh */
+ if (ofs->indexdir) {
+ index = ovl_get_index_fh(ofs, fh);
+ err = PTR_ERR(index);
+ if (IS_ERR(index)) {
+ index = NULL;
+ goto out_err;
+ }
+ }
+
+ /* Then try to get a connected upper dir by index */
+ if (index && d_is_dir(index)) {
+ struct dentry *upper = ovl_index_upper(ofs, index);
+
+ err = PTR_ERR(upper);
+ if (IS_ERR_OR_NULL(upper))
+ goto out_err;
+
+ dentry = ovl_get_dentry(sb, upper, NULL, NULL);
+ dput(upper);
+ goto out;
+ }
+
+ /* Find origin.dentry again with ovl_acceptable() layer check */
+ if (d_is_dir(origin.dentry)) {
+ dput(origin.dentry);
+ origin.dentry = NULL;
+ err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
+ if (err)
+ goto out_err;
+ }
+ if (index) {
+ err = ovl_verify_origin(ofs, index, origin.dentry, false);
+ if (err)
+ goto out_err;
+ }
+
+ /* Get a connected non-upper dir or disconnected non-dir */
+ dentry = ovl_get_dentry(sb, NULL, &origin, index);
+
+out:
+ dput(origin.dentry);
+ dput(index);
+ return dentry;
+
+out_err:
+ dentry = ERR_PTR(err);
+ goto out;
+}
+
+static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
+{
+ struct ovl_fh *fh;
+
+ /* If on-wire inner fid is aligned - nothing to do */
+ if (fh_type == OVL_FILEID_V1)
+ return (struct ovl_fh *)fid;
+
+ if (fh_type != OVL_FILEID_V0)
+ return ERR_PTR(-EINVAL);
+
+ if (buflen <= OVL_FH_WIRE_OFFSET)
+ return ERR_PTR(-EINVAL);
+
+ fh = kzalloc(buflen, GFP_KERNEL);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ /* Copy unaligned inner fh into aligned buffer */
+ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET);
+ return fh;
+}
+
+static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct dentry *dentry = NULL;
+ struct ovl_fh *fh = NULL;
+ int len = fh_len << 2;
+ unsigned int flags = 0;
+ int err;
+
+ fh = ovl_fid_to_fh(fid, len, fh_type);
+ err = PTR_ERR(fh);
+ if (IS_ERR(fh))
+ goto out_err;
+
+ err = ovl_check_fh_len(fh, len);
+ if (err)
+ goto out_err;
+
+ flags = fh->fb.flags;
+ dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
+ ovl_upper_fh_to_d(sb, fh) :
+ ovl_lower_fh_to_d(sb, fh);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry) && err != -ESTALE)
+ goto out_err;
+
+out:
+ /* We may have needed to re-align OVL_FILEID_V0 */
+ if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid)
+ kfree(fh);
+
+ return dentry;
+
+out_err:
+ pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
+ fh_len, fh_type, flags, err);
+ dentry = ERR_PTR(err);
+ goto out;
+}
+
+static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
+ return ERR_PTR(-EACCES);
+}
+
+static int ovl_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ /*
+ * ovl_fh_to_dentry() returns connected dir overlay dentries and
+ * ovl_fh_to_parent() is not implemented, so we should not get here.
+ */
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+
+static struct dentry *ovl_get_parent(struct dentry *dentry)
+{
+ /*
+ * ovl_fh_to_dentry() returns connected dir overlay dentries, so we
+ * should not get here.
+ */
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EIO);
+}
+
+const struct export_operations ovl_export_operations = {
+ .encode_fh = ovl_encode_fh,
+ .fh_to_dentry = ovl_fh_to_dentry,
+ .fh_to_parent = ovl_fh_to_parent,
+ .get_name = ovl_get_name,
+ .get_parent = ovl_get_parent,
+};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
new file mode 100644
index 000000000..cc0c07716
--- /dev/null
+++ b/fs/overlayfs/file.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 Red Hat, Inc.
+ */
+
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/uio.h>
+#include <linux/uaccess.h>
+#include <linux/splice.h>
+#include <linux/security.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include "overlayfs.h"
+
+struct ovl_aio_req {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *orig_iocb;
+};
+
+static struct kmem_cache *ovl_aio_request_cachep;
+
+static char ovl_whatisit(struct inode *inode, struct inode *realinode)
+{
+ if (realinode != ovl_inode_upper(inode))
+ return 'l';
+ if (ovl_has_upperdata(inode))
+ return 'u';
+ else
+ return 'm';
+}
+
+/* No atime modificaton nor notify on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+
+static struct file *ovl_open_realfile(const struct file *file,
+ const struct path *realpath)
+{
+ struct inode *realinode = d_inode(realpath->dentry);
+ struct inode *inode = file_inode(file);
+ struct user_namespace *real_mnt_userns;
+ struct file *realfile;
+ const struct cred *old_cred;
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
+ int acc_mode = ACC_MODE(flags);
+ int err;
+
+ if (flags & O_APPEND)
+ acc_mode |= MAY_APPEND;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ real_mnt_userns = mnt_user_ns(realpath->mnt);
+ err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode);
+ if (err) {
+ realfile = ERR_PTR(err);
+ } else {
+ if (!inode_owner_or_capable(real_mnt_userns, realinode))
+ flags &= ~O_NOATIME;
+
+ realfile = open_with_fake_path(&file->f_path, flags, realinode,
+ current_cred());
+ }
+ revert_creds(old_cred);
+
+ pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
+ file, file, ovl_whatisit(inode, realinode), file->f_flags,
+ realfile, IS_ERR(realfile) ? 0 : realfile->f_flags);
+
+ return realfile;
+}
+
+#define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
+
+static int ovl_change_flags(struct file *file, unsigned int flags)
+{
+ struct inode *inode = file_inode(file);
+ int err;
+
+ flags &= OVL_SETFL_MASK;
+
+ if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
+ return -EPERM;
+
+ if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ if (file->f_op->check_flags) {
+ err = file->f_op->check_flags(flags);
+ if (err)
+ return err;
+ }
+
+ spin_lock(&file->f_lock);
+ file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
+ file->f_iocb_flags = iocb_flags(file);
+ spin_unlock(&file->f_lock);
+
+ return 0;
+}
+
+static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
+ bool allow_meta)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct path realpath;
+
+ real->flags = 0;
+ real->file = file->private_data;
+
+ if (allow_meta)
+ ovl_path_real(dentry, &realpath);
+ else
+ ovl_path_realdata(dentry, &realpath);
+
+ /* Has it been copied up since we'd opened it? */
+ if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
+ real->flags = FDPUT_FPUT;
+ real->file = ovl_open_realfile(file, &realpath);
+
+ return PTR_ERR_OR_ZERO(real->file);
+ }
+
+ /* Did the flags change since open? */
+ if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
+ return ovl_change_flags(real->file, file->f_flags);
+
+ return 0;
+}
+
+static int ovl_real_fdget(const struct file *file, struct fd *real)
+{
+ if (d_is_dir(file_dentry(file))) {
+ real->flags = 0;
+ real->file = ovl_dir_real_file(file, false);
+
+ return PTR_ERR_OR_ZERO(real->file);
+ }
+
+ return ovl_real_fdget_meta(file, real, false);
+}
+
+static int ovl_open(struct inode *inode, struct file *file)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct file *realfile;
+ struct path realpath;
+ int err;
+
+ err = ovl_maybe_copy_up(dentry, file->f_flags);
+ if (err)
+ return err;
+
+ /* No longer need these flags, so don't pass them on to underlying fs */
+ file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+ ovl_path_realdata(dentry, &realpath);
+ realfile = ovl_open_realfile(file, &realpath);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+
+ file->private_data = realfile;
+
+ return 0;
+}
+
+static int ovl_release(struct inode *inode, struct file *file)
+{
+ fput(file->private_data);
+
+ return 0;
+}
+
+static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(file);
+ struct fd real;
+ const struct cred *old_cred;
+ loff_t ret;
+
+ /*
+ * The two special cases below do not need to involve real fs,
+ * so we can optimizing concurrent callers.
+ */
+ if (offset == 0) {
+ if (whence == SEEK_CUR)
+ return file->f_pos;
+
+ if (whence == SEEK_SET)
+ return vfs_setpos(file, 0, 0);
+ }
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ return ret;
+
+ /*
+ * Overlay file f_pos is the master copy that is preserved
+ * through copy up and modified on read/write, but only real
+ * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose
+ * limitations that are more strict than ->s_maxbytes for specific
+ * files, so we use the real file to perform seeks.
+ */
+ ovl_inode_lock(inode);
+ real.file->f_pos = file->f_pos;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ ret = vfs_llseek(real.file, offset, whence);
+ revert_creds(old_cred);
+
+ file->f_pos = real.file->f_pos;
+ ovl_inode_unlock(inode);
+
+ fdput(real);
+
+ return ret;
+}
+
+static void ovl_file_accessed(struct file *file)
+{
+ struct inode *inode, *upperinode;
+
+ if (file->f_flags & O_NOATIME)
+ return;
+
+ inode = file_inode(file);
+ upperinode = ovl_inode_upper(inode);
+
+ if (!upperinode)
+ return;
+
+ if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
+ !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) {
+ inode->i_mtime = upperinode->i_mtime;
+ inode->i_ctime = upperinode->i_ctime;
+ }
+
+ touch_atime(&file->f_path);
+}
+
+static rwf_t ovl_iocb_to_rwf(int ifl)
+{
+ rwf_t flags = 0;
+
+ if (ifl & IOCB_NOWAIT)
+ flags |= RWF_NOWAIT;
+ if (ifl & IOCB_HIPRI)
+ flags |= RWF_HIPRI;
+ if (ifl & IOCB_DSYNC)
+ flags |= RWF_DSYNC;
+ if (ifl & IOCB_SYNC)
+ flags |= RWF_SYNC;
+
+ return flags;
+}
+
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
+{
+ if (refcount_dec_and_test(&aio_req->ref)) {
+ fput(aio_req->iocb.ki_filp);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ }
+}
+
+static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
+{
+ struct kiocb *iocb = &aio_req->iocb;
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ if (iocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(orig_iocb->ki_filp);
+
+ /* Actually acquired in ovl_write_iter() */
+ __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+ SB_FREEZE_WRITE);
+ file_end_write(iocb->ki_filp);
+ ovl_copyattr(inode);
+ }
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ ovl_aio_put(aio_req);
+}
+
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
+{
+ struct ovl_aio_req *aio_req = container_of(iocb,
+ struct ovl_aio_req, iocb);
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ ovl_aio_cleanup_handler(aio_req);
+ orig_iocb->ki_complete(orig_iocb, res);
+}
+
+static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct fd real;
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
+ goto out_fdput;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ if (is_sync_kiocb(iocb)) {
+ ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
+ ovl_iocb_to_rwf(iocb->ki_flags));
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
+ revert_creds(old_cred);
+ ovl_file_accessed(file);
+out_fdput:
+ fdput(real);
+
+ return ret;
+}
+
+static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct fd real;
+ const struct cred *old_cred;
+ ssize_t ret;
+ int ifl = iocb->ki_flags;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
+ goto out_fdput;
+
+ if (!ovl_should_sync(OVL_FS(inode->i_sb)))
+ ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ if (is_sync_kiocb(iocb)) {
+ file_start_write(real.file);
+ ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
+ ovl_iocb_to_rwf(ifl));
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(inode);
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ file_start_write(real.file);
+ /* Pacify lockdep, same trick as done in aio_write() */
+ __sb_writers_release(file_inode(real.file)->i_sb,
+ SB_FREEZE_WRITE);
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
+ aio_req->iocb.ki_flags = ifl;
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
+ revert_creds(old_cred);
+out_fdput:
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ file_start_write(real.file);
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(inode);
+ revert_creds(old_cred);
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ int ret;
+
+ ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
+ if (ret <= 0)
+ return ret;
+
+ ret = ovl_real_fdget_meta(file, &real, !datasync);
+ if (ret)
+ return ret;
+
+ /* Don't sync lower file for fear of receiving EROFS error */
+ if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fsync_range(real.file, start, end, datasync);
+ revert_creds(old_cred);
+ }
+
+ fdput(real);
+
+ return ret;
+}
+
+static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *realfile = file->private_data;
+ const struct cred *old_cred;
+ int ret;
+
+ if (!realfile->f_op->mmap)
+ return -ENODEV;
+
+ if (WARN_ON(file != vma->vm_file))
+ return -EIO;
+
+ vma_set_file(vma, realfile);
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = call_mmap(vma->vm_file, vma);
+ revert_creds(old_cred);
+ ovl_file_accessed(file);
+
+ return ret;
+}
+
+static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct fd real;
+ const struct cred *old_cred;
+ int ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fallocate(real.file, mode, offset, len);
+ revert_creds(old_cred);
+
+ /* Update size */
+ ovl_copyattr(inode);
+
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ int ret;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fadvise(real.file, offset, len, advice);
+ revert_creds(old_cred);
+
+ fdput(real);
+
+ return ret;
+}
+
+enum ovl_copyop {
+ OVL_COPY,
+ OVL_CLONE,
+ OVL_DEDUPE,
+};
+
+static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int flags, enum ovl_copyop op)
+{
+ struct inode *inode_out = file_inode(file_out);
+ struct fd real_in, real_out;
+ const struct cred *old_cred;
+ loff_t ret;
+
+ inode_lock(inode_out);
+ if (op != OVL_DEDUPE) {
+ /* Update mode */
+ ovl_copyattr(inode_out);
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = ovl_real_fdget(file_out, &real_out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(file_in, &real_in);
+ if (ret) {
+ fdput(real_out);
+ goto out_unlock;
+ }
+
+ old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
+ switch (op) {
+ case OVL_COPY:
+ ret = vfs_copy_file_range(real_in.file, pos_in,
+ real_out.file, pos_out, len, flags);
+ break;
+
+ case OVL_CLONE:
+ ret = vfs_clone_file_range(real_in.file, pos_in,
+ real_out.file, pos_out, len, flags);
+ break;
+
+ case OVL_DEDUPE:
+ ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
+ real_out.file, pos_out, len,
+ flags);
+ break;
+ }
+ revert_creds(old_cred);
+
+ /* Update size */
+ ovl_copyattr(inode_out);
+
+ fdput(real_in);
+ fdput(real_out);
+
+out_unlock:
+ inode_unlock(inode_out);
+
+ return ret;
+}
+
+static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
+ OVL_COPY);
+}
+
+static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
+{
+ enum ovl_copyop op;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ op = OVL_DEDUPE;
+ else
+ op = OVL_CLONE;
+
+ /*
+ * Don't copy up because of a dedupe request, this wouldn't make sense
+ * most of the time (data would be duplicated instead of deduplicated).
+ */
+ if (op == OVL_DEDUPE &&
+ (!ovl_inode_upper(file_inode(file_in)) ||
+ !ovl_inode_upper(file_inode(file_out))))
+ return -EPERM;
+
+ return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+ remap_flags, op);
+}
+
+static int ovl_flush(struct file *file, fl_owner_t id)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ int err;
+
+ err = ovl_real_fdget(file, &real);
+ if (err)
+ return err;
+
+ if (real.file->f_op->flush) {
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ err = real.file->f_op->flush(real.file, id);
+ revert_creds(old_cred);
+ }
+ fdput(real);
+
+ return err;
+}
+
+const struct file_operations ovl_file_operations = {
+ .open = ovl_open,
+ .release = ovl_release,
+ .llseek = ovl_llseek,
+ .read_iter = ovl_read_iter,
+ .write_iter = ovl_write_iter,
+ .fsync = ovl_fsync,
+ .mmap = ovl_mmap,
+ .fallocate = ovl_fallocate,
+ .fadvise = ovl_fadvise,
+ .flush = ovl_flush,
+ .splice_read = generic_file_splice_read,
+ .splice_write = ovl_splice_write,
+
+ .copy_file_range = ovl_copy_file_range,
+ .remap_file_range = ovl_remap_file_range,
+};
+
+int __init ovl_aio_request_cache_init(void)
+{
+ ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
+ sizeof(struct ovl_aio_req),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!ovl_aio_request_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ovl_aio_request_cache_destroy(void)
+{
+ kmem_cache_destroy(ovl_aio_request_cachep);
+}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000..c062f7e2e
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,1285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/ratelimit.h>
+#include <linux/fiemap.h>
+#include <linux/fileattr.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ bool full_copy_up = false;
+ struct dentry *upperdentry;
+ const struct cred *old_cred;
+
+ err = setattr_prepare(&init_user_ns, dentry, attr);
+ if (err)
+ return err;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* Truncate should trigger data copy up as well */
+ full_copy_up = true;
+ }
+
+ if (!full_copy_up)
+ err = ovl_copy_up(dentry);
+ else
+ err = ovl_copy_up_with_data(dentry);
+ if (!err) {
+ struct inode *winode = NULL;
+
+ upperdentry = ovl_dentry_upper(dentry);
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ winode = d_inode(upperdentry);
+ err = get_write_access(winode);
+ if (err)
+ goto out_drop_write;
+ }
+
+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+ attr->ia_valid &= ~ATTR_MODE;
+
+ /*
+ * We might have to translate ovl file into real file object
+ * once use cases emerge. For now, simply don't let underlying
+ * filesystem rely on attr->ia_file
+ */
+ attr->ia_valid &= ~ATTR_FILE;
+
+ /*
+ * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN
+ * set. Overlayfs does not pass O_TRUNC flag to underlying
+ * filesystem during open -> do not pass ATTR_OPEN. This
+ * disables optimization in fuse which assumes open(O_TRUNC)
+ * already set file size to 0. But we never passed O_TRUNC to
+ * fuse. So by clearing ATTR_OPEN, fuse will be forced to send
+ * setattr request to server.
+ */
+ attr->ia_valid &= ~ATTR_OPEN;
+
+ inode_lock(upperdentry->d_inode);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_do_notify_change(ofs, upperdentry, attr);
+ revert_creds(old_cred);
+ if (!err)
+ ovl_copyattr(dentry->d_inode);
+ inode_unlock(upperdentry->d_inode);
+
+ if (winode)
+ put_write_access(winode);
+ }
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
+{
+ bool samefs = ovl_same_fs(dentry->d_sb);
+ unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+ unsigned int xinoshift = 64 - xinobits;
+
+ if (samefs) {
+ /*
+ * When all layers are on the same fs, all real inode
+ * number are unique, so we use the overlay st_dev,
+ * which is friendly to du -x.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ return;
+ } else if (xinobits) {
+ /*
+ * All inode numbers of underlying fs should not be using the
+ * high xinobits, so we use high xinobits to partition the
+ * overlay st_ino address space. The high bits holds the fsid
+ * (upper fsid is 0). The lowest xinobit is reserved for mapping
+ * the non-persistent inode numbers range in case of overflow.
+ * This way all overlay inode numbers are unique and use the
+ * overlay st_dev.
+ */
+ if (likely(!(stat->ino >> xinoshift))) {
+ stat->ino |= ((u64)fsid) << (xinoshift + 1);
+ stat->dev = dentry->d_sb->s_dev;
+ return;
+ } else if (ovl_xino_warn(dentry->d_sb)) {
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
+ }
+ }
+
+ /* The inode could not be mapped to a unified st_ino address space */
+ if (S_ISDIR(dentry->d_inode->i_mode)) {
+ /*
+ * Always use the overlay st_dev for directories, so 'find
+ * -xdev' will scan the entire overlay mount and won't cross the
+ * overlay mount boundaries.
+ *
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino for directories.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+ } else {
+ /*
+ * For non-samefs setup, if we cannot map all layers st_ino
+ * to a unified address space, we need to make sure that st_dev
+ * is unique per underlying fs, so we use the unique anonymous
+ * bdev assigned to the underlying fs.
+ */
+ stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev;
+ }
+}
+
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
+{
+ struct dentry *dentry = path->dentry;
+ enum ovl_path_type type;
+ struct path realpath;
+ const struct cred *old_cred;
+ struct inode *inode = d_inode(dentry);
+ bool is_dir = S_ISDIR(inode->i_mode);
+ int fsid = 0;
+ int err;
+ bool metacopy_blocks = false;
+
+ metacopy_blocks = ovl_is_metacopy_dentry(dentry);
+
+ type = ovl_path_real(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getattr(&realpath, stat, request_mask, flags);
+ if (err)
+ goto out;
+
+ /* Report the effective immutable/append-only STATX flags */
+ generic_fill_statx_attr(inode, stat);
+
+ /*
+ * For non-dir or same fs, we use st_ino of the copy up origin.
+ * This guaranties constant st_dev/st_ino across copy up.
+ * With xino feature and non-samefs, we use st_ino of the copy up
+ * origin masked with high bits that represent the layer id.
+ *
+ * If lower filesystem supports NFS file handles, this also guaranties
+ * persistent st_ino across mount cycle.
+ */
+ if (!is_dir || ovl_same_dev(dentry->d_sb)) {
+ if (!OVL_TYPE_UPPER(type)) {
+ fsid = ovl_layer_lower(dentry)->fsid;
+ } else if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat lowerstat;
+ u32 lowermask = STATX_INO | STATX_BLOCKS |
+ (!is_dir ? STATX_NLINK : 0);
+
+ ovl_path_lower(dentry, &realpath);
+ err = vfs_getattr(&realpath, &lowerstat,
+ lowermask, flags);
+ if (err)
+ goto out;
+
+ /*
+ * Lower hardlinks may be broken on copy up to different
+ * upper files, so we cannot use the lower origin st_ino
+ * for those different files, even for the same fs case.
+ *
+ * Similarly, several redirected dirs can point to the
+ * same dir on a lower layer. With the "verify_lower"
+ * feature, we do not use the lower origin st_ino, if
+ * we haven't verified that this redirect is unique.
+ *
+ * With inodes index enabled, it is safe to use st_ino
+ * of an indexed origin. The index validates that the
+ * upper hardlink is not broken and that a redirected
+ * dir is the only redirect to that origin.
+ */
+ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
+ (!ovl_verify_lower(dentry->d_sb) &&
+ (is_dir || lowerstat.nlink == 1))) {
+ fsid = ovl_layer_lower(dentry)->fsid;
+ stat->ino = lowerstat.ino;
+ }
+
+ /*
+ * If we are querying a metacopy dentry and lower
+ * dentry is data dentry, then use the blocks we
+ * queried just now. We don't have to do additional
+ * vfs_getattr(). If lower itself is metacopy, then
+ * additional vfs_getattr() is unavoidable.
+ */
+ if (metacopy_blocks &&
+ realpath.dentry == ovl_dentry_lowerdata(dentry)) {
+ stat->blocks = lowerstat.blocks;
+ metacopy_blocks = false;
+ }
+ }
+
+ if (metacopy_blocks) {
+ /*
+ * If lower is not same as lowerdata or if there was
+ * no origin on upper, we can end up here.
+ */
+ struct kstat lowerdatastat;
+ u32 lowermask = STATX_BLOCKS;
+
+ ovl_path_lowerdata(dentry, &realpath);
+ err = vfs_getattr(&realpath, &lowerdatastat,
+ lowermask, flags);
+ if (err)
+ goto out;
+ stat->blocks = lowerdatastat.blocks;
+ }
+ }
+
+ ovl_map_dev_ino(dentry, stat, fsid);
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (is_dir && OVL_TYPE_MERGE(type))
+ stat->nlink = 1;
+
+ /*
+ * Return the overlay inode nlinks for indexed upper inodes.
+ * Overlay inode nlink counts the union of the upper hardlinks
+ * and non-covered lower hardlinks. It does not include the upper
+ * index hardlink.
+ */
+ if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+ stat->nlink = dentry->d_inode->i_nlink;
+
+out:
+ revert_creds(old_cred);
+
+ return err;
+}
+
+int ovl_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ struct inode *upperinode = ovl_inode_upper(inode);
+ struct inode *realinode;
+ struct path realpath;
+ const struct cred *old_cred;
+ int err;
+
+ /* Careful in RCU walk mode */
+ realinode = ovl_i_path_real(inode, &realpath);
+ if (!realinode) {
+ WARN_ON(!(mask & MAY_NOT_BLOCK));
+ return -ECHILD;
+ }
+
+ /*
+ * Check overlay inode with the creds of task and underlying inode
+ * with creds of mounter
+ */
+ err = generic_permission(&init_user_ns, inode, mask);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ if (!upperinode &&
+ !special_file(realinode->i_mode) && mask & MAY_WRITE) {
+ mask &= ~(MAY_WRITE | MAY_APPEND);
+ /* Make sure mounter can read file for copy up later */
+ mask |= MAY_READ;
+ }
+ err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask);
+ revert_creds(old_cred);
+
+ return err;
+}
+
+static const char *ovl_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ const struct cred *old_cred;
+ const char *p;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ p = vfs_get_link(ovl_dentry_real(dentry), done);
+ revert_creds(old_cred);
+ return p;
+}
+
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_USER_PREFIX,
+ sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0;
+ else
+ return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
+ sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0;
+}
+
+int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_i_dentry_upper(inode);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+ struct path realpath;
+ const struct cred *old_cred;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (!value && !upperdentry) {
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0);
+ revert_creds(old_cred);
+ if (err < 0)
+ goto out_drop_write;
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = ovl_do_removexattr(ofs, realdentry, name);
+ }
+ revert_creds(old_cred);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ ssize_t res;
+ const struct cred *old_cred;
+ struct path realpath;
+
+ ovl_i_path_real(inode, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size);
+ revert_creds(old_cred);
+ return res;
+}
+
+static bool ovl_can_list(struct super_block *sb, const char *s)
+{
+ /* Never list private (.overlay) */
+ if (ovl_is_private_xattr(sb, s))
+ return false;
+
+ /* List all non-trusted xattrs */
+ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+ return true;
+
+ /* list other trusted for superuser only */
+ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+ ssize_t res;
+ size_t len;
+ char *s;
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_listxattr(realdentry, list, size);
+ revert_creds(old_cred);
+ if (res <= 0 || size == 0)
+ return res;
+
+ /* filter out private xattrs */
+ for (s = list, len = res; len;) {
+ size_t slen = strnlen(s, len) + 1;
+
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > len))
+ return -EIO;
+
+ len -= slen;
+ if (!ovl_can_list(dentry->d_sb, s)) {
+ res -= slen;
+ memmove(s, s + slen, len);
+ } else {
+ s += slen;
+ }
+ }
+
+ return res;
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+/*
+ * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
+ * of the POSIX ACLs retrieved from the lower layer to this function to not
+ * alter the POSIX ACLs for the underlying filesystem.
+ */
+static void ovl_idmap_posix_acl(struct inode *realinode,
+ struct user_namespace *mnt_userns,
+ struct posix_acl *acl)
+{
+ struct user_namespace *fs_userns = i_user_ns(realinode);
+
+ for (unsigned int i = 0; i < acl->a_count; i++) {
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
+
+ struct posix_acl_entry *e = &acl->a_entries[i];
+ switch (e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid);
+ e->e_uid = vfsuid_into_kuid(vfsuid);
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid);
+ e->e_gid = vfsgid_into_kgid(vfsgid);
+ break;
+ }
+ }
+}
+
+/*
+ * When the relevant layer is an idmapped mount we need to take the idmapping
+ * of the layer into account and translate any ACL_{GROUP,USER} values
+ * according to the idmapped mount.
+ *
+ * We cannot alter the ACLs returned from the relevant layer as that would
+ * alter the cached values filesystem wide for the lower filesystem. Instead we
+ * can clone the ACLs and then apply the relevant idmapping of the layer.
+ *
+ * This is obviously only relevant when idmapped layers are used.
+ */
+struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
+{
+ struct inode *realinode;
+ struct posix_acl *acl, *clone;
+ struct path realpath;
+
+ /* Careful in RCU walk mode */
+ realinode = ovl_i_path_real(inode, &realpath);
+ if (!realinode) {
+ WARN_ON(!rcu);
+ return ERR_PTR(-ECHILD);
+ }
+
+ if (!IS_POSIXACL(realinode))
+ return NULL;
+
+ if (rcu) {
+ acl = get_cached_acl_rcu(realinode, type);
+ } else {
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ acl = get_acl(realinode, type);
+ revert_creds(old_cred);
+ }
+ /*
+ * If there are no POSIX ACLs, or we encountered an error,
+ * or the layer isn't idmapped we don't need to do anything.
+ */
+ if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
+ return acl;
+
+ /*
+ * We only get here if the layer is idmapped. So drop out of RCU path
+ * walk so we can clone the ACLs. There's no need to release the ACLs
+ * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ */
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ if (!clone)
+ clone = ERR_PTR(-ENOMEM);
+ else
+ ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
+ /*
+ * Since we're not in RCU path walk we always need to release the
+ * original ACLs.
+ */
+ posix_acl_release(acl);
+ return clone;
+}
+#endif
+
+int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
+{
+ if (flags & S_ATIME) {
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+ struct path upperpath = {
+ .mnt = ovl_upper_mnt(ofs),
+ .dentry = ovl_upperdentry_dereference(OVL_I(inode)),
+ };
+
+ if (upperpath.dentry) {
+ touch_atime(&upperpath);
+ inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+ }
+ }
+ return 0;
+}
+
+static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ int err;
+ struct inode *realinode = ovl_inode_realdata(inode);
+ const struct cred *old_cred;
+
+ if (!realinode->i_op->fiemap)
+ return -EOPNOTSUPP;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
+ revert_creds(old_cred);
+
+ return err;
+}
+
+/*
+ * Work around the fact that security_file_ioctl() takes a file argument.
+ * Introducing security_inode_fileattr_get/set() hooks would solve this issue
+ * properly.
+ */
+static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
+ bool set)
+{
+ struct file *file;
+ unsigned int cmd;
+ int err;
+
+ file = dentry_open(realpath, O_RDONLY, current_cred());
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (set)
+ cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS;
+ else
+ cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS;
+
+ err = security_file_ioctl(file, cmd, 0);
+ fput(file);
+
+ return err;
+}
+
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
+{
+ int err;
+
+ err = ovl_security_fileattr(realpath, fa, true);
+ if (err)
+ return err;
+
+ return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa);
+}
+
+int ovl_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct path upperpath;
+ const struct cred *old_cred;
+ unsigned int flags;
+ int err;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry);
+ if (!err) {
+ ovl_path_real(dentry, &upperpath);
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ /*
+ * Store immutable/append-only flags in xattr and clear them
+ * in upper fileattr (in case they were set by older kernel)
+ * so children of "ovl-immutable" directories lower aliases of
+ * "ovl-immutable" hardlinks could be copied up.
+ * Clear xattr when flags are cleared.
+ */
+ err = ovl_set_protattr(inode, upperpath.dentry, fa);
+ if (!err)
+ err = ovl_real_fileattr_set(&upperpath, fa);
+ revert_creds(old_cred);
+
+ /*
+ * Merge real inode flags with inode flags read from
+ * overlay.protattr xattr
+ */
+ flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK;
+
+ BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK);
+ flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK;
+ inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK);
+
+ /* Update ctime */
+ ovl_copyattr(inode);
+ }
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+/* Convert inode protection flags to fileattr flags */
+static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
+{
+ BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
+ BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
+
+ if (inode->i_flags & S_APPEND) {
+ fa->flags |= FS_APPEND_FL;
+ fa->fsx_xflags |= FS_XFLAG_APPEND;
+ }
+ if (inode->i_flags & S_IMMUTABLE) {
+ fa->flags |= FS_IMMUTABLE_FL;
+ fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+ }
+}
+
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
+{
+ int err;
+
+ err = ovl_security_fileattr(realpath, fa, false);
+ if (err)
+ return err;
+
+ err = vfs_fileattr_get(realpath->dentry, fa);
+ if (err == -ENOIOCTLCMD)
+ err = -ENOTTY;
+ return err;
+}
+
+int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct path realpath;
+ const struct cred *old_cred;
+ int err;
+
+ ovl_path_real(dentry, &realpath);
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ err = ovl_real_fileattr_get(&realpath, fa);
+ ovl_fileattr_prot_flags(inode, fa);
+ revert_creds(old_cred);
+
+ return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+ .setattr = ovl_setattr,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .listxattr = ovl_listxattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
+ .fiemap = ovl_fiemap,
+ .fileattr_get = ovl_fileattr_get,
+ .fileattr_set = ovl_fileattr_set,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+ .setattr = ovl_setattr,
+ .get_link = ovl_get_link,
+ .getattr = ovl_getattr,
+ .listxattr = ovl_listxattr,
+ .update_time = ovl_update_time,
+};
+
+static const struct inode_operations ovl_special_inode_operations = {
+ .setattr = ovl_setattr,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .listxattr = ovl_listxattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
+};
+
+static const struct address_space_operations ovl_aops = {
+ /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+ .direct_IO = noop_direct_IO,
+};
+
+/*
+ * It is possible to stack overlayfs instance on top of another
+ * overlayfs instance as lower layer. We need to annotate the
+ * stackable i_mutex locks according to stack level of the super
+ * block instance. An overlayfs instance can never be in stack
+ * depth 0 (there is always a real fs below it). An overlayfs
+ * inode lock will use the lockdep annotation ovl_i_mutex_key[depth].
+ *
+ * For example, here is a snip from /proc/lockdep_chains after
+ * dir_iterate of nested overlayfs:
+ *
+ * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
+ * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
+ * [...] &type->i_mutex_dir_key (stack_depth=0)
+ *
+ * Locking order w.r.t ovl_want_write() is important for nested overlayfs.
+ *
+ * This chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * And this chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - lowerinode->i_rwsem (inode_lock[1])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is
+ * held, because it is in reverse order of the non-nested case using the same
+ * upper fs:
+ * - inode->i_rwsem (inode_lock[1])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[1])
+ */
+#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
+
+static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
+
+ int depth = inode->i_sb->s_stack_depth - 1;
+
+ if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
+ depth = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
+ else
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+
+ lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
+#endif
+}
+
+static void ovl_next_ino(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+ if (unlikely(!inode->i_ino))
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
+{
+ int xinobits = ovl_xino_bits(inode->i_sb);
+ unsigned int xinoshift = 64 - xinobits;
+
+ /*
+ * When d_ino is consistent with st_ino (samefs or i_ino has enough
+ * bits to encode layer), set the same value used for st_ino to i_ino,
+ * so inode number exposed via /proc/locks and a like will be
+ * consistent with d_ino and st_ino values. An i_ino value inconsistent
+ * with d_ino also causes nfsd readdirplus to fail.
+ */
+ inode->i_ino = ino;
+ if (ovl_same_fs(inode->i_sb)) {
+ return;
+ } else if (xinobits && likely(!(ino >> xinoshift))) {
+ inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+ return;
+ }
+
+ /*
+ * For directory inodes on non-samefs with xino disabled or xino
+ * overflow, we allocate a non-persistent inode number, to be used for
+ * resolving st_ino collisions in ovl_map_dev_ino().
+ *
+ * To avoid ino collision with legitimate xino values from upper
+ * layer (fsid 0), use the lowest xinobit to map the non
+ * persistent inode numbers to the unified st_ino address space.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ ovl_next_ino(inode);
+ if (xinobits) {
+ inode->i_ino &= ~0UL >> xinobits;
+ inode->i_ino |= 1UL << xinoshift;
+ }
+ }
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid)
+{
+ struct inode *realinode;
+ struct ovl_inode *oi = OVL_I(inode);
+
+ if (oip->upperdentry)
+ oi->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry) {
+ oi->lowerpath.dentry = dget(oip->lowerpath->dentry);
+ oi->lowerpath.layer = oip->lowerpath->layer;
+ }
+ if (oip->lowerdata)
+ oi->lowerdata = igrab(d_inode(oip->lowerdata));
+
+ realinode = ovl_inode_real(inode);
+ ovl_copyattr(inode);
+ ovl_copyflags(realinode, inode);
+ ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ inode->i_flags |= S_NOCMTIME;
+#ifdef CONFIG_FS_POSIX_ACL
+ inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
+#endif
+
+ ovl_lockdep_annotate_inode_mutex_key(inode);
+
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &ovl_file_inode_operations;
+ inode->i_fop = &ovl_file_operations;
+ inode->i_mapping->a_ops = &ovl_aops;
+ break;
+
+ case S_IFDIR:
+ inode->i_op = &ovl_dir_inode_operations;
+ inode->i_fop = &ovl_dir_operations;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &ovl_symlink_inode_operations;
+ break;
+
+ default:
+ inode->i_op = &ovl_special_inode_operations;
+ init_special_inode(inode, mode, rdev);
+ break;
+ }
+}
+
+/*
+ * With inodes index enabled, an overlay inode nlink counts the union of upper
+ * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
+ * upper inode, the following nlink modifying operations can happen:
+ *
+ * 1. Lower hardlink copy up
+ * 2. Upper hardlink created, unlinked or renamed over
+ * 3. Lower hardlink whiteout or renamed over
+ *
+ * For the first, copy up case, the union nlink does not change, whether the
+ * operation succeeds or fails, but the upper inode nlink may change.
+ * Therefore, before copy up, we store the union nlink value relative to the
+ * lower inode nlink in the index inode xattr .overlay.nlink.
+ *
+ * For the second, upper hardlink case, the union nlink should be incremented
+ * or decremented IFF the operation succeeds, aligned with nlink change of the
+ * upper inode. Therefore, before link/unlink/rename, we store the union nlink
+ * value relative to the upper inode nlink in the index inode.
+ *
+ * For the last, lower cover up case, we simplify things by preceding the
+ * whiteout or cover up with copy up. This makes sure that there is an index
+ * upper inode where the nlink xattr can be stored before the copied up upper
+ * entry is unlink.
+ */
+#define OVL_NLINK_ADD_UPPER (1 << 0)
+
+/*
+ * On-disk format for indexed nlink:
+ *
+ * nlink relative to the upper inode - "U[+-]NUM"
+ * nlink relative to the lower inode - "L[+-]NUM"
+ */
+
+static int ovl_set_nlink_common(struct dentry *dentry,
+ struct dentry *realdentry, const char *format)
+{
+ struct inode *inode = d_inode(dentry);
+ struct inode *realinode = d_inode(realdentry);
+ char buf[13];
+ int len;
+
+ len = snprintf(buf, sizeof(buf), format,
+ (int) (inode->i_nlink - realinode->i_nlink));
+
+ if (WARN_ON(len >= sizeof(buf)))
+ return -EIO;
+
+ return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
+ OVL_XATTR_NLINK, buf, len);
+}
+
+int ovl_set_nlink_upper(struct dentry *dentry)
+{
+ return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
+}
+
+int ovl_set_nlink_lower(struct dentry *dentry)
+{
+ return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
+}
+
+unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
+ struct dentry *upperdentry,
+ unsigned int fallback)
+{
+ int nlink_diff;
+ int nlink;
+ char buf[13];
+ int err;
+
+ if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
+ return fallback;
+
+ err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK,
+ &buf, sizeof(buf) - 1);
+ if (err < 0)
+ goto fail;
+
+ buf[err] = '\0';
+ if ((buf[0] != 'L' && buf[0] != 'U') ||
+ (buf[1] != '+' && buf[1] != '-'))
+ goto fail;
+
+ err = kstrtoint(buf + 1, 10, &nlink_diff);
+ if (err < 0)
+ goto fail;
+
+ nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
+ nlink += nlink_diff;
+
+ if (nlink <= 0)
+ goto fail;
+
+ return nlink;
+
+fail:
+ pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n",
+ upperdentry, err);
+ return fallback;
+}
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (inode)
+ ovl_fill_inode(inode, mode, rdev);
+
+ return inode;
+}
+
+static int ovl_inode_test(struct inode *inode, void *data)
+{
+ return inode->i_private == data;
+}
+
+static int ovl_inode_set(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ return 0;
+}
+
+static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
+ struct dentry *upperdentry, bool strict)
+{
+ /*
+ * For directories, @strict verify from lookup path performs consistency
+ * checks, so NULL lower/upper in dentry must match NULL lower/upper in
+ * inode. Non @strict verify from NFS handle decode path passes NULL for
+ * 'unknown' lower/upper.
+ */
+ if (S_ISDIR(inode->i_mode) && strict) {
+ /* Real lower dir moved to upper layer under us? */
+ if (!lowerdentry && ovl_inode_lower(inode))
+ return false;
+
+ /* Lookup of an uncovered redirect origin? */
+ if (!upperdentry && ovl_inode_upper(inode))
+ return false;
+ }
+
+ /*
+ * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
+ * This happens when finding a copied up overlay inode for a renamed
+ * or hardlinked overlay dentry and lower dentry cannot be followed
+ * by origin because lower fs does not support file handles.
+ */
+ if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
+ return false;
+
+ /*
+ * Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
+ * This happens when finding a lower alias for a copied up hard link.
+ */
+ if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
+ return false;
+
+ return true;
+}
+
+struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
+ bool is_upper)
+{
+ struct inode *inode, *key = d_inode(real);
+
+ inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
+ if (!inode)
+ return NULL;
+
+ if (!ovl_verify_inode(inode, is_upper ? NULL : real,
+ is_upper ? real : NULL, false)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+}
+
+bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir)
+{
+ struct inode *key = d_inode(dir);
+ struct inode *trap;
+ bool res;
+
+ trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
+ if (!trap)
+ return false;
+
+ res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) &&
+ !ovl_inode_lower(trap);
+
+ iput(trap);
+ return res;
+}
+
+/*
+ * Create an inode cache entry for layer root dir, that will intentionally
+ * fail ovl_verify_inode(), so any lookup that will find some layer root
+ * will fail.
+ */
+struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
+{
+ struct inode *key = d_inode(dir);
+ struct inode *trap;
+
+ if (!d_is_dir(dir))
+ return ERR_PTR(-ENOTDIR);
+
+ trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test,
+ ovl_inode_set, key);
+ if (!trap)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(trap->i_state & I_NEW)) {
+ /* Conflicting layer roots? */
+ iput(trap);
+ return ERR_PTR(-ELOOP);
+ }
+
+ trap->i_mode = S_IFDIR;
+ trap->i_flags = S_DEAD;
+ unlock_new_inode(trap);
+
+ return trap;
+}
+
+/*
+ * Does overlay inode need to be hashed by lower inode?
+ */
+static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
+ struct dentry *lower, bool index)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ /* No, if pure upper */
+ if (!lower)
+ return false;
+
+ /* Yes, if already indexed */
+ if (index)
+ return true;
+
+ /* Yes, if won't be copied up */
+ if (!ovl_upper_mnt(ofs))
+ return true;
+
+ /* No, if lower hardlink is or will be broken on copy up */
+ if ((upper || !ovl_indexdir(sb)) &&
+ !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
+ return false;
+
+ /* No, if non-indexed upper with NFS export */
+ if (sb->s_export_op && upper)
+ return false;
+
+ /* Otherwise, hash by lower inode for fsnotify */
+ return true;
+}
+
+static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
+ struct inode *key)
+{
+ return newinode ? inode_insert5(newinode, (unsigned long) key,
+ ovl_inode_test, ovl_inode_set, key) :
+ iget5_locked(sb, (unsigned long) key,
+ ovl_inode_test, ovl_inode_set, key);
+}
+
+struct inode *ovl_get_inode(struct super_block *sb,
+ struct ovl_inode_params *oip)
+{
+ struct ovl_fs *ofs = OVL_FS(sb);
+ struct dentry *upperdentry = oip->upperdentry;
+ struct ovl_path *lowerpath = oip->lowerpath;
+ struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
+ struct inode *inode;
+ struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
+ struct path realpath = {
+ .dentry = upperdentry ?: lowerdentry,
+ .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt,
+ };
+ bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
+ oip->index);
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
+ bool is_dir;
+ unsigned long ino = 0;
+ int err = oip->newinode ? -EEXIST : -ENOMEM;
+
+ if (!realinode)
+ realinode = d_inode(lowerdentry);
+
+ /*
+ * Copy up origin (lower) may exist for non-indexed upper, but we must
+ * not use lower as hash key if this is a broken hardlink.
+ */
+ is_dir = S_ISDIR(realinode->i_mode);
+ if (upperdentry || bylower) {
+ struct inode *key = d_inode(bylower ? lowerdentry :
+ upperdentry);
+ unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
+
+ inode = ovl_iget5(sb, oip->newinode, key);
+ if (!inode)
+ goto out_err;
+ if (!(inode->i_state & I_NEW)) {
+ /*
+ * Verify that the underlying files stored in the inode
+ * match those in the dentry.
+ */
+ if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
+ true)) {
+ iput(inode);
+ err = -ESTALE;
+ goto out_err;
+ }
+
+ dput(upperdentry);
+ kfree(oip->redirect);
+ goto out;
+ }
+
+ /* Recalculate nlink for non-dir due to indexing */
+ if (!is_dir)
+ nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry,
+ nlink);
+ set_nlink(inode, nlink);
+ ino = key->i_ino;
+ } else {
+ /* Lower hardlink that will be broken on copy up */
+ inode = new_inode(sb);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ ino = realinode->i_ino;
+ fsid = lowerpath->layer->fsid;
+ }
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_inode_init(inode, oip, ino, fsid);
+
+ if (upperdentry && ovl_is_impuredir(sb, upperdentry))
+ ovl_set_flag(OVL_IMPURE, inode);
+
+ if (oip->index)
+ ovl_set_flag(OVL_INDEX, inode);
+
+ OVL_I(inode)->redirect = oip->redirect;
+
+ if (bylower)
+ ovl_set_flag(OVL_CONST_INO, inode);
+
+ /* Check for non-merge dir that may have whiteouts */
+ if (is_dir) {
+ if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
+ ovl_path_check_origin_xattr(ofs, &realpath)) {
+ ovl_set_flag(OVL_WHITEOUTS, inode);
+ }
+ }
+
+ /* Check for immutable/append-only inode flags in xattr */
+ if (upperdentry)
+ ovl_check_protattr(inode, upperdentry);
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+out:
+ return inode;
+
+out_err:
+ pr_warn_ratelimited("failed to get inode (%i)\n", err);
+ inode = ERR_PTR(err);
+ goto out;
+}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
new file mode 100644
index 000000000..655d08d6f
--- /dev/null
+++ b/fs/overlayfs/namei.c
@@ -0,0 +1,1203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/exportfs.h>
+#include "overlayfs.h"
+
+struct ovl_lookup_data {
+ struct super_block *sb;
+ struct vfsmount *mnt;
+ struct qstr name;
+ bool is_dir;
+ bool opaque;
+ bool stop;
+ bool last;
+ char *redirect;
+ bool metacopy;
+};
+
+static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d,
+ size_t prelen, const char *post)
+{
+ int res;
+ char *buf;
+ struct ovl_fs *ofs = OVL_FS(d->sb);
+
+ buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post));
+ if (IS_ERR_OR_NULL(buf))
+ return PTR_ERR(buf);
+
+ if (buf[0] == '/') {
+ /*
+ * One of the ancestor path elements in an absolute path
+ * lookup in ovl_lookup_layer() could have been opaque and
+ * that will stop further lookup in lower layers (d->stop=true)
+ * But we have found an absolute redirect in descendant path
+ * element and that should force continue lookup in lower
+ * layers (reset d->stop).
+ */
+ d->stop = false;
+ } else {
+ res = strlen(buf) + 1;
+ memmove(buf + prelen, buf, res);
+ memcpy(buf, d->name.name, prelen);
+ }
+
+ strcat(buf, post);
+ kfree(d->redirect);
+ d->redirect = buf;
+ d->name.name = d->redirect;
+ d->name.len = strlen(d->redirect);
+
+ return 0;
+}
+
+static int ovl_acceptable(void *ctx, struct dentry *dentry)
+{
+ /*
+ * A non-dir origin may be disconnected, which is fine, because
+ * we only need it for its unique inode number.
+ */
+ if (!d_is_dir(dentry))
+ return 1;
+
+ /* Don't decode a deleted empty directory */
+ if (d_unhashed(dentry))
+ return 0;
+
+ /* Check if directory belongs to the layer we are decoding from */
+ return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root);
+}
+
+/*
+ * Check validity of an overlay file handle buffer.
+ *
+ * Return 0 for a valid file handle.
+ * Return -ENODATA for "origin unknown".
+ * Return <0 for an invalid file handle.
+ */
+int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
+{
+ if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len)
+ return -EINVAL;
+
+ if (fb->magic != OVL_FH_MAGIC)
+ return -EINVAL;
+
+ /* Treat larger version and unknown flags as "origin unknown" */
+ if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL)
+ return -ENODATA;
+
+ /* Treat endianness mismatch as "origin unknown" */
+ if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
+ (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
+ return -ENODATA;
+
+ return 0;
+}
+
+static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry,
+ enum ovl_xattr ox)
+{
+ int res, err;
+ struct ovl_fh *fh = NULL;
+
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return NULL;
+ goto fail;
+ }
+ /* Zero size value means "copied up but origin unknown" */
+ if (res == 0)
+ return NULL;
+
+ fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res);
+ if (res < 0)
+ goto fail;
+
+ err = ovl_check_fb_len(&fh->fb, res);
+ if (err < 0) {
+ if (err == -ENODATA)
+ goto out;
+ goto invalid;
+ }
+
+ return fh;
+
+out:
+ kfree(fh);
+ return NULL;
+
+fail:
+ pr_warn_ratelimited("failed to get origin (%i)\n", res);
+ goto out;
+invalid:
+ pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh);
+ goto out;
+}
+
+struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+ struct vfsmount *mnt, bool connected)
+{
+ struct dentry *real;
+ int bytes;
+
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return NULL;
+
+ /*
+ * Make sure that the stored uuid matches the uuid of the lower
+ * layer where file handle will be decoded.
+ * In case of uuid=off option just make sure that stored uuid is null.
+ */
+ if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) :
+ !uuid_is_null(&fh->fb.uuid))
+ return NULL;
+
+ bytes = (fh->fb.len - offsetof(struct ovl_fb, fid));
+ real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid,
+ bytes >> 2, (int)fh->fb.type,
+ connected ? ovl_acceptable : NULL, mnt);
+ if (IS_ERR(real)) {
+ /*
+ * Treat stale file handle to lower file as "origin unknown".
+ * upper file handle could become stale when upper file is
+ * unlinked and this information is needed to handle stale
+ * index entries correctly.
+ */
+ if (real == ERR_PTR(-ESTALE) &&
+ !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER))
+ real = NULL;
+ return real;
+ }
+
+ if (ovl_dentry_weird(real)) {
+ dput(real);
+ return NULL;
+ }
+
+ return real;
+}
+
+static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
+{
+ return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
+}
+
+static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
+ const char *name,
+ struct dentry *base, int len,
+ bool drop_negative)
+{
+ struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len);
+
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ if (drop_negative && ret->d_lockref.count == 1) {
+ spin_lock(&ret->d_lock);
+ /* Recheck condition under lock */
+ if (d_is_negative(ret) && ret->d_lockref.count == 1)
+ __d_drop(ret);
+ spin_unlock(&ret->d_lock);
+ }
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+
+static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
+ const char *name, unsigned int namelen,
+ size_t prelen, const char *post,
+ struct dentry **ret, bool drop_negative)
+{
+ struct dentry *this;
+ struct path path;
+ int err;
+ bool last_element = !post[0];
+
+ this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ this = NULL;
+ if (err == -ENOENT || err == -ENAMETOOLONG)
+ goto out;
+ goto out_err;
+ }
+
+ if (ovl_dentry_weird(this)) {
+ /* Don't support traversing automounts and other weirdness */
+ err = -EREMOTE;
+ goto out_err;
+ }
+ if (ovl_is_whiteout(this)) {
+ d->stop = d->opaque = true;
+ goto put_and_out;
+ }
+ /*
+ * This dentry should be a regular file if previous layer lookup
+ * found a metacopy dentry.
+ */
+ if (last_element && d->metacopy && !d_is_reg(this)) {
+ d->stop = true;
+ goto put_and_out;
+ }
+
+ path.dentry = this;
+ path.mnt = d->mnt;
+ if (!d_can_lookup(this)) {
+ if (d->is_dir || !last_element) {
+ d->stop = true;
+ goto put_and_out;
+ }
+ err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path);
+ if (err < 0)
+ goto out_err;
+
+ d->metacopy = err;
+ d->stop = !d->metacopy;
+ if (!d->metacopy || d->last)
+ goto out;
+ } else {
+ if (ovl_lookup_trap_inode(d->sb, this)) {
+ /* Caught in a trap of overlapping layers */
+ err = -ELOOP;
+ goto out_err;
+ }
+
+ if (last_element)
+ d->is_dir = true;
+ if (d->last)
+ goto out;
+
+ if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+ d->stop = true;
+ if (last_element)
+ d->opaque = true;
+ goto out;
+ }
+ }
+ err = ovl_check_redirect(&path, d, prelen, post);
+ if (err)
+ goto out_err;
+out:
+ *ret = this;
+ return 0;
+
+put_and_out:
+ dput(this);
+ this = NULL;
+ goto out;
+
+out_err:
+ dput(this);
+ return err;
+}
+
+static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
+ struct dentry **ret, bool drop_negative)
+{
+ /* Counting down from the end, since the prefix can change */
+ size_t rem = d->name.len - 1;
+ struct dentry *dentry = NULL;
+ int err;
+
+ if (d->name.name[0] != '/')
+ return ovl_lookup_single(base, d, d->name.name, d->name.len,
+ 0, "", ret, drop_negative);
+
+ while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
+ const char *s = d->name.name + d->name.len - rem;
+ const char *next = strchrnul(s, '/');
+ size_t thislen = next - s;
+ bool end = !next[0];
+
+ /* Verify we did not go off the rails */
+ if (WARN_ON(s[-1] != '/'))
+ return -EIO;
+
+ err = ovl_lookup_single(base, d, s, thislen,
+ d->name.len - rem, next, &base,
+ drop_negative);
+ dput(dentry);
+ if (err)
+ return err;
+ dentry = base;
+ if (end)
+ break;
+
+ rem -= thislen + 1;
+
+ if (WARN_ON(rem >= d->name.len))
+ return -EIO;
+ }
+ *ret = dentry;
+ return 0;
+}
+
+
+int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
+ struct dentry *upperdentry, struct ovl_path **stackp)
+{
+ struct dentry *origin = NULL;
+ int i;
+
+ for (i = 1; i < ofs->numlayer; i++) {
+ /*
+ * If lower fs uuid is not unique among lower fs we cannot match
+ * fh->uuid to layer.
+ */
+ if (ofs->layers[i].fsid &&
+ ofs->layers[i].fs->bad_uuid)
+ continue;
+
+ origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt,
+ connected);
+ if (origin)
+ break;
+ }
+
+ if (!origin)
+ return -ESTALE;
+ else if (IS_ERR(origin))
+ return PTR_ERR(origin);
+
+ if (upperdentry && !ovl_is_whiteout(upperdentry) &&
+ inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
+ goto invalid;
+
+ if (!*stackp)
+ *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL);
+ if (!*stackp) {
+ dput(origin);
+ return -ENOMEM;
+ }
+ **stackp = (struct ovl_path){
+ .dentry = origin,
+ .layer = &ofs->layers[i]
+ };
+
+ return 0;
+
+invalid:
+ pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n",
+ upperdentry, d_inode(upperdentry)->i_mode & S_IFMT,
+ d_inode(origin)->i_mode & S_IFMT);
+ dput(origin);
+ return -ESTALE;
+}
+
+static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct ovl_path **stackp)
+{
+ struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN);
+ int err;
+
+ if (IS_ERR_OR_NULL(fh))
+ return PTR_ERR(fh);
+
+ err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp);
+ kfree(fh);
+
+ if (err) {
+ if (err == -ESTALE)
+ return 0;
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Verify that @fh matches the file handle stored in xattr @name.
+ * Return 0 on match, -ESTALE on mismatch, < 0 on error.
+ */
+static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const struct ovl_fh *fh)
+{
+ struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox);
+ int err = 0;
+
+ if (!ofh)
+ return -ENODATA;
+
+ if (IS_ERR(ofh))
+ return PTR_ERR(ofh);
+
+ if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len))
+ err = -ESTALE;
+
+ kfree(ofh);
+ return err;
+}
+
+/*
+ * Verify that @real dentry matches the file handle stored in xattr @name.
+ *
+ * If @set is true and there is no stored file handle, encode @real and store
+ * file handle in xattr @name.
+ *
+ * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
+ */
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real, bool is_upper,
+ bool set)
+{
+ struct inode *inode;
+ struct ovl_fh *fh;
+ int err;
+
+ fh = ovl_encode_real_fh(ofs, real, is_upper);
+ err = PTR_ERR(fh);
+ if (IS_ERR(fh)) {
+ fh = NULL;
+ goto fail;
+ }
+
+ err = ovl_verify_fh(ofs, dentry, ox, fh);
+ if (set && err == -ENODATA)
+ err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+ if (err)
+ goto fail;
+
+out:
+ kfree(fh);
+ return err;
+
+fail:
+ inode = d_inode(real);
+ pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n",
+ is_upper ? "upper" : "origin", real,
+ inode ? inode->i_ino : 0, err);
+ goto out;
+}
+
+/* Get upper dentry from index */
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
+{
+ struct ovl_fh *fh;
+ struct dentry *upper;
+
+ if (!d_is_dir(index))
+ return dget(index);
+
+ fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER);
+ if (IS_ERR_OR_NULL(fh))
+ return ERR_CAST(fh);
+
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+ kfree(fh);
+
+ if (IS_ERR_OR_NULL(upper))
+ return upper ?: ERR_PTR(-ESTALE);
+
+ if (!d_is_dir(upper)) {
+ pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n",
+ index, upper);
+ dput(upper);
+ return ERR_PTR(-EIO);
+ }
+
+ return upper;
+}
+
+/*
+ * Verify that an index entry name matches the origin file handle stored in
+ * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
+ * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error.
+ */
+int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
+{
+ struct ovl_fh *fh = NULL;
+ size_t len;
+ struct ovl_path origin = { };
+ struct ovl_path *stack = &origin;
+ struct dentry *upper = NULL;
+ int err;
+
+ if (!d_inode(index))
+ return 0;
+
+ err = -EINVAL;
+ if (index->d_name.len < sizeof(struct ovl_fb)*2)
+ goto fail;
+
+ err = -ENOMEM;
+ len = index->d_name.len / 2;
+ fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
+ if (!fh)
+ goto fail;
+
+ err = -EINVAL;
+ if (hex2bin(fh->buf, index->d_name.name, len))
+ goto fail;
+
+ err = ovl_check_fb_len(&fh->fb, len);
+ if (err)
+ goto fail;
+
+ /*
+ * Whiteout index entries are used as an indication that an exported
+ * overlay file handle should be treated as stale (i.e. after unlink
+ * of the overlay inode). These entries contain no origin xattr.
+ */
+ if (ovl_is_whiteout(index))
+ goto out;
+
+ /*
+ * Verifying directory index entries are not stale is expensive, so
+ * only verify stale dir index if NFS export is enabled.
+ */
+ if (d_is_dir(index) && !ofs->config.nfs_export)
+ goto out;
+
+ /*
+ * Directory index entries should have 'upper' xattr pointing to the
+ * real upper dir. Non-dir index entries are hardlinks to the upper
+ * real inode. For non-dir index, we can read the copy up origin xattr
+ * directly from the index dentry, but for dir index we first need to
+ * decode the upper directory.
+ */
+ upper = ovl_index_upper(ofs, index);
+ if (IS_ERR_OR_NULL(upper)) {
+ err = PTR_ERR(upper);
+ /*
+ * Directory index entries with no 'upper' xattr need to be
+ * removed. When dir index entry has a stale 'upper' xattr,
+ * we assume that upper dir was removed and we treat the dir
+ * index as orphan entry that needs to be whited out.
+ */
+ if (err == -ESTALE)
+ goto orphan;
+ else if (!err)
+ err = -ESTALE;
+ goto fail;
+ }
+
+ err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh);
+ dput(upper);
+ if (err)
+ goto fail;
+
+ /* Check if non-dir index is orphan and don't warn before cleaning it */
+ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) {
+ err = ovl_check_origin_fh(ofs, fh, false, index, &stack);
+ if (err)
+ goto fail;
+
+ if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0)
+ goto orphan;
+ }
+
+out:
+ dput(origin.dentry);
+ kfree(fh);
+ return err;
+
+fail:
+ pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n",
+ index, d_inode(index)->i_mode & S_IFMT, err);
+ goto out;
+
+orphan:
+ pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n",
+ index, d_inode(index)->i_mode & S_IFMT,
+ d_inode(index)->i_nlink);
+ err = -ENOENT;
+ goto out;
+}
+
+static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
+{
+ char *n, *s;
+
+ n = kcalloc(fh->fb.len, 2, GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ s = bin2hex(n, fh->buf, fh->fb.len);
+ *name = (struct qstr) QSTR_INIT(n, s - n);
+
+ return 0;
+
+}
+
+/*
+ * Lookup in indexdir for the index entry of a lower real inode or a copy up
+ * origin inode. The index entry name is the hex representation of the lower
+ * inode file handle.
+ *
+ * If the index dentry in negative, then either no lower aliases have been
+ * copied up yet, or aliases have been copied up in older kernels and are
+ * not indexed.
+ *
+ * If the index dentry for a copy up origin inode is positive, but points
+ * to an inode different than the upper inode, then either the upper inode
+ * has been copied up and not indexed or it was indexed, but since then
+ * index dir was cleared. Either way, that index cannot be used to identify
+ * the overlay inode.
+ */
+int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+ struct qstr *name)
+{
+ struct ovl_fh *fh;
+ int err;
+
+ fh = ovl_encode_real_fh(ofs, origin, false);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+ err = ovl_get_index_name_fh(fh, name);
+
+ kfree(fh);
+ return err;
+}
+
+/* Lookup index by file handle for NFS export */
+struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
+{
+ struct dentry *index;
+ struct qstr name;
+ int err;
+
+ err = ovl_get_index_name_fh(fh, &name);
+ if (err)
+ return ERR_PTR(err);
+
+ index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
+ kfree(name.name);
+ if (IS_ERR(index)) {
+ if (PTR_ERR(index) == -ENOENT)
+ index = NULL;
+ return index;
+ }
+
+ if (ovl_is_whiteout(index))
+ err = -ESTALE;
+ else if (ovl_dentry_weird(index))
+ err = -EIO;
+ else
+ return index;
+
+ dput(index);
+ return ERR_PTR(err);
+}
+
+struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
+ struct dentry *origin, bool verify)
+{
+ struct dentry *index;
+ struct inode *inode;
+ struct qstr name;
+ bool is_dir = d_is_dir(origin);
+ int err;
+
+ err = ovl_get_index_name(ofs, origin, &name);
+ if (err)
+ return ERR_PTR(err);
+
+ index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name,
+ ofs->indexdir, name.len);
+ if (IS_ERR(index)) {
+ err = PTR_ERR(index);
+ if (err == -ENOENT) {
+ index = NULL;
+ goto out;
+ }
+ pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n"
+ "overlayfs: mount with '-o index=off' to disable inodes index.\n",
+ d_inode(origin)->i_ino, name.len, name.name,
+ err);
+ goto out;
+ }
+
+ inode = d_inode(index);
+ if (ovl_is_whiteout(index) && !verify) {
+ /*
+ * When index lookup is called with !verify for decoding an
+ * overlay file handle, a whiteout index implies that decode
+ * should treat file handle as stale and no need to print a
+ * warning about it.
+ */
+ dput(index);
+ index = ERR_PTR(-ESTALE);
+ goto out;
+ } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) ||
+ inode_wrong_type(inode, d_inode(origin)->i_mode)) {
+ /*
+ * Index should always be of the same file type as origin
+ * except for the case of a whiteout index. A whiteout
+ * index should only exist if all lower aliases have been
+ * unlinked, which means that finding a lower origin on lookup
+ * whose index is a whiteout should be treated as an error.
+ */
+ pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n",
+ index, d_inode(index)->i_mode & S_IFMT,
+ d_inode(origin)->i_mode & S_IFMT);
+ goto fail;
+ } else if (is_dir && verify) {
+ if (!upper) {
+ pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n",
+ origin, index);
+ goto fail;
+ }
+
+ /* Verify that dir index 'upper' xattr points to upper dir */
+ err = ovl_verify_upper(ofs, index, upper, false);
+ if (err) {
+ if (err == -ESTALE) {
+ pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n",
+ upper, origin, index);
+ }
+ goto fail;
+ }
+ } else if (upper && d_inode(upper) != inode) {
+ goto out_dput;
+ }
+out:
+ kfree(name.name);
+ return index;
+
+out_dput:
+ dput(index);
+ index = NULL;
+ goto out;
+
+fail:
+ dput(index);
+ index = ERR_PTR(-EIO);
+ goto out;
+}
+
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ BUG_ON(idx < 0);
+ if (idx == 0) {
+ ovl_path_upper(dentry, path);
+ if (path->dentry)
+ return oe->numlower ? 1 : -1;
+ idx++;
+ }
+ BUG_ON(idx > oe->numlower);
+ path->dentry = oe->lowerstack[idx - 1].dentry;
+ path->mnt = oe->lowerstack[idx - 1].layer->mnt;
+
+ return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
+/* Fix missing 'origin' xattr */
+static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
+ struct dentry *lower, struct dentry *upper)
+{
+ int err;
+
+ if (ovl_check_origin_xattr(ofs, upper))
+ return 0;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
+
+ err = ovl_set_origin(ofs, lower, upper);
+ if (!err)
+ err = ovl_set_impure(dentry->d_parent, upper->d_parent);
+
+ ovl_drop_write(dentry);
+ return err;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_entry *oe;
+ const struct cred *old_cred;
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
+ struct ovl_path *stack = NULL, *origin_path = NULL;
+ struct dentry *upperdir, *upperdentry = NULL;
+ struct dentry *origin = NULL;
+ struct dentry *index = NULL;
+ unsigned int ctr = 0;
+ struct inode *inode = NULL;
+ bool upperopaque = false;
+ char *upperredirect = NULL;
+ struct dentry *this;
+ unsigned int i;
+ int err;
+ bool uppermetacopy = false;
+ struct ovl_lookup_data d = {
+ .sb = dentry->d_sb,
+ .name = dentry->d_name,
+ .is_dir = false,
+ .opaque = false,
+ .stop = false,
+ .last = ofs->config.redirect_follow ? false : !poe->numlower,
+ .redirect = NULL,
+ .metacopy = false,
+ };
+
+ if (dentry->d_name.len > ofs->namelen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ upperdir = ovl_dentry_upper(dentry->d_parent);
+ if (upperdir) {
+ d.mnt = ovl_upper_mnt(ofs);
+ err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
+ if (err)
+ goto out;
+
+ if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
+ dput(upperdentry);
+ err = -EREMOTE;
+ goto out;
+ }
+ if (upperdentry && !d.is_dir) {
+ /*
+ * Lookup copy up origin by decoding origin file handle.
+ * We may get a disconnected dentry, which is fine,
+ * because we only need to hold the origin inode in
+ * cache and use its inode number. We may even get a
+ * connected dentry, that is not under any of the lower
+ * layers root. That is also fine for using it's inode
+ * number - it's the same as if we held a reference
+ * to a dentry in lower layer that was moved under us.
+ */
+ err = ovl_check_origin(ofs, upperdentry, &origin_path);
+ if (err)
+ goto out_put_upper;
+
+ if (d.metacopy)
+ uppermetacopy = true;
+ }
+
+ if (d.redirect) {
+ err = -ENOMEM;
+ upperredirect = kstrdup(d.redirect, GFP_KERNEL);
+ if (!upperredirect)
+ goto out_put_upper;
+ if (d.redirect[0] == '/')
+ poe = roe;
+ }
+ upperopaque = d.opaque;
+ }
+
+ if (!d.stop && poe->numlower) {
+ err = -ENOMEM;
+ stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path),
+ GFP_KERNEL);
+ if (!stack)
+ goto out_put_upper;
+ }
+
+ for (i = 0; !d.stop && i < poe->numlower; i++) {
+ struct ovl_path lower = poe->lowerstack[i];
+
+ if (!ofs->config.redirect_follow)
+ d.last = i == poe->numlower - 1;
+ else
+ d.last = lower.layer->idx == roe->numlower;
+
+ d.mnt = lower.layer->mnt;
+ err = ovl_lookup_layer(lower.dentry, &d, &this, false);
+ if (err)
+ goto out_put;
+
+ if (!this)
+ continue;
+
+ if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
+ dput(this);
+ err = -EPERM;
+ pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
+ goto out_put;
+ }
+
+ /*
+ * If no origin fh is stored in upper of a merge dir, store fh
+ * of lower dir and set upper parent "impure".
+ */
+ if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) {
+ err = ovl_fix_origin(ofs, dentry, this, upperdentry);
+ if (err) {
+ dput(this);
+ goto out_put;
+ }
+ }
+
+ /*
+ * When "verify_lower" feature is enabled, do not merge with a
+ * lower dir that does not match a stored origin xattr. In any
+ * case, only verified origin is used for index lookup.
+ *
+ * For non-dir dentry, if index=on, then ensure origin
+ * matches the dentry found using path based lookup,
+ * otherwise error out.
+ */
+ if (upperdentry && !ctr &&
+ ((d.is_dir && ovl_verify_lower(dentry->d_sb)) ||
+ (!d.is_dir && ofs->config.index && origin_path))) {
+ err = ovl_verify_origin(ofs, upperdentry, this, false);
+ if (err) {
+ dput(this);
+ if (d.is_dir)
+ break;
+ goto out_put;
+ }
+ origin = this;
+ }
+
+ if (d.metacopy && ctr) {
+ /*
+ * Do not store intermediate metacopy dentries in
+ * lower chain, except top most lower metacopy dentry.
+ * Continue the loop so that if there is an absolute
+ * redirect on this dentry, poe can be reset to roe.
+ */
+ dput(this);
+ this = NULL;
+ } else {
+ stack[ctr].dentry = this;
+ stack[ctr].layer = lower.layer;
+ ctr++;
+ }
+
+ /*
+ * Following redirects can have security consequences: it's like
+ * a symlink into the lower layer without the permission checks.
+ * This is only a problem if the upper layer is untrusted (e.g
+ * comes from an USB drive). This can allow a non-readable file
+ * or directory to become readable.
+ *
+ * Only following redirects when redirects are enabled disables
+ * this attack vector when not necessary.
+ */
+ err = -EPERM;
+ if (d.redirect && !ofs->config.redirect_follow) {
+ pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n",
+ dentry);
+ goto out_put;
+ }
+
+ if (d.stop)
+ break;
+
+ if (d.redirect && d.redirect[0] == '/' && poe != roe) {
+ poe = roe;
+ /* Find the current layer on the root dentry */
+ i = lower.layer->idx - 1;
+ }
+ }
+
+ /*
+ * For regular non-metacopy upper dentries, there is no lower
+ * path based lookup, hence ctr will be zero. If a dentry is found
+ * using ORIGIN xattr on upper, install it in stack.
+ *
+ * For metacopy dentry, path based lookup will find lower dentries.
+ * Just make sure a corresponding data dentry has been found.
+ */
+ if (d.metacopy || (uppermetacopy && !ctr)) {
+ pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n",
+ dentry);
+ err = -EIO;
+ goto out_put;
+ } else if (!d.is_dir && upperdentry && !ctr && origin_path) {
+ if (WARN_ON(stack != NULL)) {
+ err = -EIO;
+ goto out_put;
+ }
+ stack = origin_path;
+ ctr = 1;
+ origin = origin_path->dentry;
+ origin_path = NULL;
+ }
+
+ /*
+ * Always lookup index if there is no-upperdentry.
+ *
+ * For the case of upperdentry, we have set origin by now if it
+ * needed to be set. There are basically three cases.
+ *
+ * For directories, lookup index by lower inode and verify it matches
+ * upper inode. We only trust dir index if we verified that lower dir
+ * matches origin, otherwise dir index entries may be inconsistent
+ * and we ignore them.
+ *
+ * For regular upper, we already set origin if upper had ORIGIN
+ * xattr. There is no verification though as there is no path
+ * based dentry lookup in lower in this case.
+ *
+ * For metacopy upper, we set a verified origin already if index
+ * is enabled and if upper had an ORIGIN xattr.
+ *
+ */
+ if (!upperdentry && ctr)
+ origin = stack[0].dentry;
+
+ if (origin && ovl_indexdir(dentry->d_sb) &&
+ (!d.is_dir || ovl_index_all(dentry->d_sb))) {
+ index = ovl_lookup_index(ofs, upperdentry, origin, true);
+ if (IS_ERR(index)) {
+ err = PTR_ERR(index);
+ index = NULL;
+ goto out_put;
+ }
+ }
+
+ oe = ovl_alloc_entry(ctr);
+ err = -ENOMEM;
+ if (!oe)
+ goto out_put;
+
+ memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr);
+ dentry->d_fsdata = oe;
+
+ if (upperopaque)
+ ovl_dentry_set_opaque(dentry);
+
+ if (upperdentry)
+ ovl_dentry_set_upper_alias(dentry);
+ else if (index) {
+ struct path upperpath = {
+ .dentry = upperdentry = dget(index),
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
+ if (IS_ERR(upperredirect)) {
+ err = PTR_ERR(upperredirect);
+ upperredirect = NULL;
+ goto out_free_oe;
+ }
+ err = ovl_check_metacopy_xattr(ofs, &upperpath);
+ if (err < 0)
+ goto out_free_oe;
+ uppermetacopy = err;
+ }
+
+ if (upperdentry || ctr) {
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = stack,
+ .index = index,
+ .numlower = ctr,
+ .redirect = upperredirect,
+ .lowerdata = (ctr > 1 && !d.is_dir) ?
+ stack[ctr - 1].dentry : NULL,
+ };
+
+ inode = ovl_get_inode(dentry->d_sb, &oip);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_free_oe;
+ if (upperdentry && !uppermetacopy)
+ ovl_set_flag(OVL_UPPERDATA, inode);
+ }
+
+ ovl_dentry_init_reval(dentry, upperdentry);
+
+ revert_creds(old_cred);
+ if (origin_path) {
+ dput(origin_path->dentry);
+ kfree(origin_path);
+ }
+ dput(index);
+ kfree(stack);
+ kfree(d.redirect);
+ return d_splice_alias(inode, dentry);
+
+out_free_oe:
+ dentry->d_fsdata = NULL;
+ kfree(oe);
+out_put:
+ dput(index);
+ for (i = 0; i < ctr; i++)
+ dput(stack[i].dentry);
+ kfree(stack);
+out_put_upper:
+ if (origin_path) {
+ dput(origin_path->dentry);
+ kfree(origin_path);
+ }
+ dput(upperdentry);
+ kfree(upperredirect);
+out:
+ kfree(d.redirect);
+ revert_creds(old_cred);
+ return ERR_PTR(err);
+}
+
+bool ovl_lower_positive(struct dentry *dentry)
+{
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ const struct qstr *name = &dentry->d_name;
+ const struct cred *old_cred;
+ unsigned int i;
+ bool positive = false;
+ bool done = false;
+
+ /*
+ * If dentry is negative, then lower is positive iff this is a
+ * whiteout.
+ */
+ if (!dentry->d_inode)
+ return ovl_dentry_is_opaque(dentry);
+
+ /* Negative upper -> positive lower */
+ if (!ovl_dentry_upper(dentry))
+ return true;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ /* Positive upper -> have to look up lower to see whether it exists */
+ for (i = 0; !done && !positive && i < poe->numlower; i++) {
+ struct dentry *this;
+ struct dentry *lowerdir = poe->lowerstack[i].dentry;
+
+ this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt),
+ name->name, lowerdir, name->len);
+ if (IS_ERR(this)) {
+ switch (PTR_ERR(this)) {
+ case -ENOENT:
+ case -ENAMETOOLONG:
+ break;
+
+ default:
+ /*
+ * Assume something is there, we just couldn't
+ * access it.
+ */
+ positive = true;
+ break;
+ }
+ } else {
+ positive = !ovl_is_whiteout(this);
+ done = true;
+ dput(this);
+ }
+ }
+ revert_creds(old_cred);
+
+ return positive;
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000..a3c59ac01
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,698 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "ovl_entry.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "overlayfs: " fmt
+
+enum ovl_path_type {
+ __OVL_PATH_UPPER = (1 << 0),
+ __OVL_PATH_MERGE = (1 << 1),
+ __OVL_PATH_ORIGIN = (1 << 2),
+};
+
+#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
+#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
+
+#define OVL_XATTR_NAMESPACE "overlay."
+#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
+
+enum ovl_xattr {
+ OVL_XATTR_OPAQUE,
+ OVL_XATTR_REDIRECT,
+ OVL_XATTR_ORIGIN,
+ OVL_XATTR_IMPURE,
+ OVL_XATTR_NLINK,
+ OVL_XATTR_UPPER,
+ OVL_XATTR_METACOPY,
+ OVL_XATTR_PROTATTR,
+};
+
+enum ovl_inode_flag {
+ /* Pure upper dir that may contain non pure upper entries */
+ OVL_IMPURE,
+ /* Non-merge dir that may contain whiteout entries */
+ OVL_WHITEOUTS,
+ OVL_INDEX,
+ OVL_UPPERDATA,
+ /* Inode number will remain constant over copy up. */
+ OVL_CONST_INO,
+};
+
+enum ovl_entry_flag {
+ OVL_E_UPPER_ALIAS,
+ OVL_E_OPAQUE,
+ OVL_E_CONNECTED,
+};
+
+enum {
+ OVL_XINO_OFF,
+ OVL_XINO_AUTO,
+ OVL_XINO_ON,
+};
+
+/*
+ * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
+ * where:
+ * origin.fh - exported file handle of the lower file
+ * origin.uuid - uuid of the lower filesystem
+ */
+#define OVL_FH_VERSION 0
+#define OVL_FH_MAGIC 0xfb
+
+/* CPU byte order required for fid decoding: */
+#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
+#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
+/* Is the real inode encoded in fid an upper inode? */
+#define OVL_FH_FLAG_PATH_UPPER (1 << 2)
+
+#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \
+ OVL_FH_FLAG_PATH_UPPER)
+
+#if defined(__LITTLE_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN 0
+#elif defined(__BIG_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
+#else
+#error Endianness not defined
+#endif
+
+/* The type used to be returned by overlay exportfs for misaligned fid */
+#define OVL_FILEID_V0 0xfb
+/* The type returned by overlay exportfs for 32bit aligned fid */
+#define OVL_FILEID_V1 0xf8
+
+/* On-disk format for "origin" file handle */
+struct ovl_fb {
+ u8 version; /* 0 */
+ u8 magic; /* 0xfb */
+ u8 len; /* size of this header + size of fid */
+ u8 flags; /* OVL_FH_FLAG_* */
+ u8 type; /* fid_type of fid */
+ uuid_t uuid; /* uuid of filesystem */
+ u32 fid[]; /* file identifier should be 32bit aligned in-memory */
+} __packed;
+
+/* In-memory and on-wire format for overlay file handle */
+struct ovl_fh {
+ u8 padding[3]; /* make sure fb.fid is 32bit aligned */
+ union {
+ struct ovl_fb fb;
+ DECLARE_FLEX_ARRAY(u8, buf);
+ };
+} __packed;
+
+#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb)
+#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len)
+#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \
+ offsetof(struct ovl_fb, fid))
+
+extern const char *const ovl_xattr_table[][2];
+static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
+{
+ return ovl_xattr_table[ox][ofs->config.userxattr];
+}
+
+/*
+ * When changing ownership of an upper object map the intended ownership
+ * according to the upper layer's idmapping. When an upper mount idmaps files
+ * that are stored on-disk as owned by id 1001 to id 1000 this means stat on
+ * this object will report it as being owned by id 1000 when calling stat via
+ * the upper mount.
+ * In order to change ownership of an object so stat reports id 1000 when
+ * called on an idmapped upper mount the value written to disk - i.e., the
+ * value stored in ia_*id - must 1001. The mount mapping helper will thus take
+ * care to map 1000 to 1001.
+ * The mnt idmapping helpers are nops if the upper layer isn't idmapped.
+ */
+static inline int ovl_do_notify_change(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ struct iattr *attr)
+{
+ return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
+}
+
+static inline int ovl_do_rmdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry);
+
+ pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *dentry)
+{
+ int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL);
+
+ pr_debug("unlink(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry,
+ struct inode *dir, struct dentry *new_dentry)
+{
+ int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL);
+
+ pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
+ return err;
+}
+
+static inline int ovl_do_create(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true);
+
+ pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mknod(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t dev)
+{
+ int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev);
+
+ pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
+ return err;
+}
+
+static inline int ovl_do_symlink(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
+ const char *oldname)
+{
+ int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname);
+
+ pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+ return err;
+}
+
+static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name,
+ void *value, size_t size)
+{
+ int err, len;
+
+ WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb);
+
+ err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry,
+ name, value, size);
+ len = (value && err > 0) ? err : 0;
+
+ pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
+ path->dentry, name, min(len, 48), value, size, err);
+ return err;
+}
+
+static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size);
+}
+
+static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
+ const struct path *path,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size);
+}
+
+static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
+ value, size, flags);
+
+ pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
+ dentry, name, min((int)size, 48), value, size, flags, err);
+ return err;
+}
+
+static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const void *value,
+ size_t size)
+{
+ return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0);
+}
+
+static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *name)
+{
+ int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name);
+ pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+ return err;
+}
+
+static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox)
+{
+ return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
+}
+
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
+ struct dentry *olddentry, struct inode *newdir,
+ struct dentry *newdentry, unsigned int flags)
+{
+ int err;
+ struct renamedata rd = {
+ .old_mnt_userns = ovl_upper_mnt_userns(ofs),
+ .old_dir = olddir,
+ .old_dentry = olddentry,
+ .new_mnt_userns = ovl_upper_mnt_userns(ofs),
+ .new_dir = newdir,
+ .new_dentry = newdentry,
+ .flags = flags,
+ };
+
+ pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
+ err = vfs_rename(&rd);
+ if (err) {
+ pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
+ olddentry, newdentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_whiteout(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry);
+ pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
+ struct dentry *dentry, umode_t mode)
+{
+ struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
+ struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode,
+ O_LARGEFILE | O_WRONLY, current_cred());
+ int err = PTR_ERR_OR_ZERO(file);
+
+ pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return file;
+}
+
+static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
+ const char *name,
+ struct dentry *base, int len)
+{
+ return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len);
+}
+
+static inline bool ovl_open_flags_need_copy_up(int flags)
+{
+ if (!flags)
+ return false;
+
+ return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
+}
+
+static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs)
+{
+ /*
+ * To avoid regressions in existing setups with overlay lower offline
+ * changes, we allow lower changes only if none of the new features
+ * are used.
+ */
+ return (!ofs->config.index && !ofs->config.metacopy &&
+ !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON);
+}
+
+
+/* util.c */
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+struct dentry *ovl_workdir(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
+int ovl_can_decode_fh(struct super_block *sb);
+struct dentry *ovl_indexdir(struct super_block *sb);
+bool ovl_index_all(struct super_block *sb);
+bool ovl_verify_lower(struct super_block *sb);
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+bool ovl_dentry_remote(struct dentry *dentry);
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry);
+void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry);
+void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask);
+bool ovl_dentry_weird(struct dentry *dentry);
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+void ovl_path_lowerdata(struct dentry *dentry, struct path *path);
+struct inode *ovl_i_path_real(struct inode *inode, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_lowerdata(struct dentry *dentry);
+const struct ovl_layer *ovl_i_layer_lower(struct inode *inode);
+const struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_i_dentry_upper(struct inode *inode);
+struct inode *ovl_inode_upper(struct inode *inode);
+struct inode *ovl_inode_lower(struct inode *inode);
+struct inode *ovl_inode_lowerdata(struct inode *inode);
+struct inode *ovl_inode_real(struct inode *inode);
+struct inode *ovl_inode_realdata(struct inode *inode);
+struct ovl_dir_cache *ovl_dir_cache(struct inode *inode);
+void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache);
+void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry);
+void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry);
+bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+bool ovl_dentry_is_whiteout(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_upper_alias(struct dentry *dentry);
+void ovl_dentry_set_upper_alias(struct dentry *dentry);
+bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
+bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags);
+bool ovl_has_upperdata(struct inode *inode);
+void ovl_set_upperdata(struct inode *inode);
+bool ovl_redirect_dir(struct super_block *sb);
+const char *ovl_dentry_get_redirect(struct dentry *dentry);
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
+void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
+void ovl_dir_modified(struct dentry *dentry, bool impurity);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+bool ovl_is_whiteout(struct dentry *dentry);
+struct file *ovl_path_open(const struct path *path, int flags);
+int ovl_copy_up_start(struct dentry *dentry, int flags);
+void ovl_copy_up_end(struct dentry *dentry);
+bool ovl_already_copied_up(struct dentry *dentry, int flags);
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+
+static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
+ struct dentry *upperdentry)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+ return ovl_path_check_origin_xattr(ofs, &upperpath);
+}
+
+int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ enum ovl_xattr ox, const void *value, size_t size,
+ int xerr);
+int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
+bool ovl_inuse_trylock(struct dentry *dentry);
+void ovl_inuse_unlock(struct dentry *dentry);
+bool ovl_is_inuse(struct dentry *dentry);
+bool ovl_need_index(struct dentry *dentry);
+int ovl_nlink_start(struct dentry *dentry);
+void ovl_nlink_end(struct dentry *dentry);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_is_metacopy_dentry(struct dentry *dentry);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
+int ovl_sync_status(struct ovl_fs *ofs);
+
+static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
+{
+ set_bit(flag, &OVL_I(inode)->flags);
+}
+
+static inline void ovl_clear_flag(unsigned long flag, struct inode *inode)
+{
+ clear_bit(flag, &OVL_I(inode)->flags);
+}
+
+static inline bool ovl_test_flag(unsigned long flag, struct inode *inode)
+{
+ return test_bit(flag, &OVL_I(inode)->flags);
+}
+
+static inline bool ovl_is_impuredir(struct super_block *sb,
+ struct dentry *upperdentry)
+{
+ struct ovl_fs *ofs = OVL_FS(sb);
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+}
+
+/*
+ * With xino=auto, we do best effort to keep all inodes on same st_dev and
+ * d_ino consistent with st_ino.
+ * With xino=on, we do the same effort but we warn if we failed.
+ */
+static inline bool ovl_xino_warn(struct super_block *sb)
+{
+ return OVL_FS(sb)->config.xino == OVL_XINO_ON;
+}
+
+/* All layers on same fs? */
+static inline bool ovl_same_fs(struct super_block *sb)
+{
+ return OVL_FS(sb)->xino_mode == 0;
+}
+
+/* All overlay inodes have same st_dev? */
+static inline bool ovl_same_dev(struct super_block *sb)
+{
+ return OVL_FS(sb)->xino_mode >= 0;
+}
+
+static inline unsigned int ovl_xino_bits(struct super_block *sb)
+{
+ return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0;
+}
+
+static inline void ovl_inode_lock(struct inode *inode)
+{
+ mutex_lock(&OVL_I(inode)->lock);
+}
+
+static inline int ovl_inode_lock_interruptible(struct inode *inode)
+{
+ return mutex_lock_interruptible(&OVL_I(inode)->lock);
+}
+
+static inline void ovl_inode_unlock(struct inode *inode)
+{
+ mutex_unlock(&OVL_I(inode)->lock);
+}
+
+
+/* namei.c */
+int ovl_check_fb_len(struct ovl_fb *fb, int fb_len);
+
+static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
+{
+ if (fh_len < sizeof(struct ovl_fh))
+ return -EINVAL;
+
+ return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET);
+}
+
+struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+ struct vfsmount *mnt, bool connected);
+int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
+ struct dentry *upperdentry, struct ovl_path **stackp);
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real, bool is_upper,
+ bool set);
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
+int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
+int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+ struct qstr *name);
+struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
+struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
+ struct dentry *origin, bool verify);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+bool ovl_lower_positive(struct dentry *dentry);
+
+static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
+ struct dentry *origin, bool set)
+{
+ return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin,
+ false, set);
+}
+
+static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
+ struct dentry *upper, bool set)
+{
+ return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set);
+}
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+struct file *ovl_dir_real_file(const struct file *file, bool want_upper);
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+void ovl_dir_cache_free(struct inode *inode);
+int ovl_check_d_type_supported(const struct path *realpath);
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level);
+int ovl_indexdir_cleanup(struct ovl_fs *ofs);
+
+/*
+ * Can we iterate real dir directly?
+ *
+ * Non-merge dir may contain whiteouts from a time it was a merge upper, before
+ * lower dir was removed under it and possibly before it was rotated from upper
+ * to lower layer.
+ */
+static inline bool ovl_dir_is_real(struct dentry *dir)
+{
+ return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+}
+
+/* inode.c */
+int ovl_set_nlink_upper(struct dentry *dentry);
+int ovl_set_nlink_lower(struct dentry *dentry);
+unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
+ struct dentry *upperdentry,
+ unsigned int fallback);
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr);
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask);
+int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
+ void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+
+#ifdef CONFIG_FS_POSIX_ACL
+struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+#else
+#define ovl_get_acl NULL
+#endif
+
+int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
+bool ovl_is_private_xattr(struct super_block *sb, const char *name);
+
+struct ovl_inode_params {
+ struct inode *newinode;
+ struct dentry *upperdentry;
+ struct ovl_path *lowerpath;
+ bool index;
+ unsigned int numlower;
+ char *redirect;
+ struct dentry *lowerdata;
+};
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
+struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
+ bool is_upper);
+bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir);
+struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir);
+struct inode *ovl_get_inode(struct super_block *sb,
+ struct ovl_inode_params *oip);
+void ovl_copyattr(struct inode *to);
+
+/* vfs inode flags copied from real to ovl inode */
+#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
+/* vfs inode flags read from overlay.protattr xattr to ovl inode */
+#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+
+/*
+ * fileattr flags copied from lower to upper inode on copy up.
+ * We cannot copy up immutable/append-only flags, because that would prevent
+ * linking temp inode to upper dir, so we store them in xattr instead.
+ */
+#define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL)
+#define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME)
+#define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL)
+#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE)
+
+void ovl_check_protattr(struct inode *inode, struct dentry *upper);
+int ovl_set_protattr(struct inode *inode, struct dentry *upper,
+ struct fileattr *fa);
+
+static inline void ovl_copyflags(struct inode *from, struct inode *to)
+{
+ unsigned int mask = OVL_COPY_I_FLAGS_MASK;
+
+ inode_set_flags(to, from->i_flags & mask, mask);
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *dentry);
+struct ovl_cattr {
+ dev_t rdev;
+ umode_t mode;
+ const char *link;
+ struct dentry *hardlink;
+};
+
+#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode);
+struct dentry *ovl_create_real(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *newdentry,
+ struct ovl_cattr *attr);
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
+ struct ovl_cattr *attr);
+
+/* file.c */
+extern const struct file_operations ovl_file_operations;
+int __init ovl_aio_request_cache_init(void);
+void ovl_aio_request_cache_destroy(void);
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
+int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ovl_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_with_data(struct dentry *dentry);
+int ovl_maybe_copy_up(struct dentry *dentry, int flags);
+int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+ bool is_upper);
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
+ struct dentry *upper);
+
+/* export.c */
+extern const struct export_operations ovl_export_operations;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
new file mode 100644
index 000000000..a479680a5
--- /dev/null
+++ b/fs/overlayfs/ovl_entry.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+
+struct ovl_config {
+ char *lowerdir;
+ char *upperdir;
+ char *workdir;
+ bool default_permissions;
+ bool redirect_dir;
+ bool redirect_follow;
+ const char *redirect_mode;
+ bool index;
+ bool uuid;
+ bool nfs_export;
+ int xino;
+ bool metacopy;
+ bool userxattr;
+ bool ovl_volatile;
+};
+
+struct ovl_sb {
+ struct super_block *sb;
+ dev_t pseudo_dev;
+ /* Unusable (conflicting) uuid */
+ bool bad_uuid;
+ /* Used as a lower layer (but maybe also as upper) */
+ bool is_lower;
+};
+
+struct ovl_layer {
+ /* ovl_free_fs() relies on @mnt being the first member! */
+ struct vfsmount *mnt;
+ /* Trap in ovl inode cache */
+ struct inode *trap;
+ struct ovl_sb *fs;
+ /* Index of this layer in fs root (upper idx == 0) */
+ int idx;
+ /* One fsid per unique underlying sb (upper fsid == 0) */
+ int fsid;
+};
+
+/*
+ * ovl_free_fs() relies on @mnt being the first member when unmounting
+ * the private mounts created for each layer. Let's check both the
+ * offset and type.
+ */
+static_assert(offsetof(struct ovl_layer, mnt) == 0);
+static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
+
+struct ovl_path {
+ const struct ovl_layer *layer;
+ struct dentry *dentry;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+ unsigned int numlayer;
+ /* Number of unique fs among layers including upper fs */
+ unsigned int numfs;
+ const struct ovl_layer *layers;
+ struct ovl_sb *fs;
+ /* workbasedir is the path at workdir= mount option */
+ struct dentry *workbasedir;
+ /* workdir is the 'work' directory under workbasedir */
+ struct dentry *workdir;
+ /* index directory listing overlay inodes by origin file handle */
+ struct dentry *indexdir;
+ long namelen;
+ /* pathnames of lower and upper dirs, for show_options */
+ struct ovl_config config;
+ /* creds of process who forced instantiation of super block */
+ const struct cred *creator_cred;
+ bool tmpfile;
+ bool noxattr;
+ /* Did we take the inuse lock? */
+ bool upperdir_locked;
+ bool workdir_locked;
+ bool share_whiteout;
+ /* Traps in ovl inode cache */
+ struct inode *workbasedir_trap;
+ struct inode *workdir_trap;
+ struct inode *indexdir_trap;
+ /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
+ int xino_mode;
+ /* For allocation of non-persistent inode numbers */
+ atomic_long_t last_ino;
+ /* Whiteout dentry cache */
+ struct dentry *whiteout;
+ /* r/o snapshot of upperdir sb's only taken on volatile mounts */
+ errseq_t errseq;
+};
+
+static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
+{
+ return ofs->layers[0].mnt;
+}
+
+static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs)
+{
+ return mnt_user_ns(ovl_upper_mnt(ofs));
+}
+
+static inline struct ovl_fs *OVL_FS(struct super_block *sb)
+{
+ return (struct ovl_fs *)sb->s_fs_info;
+}
+
+static inline bool ovl_should_sync(struct ovl_fs *ofs)
+{
+ return !ofs->config.ovl_volatile;
+}
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+ union {
+ struct {
+ unsigned long flags;
+ };
+ struct rcu_head rcu;
+ };
+ unsigned numlower;
+ struct ovl_path lowerstack[];
+};
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+
+static inline struct ovl_entry *OVL_E(struct dentry *dentry)
+{
+ return (struct ovl_entry *) dentry->d_fsdata;
+}
+
+struct ovl_inode {
+ union {
+ struct ovl_dir_cache *cache; /* directory */
+ struct inode *lowerdata; /* regular file */
+ };
+ const char *redirect;
+ u64 version;
+ unsigned long flags;
+ struct inode vfs_inode;
+ struct dentry *__upperdentry;
+ struct ovl_path lowerpath;
+
+ /* synchronize copy up and more */
+ struct mutex lock;
+};
+
+static inline struct ovl_inode *OVL_I(struct inode *inode)
+{
+ return container_of(inode, struct ovl_inode, vfs_inode);
+}
+
+static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
+{
+ return READ_ONCE(oi->__upperdentry);
+}
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000..2b2106400
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,1235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+ unsigned int len;
+ unsigned int type;
+ u64 real_ino;
+ u64 ino;
+ struct list_head l_node;
+ struct rb_node node;
+ struct ovl_cache_entry *next_maybe_whiteout;
+ bool is_upper;
+ bool is_whiteout;
+ char name[];
+};
+
+struct ovl_dir_cache {
+ long refcount;
+ u64 version;
+ struct list_head entries;
+ struct rb_root root;
+};
+
+struct ovl_readdir_data {
+ struct dir_context ctx;
+ struct dentry *dentry;
+ bool is_lowest;
+ struct rb_root *root;
+ struct list_head *list;
+ struct list_head middle;
+ struct ovl_cache_entry *first_maybe_whiteout;
+ int count;
+ int err;
+ bool is_upper;
+ bool d_type_supported;
+};
+
+struct ovl_dir_file {
+ bool is_real;
+ bool is_upper;
+ struct ovl_dir_cache *cache;
+ struct list_head *cursor;
+ struct file *realfile;
+ struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+ return rb_entry(n, struct ovl_cache_entry, node);
+}
+
+static bool ovl_cache_entry_find_link(const char *name, int len,
+ struct rb_node ***link,
+ struct rb_node **parent)
+{
+ bool found = false;
+ struct rb_node **newp = *link;
+
+ while (!found && *newp) {
+ int cmp;
+ struct ovl_cache_entry *tmp;
+
+ *parent = *newp;
+ tmp = ovl_cache_entry_from_node(*newp);
+ cmp = strncmp(name, tmp->name, len);
+ if (cmp > 0)
+ newp = &tmp->node.rb_right;
+ else if (cmp < 0 || len < tmp->len)
+ newp = &tmp->node.rb_left;
+ else
+ found = true;
+ }
+ *link = newp;
+
+ return found;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+ const char *name, int len)
+{
+ struct rb_node *node = root->rb_node;
+ int cmp;
+
+ while (node) {
+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+ cmp = strncmp(name, p->name, len);
+ if (cmp > 0)
+ node = p->node.rb_right;
+ else if (cmp < 0 || len < p->len)
+ node = p->node.rb_left;
+ else
+ return p;
+ }
+
+ return NULL;
+}
+
+static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
+ struct ovl_cache_entry *p)
+{
+ /* Don't care if not doing ovl_iter() */
+ if (!rdd->dentry)
+ return false;
+
+ /* Always recalc d_ino when remapping lower inode numbers */
+ if (ovl_xino_bits(rdd->dentry->d_sb))
+ return true;
+
+ /* Always recalc d_ino for parent */
+ if (strcmp(p->name, "..") == 0)
+ return true;
+
+ /* If this is lower, then native d_ino will do */
+ if (!rdd->is_upper)
+ return false;
+
+ /*
+ * Recalc d_ino for '.' and for all entries if dir is impure (contains
+ * copied up entries)
+ */
+ if ((p->name[0] == '.' && p->len == 1) ||
+ ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry)))
+ return true;
+
+ return false;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
+ const char *name, int len,
+ u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+
+ p = kmalloc(size, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ memcpy(p->name, name, len);
+ p->name[len] = '\0';
+ p->len = len;
+ p->type = d_type;
+ p->real_ino = ino;
+ p->ino = ino;
+ /* Defer setting d_ino for upper entry to ovl_iterate() */
+ if (ovl_calc_d_ino(rdd, p))
+ p->ino = 0;
+ p->is_upper = rdd->is_upper;
+ p->is_whiteout = false;
+
+ if (d_type == DT_CHR) {
+ p->next_maybe_whiteout = rdd->first_maybe_whiteout;
+ rdd->first_maybe_whiteout = p;
+ }
+ return p;
+}
+
+static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len, u64 ino,
+ unsigned int d_type)
+{
+ struct rb_node **newp = &rdd->root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ovl_cache_entry *p;
+
+ if (ovl_cache_entry_find_link(name, len, &newp, &parent))
+ return true;
+
+ p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
+ if (p == NULL) {
+ rdd->err = -ENOMEM;
+ return false;
+ }
+
+ list_add_tail(&p->l_node, rdd->list);
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, rdd->root);
+
+ return true;
+}
+
+static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(rdd->root, name, namelen);
+ if (p) {
+ list_move_tail(&p->l_node, &rdd->middle);
+ } else {
+ p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ if (p == NULL)
+ rdd->err = -ENOMEM;
+ else
+ list_add_tail(&p->l_node, &rdd->middle);
+ }
+
+ return rdd->err == 0;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_cache_entry *n;
+
+ list_for_each_entry_safe(p, n, list, l_node)
+ kfree(p);
+
+ INIT_LIST_HEAD(list);
+}
+
+void ovl_dir_cache_free(struct inode *inode)
+{
+ struct ovl_dir_cache *cache = ovl_dir_cache(inode);
+
+ if (cache) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+ struct ovl_dir_cache *cache = od->cache;
+
+ WARN_ON(cache->refcount <= 0);
+ cache->refcount--;
+ if (!cache->refcount) {
+ if (ovl_dir_cache(d_inode(dentry)) == cache)
+ ovl_set_dir_cache(d_inode(dentry), NULL);
+
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
+static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ rdd->count++;
+ if (!rdd->is_lowest)
+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ else
+ return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
+}
+
+static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
+{
+ int err;
+ struct ovl_cache_entry *p;
+ struct dentry *dentry, *dir = path->dentry;
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(rdd->dentry->d_sb);
+
+ err = down_write_killable(&dir->d_inode->i_rwsem);
+ if (!err) {
+ while (rdd->first_maybe_whiteout) {
+ p = rdd->first_maybe_whiteout;
+ rdd->first_maybe_whiteout = p->next_maybe_whiteout;
+ dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
+ if (!IS_ERR(dentry)) {
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ }
+ inode_unlock(dir->d_inode);
+ }
+ revert_creds(old_cred);
+
+ return err;
+}
+
+static inline int ovl_dir_read(const struct path *realpath,
+ struct ovl_readdir_data *rdd)
+{
+ struct file *realfile;
+ int err;
+
+ realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+
+ rdd->first_maybe_whiteout = NULL;
+ rdd->ctx.pos = 0;
+ do {
+ rdd->count = 0;
+ rdd->err = 0;
+ err = iterate_dir(realfile, &rdd->ctx);
+ if (err >= 0)
+ err = rdd->err;
+ } while (!err && rdd->count);
+
+ if (!err && rdd->first_maybe_whiteout && rdd->dentry)
+ err = ovl_check_whiteouts(realpath, rdd);
+
+ fput(realfile);
+
+ return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct ovl_dir_cache *cache = od->cache;
+ struct dentry *dentry = file->f_path.dentry;
+ bool is_real;
+
+ if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+ ovl_cache_put(od, dentry);
+ od->cache = NULL;
+ od->cursor = NULL;
+ }
+ is_real = ovl_dir_is_real(dentry);
+ if (od->is_real != is_real) {
+ /* is_real can only become false when dir is copied up */
+ if (WARN_ON(is_real))
+ return;
+ od->is_real = false;
+ }
+}
+
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
+ struct rb_root *root)
+{
+ int err;
+ struct path realpath;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .dentry = dentry,
+ .list = list,
+ .root = root,
+ .is_lowest = false,
+ };
+ int idx, next;
+
+ for (idx = 0; idx != -1; idx = next) {
+ next = ovl_path_next(idx, dentry, &realpath);
+ rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+
+ if (next != -1) {
+ err = ovl_dir_read(&realpath, &rdd);
+ if (err)
+ break;
+ } else {
+ /*
+ * Insert lowest layer entries before upper ones, this
+ * allows offsets to be reasonably constant
+ */
+ list_add(&rdd.middle, rdd.list);
+ rdd.is_lowest = true;
+ err = ovl_dir_read(&realpath, &rdd);
+ list_del(&rdd.middle);
+ }
+ }
+ return err;
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+ struct list_head *p;
+ loff_t off = 0;
+
+ list_for_each(p, &od->cache->entries) {
+ if (off >= pos)
+ break;
+ off++;
+ }
+ /* Cursor is safe since the cache is stable */
+ od->cursor = p;
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+ int res;
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(d_inode(dentry));
+ if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ WARN_ON(!cache->refcount);
+ cache->refcount++;
+ return cache;
+ }
+ ovl_set_dir_cache(d_inode(dentry), NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ cache->refcount = 1;
+ INIT_LIST_HEAD(&cache->entries);
+ cache->root = RB_ROOT;
+
+ res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(d_inode(dentry), cache);
+
+ return cache;
+}
+
+/* Map inode number to lower fs unique range */
+static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
+ const char *name, int namelen, bool warn)
+{
+ unsigned int xinoshift = 64 - xinobits;
+
+ if (unlikely(ino >> xinoshift)) {
+ if (warn) {
+ pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ namelen, name, ino, xinobits);
+ }
+ return ino;
+ }
+
+ /*
+ * The lowest xinobit is reserved for mapping the non-peresistent inode
+ * numbers range, but this range is only exposed via st_ino, not here.
+ */
+ return ino | ((u64)fsid) << (xinoshift + 1);
+}
+
+/*
+ * Set d_ino for upper entries. Non-upper entries should always report
+ * the uppermost real inode ino and should not call this function.
+ *
+ * When not all layer are on same fs, report real ino also for upper.
+ *
+ * When all layers are on the same fs, and upper has a reference to
+ * copy up origin, call vfs_getattr() on the overlay entry to make
+ * sure that d_ino will be consistent with st_ino from stat(2).
+ */
+static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
+
+{
+ struct dentry *dir = path->dentry;
+ struct dentry *this = NULL;
+ enum ovl_path_type type;
+ u64 ino = p->real_ino;
+ int xinobits = ovl_xino_bits(dir->d_sb);
+ int err = 0;
+
+ if (!ovl_same_dev(dir->d_sb))
+ goto out;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1) {
+ this = dget(dir);
+ goto get;
+ }
+ if (p->len == 2 && p->name[1] == '.') {
+ /* we shall not be moved */
+ this = dget(dir->d_parent);
+ goto get;
+ }
+ }
+ this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
+ if (IS_ERR_OR_NULL(this) || !this->d_inode) {
+ /* Mark a stale entry */
+ p->is_whiteout = true;
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ this = NULL;
+ goto fail;
+ }
+ goto out;
+ }
+
+get:
+ type = ovl_path_type(this);
+ if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat stat;
+ struct path statpath = *path;
+
+ statpath.dentry = this;
+ err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
+ if (err)
+ goto fail;
+
+ /*
+ * Directory inode is always on overlay st_dev.
+ * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
+ * of xino bits overflow.
+ */
+ WARN_ON_ONCE(S_ISDIR(stat.mode) &&
+ dir->d_sb->s_dev != stat.dev);
+ ino = stat.ino;
+ } else if (xinobits && !OVL_TYPE_UPPER(type)) {
+ ino = ovl_remap_lower_ino(ino, xinobits,
+ ovl_layer_lower(this)->fsid,
+ p->name, p->len,
+ ovl_xino_warn(dir->d_sb));
+ }
+
+out:
+ p->ino = ino;
+ dput(this);
+ return err;
+
+fail:
+ pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n",
+ p->name, err);
+ goto out;
+}
+
+static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ rdd->count++;
+ p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ if (p == NULL) {
+ rdd->err = -ENOMEM;
+ return false;
+ }
+ list_add_tail(&p->l_node, rdd->list);
+
+ return true;
+}
+
+static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
+ struct rb_root *root)
+{
+ int err;
+ struct path realpath;
+ struct ovl_cache_entry *p, *n;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_plain,
+ .list = list,
+ .root = root,
+ };
+
+ INIT_LIST_HEAD(list);
+ *root = RB_ROOT;
+ ovl_path_upper(path->dentry, &realpath);
+
+ err = ovl_dir_read(&realpath, &rdd);
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(p, n, list, l_node) {
+ if (strcmp(p->name, ".") != 0 &&
+ strcmp(p->name, "..") != 0) {
+ err = ovl_cache_update_ino(path, p);
+ if (err)
+ return err;
+ }
+ if (p->ino == p->real_ino) {
+ list_del(&p->l_node);
+ kfree(p);
+ } else {
+ struct rb_node **newp = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
+ &newp, &parent)))
+ return -EIO;
+
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, root);
+ }
+ }
+ return 0;
+}
+
+static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
+{
+ int res;
+ struct dentry *dentry = path->dentry;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(d_inode(dentry));
+ if (cache && ovl_dentry_version_get(dentry) == cache->version)
+ return cache;
+
+ /* Impure cache is not refcounted, free it here */
+ ovl_dir_cache_free(d_inode(dentry));
+ ovl_set_dir_cache(d_inode(dentry), NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ res = ovl_dir_read_impure(path, &cache->entries, &cache->root);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+ if (list_empty(&cache->entries)) {
+ /*
+ * A good opportunity to get rid of an unneeded "impure" flag.
+ * Removing the "impure" xattr is best effort.
+ */
+ if (!ovl_want_write(dentry)) {
+ ovl_removexattr(ofs, ovl_dentry_upper(dentry),
+ OVL_XATTR_IMPURE);
+ ovl_drop_write(dentry);
+ }
+ ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(d_inode(dentry), cache);
+
+ return cache;
+}
+
+struct ovl_readdir_translate {
+ struct dir_context *orig_ctx;
+ struct ovl_dir_cache *cache;
+ struct dir_context ctx;
+ u64 parent_ino;
+ int fsid;
+ int xinobits;
+ bool xinowarn;
+};
+
+static bool ovl_fill_real(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_translate *rdt =
+ container_of(ctx, struct ovl_readdir_translate, ctx);
+ struct dir_context *orig_ctx = rdt->orig_ctx;
+
+ if (rdt->parent_ino && strcmp(name, "..") == 0) {
+ ino = rdt->parent_ino;
+ } else if (rdt->cache) {
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
+ if (p)
+ ino = p->ino;
+ } else if (rdt->xinobits) {
+ ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
+ name, namelen, rdt->xinowarn);
+ }
+
+ return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+}
+
+static bool ovl_is_impure_dir(struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct inode *dir = d_inode(file->f_path.dentry);
+
+ /*
+ * Only upper dir can be impure, but if we are in the middle of
+ * iterating a lower real dir, dir could be copied up and marked
+ * impure. We only want the impure cache if we started iterating
+ * a real upper dir to begin with.
+ */
+ return od->is_upper && ovl_test_flag(OVL_IMPURE, dir);
+
+}
+
+static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
+{
+ int err;
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dir = file->f_path.dentry;
+ const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
+ struct ovl_readdir_translate rdt = {
+ .ctx.actor = ovl_fill_real,
+ .orig_ctx = ctx,
+ .xinobits = ovl_xino_bits(dir->d_sb),
+ .xinowarn = ovl_xino_warn(dir->d_sb),
+ };
+
+ if (rdt.xinobits && lower_layer)
+ rdt.fsid = lower_layer->fsid;
+
+ if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
+ struct kstat stat;
+ struct path statpath = file->f_path;
+
+ statpath.dentry = dir->d_parent;
+ err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
+ if (err)
+ return err;
+
+ WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
+ rdt.parent_ino = stat.ino;
+ }
+
+ if (ovl_is_impure_dir(file)) {
+ rdt.cache = ovl_cache_get_impure(&file->f_path);
+ if (IS_ERR(rdt.cache))
+ return PTR_ERR(rdt.cache);
+ }
+
+ err = iterate_dir(od->realfile, &rdt.ctx);
+ ctx->pos = rdt.ctx.pos;
+
+ return err;
+}
+
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct ovl_cache_entry *p;
+ const struct cred *old_cred;
+ int err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (!ctx->pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real) {
+ /*
+ * If parent is merge, then need to adjust d_ino for '..', if
+ * dir is impure then need to adjust d_ino for copied up
+ * entries.
+ */
+ if (ovl_xino_bits(dentry->d_sb) ||
+ (ovl_same_fs(dentry->d_sb) &&
+ (ovl_is_impure_dir(file) ||
+ OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
+ err = ovl_iterate_real(file, ctx);
+ } else {
+ err = iterate_dir(od->realfile, ctx);
+ }
+ goto out;
+ }
+
+ if (!od->cache) {
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_cache_get(dentry);
+ err = PTR_ERR(cache);
+ if (IS_ERR(cache))
+ goto out;
+
+ od->cache = cache;
+ ovl_seek_cursor(od, ctx->pos);
+ }
+
+ while (od->cursor != &od->cache->entries) {
+ p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
+ if (!p->is_whiteout) {
+ if (!p->ino) {
+ err = ovl_cache_update_ino(&file->f_path, p);
+ if (err)
+ goto out;
+ }
+ }
+ /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ if (!p->is_whiteout) {
+ if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+ break;
+ }
+ od->cursor = p->l_node.next;
+ ctx->pos++;
+ }
+ err = 0;
+out:
+ revert_creds(old_cred);
+ return err;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t res;
+ struct ovl_dir_file *od = file->private_data;
+
+ inode_lock(file_inode(file));
+ if (!file->f_pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real) {
+ res = vfs_llseek(od->realfile, offset, origin);
+ file->f_pos = od->realfile->f_pos;
+ } else {
+ res = -EINVAL;
+
+ switch (origin) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_SET:
+ break;
+ default:
+ goto out_unlock;
+ }
+ if (offset < 0)
+ goto out_unlock;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (od->cache)
+ ovl_seek_cursor(od, offset);
+ }
+ res = offset;
+ }
+out_unlock:
+ inode_unlock(file_inode(file));
+
+ return res;
+}
+
+static struct file *ovl_dir_open_realfile(const struct file *file,
+ const struct path *realpath)
+{
+ struct file *res;
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
+ revert_creds(old_cred);
+
+ return res;
+}
+
+/*
+ * Like ovl_real_fdget(), returns upperfile if dir was copied up since open.
+ * Unlike ovl_real_fdget(), this caches upperfile in file->private_data.
+ *
+ * TODO: use same abstract type for file->private_data of dir and file so
+ * upperfile could also be cached for files as well.
+ */
+struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
+{
+
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct file *old, *realfile = od->realfile;
+
+ if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
+ return want_upper ? NULL : realfile;
+
+ /*
+ * Need to check if we started out being a lower dir, but got copied up
+ */
+ if (!od->is_upper) {
+ realfile = READ_ONCE(od->upperfile);
+ if (!realfile) {
+ struct path upperpath;
+
+ ovl_path_upper(dentry, &upperpath);
+ realfile = ovl_dir_open_realfile(file, &upperpath);
+ if (IS_ERR(realfile))
+ return realfile;
+
+ old = cmpxchg_release(&od->upperfile, NULL, realfile);
+ if (old) {
+ fput(realfile);
+ realfile = old;
+ }
+ }
+ }
+
+ return realfile;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct file *realfile;
+ int err;
+
+ err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+ if (err <= 0)
+ return err;
+
+ realfile = ovl_dir_real_file(file, true);
+ err = PTR_ERR_OR_ZERO(realfile);
+
+ /* Nothing to sync for lower */
+ if (!realfile || err)
+ return err;
+
+ return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+
+ if (od->cache) {
+ inode_lock(inode);
+ ovl_cache_put(od, file->f_path.dentry);
+ inode_unlock(inode);
+ }
+ fput(od->realfile);
+ if (od->upperfile)
+ fput(od->upperfile);
+ kfree(od);
+
+ return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+ struct path realpath;
+ struct file *realfile;
+ struct ovl_dir_file *od;
+ enum ovl_path_type type;
+
+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+ if (!od)
+ return -ENOMEM;
+
+ type = ovl_path_real(file->f_path.dentry, &realpath);
+ realfile = ovl_dir_open_realfile(file, &realpath);
+ if (IS_ERR(realfile)) {
+ kfree(od);
+ return PTR_ERR(realfile);
+ }
+ od->realfile = realfile;
+ od->is_real = ovl_dir_is_real(file->f_path.dentry);
+ od->is_upper = OVL_TYPE_UPPER(type);
+ file->private_data = od;
+
+ return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+ .read = generic_read_dir,
+ .open = ovl_dir_open,
+ .iterate = ovl_iterate,
+ .llseek = ovl_dir_llseek,
+ .fsync = ovl_dir_fsync,
+ .release = ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct ovl_cache_entry *p, *n;
+ struct rb_root root = RB_ROOT;
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_dir_read_merged(dentry, list, &root);
+ revert_creds(old_cred);
+ if (err)
+ return err;
+
+ err = 0;
+
+ list_for_each_entry_safe(p, n, list, l_node) {
+ /*
+ * Select whiteouts in upperdir, they should
+ * be cleared when deleting this directory.
+ */
+ if (p->is_whiteout) {
+ if (p->is_upper)
+ continue;
+ goto del_entry;
+ }
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ goto del_entry;
+ if (p->len == 2 && p->name[1] == '.')
+ goto del_entry;
+ }
+ err = -ENOTEMPTY;
+ break;
+
+del_entry:
+ list_del(&p->l_node);
+ kfree(p);
+ }
+
+ return err;
+}
+
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+
+ inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
+ list_for_each_entry(p, list, l_node) {
+ struct dentry *dentry;
+
+ if (WARN_ON(!p->is_whiteout || !p->is_upper))
+ continue;
+
+ dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
+ if (IS_ERR(dentry)) {
+ pr_err("lookup '%s/%.*s' failed (%i)\n",
+ upper->d_name.name, p->len, p->name,
+ (int) PTR_ERR(dentry));
+ continue;
+ }
+ if (dentry->d_inode)
+ ovl_cleanup(ofs, upper->d_inode, dentry);
+ dput(dentry);
+ }
+ inode_unlock(upper->d_inode);
+}
+
+static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ /* Even if d_type is not supported, DT_DIR is returned for . and .. */
+ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
+ return true;
+
+ if (d_type != DT_UNKNOWN)
+ rdd->d_type_supported = true;
+
+ return true;
+}
+
+/*
+ * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
+ * if error is encountered.
+ */
+int ovl_check_d_type_supported(const struct path *realpath)
+{
+ int err;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_check_d_type,
+ .d_type_supported = false,
+ };
+
+ err = ovl_dir_read(realpath, &rdd);
+ if (err)
+ return err;
+
+ return rdd.d_type_supported;
+}
+
+#define OVL_INCOMPATDIR_NAME "incompat"
+
+static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
+ int level)
+{
+ int err;
+ struct inode *dir = path->dentry->d_inode;
+ LIST_HEAD(list);
+ struct rb_root root = RB_ROOT;
+ struct ovl_cache_entry *p;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .dentry = NULL,
+ .list = &list,
+ .root = &root,
+ .is_lowest = false,
+ };
+ bool incompat = false;
+
+ /*
+ * The "work/incompat" directory is treated specially - if it is not
+ * empty, instead of printing a generic error and mounting read-only,
+ * we will error about incompat features and fail the mount.
+ *
+ * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name
+ * starts with '#'.
+ */
+ if (level == 2 &&
+ !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME))
+ incompat = true;
+
+ err = ovl_dir_read(path, &rdd);
+ if (err)
+ goto out;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ list_for_each_entry(p, &list, l_node) {
+ struct dentry *dentry;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ } else if (incompat) {
+ pr_err("overlay with incompat feature '%s' cannot be mounted\n",
+ p->name);
+ err = -EINVAL;
+ break;
+ }
+ dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
+ if (IS_ERR(dentry))
+ continue;
+ if (dentry->d_inode)
+ err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
+ dput(dentry);
+ if (err)
+ break;
+ }
+ inode_unlock(dir);
+out:
+ ovl_cache_free(&list);
+ return err;
+}
+
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level)
+{
+ int err;
+
+ if (!d_is_dir(dentry) || level > 1) {
+ return ovl_cleanup(ofs, dir, dentry);
+ }
+
+ err = ovl_do_rmdir(ofs, dir, dentry);
+ if (err) {
+ struct path path = { .mnt = mnt, .dentry = dentry };
+
+ inode_unlock(dir);
+ err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ if (!err)
+ err = ovl_cleanup(ofs, dir, dentry);
+ }
+
+ return err;
+}
+
+int ovl_indexdir_cleanup(struct ovl_fs *ofs)
+{
+ int err;
+ struct dentry *indexdir = ofs->indexdir;
+ struct dentry *index = NULL;
+ struct inode *dir = indexdir->d_inode;
+ struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
+ LIST_HEAD(list);
+ struct rb_root root = RB_ROOT;
+ struct ovl_cache_entry *p;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .dentry = NULL,
+ .list = &list,
+ .root = &root,
+ .is_lowest = false,
+ };
+
+ err = ovl_dir_read(&path, &rdd);
+ if (err)
+ goto out;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ list_for_each_entry(p, &list, l_node) {
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ }
+ index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
+ if (IS_ERR(index)) {
+ err = PTR_ERR(index);
+ index = NULL;
+ break;
+ }
+ /* Cleanup leftover from index create/cleanup attempt */
+ if (index->d_name.name[0] == '#') {
+ err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
+ if (err)
+ break;
+ goto next;
+ }
+ err = ovl_verify_index(ofs, index);
+ if (!err) {
+ goto next;
+ } else if (err == -ESTALE) {
+ /* Cleanup stale index entries */
+ err = ovl_cleanup(ofs, dir, index);
+ } else if (err != -ENOENT) {
+ /*
+ * Abort mount to avoid corrupting the index if
+ * an incompatible index entry was found or on out
+ * of memory.
+ */
+ break;
+ } else if (ofs->config.nfs_export) {
+ /*
+ * Whiteout orphan index to block future open by
+ * handle after overlay nlink dropped to zero.
+ */
+ err = ovl_cleanup_and_whiteout(ofs, dir, index);
+ } else {
+ /* Cleanup orphan index entries */
+ err = ovl_cleanup(ofs, dir, index);
+ }
+
+ if (err)
+ break;
+
+next:
+ dput(index);
+ index = NULL;
+ }
+ dput(index);
+ inode_unlock(dir);
+out:
+ ovl_cache_free(&list);
+ if (err)
+ pr_err("failed index dir cleanup (%i)\n", err);
+ return err;
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000..51eec4a8e
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,2244 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ */
+
+#include <uapi/linux/magic.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/exportfs.h>
+#include <linux/file.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+
+struct ovl_dir_cache;
+
+#define OVL_MAX_STACK 500
+
+static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
+module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
+MODULE_PARM_DESC(redirect_dir,
+ "Default to on or off for the redirect_dir feature");
+
+static bool ovl_redirect_always_follow =
+ IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW);
+module_param_named(redirect_always_follow, ovl_redirect_always_follow,
+ bool, 0644);
+MODULE_PARM_DESC(redirect_always_follow,
+ "Follow redirects even if redirect_dir feature is turned off");
+
+static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX);
+module_param_named(index, ovl_index_def, bool, 0644);
+MODULE_PARM_DESC(index,
+ "Default to on or off for the inodes index feature");
+
+static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT);
+module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644);
+MODULE_PARM_DESC(nfs_export,
+ "Default to on or off for the NFS export feature");
+
+static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO);
+module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644);
+MODULE_PARM_DESC(xino_auto,
+ "Auto enable xino feature");
+
+static void ovl_entry_stack_free(struct ovl_entry *oe)
+{
+ unsigned int i;
+
+ for (i = 0; i < oe->numlower; i++)
+ dput(oe->lowerstack[i].dentry);
+}
+
+static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY);
+module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
+MODULE_PARM_DESC(metacopy,
+ "Default to on or off for the metadata only copy up feature");
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe) {
+ ovl_entry_stack_free(oe);
+ kfree_rcu(oe, rcu);
+ }
+}
+
+static struct dentry *ovl_d_real(struct dentry *dentry,
+ const struct inode *inode)
+{
+ struct dentry *real = NULL, *lower;
+
+ /* It's an overlay file */
+ if (inode && d_inode(dentry) == inode)
+ return dentry;
+
+ if (!d_is_reg(dentry)) {
+ if (!inode || inode == d_inode(dentry))
+ return dentry;
+ goto bug;
+ }
+
+ real = ovl_dentry_upper(dentry);
+ if (real && (inode == d_inode(real)))
+ return real;
+
+ if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
+ return real;
+
+ lower = ovl_dentry_lowerdata(dentry);
+ if (!lower)
+ goto bug;
+ real = lower;
+
+ /* Handle recursion */
+ real = d_real(real, inode);
+
+ if (!inode || inode == d_inode(real))
+ return real;
+bug:
+ WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
+ __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
+ inode ? inode->i_ino : 0, real,
+ real && d_inode(real) ? d_inode(real)->i_ino : 0);
+ return dentry;
+}
+
+static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
+{
+ int ret = 1;
+
+ if (weak) {
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ ret = -ESTALE;
+ }
+ }
+ return ret;
+}
+
+static int ovl_dentry_revalidate_common(struct dentry *dentry,
+ unsigned int flags, bool weak)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct inode *inode = d_inode_rcu(dentry);
+ struct dentry *upper;
+ unsigned int i;
+ int ret = 1;
+
+ /* Careful in RCU mode */
+ if (!inode)
+ return -ECHILD;
+
+ upper = ovl_i_dentry_upper(inode);
+ if (upper)
+ ret = ovl_revalidate_real(upper, flags, weak);
+
+ for (i = 0; ret > 0 && i < oe->numlower; i++) {
+ ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
+ weak);
+ }
+ return ret;
+}
+
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, false);
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, true);
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+ .d_release = ovl_dentry_release,
+ .d_real = ovl_d_real,
+ .d_revalidate = ovl_dentry_revalidate,
+ .d_weak_revalidate = ovl_dentry_weak_revalidate,
+};
+
+static struct kmem_cache *ovl_inode_cachep;
+
+static struct inode *ovl_alloc_inode(struct super_block *sb)
+{
+ struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL);
+
+ if (!oi)
+ return NULL;
+
+ oi->cache = NULL;
+ oi->redirect = NULL;
+ oi->version = 0;
+ oi->flags = 0;
+ oi->__upperdentry = NULL;
+ oi->lowerpath.dentry = NULL;
+ oi->lowerpath.layer = NULL;
+ oi->lowerdata = NULL;
+ mutex_init(&oi->lock);
+
+ return &oi->vfs_inode;
+}
+
+static void ovl_free_inode(struct inode *inode)
+{
+ struct ovl_inode *oi = OVL_I(inode);
+
+ kfree(oi->redirect);
+ mutex_destroy(&oi->lock);
+ kmem_cache_free(ovl_inode_cachep, oi);
+}
+
+static void ovl_destroy_inode(struct inode *inode)
+{
+ struct ovl_inode *oi = OVL_I(inode);
+
+ dput(oi->__upperdentry);
+ dput(oi->lowerpath.dentry);
+ if (S_ISDIR(inode->i_mode))
+ ovl_dir_cache_free(inode);
+ else
+ iput(oi->lowerdata);
+}
+
+static void ovl_free_fs(struct ovl_fs *ofs)
+{
+ struct vfsmount **mounts;
+ unsigned i;
+
+ iput(ofs->workbasedir_trap);
+ iput(ofs->indexdir_trap);
+ iput(ofs->workdir_trap);
+ dput(ofs->whiteout);
+ dput(ofs->indexdir);
+ dput(ofs->workdir);
+ if (ofs->workdir_locked)
+ ovl_inuse_unlock(ofs->workbasedir);
+ dput(ofs->workbasedir);
+ if (ofs->upperdir_locked)
+ ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
+
+ /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
+ mounts = (struct vfsmount **) ofs->layers;
+ for (i = 0; i < ofs->numlayer; i++) {
+ iput(ofs->layers[i].trap);
+ mounts[i] = ofs->layers[i].mnt;
+ }
+ kern_unmount_array(mounts, ofs->numlayer);
+ kfree(ofs->layers);
+ for (i = 0; i < ofs->numfs; i++)
+ free_anon_bdev(ofs->fs[i].pseudo_dev);
+ kfree(ofs->fs);
+
+ kfree(ofs->config.lowerdir);
+ kfree(ofs->config.upperdir);
+ kfree(ofs->config.workdir);
+ kfree(ofs->config.redirect_mode);
+ if (ofs->creator_cred)
+ put_cred(ofs->creator_cred);
+ kfree(ofs);
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ ovl_free_fs(ofs);
+}
+
+/* Sync real dirty inodes in upper filesystem (if it exists) */
+static int ovl_sync_fs(struct super_block *sb, int wait)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ struct super_block *upper_sb;
+ int ret;
+
+ ret = ovl_sync_status(ofs);
+ /*
+ * We have to always set the err, because the return value isn't
+ * checked in syncfs, and instead indirectly return an error via
+ * the sb's writeback errseq, which VFS inspects after this call.
+ */
+ if (ret < 0) {
+ errseq_set(&sb->s_wb_err, -EIO);
+ return -EIO;
+ }
+
+ if (!ret)
+ return ret;
+
+ /*
+ * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
+ * All the super blocks will be iterated, including upper_sb.
+ *
+ * If this is a syncfs(2) call, then we do need to call
+ * sync_filesystem() on upper_sb, but enough if we do it when being
+ * called with wait == 1.
+ */
+ if (!wait)
+ return 0;
+
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+
+ down_read(&upper_sb->s_umount);
+ ret = sync_filesystem(upper_sb);
+ up_read(&upper_sb->s_umount);
+
+ return ret;
+}
+
+/**
+ * ovl_statfs
+ * @dentry: The dentry to query
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. As writes always target the upper layer
+ * filesystem pass the statfs to the upper filesystem (if it exists)
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct dentry *root_dentry = dentry->d_sb->s_root;
+ struct path path;
+ int err;
+
+ ovl_path_real(root_dentry, &path);
+
+ err = vfs_statfs(&path, buf);
+ if (!err) {
+ buf->f_namelen = ofs->namelen;
+ buf->f_type = OVERLAYFS_SUPER_MAGIC;
+ }
+
+ return err;
+}
+
+/* Will this overlay be forced to mount/remount ro? */
+static bool ovl_force_readonly(struct ovl_fs *ofs)
+{
+ return (!ovl_upper_mnt(ofs) || !ofs->workdir);
+}
+
+static const char *ovl_redirect_mode_def(void)
+{
+ return ovl_redirect_dir_def ? "on" : "off";
+}
+
+static const char * const ovl_xino_str[] = {
+ "off",
+ "auto",
+ "on",
+};
+
+static inline int ovl_xino_def(void)
+{
+ return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF;
+}
+
+/**
+ * ovl_show_options
+ * @m: the seq_file handle
+ * @dentry: The dentry to query
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ seq_show_option(m, "lowerdir", ofs->config.lowerdir);
+ if (ofs->config.upperdir) {
+ seq_show_option(m, "upperdir", ofs->config.upperdir);
+ seq_show_option(m, "workdir", ofs->config.workdir);
+ }
+ if (ofs->config.default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0)
+ seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode);
+ if (ofs->config.index != ovl_index_def)
+ seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
+ if (!ofs->config.uuid)
+ seq_puts(m, ",uuid=off");
+ if (ofs->config.nfs_export != ovl_nfs_export_def)
+ seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
+ "on" : "off");
+ if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb))
+ seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]);
+ if (ofs->config.metacopy != ovl_metacopy_def)
+ seq_printf(m, ",metacopy=%s",
+ ofs->config.metacopy ? "on" : "off");
+ if (ofs->config.ovl_volatile)
+ seq_puts(m, ",volatile");
+ if (ofs->config.userxattr)
+ seq_puts(m, ",userxattr");
+ return 0;
+}
+
+static int ovl_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ struct super_block *upper_sb;
+ int ret = 0;
+
+ if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
+ return -EROFS;
+
+ if (*flags & SB_RDONLY && !sb_rdonly(sb)) {
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ if (ovl_should_sync(ofs)) {
+ down_read(&upper_sb->s_umount);
+ ret = sync_filesystem(upper_sb);
+ up_read(&upper_sb->s_umount);
+ }
+ }
+
+ return ret;
+}
+
+static const struct super_operations ovl_super_operations = {
+ .alloc_inode = ovl_alloc_inode,
+ .free_inode = ovl_free_inode,
+ .destroy_inode = ovl_destroy_inode,
+ .drop_inode = generic_delete_inode,
+ .put_super = ovl_put_super,
+ .sync_fs = ovl_sync_fs,
+ .statfs = ovl_statfs,
+ .show_options = ovl_show_options,
+ .remount_fs = ovl_remount,
+};
+
+enum {
+ OPT_LOWERDIR,
+ OPT_UPPERDIR,
+ OPT_WORKDIR,
+ OPT_DEFAULT_PERMISSIONS,
+ OPT_REDIRECT_DIR,
+ OPT_INDEX_ON,
+ OPT_INDEX_OFF,
+ OPT_UUID_ON,
+ OPT_UUID_OFF,
+ OPT_NFS_EXPORT_ON,
+ OPT_USERXATTR,
+ OPT_NFS_EXPORT_OFF,
+ OPT_XINO_ON,
+ OPT_XINO_OFF,
+ OPT_XINO_AUTO,
+ OPT_METACOPY_ON,
+ OPT_METACOPY_OFF,
+ OPT_VOLATILE,
+ OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+ {OPT_LOWERDIR, "lowerdir=%s"},
+ {OPT_UPPERDIR, "upperdir=%s"},
+ {OPT_WORKDIR, "workdir=%s"},
+ {OPT_DEFAULT_PERMISSIONS, "default_permissions"},
+ {OPT_REDIRECT_DIR, "redirect_dir=%s"},
+ {OPT_INDEX_ON, "index=on"},
+ {OPT_INDEX_OFF, "index=off"},
+ {OPT_USERXATTR, "userxattr"},
+ {OPT_UUID_ON, "uuid=on"},
+ {OPT_UUID_OFF, "uuid=off"},
+ {OPT_NFS_EXPORT_ON, "nfs_export=on"},
+ {OPT_NFS_EXPORT_OFF, "nfs_export=off"},
+ {OPT_XINO_ON, "xino=on"},
+ {OPT_XINO_OFF, "xino=off"},
+ {OPT_XINO_AUTO, "xino=auto"},
+ {OPT_METACOPY_ON, "metacopy=on"},
+ {OPT_METACOPY_OFF, "metacopy=off"},
+ {OPT_VOLATILE, "volatile"},
+ {OPT_ERR, NULL}
+};
+
+static char *ovl_next_opt(char **s)
+{
+ char *sbegin = *s;
+ char *p;
+
+ if (sbegin == NULL)
+ return NULL;
+
+ for (p = sbegin; *p; p++) {
+ if (*p == '\\') {
+ p++;
+ if (!*p)
+ break;
+ } else if (*p == ',') {
+ *p = '\0';
+ *s = p + 1;
+ return sbegin;
+ }
+ }
+ *s = NULL;
+ return sbegin;
+}
+
+static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode)
+{
+ if (strcmp(mode, "on") == 0) {
+ config->redirect_dir = true;
+ /*
+ * Does not make sense to have redirect creation without
+ * redirect following.
+ */
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "follow") == 0) {
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "off") == 0) {
+ if (ovl_redirect_always_follow)
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "nofollow") != 0) {
+ pr_err("bad mount option \"redirect_dir=%s\"\n",
+ mode);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+ char *p;
+ int err;
+ bool metacopy_opt = false, redirect_opt = false;
+ bool nfs_export_opt = false, index_opt = false;
+
+ config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
+ if (!config->redirect_mode)
+ return -ENOMEM;
+
+ while ((p = ovl_next_opt(&opt)) != NULL) {
+ int token;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, ovl_tokens, args);
+ switch (token) {
+ case OPT_UPPERDIR:
+ kfree(config->upperdir);
+ config->upperdir = match_strdup(&args[0]);
+ if (!config->upperdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_LOWERDIR:
+ kfree(config->lowerdir);
+ config->lowerdir = match_strdup(&args[0]);
+ if (!config->lowerdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_WORKDIR:
+ kfree(config->workdir);
+ config->workdir = match_strdup(&args[0]);
+ if (!config->workdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_DEFAULT_PERMISSIONS:
+ config->default_permissions = true;
+ break;
+
+ case OPT_REDIRECT_DIR:
+ kfree(config->redirect_mode);
+ config->redirect_mode = match_strdup(&args[0]);
+ if (!config->redirect_mode)
+ return -ENOMEM;
+ redirect_opt = true;
+ break;
+
+ case OPT_INDEX_ON:
+ config->index = true;
+ index_opt = true;
+ break;
+
+ case OPT_INDEX_OFF:
+ config->index = false;
+ index_opt = true;
+ break;
+
+ case OPT_UUID_ON:
+ config->uuid = true;
+ break;
+
+ case OPT_UUID_OFF:
+ config->uuid = false;
+ break;
+
+ case OPT_NFS_EXPORT_ON:
+ config->nfs_export = true;
+ nfs_export_opt = true;
+ break;
+
+ case OPT_NFS_EXPORT_OFF:
+ config->nfs_export = false;
+ nfs_export_opt = true;
+ break;
+
+ case OPT_XINO_ON:
+ config->xino = OVL_XINO_ON;
+ break;
+
+ case OPT_XINO_OFF:
+ config->xino = OVL_XINO_OFF;
+ break;
+
+ case OPT_XINO_AUTO:
+ config->xino = OVL_XINO_AUTO;
+ break;
+
+ case OPT_METACOPY_ON:
+ config->metacopy = true;
+ metacopy_opt = true;
+ break;
+
+ case OPT_METACOPY_OFF:
+ config->metacopy = false;
+ metacopy_opt = true;
+ break;
+
+ case OPT_VOLATILE:
+ config->ovl_volatile = true;
+ break;
+
+ case OPT_USERXATTR:
+ config->userxattr = true;
+ break;
+
+ default:
+ pr_err("unrecognized mount option \"%s\" or missing value\n",
+ p);
+ return -EINVAL;
+ }
+ }
+
+ /* Workdir/index are useless in non-upper mount */
+ if (!config->upperdir) {
+ if (config->workdir) {
+ pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+ if (config->index && index_opt) {
+ pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n");
+ index_opt = false;
+ }
+ config->index = false;
+ }
+
+ if (!config->upperdir && config->ovl_volatile) {
+ pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n");
+ config->ovl_volatile = false;
+ }
+
+ err = ovl_parse_redirect_mode(config, config->redirect_mode);
+ if (err)
+ return err;
+
+ /*
+ * This is to make the logic below simpler. It doesn't make any other
+ * difference, since config->redirect_dir is only used for upper.
+ */
+ if (!config->upperdir && config->redirect_follow)
+ config->redirect_dir = true;
+
+ /* Resolve metacopy -> redirect_dir dependency */
+ if (config->metacopy && !config->redirect_dir) {
+ if (metacopy_opt && redirect_opt) {
+ pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
+ config->redirect_mode);
+ return -EINVAL;
+ }
+ if (redirect_opt) {
+ /*
+ * There was an explicit redirect_dir=... that resulted
+ * in this conflict.
+ */
+ pr_info("disabling metacopy due to redirect_dir=%s\n",
+ config->redirect_mode);
+ config->metacopy = false;
+ } else {
+ /* Automatically enable redirect otherwise. */
+ config->redirect_follow = config->redirect_dir = true;
+ }
+ }
+
+ /* Resolve nfs_export -> index dependency */
+ if (config->nfs_export && !config->index) {
+ if (!config->upperdir && config->redirect_follow) {
+ pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
+ config->nfs_export = false;
+ } else if (nfs_export_opt && index_opt) {
+ pr_err("conflicting options: nfs_export=on,index=off\n");
+ return -EINVAL;
+ } else if (index_opt) {
+ /*
+ * There was an explicit index=off that resulted
+ * in this conflict.
+ */
+ pr_info("disabling nfs_export due to index=off\n");
+ config->nfs_export = false;
+ } else {
+ /* Automatically enable index otherwise. */
+ config->index = true;
+ }
+ }
+
+ /* Resolve nfs_export -> !metacopy dependency */
+ if (config->nfs_export && config->metacopy) {
+ if (nfs_export_opt && metacopy_opt) {
+ pr_err("conflicting options: nfs_export=on,metacopy=on\n");
+ return -EINVAL;
+ }
+ if (metacopy_opt) {
+ /*
+ * There was an explicit metacopy=on that resulted
+ * in this conflict.
+ */
+ pr_info("disabling nfs_export due to metacopy=on\n");
+ config->nfs_export = false;
+ } else {
+ /*
+ * There was an explicit nfs_export=on that resulted
+ * in this conflict.
+ */
+ pr_info("disabling metacopy due to nfs_export=on\n");
+ config->metacopy = false;
+ }
+ }
+
+
+ /* Resolve userxattr -> !redirect && !metacopy dependency */
+ if (config->userxattr) {
+ if (config->redirect_follow && redirect_opt) {
+ pr_err("conflicting options: userxattr,redirect_dir=%s\n",
+ config->redirect_mode);
+ return -EINVAL;
+ }
+ if (config->metacopy && metacopy_opt) {
+ pr_err("conflicting options: userxattr,metacopy=on\n");
+ return -EINVAL;
+ }
+ /*
+ * Silently disable default setting of redirect and metacopy.
+ * This shall be the default in the future as well: these
+ * options must be explicitly enabled if used together with
+ * userxattr.
+ */
+ config->redirect_dir = config->redirect_follow = false;
+ config->metacopy = false;
+ }
+
+ return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+#define OVL_INDEXDIR_NAME "index"
+
+static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
+ const char *name, bool persist)
+{
+ struct inode *dir = ofs->workbasedir->d_inode;
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
+ struct dentry *work;
+ int err;
+ bool retried = false;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+retry:
+ work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
+
+ if (!IS_ERR(work)) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = S_IFDIR | 0,
+ };
+
+ if (work->d_inode) {
+ err = -EEXIST;
+ if (retried)
+ goto out_dput;
+
+ if (persist)
+ goto out_unlock;
+
+ retried = true;
+ err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
+ dput(work);
+ if (err == -EINVAL) {
+ work = ERR_PTR(err);
+ goto out_unlock;
+ }
+ goto retry;
+ }
+
+ err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
+ if (err)
+ goto out_dput;
+
+ /* Weird filesystem returning with hashed negative (kernfs)? */
+ err = -EINVAL;
+ if (d_really_is_negative(work))
+ goto out_dput;
+
+ /*
+ * Try to remove POSIX ACL xattrs from workdir. We are good if:
+ *
+ * a) success (there was a POSIX ACL xattr and was removed)
+ * b) -ENODATA (there was no POSIX ACL xattr)
+ * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported)
+ *
+ * There are various other error values that could effectively
+ * mean that the xattr doesn't exist (e.g. -ERANGE is returned
+ * if the xattr name is too long), but the set of filesystems
+ * allowed as upper are limited to "normal" ones, where checking
+ * for the above two errors is sufficient.
+ */
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err && err != -ENODATA && err != -EOPNOTSUPP)
+ goto out_dput;
+
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err && err != -ENODATA && err != -EOPNOTSUPP)
+ goto out_dput;
+
+ /* Clear any inherited mode bits */
+ inode_lock(work->d_inode);
+ err = ovl_do_notify_change(ofs, work, &attr);
+ inode_unlock(work->d_inode);
+ if (err)
+ goto out_dput;
+ } else {
+ err = PTR_ERR(work);
+ goto out_err;
+ }
+out_unlock:
+ inode_unlock(dir);
+ return work;
+
+out_dput:
+ dput(work);
+out_err:
+ pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
+ ofs->config.workdir, name, -err);
+ work = NULL;
+ goto out_unlock;
+}
+
+static void ovl_unescape(char *s)
+{
+ char *d = s;
+
+ for (;; s++, d++) {
+ if (*s == '\\')
+ s++;
+ *d = *s;
+ if (!*s)
+ break;
+ }
+}
+
+static int ovl_mount_dir_noesc(const char *name, struct path *path)
+{
+ int err = -EINVAL;
+
+ if (!*name) {
+ pr_err("empty lowerdir\n");
+ goto out;
+ }
+ err = kern_path(name, LOOKUP_FOLLOW, path);
+ if (err) {
+ pr_err("failed to resolve '%s': %i\n", name, err);
+ goto out;
+ }
+ err = -EINVAL;
+ if (ovl_dentry_weird(path->dentry)) {
+ pr_err("filesystem on '%s' not supported\n", name);
+ goto out_put;
+ }
+ if (!d_is_dir(path->dentry)) {
+ pr_err("'%s' not a directory\n", name);
+ goto out_put;
+ }
+ return 0;
+
+out_put:
+ path_put_init(path);
+out:
+ return err;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+ int err = -ENOMEM;
+ char *tmp = kstrdup(name, GFP_KERNEL);
+
+ if (tmp) {
+ ovl_unescape(tmp);
+ err = ovl_mount_dir_noesc(tmp, path);
+
+ if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ pr_err("filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put_init(path);
+ err = -EINVAL;
+ }
+ kfree(tmp);
+ }
+ return err;
+}
+
+static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
+ const char *name)
+{
+ struct kstatfs statfs;
+ int err = vfs_statfs(path, &statfs);
+
+ if (err)
+ pr_err("statfs failed on '%s'\n", name);
+ else
+ ofs->namelen = max(ofs->namelen, statfs.f_namelen);
+
+ return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path,
+ struct ovl_fs *ofs, int *stack_depth)
+{
+ int fh_type;
+ int err;
+
+ err = ovl_mount_dir_noesc(name, path);
+ if (err)
+ return err;
+
+ err = ovl_check_namelen(path, ofs, name);
+ if (err)
+ return err;
+
+ *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+
+ /*
+ * The inodes index feature and NFS export need to encode and decode
+ * file handles, so they require that all layers support them.
+ */
+ fh_type = ovl_can_decode_fh(path->dentry->d_sb);
+ if ((ofs->config.nfs_export ||
+ (ofs->config.index && ofs->config.upperdir)) && !fh_type) {
+ ofs->config.index = false;
+ ofs->config.nfs_export = false;
+ pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
+ name);
+ }
+ /*
+ * Decoding origin file handle is required for persistent st_ino.
+ * Without persistent st_ino, xino=auto falls back to xino=off.
+ */
+ if (ofs->config.xino == OVL_XINO_AUTO &&
+ ofs->config.upperdir && !fh_type) {
+ ofs->config.xino = OVL_XINO_OFF;
+ pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n",
+ name);
+ }
+
+ /* Check if lower fs has 32bit inode numbers */
+ if (fh_type != FILEID_INO32_GEN)
+ ofs->xino_mode = -1;
+
+ return 0;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+ bool ok = false;
+
+ if (workdir != upperdir) {
+ ok = (lock_rename(workdir, upperdir) == NULL);
+ unlock_rename(workdir, upperdir);
+ }
+ return ok;
+}
+
+static unsigned int ovl_split_lowerdirs(char *str)
+{
+ unsigned int ctr = 1;
+ char *s, *d;
+
+ for (s = d = str;; s++, d++) {
+ if (*s == '\\') {
+ s++;
+ } else if (*s == ':') {
+ *d = '\0';
+ ctr++;
+ continue;
+ }
+ *d = *s;
+ if (!*s)
+ break;
+ }
+ return ctr;
+}
+
+static int __maybe_unused
+ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
+}
+
+static int __maybe_unused
+ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode);
+ struct posix_acl *acl = NULL;
+ int err;
+
+ /* Check that everything is OK before copy-up */
+ if (value) {
+ /* The above comment can be understood in two ways:
+ *
+ * 1. We just want to check whether the basic POSIX ACL format
+ * is ok. For example, if the header is correct and the size
+ * is sane.
+ * 2. We want to know whether the ACL_{GROUP,USER} entries can
+ * be mapped according to the underlying filesystem.
+ *
+ * Currently, we only check 1. If we wanted to check 2. we
+ * would need to pass the mnt_userns and the fs_userns of the
+ * underlying filesystem. But frankly, I think checking 1. is
+ * enough to start the copy-up.
+ */
+ acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ err = -EOPNOTSUPP;
+ if (!IS_POSIXACL(d_inode(workdir)))
+ goto out_acl_release;
+ if (!realinode->i_op->set_acl)
+ goto out_acl_release;
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
+ err = acl ? -EACCES : 0;
+ goto out_acl_release;
+ }
+ err = -EPERM;
+ if (!inode_owner_or_capable(&init_user_ns, inode))
+ goto out_acl_release;
+
+ posix_acl_release(acl);
+
+ /*
+ * Check if sgid bit needs to be cleared (actual setacl operation will
+ * be done with mounter's capabilities and so that won't do it for us).
+ */
+ if (unlikely(inode->i_mode & S_ISGID) &&
+ handler->flags == ACL_TYPE_ACCESS &&
+ !in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
+ struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+ err = ovl_setattr(&init_user_ns, dentry, &iattr);
+ if (err)
+ return err;
+ }
+
+ err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
+ return err;
+
+out_acl_release:
+ posix_acl_release(acl);
+ return err;
+}
+
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, inode, name, buffer, size);
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_xattr_set(dentry, inode, name, value, size, flags);
+}
+
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .get = ovl_posix_acl_xattr_get,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .get = ovl_posix_acl_xattr_get,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_trusted_xattr_handler = {
+ .prefix = OVL_XATTR_TRUSTED_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_user_xattr_handler = {
+ .prefix = OVL_XATTR_USER_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+ .prefix = "", /* catch all */
+ .get = ovl_other_xattr_get,
+ .set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &ovl_posix_acl_access_xattr_handler,
+ &ovl_posix_acl_default_xattr_handler,
+#endif
+ &ovl_own_trusted_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler *ovl_user_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &ovl_posix_acl_access_xattr_handler,
+ &ovl_posix_acl_default_xattr_handler,
+#endif
+ &ovl_own_user_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
+ struct inode **ptrap, const char *name)
+{
+ struct inode *trap;
+ int err;
+
+ trap = ovl_get_trap_inode(sb, dir);
+ err = PTR_ERR_OR_ZERO(trap);
+ if (err) {
+ if (err == -ELOOP)
+ pr_err("conflicting %s path\n", name);
+ return err;
+ }
+
+ *ptrap = trap;
+ return 0;
+}
+
+/*
+ * Determine how we treat concurrent use of upperdir/workdir based on the
+ * index feature. This is papering over mount leaks of container runtimes,
+ * for example, an old overlay mount is leaked and now its upperdir is
+ * attempted to be used as a lower layer in a new overlay mount.
+ */
+static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
+{
+ if (ofs->config.index) {
+ pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
+ name);
+ return -EBUSY;
+ } else {
+ pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
+ name);
+ return 0;
+ }
+}
+
+static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
+ struct ovl_layer *upper_layer, struct path *upperpath)
+{
+ struct vfsmount *upper_mnt;
+ int err;
+
+ err = ovl_mount_dir(ofs->config.upperdir, upperpath);
+ if (err)
+ goto out;
+
+ /* Upperdir path should not be r/o */
+ if (__mnt_is_readonly(upperpath->mnt)) {
+ pr_err("upper fs is r/o, try multi-lower layers mount\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
+ if (err)
+ goto out;
+
+ err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap,
+ "upperdir");
+ if (err)
+ goto out;
+
+ upper_mnt = clone_private_mount(upperpath);
+ err = PTR_ERR(upper_mnt);
+ if (IS_ERR(upper_mnt)) {
+ pr_err("failed to clone upperpath\n");
+ goto out;
+ }
+
+ /* Don't inherit atime flags */
+ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+ upper_layer->mnt = upper_mnt;
+ upper_layer->idx = 0;
+ upper_layer->fsid = 0;
+
+ /*
+ * Inherit SB_NOSEC flag from upperdir.
+ *
+ * This optimization changes behavior when a security related attribute
+ * (suid/sgid/security.*) is changed on an underlying layer. This is
+ * okay because we don't yet have guarantees in that case, but it will
+ * need careful treatment once we want to honour changes to underlying
+ * filesystems.
+ */
+ if (upper_mnt->mnt_sb->s_flags & SB_NOSEC)
+ sb->s_flags |= SB_NOSEC;
+
+ if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) {
+ ofs->upperdir_locked = true;
+ } else {
+ err = ovl_report_in_use(ofs, "upperdir");
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * negative values if error is encountered.
+ */
+static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
+{
+ struct dentry *workdir = ofs->workdir;
+ struct inode *dir = d_inode(workdir);
+ struct dentry *temp;
+ struct dentry *dest;
+ struct dentry *whiteout;
+ struct name_snapshot name;
+ int err;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+
+ temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto out_unlock;
+
+ dest = ovl_lookup_temp(ofs, workdir);
+ err = PTR_ERR(dest);
+ if (IS_ERR(dest)) {
+ dput(temp);
+ goto out_unlock;
+ }
+
+ /* Name is inline and stable - using snapshot as a copy helper */
+ take_dentry_name_snapshot(&name, temp);
+ err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
+ if (err) {
+ if (err == -EINVAL)
+ err = 0;
+ goto cleanup_temp;
+ }
+
+ whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto cleanup_temp;
+
+ err = ovl_is_whiteout(whiteout);
+
+ /* Best effort cleanup of whiteout and temp file */
+ if (err)
+ ovl_cleanup(ofs, dir, whiteout);
+ dput(whiteout);
+
+cleanup_temp:
+ ovl_cleanup(ofs, dir, temp);
+ release_dentry_name_snapshot(&name);
+ dput(temp);
+ dput(dest);
+
+out_unlock:
+ inode_unlock(dir);
+
+ return err;
+}
+
+static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
+ struct dentry *parent,
+ const char *name, umode_t mode)
+{
+ size_t len = strlen(name);
+ struct dentry *child;
+
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ child = ovl_lookup_upper(ofs, name, parent, len);
+ if (!IS_ERR(child) && !child->d_inode)
+ child = ovl_create_real(ofs, parent->d_inode, child,
+ OVL_CATTR(mode));
+ inode_unlock(parent->d_inode);
+ dput(parent);
+
+ return child;
+}
+
+/*
+ * Creates $workdir/work/incompat/volatile/dirty file if it is not already
+ * present.
+ */
+static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
+{
+ unsigned int ctr;
+ struct dentry *d = dget(ofs->workbasedir);
+ static const char *const volatile_path[] = {
+ OVL_WORKDIR_NAME, "incompat", "volatile", "dirty"
+ };
+ const char *const *name = volatile_path;
+
+ for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) {
+ d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ }
+ dput(d);
+ return 0;
+}
+
+static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
+ const struct path *workpath)
+{
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
+ struct dentry *workdir;
+ struct file *tmpfile;
+ bool rename_whiteout;
+ bool d_type;
+ int fh_type;
+ int err;
+
+ err = mnt_want_write(mnt);
+ if (err)
+ return err;
+
+ workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
+ err = PTR_ERR(workdir);
+ if (IS_ERR_OR_NULL(workdir))
+ goto out;
+
+ ofs->workdir = workdir;
+
+ err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir");
+ if (err)
+ goto out;
+
+ /*
+ * Upper should support d_type, else whiteouts are visible. Given
+ * workdir and upper are on same fs, we can do iterate_dir() on
+ * workdir. This check requires successful creation of workdir in
+ * previous step.
+ */
+ err = ovl_check_d_type_supported(workpath);
+ if (err < 0)
+ goto out;
+
+ d_type = err;
+ if (!d_type)
+ pr_warn("upper fs needs to support d_type.\n");
+
+ /* Check if upper/work fs supports O_TMPFILE */
+ tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
+ ofs->tmpfile = !IS_ERR(tmpfile);
+ if (ofs->tmpfile)
+ fput(tmpfile);
+ else
+ pr_warn("upper fs does not support tmpfile.\n");
+
+
+ /* Check if upper/work fs supports RENAME_WHITEOUT */
+ err = ovl_check_rename_whiteout(ofs);
+ if (err < 0)
+ goto out;
+
+ rename_whiteout = err;
+ if (!rename_whiteout)
+ pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+
+ /*
+ * Check if upper/work fs supports (trusted|user).overlay.* xattr
+ */
+ err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
+ if (err) {
+ pr_warn("failed to set xattr on upper\n");
+ ofs->noxattr = true;
+ if (ofs->config.index || ofs->config.metacopy) {
+ ofs->config.index = false;
+ ofs->config.metacopy = false;
+ pr_warn("...falling back to index=off,metacopy=off.\n");
+ }
+ /*
+ * xattr support is required for persistent st_ino.
+ * Without persistent st_ino, xino=auto falls back to xino=off.
+ */
+ if (ofs->config.xino == OVL_XINO_AUTO) {
+ ofs->config.xino = OVL_XINO_OFF;
+ pr_warn("...falling back to xino=off.\n");
+ }
+ if (err == -EPERM && !ofs->config.userxattr)
+ pr_info("try mounting with 'userxattr' option\n");
+ err = 0;
+ } else {
+ ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
+ }
+
+ /*
+ * We allowed sub-optimal upper fs configuration and don't want to break
+ * users over kernel upgrade, but we never allowed remote upper fs, so
+ * we can enforce strict requirements for remote upper fs.
+ */
+ if (ovl_dentry_remote(ofs->workdir) &&
+ (!d_type || !rename_whiteout || ofs->noxattr)) {
+ pr_err("upper fs missing required features.\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * For volatile mount, create a incompat/volatile/dirty file to keep
+ * track of it.
+ */
+ if (ofs->config.ovl_volatile) {
+ err = ovl_create_volatile_dirty(ofs);
+ if (err < 0) {
+ pr_err("Failed to create volatile/dirty file.\n");
+ goto out;
+ }
+ }
+
+ /* Check if upper/work fs supports file handles */
+ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
+ if (ofs->config.index && !fh_type) {
+ ofs->config.index = false;
+ pr_warn("upper fs does not support file handles, falling back to index=off.\n");
+ }
+
+ /* Check if upper fs has 32bit inode numbers */
+ if (fh_type != FILEID_INO32_GEN)
+ ofs->xino_mode = -1;
+
+ /* NFS export of r/w mount depends on index */
+ if (ofs->config.nfs_export && !ofs->config.index) {
+ pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n");
+ ofs->config.nfs_export = false;
+ }
+out:
+ mnt_drop_write(mnt);
+ return err;
+}
+
+static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
+ const struct path *upperpath)
+{
+ int err;
+ struct path workpath = { };
+
+ err = ovl_mount_dir(ofs->config.workdir, &workpath);
+ if (err)
+ goto out;
+
+ err = -EINVAL;
+ if (upperpath->mnt != workpath.mnt) {
+ pr_err("workdir and upperdir must reside under the same mount\n");
+ goto out;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
+ pr_err("workdir and upperdir must be separate subtrees\n");
+ goto out;
+ }
+
+ ofs->workbasedir = dget(workpath.dentry);
+
+ if (ovl_inuse_trylock(ofs->workbasedir)) {
+ ofs->workdir_locked = true;
+ } else {
+ err = ovl_report_in_use(ofs, "workdir");
+ if (err)
+ goto out;
+ }
+
+ err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap,
+ "workdir");
+ if (err)
+ goto out;
+
+ err = ovl_make_workdir(sb, ofs, &workpath);
+
+out:
+ path_put(&workpath);
+
+ return err;
+}
+
+static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
+ struct ovl_entry *oe, const struct path *upperpath)
+{
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
+ struct dentry *indexdir;
+ int err;
+
+ err = mnt_want_write(mnt);
+ if (err)
+ return err;
+
+ /* Verify lower root is upper root origin */
+ err = ovl_verify_origin(ofs, upperpath->dentry,
+ oe->lowerstack[0].dentry, true);
+ if (err) {
+ pr_err("failed to verify upper root origin\n");
+ goto out;
+ }
+
+ /* index dir will act also as workdir */
+ iput(ofs->workdir_trap);
+ ofs->workdir_trap = NULL;
+ dput(ofs->workdir);
+ ofs->workdir = NULL;
+ indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
+ if (IS_ERR(indexdir)) {
+ err = PTR_ERR(indexdir);
+ } else if (indexdir) {
+ ofs->indexdir = indexdir;
+ ofs->workdir = dget(indexdir);
+
+ err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
+ "indexdir");
+ if (err)
+ goto out;
+
+ /*
+ * Verify upper root is exclusively associated with index dir.
+ * Older kernels stored upper fh in ".overlay.origin"
+ * xattr. If that xattr exists, verify that it is a match to
+ * upper dir file handle. In any case, verify or set xattr
+ * ".overlay.upper" to indicate that index may have
+ * directory entries.
+ */
+ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
+ err = ovl_verify_set_fh(ofs, ofs->indexdir,
+ OVL_XATTR_ORIGIN,
+ upperpath->dentry, true, false);
+ if (err)
+ pr_err("failed to verify index dir 'origin' xattr\n");
+ }
+ err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry,
+ true);
+ if (err)
+ pr_err("failed to verify index dir 'upper' xattr\n");
+
+ /* Cleanup bad/stale/orphan index entries */
+ if (!err)
+ err = ovl_indexdir_cleanup(ofs);
+ }
+ if (err || !ofs->indexdir)
+ pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+
+out:
+ mnt_drop_write(mnt);
+ return err;
+}
+
+static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
+{
+ unsigned int i;
+
+ if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
+ return true;
+
+ /*
+ * We allow using single lower with null uuid for index and nfs_export
+ * for example to support those features with single lower squashfs.
+ * To avoid regressions in setups of overlay with re-formatted lower
+ * squashfs, do not allow decoding origin with lower null uuid unless
+ * user opted-in to one of the new features that require following the
+ * lower inode of non-dir upper.
+ */
+ if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid))
+ return false;
+
+ for (i = 0; i < ofs->numfs; i++) {
+ /*
+ * We use uuid to associate an overlay lower file handle with a
+ * lower layer, so we can accept lower fs with null uuid as long
+ * as all lower layers with null uuid are on the same fs.
+ * if we detect multiple lower fs with the same uuid, we
+ * disable lower file handle decoding on all of them.
+ */
+ if (ofs->fs[i].is_lower &&
+ uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) {
+ ofs->fs[i].bad_uuid = true;
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Get a unique fsid for the layer */
+static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
+{
+ struct super_block *sb = path->mnt->mnt_sb;
+ unsigned int i;
+ dev_t dev;
+ int err;
+ bool bad_uuid = false;
+ bool warn = false;
+
+ for (i = 0; i < ofs->numfs; i++) {
+ if (ofs->fs[i].sb == sb)
+ return i;
+ }
+
+ if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
+ bad_uuid = true;
+ if (ofs->config.xino == OVL_XINO_AUTO) {
+ ofs->config.xino = OVL_XINO_OFF;
+ warn = true;
+ }
+ if (ofs->config.index || ofs->config.nfs_export) {
+ ofs->config.index = false;
+ ofs->config.nfs_export = false;
+ warn = true;
+ }
+ if (warn) {
+ pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n",
+ uuid_is_null(&sb->s_uuid) ? "null" :
+ "conflicting",
+ path->dentry, ovl_xino_str[ofs->config.xino]);
+ }
+ }
+
+ err = get_anon_bdev(&dev);
+ if (err) {
+ pr_err("failed to get anonymous bdev for lowerpath\n");
+ return err;
+ }
+
+ ofs->fs[ofs->numfs].sb = sb;
+ ofs->fs[ofs->numfs].pseudo_dev = dev;
+ ofs->fs[ofs->numfs].bad_uuid = bad_uuid;
+
+ return ofs->numfs++;
+}
+
+static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
+ struct path *stack, unsigned int numlower,
+ struct ovl_layer *layers)
+{
+ int err;
+ unsigned int i;
+
+ err = -ENOMEM;
+ ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL);
+ if (ofs->fs == NULL)
+ goto out;
+
+ /* idx/fsid 0 are reserved for upper fs even with lower only overlay */
+ ofs->numfs++;
+
+ /*
+ * All lower layers that share the same fs as upper layer, use the same
+ * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower
+ * only overlay to simplify ovl_fs_free().
+ * is_lower will be set if upper fs is shared with a lower layer.
+ */
+ err = get_anon_bdev(&ofs->fs[0].pseudo_dev);
+ if (err) {
+ pr_err("failed to get anonymous bdev for upper fs\n");
+ goto out;
+ }
+
+ if (ovl_upper_mnt(ofs)) {
+ ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
+ ofs->fs[0].is_lower = false;
+ }
+
+ for (i = 0; i < numlower; i++) {
+ struct vfsmount *mnt;
+ struct inode *trap;
+ int fsid;
+
+ err = fsid = ovl_get_fsid(ofs, &stack[i]);
+ if (err < 0)
+ goto out;
+
+ /*
+ * Check if lower root conflicts with this overlay layers before
+ * checking if it is in-use as upperdir/workdir of "another"
+ * mount, because we do not bother to check in ovl_is_inuse() if
+ * the upperdir/workdir is in fact in-use by our
+ * upperdir/workdir.
+ */
+ err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
+ if (err)
+ goto out;
+
+ if (ovl_is_inuse(stack[i].dentry)) {
+ err = ovl_report_in_use(ofs, "lowerdir");
+ if (err) {
+ iput(trap);
+ goto out;
+ }
+ }
+
+ mnt = clone_private_mount(&stack[i]);
+ err = PTR_ERR(mnt);
+ if (IS_ERR(mnt)) {
+ pr_err("failed to clone lowerpath\n");
+ iput(trap);
+ goto out;
+ }
+
+ /*
+ * Make lower layers R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
+
+ layers[ofs->numlayer].trap = trap;
+ layers[ofs->numlayer].mnt = mnt;
+ layers[ofs->numlayer].idx = ofs->numlayer;
+ layers[ofs->numlayer].fsid = fsid;
+ layers[ofs->numlayer].fs = &ofs->fs[fsid];
+ ofs->numlayer++;
+ ofs->fs[fsid].is_lower = true;
+ }
+
+ /*
+ * When all layers on same fs, overlay can use real inode numbers.
+ * With mount option "xino=<on|auto>", mounter declares that there are
+ * enough free high bits in underlying fs to hold the unique fsid.
+ * If overlayfs does encounter underlying inodes using the high xino
+ * bits reserved for fsid, it emits a warning and uses the original
+ * inode number or a non persistent inode number allocated from a
+ * dedicated range.
+ */
+ if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) {
+ if (ofs->config.xino == OVL_XINO_ON)
+ pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
+ ofs->xino_mode = 0;
+ } else if (ofs->config.xino == OVL_XINO_OFF) {
+ ofs->xino_mode = -1;
+ } else if (ofs->xino_mode < 0) {
+ /*
+ * This is a roundup of number of bits needed for encoding
+ * fsid, where fsid 0 is reserved for upper fs (even with
+ * lower only overlay) +1 extra bit is reserved for the non
+ * persistent inode number range that is used for resolving
+ * xino lower bits overflow.
+ */
+ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
+ ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
+ }
+
+ if (ofs->xino_mode > 0) {
+ pr_info("\"xino\" feature enabled using %d upper inode bits.\n",
+ ofs->xino_mode);
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
+ const char *lower, unsigned int numlower,
+ struct ovl_fs *ofs, struct ovl_layer *layers)
+{
+ int err;
+ struct path *stack = NULL;
+ unsigned int i;
+ struct ovl_entry *oe;
+
+ if (!ofs->config.upperdir && numlower == 1) {
+ pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL);
+ if (!stack)
+ return ERR_PTR(-ENOMEM);
+
+ err = -EINVAL;
+ for (i = 0; i < numlower; i++) {
+ err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth);
+ if (err)
+ goto out_err;
+
+ lower = strchr(lower, '\0') + 1;
+ }
+
+ err = -EINVAL;
+ sb->s_stack_depth++;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("maximum fs stacking depth exceeded\n");
+ goto out_err;
+ }
+
+ err = ovl_get_layers(sb, ofs, stack, numlower, layers);
+ if (err)
+ goto out_err;
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry(numlower);
+ if (!oe)
+ goto out_err;
+
+ for (i = 0; i < numlower; i++) {
+ oe->lowerstack[i].dentry = dget(stack[i].dentry);
+ oe->lowerstack[i].layer = &ofs->layers[i+1];
+ }
+
+out:
+ for (i = 0; i < numlower; i++)
+ path_put(&stack[i]);
+ kfree(stack);
+
+ return oe;
+
+out_err:
+ oe = ERR_PTR(err);
+ goto out;
+}
+
+/*
+ * Check if this layer root is a descendant of:
+ * - another layer of this overlayfs instance
+ * - upper/work dir of any overlayfs instance
+ */
+static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
+ struct dentry *dentry, const char *name,
+ bool is_lower)
+{
+ struct dentry *next = dentry, *parent;
+ int err = 0;
+
+ if (!dentry)
+ return 0;
+
+ parent = dget_parent(next);
+
+ /* Walk back ancestors to root (inclusive) looking for traps */
+ while (!err && parent != next) {
+ if (is_lower && ovl_lookup_trap_inode(sb, parent)) {
+ err = -ELOOP;
+ pr_err("overlapping %s path\n", name);
+ } else if (ovl_is_inuse(parent)) {
+ err = ovl_report_in_use(ofs, name);
+ }
+ next = parent;
+ parent = dget_parent(next);
+ dput(next);
+ }
+
+ dput(parent);
+
+ return err;
+}
+
+/*
+ * Check if any of the layers or work dirs overlap.
+ */
+static int ovl_check_overlapping_layers(struct super_block *sb,
+ struct ovl_fs *ofs)
+{
+ int i, err;
+
+ if (ovl_upper_mnt(ofs)) {
+ err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root,
+ "upperdir", false);
+ if (err)
+ return err;
+
+ /*
+ * Checking workbasedir avoids hitting ovl_is_inuse(parent) of
+ * this instance and covers overlapping work and index dirs,
+ * unless work or index dir have been moved since created inside
+ * workbasedir. In that case, we already have their traps in
+ * inode cache and we will catch that case on lookup.
+ */
+ err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir",
+ false);
+ if (err)
+ return err;
+ }
+
+ for (i = 1; i < ofs->numlayer; i++) {
+ err = ovl_check_layer(sb, ofs,
+ ofs->layers[i].mnt->mnt_root,
+ "lowerdir", true);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static struct dentry *ovl_get_root(struct super_block *sb,
+ struct dentry *upperdentry,
+ struct ovl_entry *oe)
+{
+ struct dentry *root;
+ struct ovl_path *lowerpath = &oe->lowerstack[0];
+ unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
+ int fsid = lowerpath->layer->fsid;
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = lowerpath,
+ };
+
+ root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ if (!root)
+ return NULL;
+
+ root->d_fsdata = oe;
+
+ if (upperdentry) {
+ /* Root inode uses upper st_ino/i_ino */
+ ino = d_inode(upperdentry)->i_ino;
+ fsid = 0;
+ ovl_dentry_set_upper_alias(root);
+ if (ovl_is_impuredir(sb, upperdentry))
+ ovl_set_flag(OVL_IMPURE, d_inode(root));
+ }
+
+ /* Root is always merge -> can have whiteouts */
+ ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
+ ovl_dentry_set_flag(OVL_E_CONNECTED, root);
+ ovl_set_upperdata(d_inode(root));
+ ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ ovl_dentry_init_flags(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
+
+ return root;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct path upperpath = { };
+ struct dentry *root_dentry;
+ struct ovl_entry *oe;
+ struct ovl_fs *ofs;
+ struct ovl_layer *layers;
+ struct cred *cred;
+ char *splitlower = NULL;
+ unsigned int numlower;
+ int err;
+
+ err = -EIO;
+ if (WARN_ON(sb->s_user_ns != current_user_ns()))
+ goto out;
+
+ sb->s_d_op = &ovl_dentry_operations;
+
+ err = -ENOMEM;
+ ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+ if (!ofs)
+ goto out;
+
+ err = -ENOMEM;
+ ofs->creator_cred = cred = prepare_creds();
+ if (!cred)
+ goto out_err;
+
+ /* Is there a reason anyone would want not to share whiteouts? */
+ ofs->share_whiteout = true;
+
+ ofs->config.index = ovl_index_def;
+ ofs->config.uuid = true;
+ ofs->config.nfs_export = ovl_nfs_export_def;
+ ofs->config.xino = ovl_xino_def();
+ ofs->config.metacopy = ovl_metacopy_def;
+ err = ovl_parse_opt((char *) data, &ofs->config);
+ if (err)
+ goto out_err;
+
+ err = -EINVAL;
+ if (!ofs->config.lowerdir) {
+ if (!silent)
+ pr_err("missing 'lowerdir'\n");
+ goto out_err;
+ }
+
+ err = -ENOMEM;
+ splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
+ if (!splitlower)
+ goto out_err;
+
+ err = -EINVAL;
+ numlower = ovl_split_lowerdirs(splitlower);
+ if (numlower > OVL_MAX_STACK) {
+ pr_err("too many lower directories, limit is %d\n",
+ OVL_MAX_STACK);
+ goto out_err;
+ }
+
+ err = -ENOMEM;
+ layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
+ if (!layers)
+ goto out_err;
+
+ ofs->layers = layers;
+ /* Layer 0 is reserved for upper even if there's no upper */
+ ofs->numlayer = 1;
+
+ sb->s_stack_depth = 0;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ atomic_long_set(&ofs->last_ino, 1);
+ /* Assume underlying fs uses 32bit inodes unless proven otherwise */
+ if (ofs->config.xino != OVL_XINO_OFF) {
+ ofs->xino_mode = BITS_PER_LONG - 32;
+ if (!ofs->xino_mode) {
+ pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
+ ofs->config.xino = OVL_XINO_OFF;
+ }
+ }
+
+ /* alloc/destroy_inode needed for setting up traps in inode cache */
+ sb->s_op = &ovl_super_operations;
+
+ if (ofs->config.upperdir) {
+ struct super_block *upper_sb;
+
+ err = -EINVAL;
+ if (!ofs->config.workdir) {
+ pr_err("missing 'workdir'\n");
+ goto out_err;
+ }
+
+ err = ovl_get_upper(sb, ofs, &layers[0], &upperpath);
+ if (err)
+ goto out_err;
+
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ if (!ovl_should_sync(ofs)) {
+ ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
+ if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
+ err = -EIO;
+ pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
+ goto out_err;
+ }
+ }
+
+ err = ovl_get_workdir(sb, ofs, &upperpath);
+ if (err)
+ goto out_err;
+
+ if (!ofs->workdir)
+ sb->s_flags |= SB_RDONLY;
+
+ sb->s_stack_depth = upper_sb->s_stack_depth;
+ sb->s_time_gran = upper_sb->s_time_gran;
+ }
+ oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
+ err = PTR_ERR(oe);
+ if (IS_ERR(oe))
+ goto out_err;
+
+ /* If the upper fs is nonexistent, we mark overlayfs r/o too */
+ if (!ovl_upper_mnt(ofs))
+ sb->s_flags |= SB_RDONLY;
+
+ if (!ofs->config.uuid && ofs->numfs > 1) {
+ pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n");
+ ofs->config.uuid = true;
+ }
+
+ if (!ovl_force_readonly(ofs) && ofs->config.index) {
+ err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
+ if (err)
+ goto out_free_oe;
+
+ /* Force r/o mount with no index dir */
+ if (!ofs->indexdir)
+ sb->s_flags |= SB_RDONLY;
+ }
+
+ err = ovl_check_overlapping_layers(sb, ofs);
+ if (err)
+ goto out_free_oe;
+
+ /* Show index=off in /proc/mounts for forced r/o mount */
+ if (!ofs->indexdir) {
+ ofs->config.index = false;
+ if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
+ pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
+ ofs->config.nfs_export = false;
+ }
+ }
+
+ if (ofs->config.metacopy && ofs->config.nfs_export) {
+ pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
+ ofs->config.nfs_export = false;
+ }
+
+ if (ofs->config.nfs_export)
+ sb->s_export_op = &ovl_export_operations;
+
+ /* Never override disk quota limits or use reserved space */
+ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
+
+ sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+ sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
+ ovl_trusted_xattr_handlers;
+ sb->s_fs_info = ofs;
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_SKIP_SYNC;
+
+ err = -ENOMEM;
+ root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
+ if (!root_dentry)
+ goto out_free_oe;
+
+ mntput(upperpath.mnt);
+ kfree(splitlower);
+
+ sb->s_root = root_dentry;
+
+ return 0;
+
+out_free_oe:
+ ovl_entry_stack_free(oe);
+ kfree(oe);
+out_err:
+ kfree(splitlower);
+ path_put(&upperpath);
+ ovl_free_fs(ofs);
+out:
+ return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "overlay",
+ .fs_flags = FS_USERNS_MOUNT,
+ .mount = ovl_mount,
+ .kill_sb = kill_anon_super,
+};
+MODULE_ALIAS_FS("overlay");
+
+static void ovl_inode_init_once(void *foo)
+{
+ struct ovl_inode *oi = foo;
+
+ inode_init_once(&oi->vfs_inode);
+}
+
+static int __init ovl_init(void)
+{
+ int err;
+
+ ovl_inode_cachep = kmem_cache_create("ovl_inode",
+ sizeof(struct ovl_inode), 0,
+ (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ ovl_inode_init_once);
+ if (ovl_inode_cachep == NULL)
+ return -ENOMEM;
+
+ err = ovl_aio_request_cache_init();
+ if (!err) {
+ err = register_filesystem(&ovl_fs_type);
+ if (!err)
+ return 0;
+
+ ovl_aio_request_cache_destroy();
+ }
+ kmem_cache_destroy(ovl_inode_cachep);
+
+ return err;
+}
+
+static void __exit ovl_exit(void)
+{
+ unregister_filesystem(&ovl_fs_type);
+
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(ovl_inode_cachep);
+ ovl_aio_request_cache_destroy();
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
new file mode 100644
index 000000000..0d8f96168
--- /dev/null
+++ b/fs/overlayfs/util.c
@@ -0,0 +1,1136 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/fileattr.h>
+#include <linux/uuid.h>
+#include <linux/namei.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+
+int ovl_want_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return mnt_want_write(ovl_upper_mnt(ofs));
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ mnt_drop_write(ovl_upper_mnt(ofs));
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return ofs->workdir;
+}
+
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return override_creds(ofs->creator_cred);
+}
+
+/*
+ * Check if underlying fs supports file handles and try to determine encoding
+ * type, in order to deduce maximum inode number used by fs.
+ *
+ * Return 0 if file handles are not supported.
+ * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding.
+ * Return -1 if fs uses a non default encoding with unknown inode size.
+ */
+int ovl_can_decode_fh(struct super_block *sb)
+{
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return 0;
+
+ if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
+ return 0;
+
+ return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
+}
+
+struct dentry *ovl_indexdir(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->indexdir;
+}
+
+/* Index all files on copy up. For now only enabled for NFS export */
+bool ovl_index_all(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->config.nfs_export && ofs->config.index;
+}
+
+/* Verify lower origin on lookup. For now only enabled for NFS export */
+bool ovl_verify_lower(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->config.nfs_export && ofs->config.index;
+}
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
+{
+ size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+ struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+ if (oe)
+ oe->numlower = numlower;
+
+ return oe;
+}
+
+#define OVL_D_REVALIDATE (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE)
+
+bool ovl_dentry_remote(struct dentry *dentry)
+{
+ return dentry->d_flags & OVL_D_REVALIDATE;
+}
+
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry)
+{
+ if (!ovl_dentry_remote(realdentry))
+ return;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= realdentry->d_flags & OVL_D_REVALIDATE;
+ spin_unlock(&dentry->d_lock);
+}
+
+void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry)
+{
+ return ovl_dentry_init_flags(dentry, upperdentry, OVL_D_REVALIDATE);
+}
+
+void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+ unsigned int i, flags = 0;
+
+ if (upperdentry)
+ flags |= upperdentry->d_flags;
+ for (i = 0; i < oe->numlower; i++)
+ flags |= oe->lowerstack[i].dentry->d_flags;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~mask;
+ dentry->d_flags |= flags & mask;
+ spin_unlock(&dentry->d_lock);
+}
+
+bool ovl_dentry_weird(struct dentry *dentry)
+{
+ return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+ DCACHE_MANAGE_TRANSIT |
+ DCACHE_OP_HASH |
+ DCACHE_OP_COMPARE);
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ enum ovl_path_type type = 0;
+
+ if (ovl_dentry_upper(dentry)) {
+ type = __OVL_PATH_UPPER;
+
+ /*
+ * Non-dir dentry can hold lower dentry of its copy up origin.
+ */
+ if (oe->numlower) {
+ if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry)))
+ type |= __OVL_PATH_ORIGIN;
+ if (d_is_dir(dentry) ||
+ !ovl_has_upperdata(d_inode(dentry)))
+ type |= __OVL_PATH_MERGE;
+ }
+ } else {
+ if (oe->numlower > 1)
+ type |= __OVL_PATH_MERGE;
+ }
+ return type;
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+
+ path->mnt = ovl_upper_mnt(ofs);
+ path->dentry = ovl_dentry_upper(dentry);
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe->numlower) {
+ path->mnt = oe->lowerstack[0].layer->mnt;
+ path->dentry = oe->lowerstack[0].dentry;
+ } else {
+ *path = (struct path) { };
+ }
+}
+
+void ovl_path_lowerdata(struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe->numlower) {
+ path->mnt = oe->lowerstack[oe->numlower - 1].layer->mnt;
+ path->dentry = oe->lowerstack[oe->numlower - 1].dentry;
+ } else {
+ *path = (struct path) { };
+ }
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (!OVL_TYPE_UPPER(type))
+ ovl_path_lower(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ WARN_ON_ONCE(d_is_dir(dentry));
+
+ if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type))
+ ovl_path_lowerdata(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+ return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
+
+const struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->numlower ? oe->lowerstack[0].layer : NULL;
+}
+
+/*
+ * ovl_dentry_lower() could return either a data dentry or metacopy dentry
+ * depending on what is stored in lowerstack[0]. At times we need to find
+ * lower dentry which has data (and not metacopy dentry). This helper
+ * returns the lower data dentry.
+ */
+struct dentry *ovl_dentry_lowerdata(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->numlower ? oe->lowerstack[oe->numlower - 1].dentry : NULL;
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+ return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry);
+}
+
+struct dentry *ovl_i_dentry_upper(struct inode *inode)
+{
+ return ovl_upperdentry_dereference(OVL_I(inode));
+}
+
+struct inode *ovl_i_path_real(struct inode *inode, struct path *path)
+{
+ path->dentry = ovl_i_dentry_upper(inode);
+ if (!path->dentry) {
+ path->dentry = OVL_I(inode)->lowerpath.dentry;
+ path->mnt = OVL_I(inode)->lowerpath.layer->mnt;
+ } else {
+ path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb));
+ }
+
+ return path->dentry ? d_inode_rcu(path->dentry) : NULL;
+}
+
+struct inode *ovl_inode_upper(struct inode *inode)
+{
+ struct dentry *upperdentry = ovl_i_dentry_upper(inode);
+
+ return upperdentry ? d_inode(upperdentry) : NULL;
+}
+
+struct inode *ovl_inode_lower(struct inode *inode)
+{
+ struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry;
+
+ return lowerdentry ? d_inode(lowerdentry) : NULL;
+}
+
+struct inode *ovl_inode_real(struct inode *inode)
+{
+ return ovl_inode_upper(inode) ?: ovl_inode_lower(inode);
+}
+
+/* Return inode which contains lower data. Do not return metacopy */
+struct inode *ovl_inode_lowerdata(struct inode *inode)
+{
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return NULL;
+
+ return OVL_I(inode)->lowerdata ?: ovl_inode_lower(inode);
+}
+
+/* Return real inode which contains data. Does not return metacopy inode */
+struct inode *ovl_inode_realdata(struct inode *inode)
+{
+ struct inode *upperinode;
+
+ upperinode = ovl_inode_upper(inode);
+ if (upperinode && ovl_has_upperdata(inode))
+ return upperinode;
+
+ return ovl_inode_lowerdata(inode);
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct inode *inode)
+{
+ return OVL_I(inode)->cache;
+}
+
+void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache)
+{
+ OVL_I(inode)->cache = cache;
+}
+
+void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry)
+{
+ set_bit(flag, &OVL_E(dentry)->flags);
+}
+
+void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry)
+{
+ clear_bit(flag, &OVL_E(dentry)->flags);
+}
+
+bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry)
+{
+ return test_bit(flag, &OVL_E(dentry)->flags);
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry);
+}
+
+bool ovl_dentry_is_whiteout(struct dentry *dentry)
+{
+ return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
+}
+
+/*
+ * For hard links and decoded file handles, it's possible for ovl_dentry_upper()
+ * to return positive, while there's no actual upper alias for the inode.
+ * Copy up code needs to know about the existence of the upper alias, so it
+ * can't use ovl_dentry_upper().
+ */
+bool ovl_dentry_has_upper_alias(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry);
+}
+
+void ovl_dentry_set_upper_alias(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry);
+}
+
+static bool ovl_should_check_upperdata(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return false;
+
+ if (!ovl_inode_lower(inode))
+ return false;
+
+ return true;
+}
+
+bool ovl_has_upperdata(struct inode *inode)
+{
+ if (!ovl_should_check_upperdata(inode))
+ return true;
+
+ if (!ovl_test_flag(OVL_UPPERDATA, inode))
+ return false;
+ /*
+ * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of
+ * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure
+ * if setting of OVL_UPPERDATA is visible, then effects of writes
+ * before that are visible too.
+ */
+ smp_rmb();
+ return true;
+}
+
+void ovl_set_upperdata(struct inode *inode)
+{
+ /*
+ * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure
+ * if OVL_UPPERDATA flag is visible, then effects of write operations
+ * before it are visible as well.
+ */
+ smp_wmb();
+ ovl_set_flag(OVL_UPPERDATA, inode);
+}
+
+/* Caller should hold ovl_inode->lock */
+bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags)
+{
+ if (!ovl_open_flags_need_copy_up(flags))
+ return false;
+
+ return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry));
+}
+
+bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags)
+{
+ if (!ovl_open_flags_need_copy_up(flags))
+ return false;
+
+ return !ovl_has_upperdata(d_inode(dentry));
+}
+
+bool ovl_redirect_dir(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->config.redirect_dir && !ofs->noxattr;
+}
+
+const char *ovl_dentry_get_redirect(struct dentry *dentry)
+{
+ return OVL_I(d_inode(dentry))->redirect;
+}
+
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
+{
+ struct ovl_inode *oi = OVL_I(d_inode(dentry));
+
+ kfree(oi->redirect);
+ oi->redirect = redirect;
+}
+
+void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
+{
+ struct inode *upperinode = d_inode(upperdentry);
+
+ WARN_ON(OVL_I(inode)->__upperdentry);
+
+ /*
+ * Make sure upperdentry is consistent before making it visible
+ */
+ smp_wmb();
+ OVL_I(inode)->__upperdentry = upperdentry;
+ if (inode_unhashed(inode)) {
+ inode->i_private = upperinode;
+ __insert_inode_hash(inode, (unsigned long) upperinode);
+ }
+}
+
+static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
+{
+ struct inode *inode = d_inode(dentry);
+
+ WARN_ON(!inode_is_locked(inode));
+ WARN_ON(!d_is_dir(dentry));
+ /*
+ * Version is used by readdir code to keep cache consistent.
+ * For merge dirs (or dirs with origin) all changes need to be noted.
+ * For non-merge dirs, cache contains only impure entries (i.e. ones
+ * which have been copied up and have origins), so only need to note
+ * changes to impure entries.
+ */
+ if (!ovl_dir_is_real(dentry) || impurity)
+ OVL_I(inode)->version++;
+}
+
+void ovl_dir_modified(struct dentry *dentry, bool impurity)
+{
+ /* Copy mtime/ctime */
+ ovl_copyattr(d_inode(dentry));
+
+ ovl_dir_version_inc(dentry, impurity);
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ WARN_ON(!inode_is_locked(inode));
+ return OVL_I(inode)->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ return inode && IS_WHITEOUT(inode);
+}
+
+struct file *ovl_path_open(const struct path *path, int flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt);
+ int err, acc_mode;
+
+ if (flags & ~(O_ACCMODE | O_LARGEFILE))
+ BUG();
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ acc_mode = MAY_READ;
+ break;
+ case O_WRONLY:
+ acc_mode = MAY_WRITE;
+ break;
+ default:
+ BUG();
+ }
+
+ err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN);
+ if (err)
+ return ERR_PTR(err);
+
+ /* O_NOATIME is an optimization, don't fail if not permitted */
+ if (inode_owner_or_capable(real_mnt_userns, inode))
+ flags |= O_NOATIME;
+
+ return dentry_open(path, flags, current_cred());
+}
+
+/* Caller should hold ovl_inode->lock */
+static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags)
+{
+ bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED;
+
+ if (ovl_dentry_upper(dentry) &&
+ (ovl_dentry_has_upper_alias(dentry) || disconnected) &&
+ !ovl_dentry_needs_data_copy_up_locked(dentry, flags))
+ return true;
+
+ return false;
+}
+
+bool ovl_already_copied_up(struct dentry *dentry, int flags)
+{
+ bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED;
+
+ /*
+ * Check if copy-up has happened as well as for upper alias (in
+ * case of hard links) is there.
+ *
+ * Both checks are lockless:
+ * - false negatives: will recheck under oi->lock
+ * - false positives:
+ * + ovl_dentry_upper() uses memory barriers to ensure the
+ * upper dentry is up-to-date
+ * + ovl_dentry_has_upper_alias() relies on locking of
+ * upper parent i_rwsem to prevent reordering copy-up
+ * with rename.
+ */
+ if (ovl_dentry_upper(dentry) &&
+ (ovl_dentry_has_upper_alias(dentry) || disconnected) &&
+ !ovl_dentry_needs_data_copy_up(dentry, flags))
+ return true;
+
+ return false;
+}
+
+int ovl_copy_up_start(struct dentry *dentry, int flags)
+{
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ err = ovl_inode_lock_interruptible(inode);
+ if (!err && ovl_already_copied_up_locked(dentry, flags)) {
+ err = 1; /* Already copied up */
+ ovl_inode_unlock(inode);
+ }
+
+ return err;
+}
+
+void ovl_copy_up_end(struct dentry *dentry)
+{
+ ovl_inode_unlock(d_inode(dentry));
+}
+
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ int res;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0);
+
+ /* Zero size value means "copied up but origin unknown" */
+ if (res >= 0)
+ return true;
+
+ return false;
+}
+
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
+{
+ int res;
+ char val;
+
+ if (!d_is_dir(path->dentry))
+ return false;
+
+ res = ovl_path_getxattr(ofs, path, ox, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
+#define OVL_XATTR_REDIRECT_POSTFIX "redirect"
+#define OVL_XATTR_ORIGIN_POSTFIX "origin"
+#define OVL_XATTR_IMPURE_POSTFIX "impure"
+#define OVL_XATTR_NLINK_POSTFIX "nlink"
+#define OVL_XATTR_UPPER_POSTFIX "upper"
+#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
+#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
+
+#define OVL_XATTR_TAB_ENTRY(x) \
+ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
+ [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX }
+
+const char *const ovl_xattr_table[][2] = {
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
+};
+
+int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ enum ovl_xattr ox, const void *value, size_t size,
+ int xerr)
+{
+ int err;
+
+ if (ofs->noxattr)
+ return xerr;
+
+ err = ovl_setxattr(ofs, upperdentry, ox, value, size);
+
+ if (err == -EOPNOTSUPP) {
+ pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox));
+ ofs->noxattr = true;
+ return xerr;
+ }
+
+ return err;
+}
+
+int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ int err;
+
+ if (ovl_test_flag(OVL_IMPURE, d_inode(dentry)))
+ return 0;
+
+ /*
+ * Do not fail when upper doesn't support xattrs.
+ * Upper inodes won't have origin nor redirect xattr anyway.
+ */
+ err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0);
+ if (!err)
+ ovl_set_flag(OVL_IMPURE, d_inode(dentry));
+
+ return err;
+}
+
+
+#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */
+
+void ovl_check_protattr(struct inode *inode, struct dentry *upper)
+{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK;
+ char buf[OVL_PROTATTR_MAX+1];
+ int res, n;
+
+ res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf,
+ OVL_PROTATTR_MAX);
+ if (res < 0)
+ return;
+
+ /*
+ * Initialize inode flags from overlay.protattr xattr and upper inode
+ * flags. If upper inode has those fileattr flags set (i.e. from old
+ * kernel), we do not clear them on ovl_get_inode(), but we will clear
+ * them on next fileattr_set().
+ */
+ for (n = 0; n < res; n++) {
+ if (buf[n] == 'a')
+ iflags |= S_APPEND;
+ else if (buf[n] == 'i')
+ iflags |= S_IMMUTABLE;
+ else
+ break;
+ }
+
+ if (!res || n < res) {
+ pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n",
+ upper, res);
+ } else {
+ inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);
+ }
+}
+
+int ovl_set_protattr(struct inode *inode, struct dentry *upper,
+ struct fileattr *fa)
+{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ char buf[OVL_PROTATTR_MAX];
+ int len = 0, err = 0;
+ u32 iflags = 0;
+
+ BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX);
+
+ if (fa->flags & FS_APPEND_FL) {
+ buf[len++] = 'a';
+ iflags |= S_APPEND;
+ }
+ if (fa->flags & FS_IMMUTABLE_FL) {
+ buf[len++] = 'i';
+ iflags |= S_IMMUTABLE;
+ }
+
+ /*
+ * Do not allow to set protection flags when upper doesn't support
+ * xattrs, because we do not set those fileattr flags on upper inode.
+ * Remove xattr if it exist and all protection flags are cleared.
+ */
+ if (len) {
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR,
+ buf, len, -EPERM);
+ } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) {
+ err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
+ if (err == -EOPNOTSUPP || err == -ENODATA)
+ err = 0;
+ }
+ if (err)
+ return err;
+
+ inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);
+
+ /* Mask out the fileattr flags that should not be set in upper inode */
+ fa->flags &= ~OVL_PROT_FS_FLAGS_MASK;
+ fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK;
+
+ return 0;
+}
+
+/**
+ * Caller must hold a reference to inode to prevent it from being freed while
+ * it is marked inuse.
+ */
+bool ovl_inuse_trylock(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ bool locked = false;
+
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_OVL_INUSE)) {
+ inode->i_state |= I_OVL_INUSE;
+ locked = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ return locked;
+}
+
+void ovl_inuse_unlock(struct dentry *dentry)
+{
+ if (dentry) {
+ struct inode *inode = d_inode(dentry);
+
+ spin_lock(&inode->i_lock);
+ WARN_ON(!(inode->i_state & I_OVL_INUSE));
+ inode->i_state &= ~I_OVL_INUSE;
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+bool ovl_is_inuse(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ bool inuse;
+
+ spin_lock(&inode->i_lock);
+ inuse = (inode->i_state & I_OVL_INUSE);
+ spin_unlock(&inode->i_lock);
+
+ return inuse;
+}
+
+/*
+ * Does this overlay dentry need to be indexed on copy up?
+ */
+bool ovl_need_index(struct dentry *dentry)
+{
+ struct dentry *lower = ovl_dentry_lower(dentry);
+
+ if (!lower || !ovl_indexdir(dentry->d_sb))
+ return false;
+
+ /* Index all files for NFS export and consistency verification */
+ if (ovl_index_all(dentry->d_sb))
+ return true;
+
+ /* Index only lower hardlinks on copy up */
+ if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
+ return true;
+
+ return false;
+}
+
+/* Caller must hold OVL_I(inode)->lock */
+static void ovl_cleanup_index(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
+ struct inode *dir = indexdir->d_inode;
+ struct dentry *lowerdentry = ovl_dentry_lower(dentry);
+ struct dentry *upperdentry = ovl_dentry_upper(dentry);
+ struct dentry *index = NULL;
+ struct inode *inode;
+ struct qstr name = { };
+ int err;
+
+ err = ovl_get_index_name(ofs, lowerdentry, &name);
+ if (err)
+ goto fail;
+
+ inode = d_inode(upperdentry);
+ if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
+ pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
+ upperdentry, inode->i_ino, inode->i_nlink);
+ /*
+ * We either have a bug with persistent union nlink or a lower
+ * hardlink was added while overlay is mounted. Adding a lower
+ * hardlink and then unlinking all overlay hardlinks would drop
+ * overlay nlink to zero before all upper inodes are unlinked.
+ * As a safety measure, when that situation is detected, set
+ * the overlay nlink to the index inode nlink minus one for the
+ * index entry itself.
+ */
+ set_nlink(d_inode(dentry), inode->i_nlink - 1);
+ ovl_set_nlink_upper(dentry);
+ goto out;
+ }
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
+ err = PTR_ERR(index);
+ if (IS_ERR(index)) {
+ index = NULL;
+ } else if (ovl_index_all(dentry->d_sb)) {
+ /* Whiteout orphan index to block future open by handle */
+ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
+ dir, index);
+ } else {
+ /* Cleanup orphan index entries */
+ err = ovl_cleanup(ofs, dir, index);
+ }
+
+ inode_unlock(dir);
+ if (err)
+ goto fail;
+
+out:
+ kfree(name.name);
+ dput(index);
+ return;
+
+fail:
+ pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err);
+ goto out;
+}
+
+/*
+ * Operations that change overlay inode and upper inode nlink need to be
+ * synchronized with copy up for persistent nlink accounting.
+ */
+int ovl_nlink_start(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct cred *old_cred;
+ int err;
+
+ if (WARN_ON(!inode))
+ return -ENOENT;
+
+ /*
+ * With inodes index is enabled, we store the union overlay nlink
+ * in an xattr on the index inode. When whiting out an indexed lower,
+ * we need to decrement the overlay persistent nlink, but before the
+ * first copy up, we have no upper index inode to store the xattr.
+ *
+ * As a workaround, before whiteout/rename over an indexed lower,
+ * copy up to create the upper index. Creating the upper index will
+ * initialize the overlay nlink, so it could be dropped if unlink
+ * or rename succeeds.
+ *
+ * TODO: implement metadata only index copy up when called with
+ * ovl_copy_up_flags(dentry, O_PATH).
+ */
+ if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ return err;
+ }
+
+ err = ovl_inode_lock_interruptible(inode);
+ if (err)
+ return err;
+
+ if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
+ goto out;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ /*
+ * The overlay inode nlink should be incremented/decremented IFF the
+ * upper operation succeeds, along with nlink change of upper inode.
+ * Therefore, before link/unlink/rename, we store the union nlink
+ * value relative to the upper inode nlink in an upper inode xattr.
+ */
+ err = ovl_set_nlink_upper(dentry);
+ revert_creds(old_cred);
+
+out:
+ if (err)
+ ovl_inode_unlock(inode);
+
+ return err;
+}
+
+void ovl_nlink_end(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ ovl_cleanup_index(dentry);
+ revert_creds(old_cred);
+ }
+
+ ovl_inode_unlock(inode);
+}
+
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+{
+ /* Workdir should not be the same as upperdir */
+ if (workdir == upperdir)
+ goto err;
+
+ /* Workdir should not be subdir of upperdir and vice versa */
+ if (lock_rename(workdir, upperdir) != NULL)
+ goto err_unlock;
+
+ return 0;
+
+err_unlock:
+ unlock_rename(workdir, upperdir);
+err:
+ pr_err("failed to lock workdir+upperdir\n");
+ return -EIO;
+}
+
+/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ int res;
+
+ /* Only regular files can have metacopy xattr */
+ if (!S_ISREG(d_inode(path->dentry)->i_mode))
+ return 0;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return 0;
+ /*
+ * getxattr on user.* may fail with EACCES in case there's no
+ * read permission on the inode. Not much we can do, other than
+ * tell the caller that this is not a metacopy inode.
+ */
+ if (ofs->config.userxattr && res == -EACCES)
+ return 0;
+ goto out;
+ }
+
+ return 1;
+out:
+ pr_warn_ratelimited("failed to get metacopy (%i)\n", res);
+ return res;
+}
+
+bool ovl_is_metacopy_dentry(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (!d_is_reg(dentry))
+ return false;
+
+ if (ovl_dentry_upper(dentry)) {
+ if (!ovl_has_upperdata(d_inode(dentry)))
+ return true;
+ return false;
+ }
+
+ return (oe->numlower > 1);
+}
+
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding)
+{
+ int res;
+ char *s, *next, *buf = NULL;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0);
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return NULL;
+ if (res < 0)
+ goto fail;
+ if (res == 0)
+ goto invalid;
+
+ buf = kzalloc(res + padding + 1, GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res);
+ if (res < 0)
+ goto fail;
+ if (res == 0)
+ goto invalid;
+
+ if (buf[0] == '/') {
+ for (s = buf; *s++ == '/'; s = next) {
+ next = strchrnul(s, '/');
+ if (s == next)
+ goto invalid;
+ }
+ } else {
+ if (strchr(buf, '/') != NULL)
+ goto invalid;
+ }
+
+ return buf;
+invalid:
+ pr_warn_ratelimited("invalid redirect (%s)\n", buf);
+ res = -EINVAL;
+ goto err_free;
+fail:
+ pr_warn_ratelimited("failed to get redirect (%i)\n", res);
+err_free:
+ kfree(buf);
+ return ERR_PTR(res);
+}
+
+/*
+ * ovl_sync_status() - Check fs sync status for volatile mounts
+ *
+ * Returns 1 if this is not a volatile mount and a real sync is required.
+ *
+ * Returns 0 if syncing can be skipped because mount is volatile, and no errors
+ * have occurred on the upperdir since the mount.
+ *
+ * Returns -errno if it is a volatile mount, and the error that occurred since
+ * the last mount. If the error code changes, it'll return the latest error
+ * code.
+ */
+
+int ovl_sync_status(struct ovl_fs *ofs)
+{
+ struct vfsmount *mnt;
+
+ if (ovl_should_sync(ofs))
+ return 1;
+
+ mnt = ovl_upper_mnt(ofs);
+ if (!mnt)
+ return 0;
+
+ return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
+}
+
+/*
+ * ovl_copyattr() - copy inode attributes from layer to ovl inode
+ *
+ * When overlay copies inode information from an upper or lower layer to the
+ * relevant overlay inode it will apply the idmapping of the upper or lower
+ * layer when doing so ensuring that the ovl inode ownership will correctly
+ * reflect the ownership of the idmapped upper or lower layer. For example, an
+ * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to
+ * map any lower or upper inode owned by id 1001 to id 1000. These mapping
+ * helpers are nops when the relevant layer isn't idmapped.
+ */
+void ovl_copyattr(struct inode *inode)
+{
+ struct path realpath;
+ struct inode *realinode;
+ struct user_namespace *real_mnt_userns;
+
+ realinode = ovl_i_path_real(inode, &realpath);
+ real_mnt_userns = mnt_user_ns(realpath.mnt);
+
+ inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
+ inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ inode->i_mode = realinode->i_mode;
+ inode->i_atime = realinode->i_atime;
+ inode->i_mtime = realinode->i_mtime;
+ inode->i_ctime = realinode->i_ctime;
+ i_size_write(inode, i_size_read(realinode));
+}