diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /fs/overlayfs | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/overlayfs')
-rw-r--r-- | fs/overlayfs/Kconfig | 124 | ||||
-rw-r--r-- | fs/overlayfs/Makefile | 8 | ||||
-rw-r--r-- | fs/overlayfs/copy_up.c | 906 | ||||
-rw-r--r-- | fs/overlayfs/dir.c | 1259 | ||||
-rw-r--r-- | fs/overlayfs/export.c | 850 | ||||
-rw-r--r-- | fs/overlayfs/file.c | 653 | ||||
-rw-r--r-- | fs/overlayfs/inode.c | 982 | ||||
-rw-r--r-- | fs/overlayfs/namei.c | 1164 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 426 | ||||
-rw-r--r-- | fs/overlayfs/ovl_entry.h | 122 | ||||
-rw-r--r-- | fs/overlayfs/readdir.c | 1161 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 1757 | ||||
-rw-r--r-- | fs/overlayfs/util.c | 945 |
13 files changed, 10357 insertions, 0 deletions
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000..2ef91be2a --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,124 @@ +config OVERLAY_FS + tristate "Overlay filesystem support" + select EXPORTFS + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt + +config OVERLAY_FS_REDIRECT_DIR + bool "Overlayfs: turn on redirect directory feature by default" + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + redirects when renaming directories by default. In this case it is + still possible to turn off redirects globally with the + "redirect_dir=off" module option or on a filesystem instance basis + with the "redirect_dir=off" mount option. + + Note, that redirects are not backward compatible. That is, mounting + an overlay which has redirects on a kernel that doesn't support this + feature will have unexpected results. + + If unsure, say N. + +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + If backward compatibility is not an issue, then it is safe and + recommended to say N here. + + For more information, see Documentation/filesystems/overlayfs.txt + + If unsure, say Y. + +config OVERLAY_FS_INDEX + bool "Overlayfs: turn on inodes index feature by default" + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + the index directory to map lower inodes to upper inodes by default. + In this case it is still possible to turn off index globally with the + "index=off" module option or on a filesystem instance basis with the + "index=off" mount option. + + The inodes index feature prevents breaking of lower hardlinks on copy + up. + + Note, that the inodes index feature is not backward compatible. + That is, mounting an overlay which has an inodes index on a kernel + that doesn't support this feature will have unexpected results. + + If unsure, say N. + +config OVERLAY_FS_NFS_EXPORT + bool "Overlayfs: turn on NFS export feature by default" + depends on OVERLAY_FS + depends on OVERLAY_FS_INDEX + depends on !OVERLAY_FS_METACOPY + help + If this config option is enabled then overlay filesystems will use + the index directory to decode overlay NFS file handles by default. + In this case, it is still possible to turn off NFS export support + globally with the "nfs_export=off" module option or on a filesystem + instance basis with the "nfs_export=off" mount option. + + The NFS export feature creates an index on copy up of every file and + directory. This full index is used to detect overlay filesystems + inconsistencies on lookup, like redirect from multiple upper dirs to + the same lower dir. The full index may incur some overhead on mount + time, especially when verifying that directory file handles are not + stale. + + Note, that the NFS export feature is not backward compatible. + That is, mounting an overlay which has a full index on a kernel + that doesn't support this feature will have unexpected results. + + Most users should say N here and enable this feature on a case-by- + case basis with the "nfs_export=on" mount option. + + Say N unless you fully understand the consequences. + +config OVERLAY_FS_XINO_AUTO + bool "Overlayfs: auto enable inode number mapping" + default n + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + unused high bits in undelying filesystem inode numbers to map all + inodes to a unified address space. The mapped 64bit inode numbers + might not be compatible with applications that expect 32bit inodes. + + If compatibility with applications that expect 32bit inodes is not an + issue, then it is safe and recommended to say Y here. + + For more information, see Documentation/filesystems/overlayfs.txt + + If unsure, say N. + +config OVERLAY_FS_METACOPY + bool "Overlayfs: turn on metadata only copy up feature by default" + depends on OVERLAY_FS + select OVERLAY_FS_REDIRECT_DIR + help + If this config option is enabled then overlay filesystems will + copy up only metadata where appropriate and data copy up will + happen when a file is opened for WRITE operation. It is still + possible to turn off this feature globally with the "metacopy=off" + module option or on a filesystem instance basis with the + "metacopy=off" mount option. + + Note, that this feature is not backward compatible. That is, + mounting an overlay which has metacopy only inodes on a kernel + that doesn't support this feature will have unexpected results. + + If unsure, say N. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000..46e1ff8ac --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAY_FS) += overlay.o + +overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ + copy_up.o export.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000..30abafcd4 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,906 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/splice.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/sched/signal.h> +#include <linux/cred.h> +#include <linux/namei.h> +#include <linux/fdtable.h> +#include <linux/ratelimit.h> +#include <linux/exportfs.h> +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +static int ovl_ccup_set(const char *buf, const struct kernel_param *param) +{ + pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n"); + return 0; +} + +static int ovl_ccup_get(char *buf, const struct kernel_param *param) +{ + return sprintf(buf, "N\n"); +} + +module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644); +MODULE_PARM_DESC(ovl_check_copy_up, "Obsolete; does nothing"); + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size, value_size = 0; + char *buf, *name, *value = NULL; + int error = 0; + size_t slen; + + if (!(old->d_inode->i_opflags & IOP_XATTR) || + !(new->d_inode->i_opflags & IOP_XATTR)) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out; + } + + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + + if (ovl_is_private_xattr(name)) + continue; + + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } +retry: + size = vfs_getxattr(old, name, value, value_size); + if (size == -ERANGE) + size = vfs_getxattr(old, name, NULL, 0); + + if (size < 0) { + error = size; + break; + } + + if (size > value_size) { + void *new; + + new = krealloc(value, size, GFP_KERNEL); + if (!new) { + error = -ENOMEM; + break; + } + value = new; + value_size = size; + goto retry; + } + + error = vfs_setxattr(new, name, value, size, 0); + if (error) + break; + } + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* Try to use clone_file_range to clone up within the same fs */ + error = do_clone_file_range(old_file, 0, new_file, 0, len); + if (!error) + goto out; + /* Couldn't clone, so now we try to copy the data */ + error = 0; + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } +out: + if (!error) + error = vfs_fsync(new_file, 0); + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = ATTR_SIZE, + .ia_size = stat->size, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; +} + +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) +{ + struct ovl_fh *fh; + int fh_type, fh_len, dwords; + void *buf; + int buflen = MAX_HANDLE_SZ; + uuid_t *uuid = &real->d_sb->s_uuid; + + buf = kmalloc(buflen, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* + * We encode a non-connectable file handle for non-dir, because we + * only need to find the lower inode number and we don't want to pay + * the price or reconnecting the dentry. + */ + dwords = buflen >> 2; + fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + buflen = (dwords << 2); + + fh = ERR_PTR(-EIO); + if (WARN_ON(fh_type < 0) || + WARN_ON(buflen > MAX_HANDLE_SZ) || + WARN_ON(fh_type == FILEID_INVALID)) + goto out; + + BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); + fh_len = offsetof(struct ovl_fh, fid) + buflen; + fh = kmalloc(fh_len, GFP_KERNEL); + if (!fh) { + fh = ERR_PTR(-ENOMEM); + goto out; + } + + fh->version = OVL_FH_VERSION; + fh->magic = OVL_FH_MAGIC; + fh->type = fh_type; + fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + /* + * When we will want to decode an overlay dentry from this handle + * and all layers are on the same fs, if we get a disconncted real + * dentry when we decode fid, the only way to tell if we should assign + * it to upperdentry or to lowerstack is by checking this flag. + */ + if (is_upper) + fh->flags |= OVL_FH_FLAG_PATH_UPPER; + fh->len = fh_len; + fh->uuid = *uuid; + memcpy(fh->fid, buf, buflen); + +out: + kfree(buf); + return fh; +} + +int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) +{ + const struct ovl_fh *fh = NULL; + int err; + + /* + * When lower layer doesn't support export operations store a 'null' fh, + * so we can use the overlay.origin xattr to distignuish between a copy + * up and a pure upper inode. + */ + if (ovl_can_decode_fh(lower->d_sb)) { + fh = ovl_encode_real_fh(lower, false); + if (IS_ERR(fh)) + return PTR_ERR(fh); + } + + /* + * Do not fail when upper doesn't support xattrs. + */ + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, + fh ? fh->len : 0, 0); + kfree(fh); + + return err; +} + +/* Store file handle of @upper dir in @index dir entry */ +static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) +{ + const struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(upper, true); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + + kfree(fh); + return err; +} + +/* + * Create and install index entry. + * + * Caller must hold i_mutex on indexdir. + */ +static int ovl_create_index(struct dentry *dentry, struct dentry *origin, + struct dentry *upper) +{ + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = d_inode(indexdir); + struct dentry *index = NULL; + struct dentry *temp = NULL; + struct qstr name = { }; + int err; + + /* + * For now this is only used for creating index entry for directories, + * because non-dir are copied up directly to index and then hardlinked + * to upper dir. + * + * TODO: implement create index for non-dir, so we can call it when + * encoding file handle for non-dir in case index does not exist. + */ + if (WARN_ON(!d_is_dir(dentry))) + return -EIO; + + /* Directory not expected to be indexed before copy up */ + if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) + return -EIO; + + err = ovl_get_index_name(origin, &name); + if (err) + return err; + + temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0)); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto free_name; + + err = ovl_set_upper_fh(upper, temp); + if (err) + goto out; + + index = lookup_one_len(name.name, indexdir, name.len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + } else { + err = ovl_do_rename(dir, temp, dir, index, 0); + dput(index); + } +out: + if (err) + ovl_cleanup(dir, temp); + dput(temp); +free_name: + kfree(name.name); + return err; +} + +struct ovl_copy_up_ctx { + struct dentry *parent; + struct dentry *dentry; + struct path lowerpath; + struct kstat stat; + struct kstat pstat; + const char *link; + struct dentry *destdir; + struct qstr destname; + struct dentry *workdir; + bool tmpfile; + bool origin; + bool indexed; + bool metacopy; +}; + +static int ovl_link_up(struct ovl_copy_up_ctx *c) +{ + int err; + struct dentry *upper; + struct dentry *upperdir = ovl_dentry_upper(c->parent); + struct inode *udir = d_inode(upperdir); + + /* Mark parent "impure" because it may now contain non-pure upper */ + err = ovl_set_impure(c->parent, upperdir); + if (err) + return err; + + err = ovl_set_nlink_lower(c->dentry); + if (err) + return err; + + inode_lock_nested(udir, I_MUTEX_PARENT); + upper = lookup_one_len(c->dentry->d_name.name, upperdir, + c->dentry->d_name.len); + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper); + dput(upper); + + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &c->pstat); + ovl_dentry_set_upper_alias(c->dentry); + } + } + inode_unlock(udir); + if (err) + return err; + + err = ovl_set_nlink_upper(c->dentry); + + return err; +} + +static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, + struct dentry **newdentry) +{ + int err; + struct dentry *upper; + struct inode *udir = d_inode(c->destdir); + + upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + if (IS_ERR(upper)) + return PTR_ERR(upper); + + if (c->tmpfile) + err = ovl_do_link(temp, udir, upper); + else + err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0); + + if (!err) + *newdentry = dget(c->tmpfile ? upper : temp); + dput(upper); + + return err; +} + +static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c) +{ + int err; + struct dentry *temp; + const struct cred *old_creds = NULL; + struct cred *new_creds = NULL; + struct ovl_cattr cattr = { + /* Can't properly set mode on creation because of the umask */ + .mode = c->stat.mode & S_IFMT, + .rdev = c->stat.rdev, + .link = c->link + }; + + err = security_inode_copy_up(c->dentry, &new_creds); + temp = ERR_PTR(err); + if (err < 0) + goto out; + + if (new_creds) + old_creds = override_creds(new_creds); + + if (c->tmpfile) + temp = ovl_do_tmpfile(c->workdir, c->stat.mode); + else + temp = ovl_create_temp(c->workdir, &cattr); +out: + if (new_creds) { + revert_creds(old_creds); + put_cred(new_creds); + } + + return temp; +} + +static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + int err; + + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + + err = ovl_copy_xattr(c->lowerpath.dentry, temp); + if (err) + return err; + + /* + * Store identifier of lower inode in upper inode xattr to + * allow lookup of the copy up origin inode. + * + * Don't set origin when we are breaking the association with a lower + * hard link. + */ + if (c->origin) { + err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp); + if (err) + return err; + } + + if (c->metacopy) { + err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, + NULL, 0, -EOPNOTSUPP); + if (err) + return err; + } + + inode_lock(temp->d_inode); + if (c->metacopy) + err = ovl_set_size(temp, &c->stat); + if (!err) + err = ovl_set_attr(temp, &c->stat); + inode_unlock(temp->d_inode); + + return err; +} + +static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) +{ + struct inode *udir = c->destdir->d_inode; + struct inode *inode; + struct dentry *newdentry = NULL; + struct dentry *temp; + int err; + + temp = ovl_get_tmpfile(c); + if (IS_ERR(temp)) + return PTR_ERR(temp); + + err = ovl_copy_up_inode(c, temp); + if (err) + goto out; + + if (S_ISDIR(c->stat.mode) && c->indexed) { + err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + if (err) + goto out; + } + + if (c->tmpfile) { + inode_lock_nested(udir, I_MUTEX_PARENT); + err = ovl_install_temp(c, temp, &newdentry); + inode_unlock(udir); + } else { + err = ovl_install_temp(c, temp, &newdentry); + } + if (err) + goto out; + + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); + inode = d_inode(c->dentry); + ovl_inode_update(inode, newdentry); + if (S_ISDIR(inode->i_mode)) + ovl_set_flag(OVL_WHITEOUTS, inode); + +out: + if (err && !c->tmpfile) + ovl_cleanup(d_inode(c->workdir), temp); + dput(temp); + return err; + +} + +/* + * Copy up a single dentry + * + * All renames start with copy up of source if necessary. The actual + * rename will only proceed once the copy up was successful. Copy up uses + * upper parent i_mutex for exclusion. Since rename can change d_parent it + * is possible that the copy up will lock the old parent. At that point + * the file will have already been copied up anyway. + */ +static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) +{ + int err; + struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info; + bool to_index = false; + + /* + * Indexed non-dir is copied up directly to the index entry and then + * hardlinked to upper dir. Indexed dir is copied up to indexdir, + * then index entry is created and then copied up dir installed. + * Copying dir up to indexdir instead of workdir simplifies locking. + */ + if (ovl_need_index(c->dentry)) { + c->indexed = true; + if (S_ISDIR(c->stat.mode)) + c->workdir = ovl_indexdir(c->dentry->d_sb); + else + to_index = true; + } + + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) + c->origin = true; + + if (to_index) { + c->destdir = ovl_indexdir(c->dentry->d_sb); + err = ovl_get_index_name(c->lowerpath.dentry, &c->destname); + if (err) + return err; + } else if (WARN_ON(!c->parent)) { + /* Disconnected dentry must be copied up to index dir */ + return -EIO; + } else { + /* + * Mark parent "impure" because it may now contain non-pure + * upper + */ + err = ovl_set_impure(c->parent, c->destdir); + if (err) + return err; + } + + /* Should we copyup with O_TMPFILE or with workdir? */ + if (S_ISREG(c->stat.mode) && ofs->tmpfile) { + c->tmpfile = true; + err = ovl_copy_up_locked(c); + } else { + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (!err) { + err = ovl_copy_up_locked(c); + unlock_rename(c->workdir, c->destdir); + } + } + + + if (err) + goto out; + + if (c->indexed) + ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + + if (to_index) { + /* Initialize nlink for copy up of disconnected dentry */ + err = ovl_set_nlink_upper(c->dentry); + } else { + struct inode *udir = d_inode(c->destdir); + + /* Restore timestamps on parent (best effort) */ + inode_lock(udir); + ovl_set_timestamps(c->destdir, &c->pstat); + inode_unlock(udir); + + ovl_dentry_set_upper_alias(c->dentry); + } + +out: + if (to_index) + kfree(c->destname.name); + return err; +} + +static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, + int flags) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (!ofs->config.metacopy) + return false; + + if (!S_ISREG(mode)) + return false; + + if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC))) + return false; + + return true; +} + +/* Copy up data of an inode which was copied up metadata only in the past. */ +static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) +{ + struct path upperpath, datapath; + int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry == NULL)) + return -EIO; + + ovl_path_lowerdata(c->dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; + + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + + + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); + if (err) + goto out_free; + + ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: + return err; +} + +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + int flags) +{ + int err; + DEFINE_DELAYED_CALL(done); + struct path parentpath; + struct ovl_copy_up_ctx ctx = { + .parent = parent, + .dentry = dentry, + .workdir = ovl_workdir(dentry), + }; + + if (WARN_ON(!ctx.workdir)) + return -EROFS; + + ovl_path_lower(dentry, &ctx.lowerpath); + err = vfs_getattr(&ctx.lowerpath, &ctx.stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + if (err) + return err; + + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); + + if (parent) { + ovl_path_upper(parent, &parentpath); + ctx.destdir = parentpath.dentry; + ctx.destname = dentry->d_name; + + err = vfs_getattr(&parentpath, &ctx.pstat, + STATX_ATIME | STATX_MTIME, + AT_STATX_SYNC_AS_STAT); + if (err) + return err; + } + + /* maybe truncate regular file. this has no effect on dirs */ + if (flags & O_TRUNC) + ctx.stat.size = 0; + + if (S_ISLNK(ctx.stat.mode)) { + ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done); + if (IS_ERR(ctx.link)) + return PTR_ERR(ctx.link); + } + + err = ovl_copy_up_start(dentry, flags); + /* err < 0: interrupted, err > 0: raced with another copy-up */ + if (unlikely(err)) { + if (err > 0) + err = 0; + } else { + if (!ovl_dentry_upper(dentry)) + err = ovl_do_copy_up(&ctx); + if (!err && parent && !ovl_dentry_has_upper_alias(dentry)) + err = ovl_link_up(&ctx); + if (!err && ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + err = ovl_copy_up_meta_inode_data(&ctx); + ovl_copy_up_end(dentry); + } + do_delayed_call(&done); + + return err; +} + +int ovl_copy_up_flags(struct dentry *dentry, int flags) +{ + int err = 0; + const struct cred *old_cred; + bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); + + /* + * With NFS export, copy up can get called for a disconnected non-dir. + * In this case, we will copy up lower inode to index dir without + * linking it to upper dir. + */ + if (WARN_ON(disconnected && d_is_dir(dentry))) + return -EIO; + + old_cred = ovl_override_creds(dentry->d_sb); + while (!err) { + struct dentry *next; + struct dentry *parent = NULL; + + if (ovl_already_copied_up(dentry, flags)) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (; !disconnected;) { + parent = dget_parent(next); + + if (ovl_dentry_upper(parent)) + break; + + dput(next); + next = parent; + } + + err = ovl_copy_up_one(parent, next, flags); + + dput(parent); + dput(next); + } + revert_creds(old_cred); + + return err; +} + +static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) +{ + /* Copy up of disconnected dentry does not set upper alias */ + if (ovl_already_copied_up(dentry, flags)) + return false; + + if (special_file(d_inode(dentry)->i_mode)) + return false; + + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return true; +} + +int ovl_maybe_copy_up(struct dentry *dentry, int flags) +{ + int err = 0; + + if (ovl_open_need_copy_up(dentry, flags)) { + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up_flags(dentry, flags); + ovl_drop_write(dentry); + } + } + + return err; +} + +int ovl_copy_up_with_data(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, O_WRONLY); +} + +int ovl_copy_up(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, 0); +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000..06dc64962 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,1259 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/cred.h> +#include <linux/module.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +static unsigned short ovl_redirect_max = 256; +module_param_named(redirect_max, ovl_redirect_max, ushort, 0644); +MODULE_PARM_DESC(ovl_redirect_max, + "Maximum length of absolute redirect xattr value"); + +static int ovl_set_redirect(struct dentry *dentry, bool samedir); + +int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (d_is_dir(wdentry)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } + + return err; +} + +static struct dentry *ovl_lookup_temp(struct dentry *workdir) +{ + struct dentry *temp; + char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); + + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +/* Caller must hold i_mutex on both workdir and dir */ +int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, + struct dentry *dentry) +{ + struct inode *wdir = workdir->d_inode; + struct dentry *whiteout; + int err; + int flags = 0; + + whiteout = ovl_whiteout(workdir); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + return err; + + if (d_is_dir(dentry)) + flags = RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, dir, dentry, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(wdir, dentry); + +out: + dput(whiteout); + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out; +} + +int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) +{ + int err; + struct dentry *d, *dentry = *newdentry; + + err = ovl_do_mkdir(dir, dentry, mode); + if (err) + return err; + + if (likely(!d_unhashed(dentry))) + return 0; + + /* + * vfs_mkdir() may succeed and leave the dentry passed + * to it unhashed and negative. If that happens, try to + * lookup a new hashed and positive dentry. + */ + d = lookup_one_len(dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); + if (IS_ERR(d)) { + pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + dentry, err); + return PTR_ERR(d); + } + dput(dentry); + *newdentry = d; + + return 0; +} + +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr) +{ + int err; + + if (IS_ERR(newdentry)) + return newdentry; + + err = -ESTALE; + if (newdentry->d_inode) + goto out; + + if (attr->hardlink) { + err = ovl_do_link(attr->hardlink, dir, newdentry); + } else { + switch (attr->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, attr->mode); + break; + + case S_IFDIR: + /* mkdir is special... */ + err = ovl_mkdir_real(dir, &newdentry, attr->mode); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, attr->mode, + attr->rdev); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, attr->link); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -EIO; + } +out: + if (err) { + dput(newdentry); + return ERR_PTR(err); + } + return newdentry; +} + +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) +{ + return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir), + attr); +} + +static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, + int xerr) +{ + int err; + + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); + if (!err) + ovl_dentry_set_opaque(dentry); + + return err; +} + +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +{ + /* + * Fail with -EIO when trying to create opaque dir and upper doesn't + * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to + * return a specific error for noxattr case. + */ + return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); +} + +/* + * Common operations required to be done after creation of file on upper. + * If @hardlink is false, then @inode is a pre-allocated inode, we may or + * may not use to instantiate the new dentry. + */ +static int ovl_instantiate(struct dentry *dentry, struct inode *inode, + struct dentry *newdentry, bool hardlink) +{ + struct ovl_inode_params oip = { + .upperdentry = newdentry, + .newinode = inode, + }; + + ovl_dir_modified(dentry->d_parent, false); + ovl_dentry_set_upper_alias(dentry); + if (!hardlink) { + /* + * ovl_obtain_alias() can be called after ovl_create_real() + * and before we get here, so we may get an inode from cache + * with the same real upperdentry that is not the inode we + * pre-allocated. In this case we will use the cached inode + * to instantiate the new dentry. + * + * XXX: if we ever use ovl_obtain_alias() to decode directory + * file handles, need to use ovl_get_inode_locked() and + * d_instantiate_new() here to prevent from creating two + * hashed directory inode aliases. + */ + inode = ovl_get_inode(dentry->d_sb, &oip); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } else { + WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); + dput(newdentry); + inc_nlink(inode); + } + + d_instantiate(dentry, inode); + if (inode != oip.newinode) { + pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + dentry); + } + + /* Force lookup of new upper hardlink to find its lower */ + if (hardlink) + d_drop(dentry); + + return 0; +} + +static bool ovl_type_merge(struct dentry *dentry) +{ + return OVL_TYPE_MERGE(ovl_path_type(dentry)); +} + +static bool ovl_type_origin(struct dentry *dentry) +{ + return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *attr) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + if (!attr->hardlink && !IS_POSIXACL(udir)) + attr->mode &= ~current_umask(); + + inode_lock_nested(udir, I_MUTEX_PARENT); + newdentry = ovl_create_real(udir, + lookup_one_len(dentry->d_name.name, + upperdir, + dentry->d_name.len), + attr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { + /* Setting opaque here is just an optimization, allow to fail */ + ovl_set_opaque(dentry, newdentry); + } + + err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink); + if (err) + goto out_cleanup; +out_unlock: + inode_unlock(udir); + return err; + +out_cleanup: + ovl_cleanup(udir, newdentry); + dput(newdentry); + goto out_unlock; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode)); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(dentry, opaquedir); + if (err) + goto out_cleanup; + + inode_lock(opaquedir->d_inode); + err = ovl_set_attr(opaquedir, &stat); + inode_unlock(opaquedir->d_inode); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, + const struct posix_acl *acl) +{ + void *buffer; + size_t size; + int err; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) + return 0; + + size = posix_acl_to_xattr(NULL, acl, NULL, 0); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + err = size; + if (err < 0) + goto out_free; + + err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); +out_free: + kfree(buffer); + return err; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *cattr) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + struct posix_acl *acl, *default_acl; + bool hardlink = !!cattr->hardlink; + + if (WARN_ON(!workdir)) + return -EROFS; + + if (!hardlink) { + err = posix_acl_create(dentry->d_parent->d_inode, + &cattr->mode, &default_acl, &acl); + if (err) + return err; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + goto out_dput; + + newdentry = ovl_create_temp(workdir, cattr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput; + + /* + * mode could have been mutilated due to umask (e.g. sgid directory) + */ + if (!hardlink && + !S_ISLNK(cattr->mode) && + newdentry->d_inode->i_mode != cattr->mode) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = cattr->mode, + }; + inode_lock(newdentry->d_inode); + err = notify_change(newdentry, &attr, NULL); + inode_unlock(newdentry->d_inode); + if (err) + goto out_cleanup; + } + if (!hardlink) { + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, + acl); + if (err) + goto out_cleanup; + + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, + default_acl); + if (err) + goto out_cleanup; + } + + if (!hardlink && S_ISDIR(cattr->mode)) { + err = ovl_set_opaque(dentry, newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + err = ovl_instantiate(dentry, inode, newdentry, hardlink); + if (err) { + ovl_cleanup(udir, newdentry); + dput(newdentry); + } +out_dput: + dput(upper); +out_unlock: + unlock_rename(workdir, upperdir); +out: + if (!hardlink) { + posix_acl_release(acl); + posix_acl_release(default_acl); + } + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + dput(newdentry); + goto out_dput; +} + +static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, + struct ovl_cattr *attr, bool origin) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + struct dentry *parent = dentry->d_parent; + + err = ovl_copy_up(parent); + if (err) + return err; + + old_cred = ovl_override_creds(dentry->d_sb); + + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + goto out_revert_creds; + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (override_cred) { + override_cred->fsuid = inode->i_uid; + override_cred->fsgid = inode->i_gid; + if (!attr->hardlink) { + err = security_dentry_create_files_as(dentry, + attr->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; + } + } + put_cred(override_creds(override_cred)); + put_cred(override_cred); + + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr); + else + err = ovl_create_over_whiteout(dentry, inode, attr); + } +out_revert_creds: + revert_creds(old_cred); + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + struct inode *inode; + struct ovl_cattr attr = { + .rdev = rdev, + .link = link, + }; + + err = ovl_want_write(dentry); + if (err) + goto out; + + /* Preallocate inode to be used by ovl_get_inode() */ + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, rdev); + if (!inode) + goto out_drop_write; + + spin_lock(&inode->i_lock); + inode->i_state |= I_CREATING; + spin_unlock(&inode->i_lock); + + inode_init_owner(inode, dentry->d_parent->d_inode, mode); + attr.mode = inode->i_mode; + + err = ovl_create_or_link(dentry, inode, &attr, false); + /* Did we end up using the preallocated inode? */ + if (inode != d_inode(dentry)) + iput(inode); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_set_link_redirect(struct dentry *dentry) +{ + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_set_redirect(dentry, false); + revert_creds(old_cred); + + return err; +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + bool locked = false; + struct inode *inode; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + + if (ovl_is_metacopy_dentry(old)) { + err = ovl_set_link_redirect(old); + if (err) + goto out_drop_write; + } + + err = ovl_nlink_start(old, &locked); + if (err) + goto out_drop_write; + + inode = d_inode(old); + ihold(inode); + + err = ovl_create_or_link(new, inode, + &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)}, + ovl_type_origin(old)); + if (err) + iput(inode); + + ovl_nlink_end(old, locked); +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper) +{ + return d_inode(ovl_dentry_upper(dentry)) == d_inode(upper); +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (WARN_ON(!workdir)) + return -EROFS; + + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && ovl_dentry_upper(dentry) && + !ovl_matches_upper(dentry, upper))) { + goto out_dput_upper; + } + + err = ovl_cleanup_and_whiteout(workdir, d_inode(upperdir), upper); + if (err) + goto out_d_drop; + + ovl_dir_modified(dentry->d_parent, true); +out_d_drop: + d_drop(dentry); +out_dput_upper: + dput(upper); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir, + struct list_head *list) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + inode_lock_nested(dir, I_MUTEX_PARENT); + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && !ovl_matches_upper(dentry, upper))) + goto out_dput_upper; + + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + if (!err) + d_drop(dentry); +out_dput_upper: + dput(upper); +out_unlock: + inode_unlock(dir); + dput(opaquedir); +out: + return err; +} + +static bool ovl_pure_upper(struct dentry *dentry) +{ + return !ovl_dentry_lower(dentry) && + !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + int err; + bool locked = false; + const struct cred *old_cred; + struct dentry *upperdentry; + bool lower_positive = ovl_lower_positive(dentry); + LIST_HEAD(list); + + /* No need to clean pure upper removed by vfs_rmdir() */ + if (is_dir && (lower_positive || !ovl_pure_upper(dentry))) { + err = ovl_check_empty_dir(dentry, &list); + if (err) + goto out; + } + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + err = ovl_nlink_start(dentry, &locked); + if (err) + goto out_drop_write; + + old_cred = ovl_override_creds(dentry->d_sb); + if (!lower_positive) + err = ovl_remove_upper(dentry, is_dir, &list); + else + err = ovl_remove_and_whiteout(dentry, &list); + revert_creds(old_cred); + if (!err) { + if (is_dir) + clear_nlink(dentry->d_inode); + else + drop_nlink(dentry->d_inode); + } + ovl_nlink_end(dentry, locked); + + /* + * Copy ctime + * + * Note: we fail to update ctime if there was no copy-up, only a + * whiteout + */ + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) + ovl_copyattr(d_inode(upperdentry), d_inode(dentry)); + +out_drop_write: + ovl_drop_write(dentry); +out: + ovl_cache_free(&list); + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static bool ovl_type_merge_or_lower(struct dentry *dentry) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type); +} + +static bool ovl_can_move(struct dentry *dentry) +{ + return ovl_redirect_dir(dentry->d_sb) || + !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry); +} + +static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect) +{ + char *buf, *ret; + struct dentry *d, *tmp; + int buflen = ovl_redirect_max + 1; + + if (!abs_redirect) { + ret = kstrndup(dentry->d_name.name, dentry->d_name.len, + GFP_KERNEL); + goto out; + } + + buf = ret = kmalloc(buflen, GFP_KERNEL); + if (!buf) + goto out; + + buflen--; + buf[buflen] = '\0'; + for (d = dget(dentry); !IS_ROOT(d);) { + const char *name; + int thislen; + + spin_lock(&d->d_lock); + name = ovl_dentry_get_redirect(d); + if (name) { + thislen = strlen(name); + } else { + name = d->d_name.name; + thislen = d->d_name.len; + } + + /* If path is too long, fall back to userspace move */ + if (thislen + (name[0] != '/') > buflen) { + ret = ERR_PTR(-EXDEV); + spin_unlock(&d->d_lock); + goto out_put; + } + + buflen -= thislen; + memcpy(&buf[buflen], name, thislen); + spin_unlock(&d->d_lock); + tmp = dget_parent(d); + + dput(d); + d = tmp; + + /* Absolute redirect: finished */ + if (buf[buflen] == '/') + break; + buflen--; + buf[buflen] = '/'; + } + ret = kstrdup(&buf[buflen], GFP_KERNEL); +out_put: + dput(d); + kfree(buf); +out: + return ret ? ret : ERR_PTR(-ENOMEM); +} + +static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir) +{ + struct dentry *lowerdentry; + + if (!samedir) + return true; + + if (d_is_dir(dentry)) + return false; + + /* + * For non-dir hardlinked files, we need absolute redirects + * in general as two upper hardlinks could be in different + * dirs. We could put a relative redirect now and convert + * it to absolute redirect later. But when nlink > 1 and + * indexing is on, that means relative redirect needs to be + * converted to absolute during copy up of another lower + * hardllink as well. + * + * So without optimizing too much, just check if lower is + * a hard link or not. If lower is hard link, put absolute + * redirect. + */ + lowerdentry = ovl_dentry_lower(dentry); + return (d_inode(lowerdentry)->i_nlink > 1); +} + +static int ovl_set_redirect(struct dentry *dentry, bool samedir) +{ + int err; + const char *redirect = ovl_dentry_get_redirect(dentry); + bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir); + + if (redirect && (!absolute_redirect || redirect[0] == '/')) + return 0; + + redirect = ovl_get_redirect(dentry, absolute_redirect); + if (IS_ERR(redirect)) + return PTR_ERR(redirect); + + err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + OVL_XATTR_REDIRECT, + redirect, strlen(redirect), -EXDEV); + if (!err) { + spin_lock(&dentry->d_lock); + ovl_dentry_set_redirect(dentry, redirect); + spin_unlock(&dentry->d_lock); + } else { + kfree(redirect); + pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + err); + /* Fall back to userspace copy-up */ + err = -EXDEV; + } + return err; +} + +static int ovl_rename(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + bool locked = false; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = d_is_dir(old); + bool new_is_dir = d_is_dir(new); + bool samedir = olddir == newdir; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + LIST_HEAD(list); + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + /* Don't copy up directory trees */ + err = -EXDEV; + if (!ovl_can_move(old)) + goto out; + if (!overwrite && !ovl_can_move(new)) + goto out; + + if (overwrite && new_is_dir && !ovl_pure_upper(new)) { + err = ovl_check_empty_dir(new, &list); + if (err) + goto out; + } + + if (overwrite) { + if (ovl_lower_positive(old)) { + if (!ovl_dentry_is_whiteout(new)) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && ovl_dentry_is_whiteout(new)) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } else { + err = ovl_nlink_start(new, &locked); + if (err) + goto out_drop_write; + } + + old_cred = ovl_override_creds(old->d_sb); + + if (!list_empty(&list)) { + opaquedir = ovl_clear_empty(new, &list); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + if (!samedir) { + /* + * When moving a merge dir or non-dir with copy up origin into + * a new parent, we are marking the new parent dir "impure". + * When ovl_iterate() iterates an "impure" upper dir, it will + * lookup the origin inodes of the entries to fill d_ino. + */ + if (ovl_type_origin(old)) { + err = ovl_set_impure(new->d_parent, new_upperdir); + if (err) + goto out_revert_creds; + } + if (!overwrite && ovl_type_origin(new)) { + err = ovl_set_impure(old->d_parent, old_upperdir); + if (err) + goto out_revert_creds; + } + } + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = lookup_one_len(old->d_name.name, old_upperdir, + old->d_name.len); + err = PTR_ERR(olddentry); + if (IS_ERR(olddentry)) + goto out_unlock; + + err = -ESTALE; + if (!ovl_matches_upper(old, olddentry)) + goto out_dput_old; + + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput_old; + + old_opaque = ovl_dentry_is_opaque(old); + new_opaque = ovl_dentry_is_opaque(new); + + err = -ESTALE; + if (d_inode(new) && ovl_dentry_upper(new)) { + if (opaquedir) { + if (newdentry != opaquedir) + goto out_dput; + } else { + if (!ovl_matches_upper(new, newdentry)) + goto out_dput; + } + } else { + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } + } + + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (olddentry->d_inode == newdentry->d_inode) + goto out_dput; + + err = 0; + if (ovl_type_merge_or_lower(old)) + err = ovl_set_redirect(old, samedir); + else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + if (err) + goto out_dput; + + if (!overwrite && ovl_type_merge_or_lower(new)) + err = ovl_set_redirect(new, samedir); + else if (!overwrite && new_is_dir && !new_opaque && + ovl_type_merge(old->d_parent)) + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + if (err) + goto out_dput; + + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, flags); + if (err) + goto out_dput; + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + if (overwrite && d_inode(new)) { + if (new_is_dir) + clear_nlink(d_inode(new)); + else + drop_nlink(d_inode(new)); + } + + ovl_dir_modified(old->d_parent, ovl_type_origin(old) || + (!overwrite && ovl_type_origin(new))); + ovl_dir_modified(new->d_parent, ovl_type_origin(old) || + (d_inode(new) && ovl_type_origin(new))); + + /* copy ctime: */ + ovl_copyattr(d_inode(olddentry), d_inode(old)); + if (d_inode(new) && ovl_dentry_upper(new)) + ovl_copyattr(d_inode(newdentry), d_inode(new)); + +out_dput: + dput(newdentry); +out_dput_old: + dput(olddentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + revert_creds(old_cred); + ovl_nlink_end(new, locked); +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + ovl_cache_free(&list); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename = ovl_rename, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, +}; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c new file mode 100644 index 000000000..ba6c7c592 --- /dev/null +++ b/fs/overlayfs/export.c @@ -0,0 +1,850 @@ +/* + * Overlayfs NFS export support. + * + * Amir Goldstein <amir73il@gmail.com> + * + * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + + if (err) { + pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + dentry, err); + } + + return err; +} + +/* + * Before encoding a non-upper directory file handle from real layer N, we need + * to check if it will be possible to reconnect an overlay dentry from the real + * lower decoded dentry. This is done by following the overlay ancestry up to a + * "layer N connected" ancestor and verifying that all parents along the way are + * "layer N connectable". If an ancestor that is NOT "layer N connectable" is + * found, we need to copy up an ancestor, which is "layer N connectable", thus + * making that ancestor "layer N connected". For example: + * + * layer 1: /a + * layer 2: /a/b/c + * + * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is + * copied up and renamed, upper dir /a will be indexed by lower dir /a from + * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) + * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay + * dentry from the connected lower dentry /a/b/c. + * + * To avoid this problem on decode time, we need to copy up an ancestor of + * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is + * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" + * and when the time comes to decode the file handle from lower dentry /a/b/c, + * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding + * a connected overlay dentry will be accomplished. + * + * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an + * entry /a in the lower layers above layer N and find the indexed dir /a from + * layer 1. If that improvement is made, then the check for "layer N connected" + * will need to verify there are no redirects in lower layers above N. In the + * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a + * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": + * + * layer 1: /A (redirect = /a) + * layer 2: /a/b/c + */ + +/* Return the lowest layer for encoding a connectable file handle */ +static int ovl_connectable_layer(struct dentry *dentry) +{ + struct ovl_entry *oe = OVL_E(dentry); + + /* We can get overlay root from root of any layer */ + if (dentry == dentry->d_sb->s_root) + return oe->numlower; + + /* + * If it's an unindexed merge dir, then it's not connectable with any + * lower layer + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* We can get upper/overlay path from indexed/lower dentry */ + return oe->lowerstack[0].layer->idx; +} + +/* + * @dentry is "connected" if all ancestors up to root or a "connected" ancestor + * have the same uppermost lower layer as the origin's layer. We may need to + * copy up a "connectable" ancestor to make it "connected". A "connected" dentry + * cannot become non "connected", so cache positive result in dentry flags. + * + * Return the connected origin layer or < 0 on error. + */ +static int ovl_connect_layer(struct dentry *dentry) +{ + struct dentry *next, *parent = NULL; + int origin_layer; + int err = 0; + + if (WARN_ON(dentry == dentry->d_sb->s_root) || + WARN_ON(!ovl_dentry_lower(dentry))) + return -EIO; + + origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) + return origin_layer; + + /* Find the topmost origin layer connectable ancestor of @dentry */ + next = dget(dentry); + for (;;) { + parent = dget_parent(next); + if (WARN_ON(parent == next)) { + err = -EIO; + break; + } + + /* + * If @parent is not origin layer connectable, then copy up + * @next which is origin layer connectable and we are done. + */ + if (ovl_connectable_layer(parent) < origin_layer) { + err = ovl_encode_maybe_copy_up(next); + break; + } + + /* If @parent is connected or indexed we are done */ + if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || + ovl_test_flag(OVL_INDEX, d_inode(parent))) + break; + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + if (!err) + ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); + + return err ?: origin_layer; +} + +/* + * We only need to encode origin if there is a chance that the same object was + * encoded pre copy up and then we need to stay consistent with the same + * encoding also after copy up. If non-pure upper is not indexed, then it was + * copied up before NFS export was enabled. In that case we don't need to worry + * about staying consistent with pre copy up encoding and we encode an upper + * file handle. Overlay root dentry is a private case of non-indexed upper. + * + * The following table summarizes the different file handle encodings used for + * different overlay object types: + * + * Object type | Encoding + * -------------------------------- + * Pure upper | U + * Non-indexed upper | U + * Indexed upper | L (*) + * Non-upper | L (*) + * + * U = upper file handle + * L = lower file handle + * + * (*) Connecting an overlay dir from real lower dentry is not always + * possible when there are redirects in lower layers and non-indexed merge dirs. + * To mitigate those case, we may copy up the lower dir ancestor before encode + * a lower dir file handle. + * + * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. + */ +static int ovl_check_encode_origin(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + /* Upper file handle for pure upper */ + if (!ovl_dentry_lower(dentry)) + return 0; + + /* + * Upper file handle for non-indexed upper. + * + * Root is never indexed, so if there's an upper layer, encode upper for + * root. + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* + * Decoding a merge dir, whose origin's ancestor is under a redirected + * lower dir or under a non-indexed upper is not always possible. + * ovl_connect_layer() will try to make origin's layer "connected" by + * copying up a "connectable" ancestor. + */ + if (d_is_dir(dentry) && ofs->upper_mnt) + return ovl_connect_layer(dentry); + + /* Lower file handle for indexed and non-upper dir/non-dir */ + return 1; +} + +static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +{ + struct ovl_fh *fh = NULL; + int err, enc_lower; + + /* + * Check if we should encode a lower or upper file handle and maybe + * copy up an ancestor to make lower file handle connectable. + */ + err = enc_lower = ovl_check_encode_origin(dentry); + if (enc_lower < 0) + goto fail; + + /* Encode an upper or lower file handle */ + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = -EOVERFLOW; + if (fh->len > buflen) + goto fail; + + memcpy(buf, (char *)fh, fh->len); + err = fh->len; + +out: + kfree(fh); + return err; + +fail: + pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", + dentry, err, buflen, fh ? (int)fh->len : 0, + fh ? fh->type : 0); + goto out; +} + +static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) +{ + int res, len = *max_len << 2; + + res = ovl_d_to_fh(dentry, (char *)fid, len); + if (res <= 0) + return FILEID_INVALID; + + len = res; + + /* Round up to dwords */ + *max_len = (len + 3) >> 2; + return OVL_FILEID; +} + +static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) +{ + struct dentry *dentry; + int type; + + /* TODO: encode connectable file handles */ + if (parent) + return FILEID_INVALID; + + dentry = d_find_any_alias(inode); + if (WARN_ON(!dentry)) + return FILEID_INVALID; + + type = ovl_dentry_to_fh(dentry, fid, max_len); + + dput(dentry); + return type; +} + +/* + * Find or instantiate an overlay dentry from real dentries and index. + */ +static struct dentry *ovl_obtain_alias(struct super_block *sb, + struct dentry *upper_alias, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; + struct dentry *upper = upper_alias ?: index; + struct dentry *dentry; + struct inode *inode; + struct ovl_entry *oe; + struct ovl_inode_params oip = { + .lowerpath = lowerpath, + .index = index, + .numlower = !!lower + }; + + /* We get overlay directory dentries with ovl_lookup_real() */ + if (d_is_dir(upper ?: lower)) + return ERR_PTR(-EIO); + + oip.upperdentry = dget(upper); + inode = ovl_get_inode(sb, &oip); + if (IS_ERR(inode)) { + dput(upper); + return ERR_CAST(inode); + } + + if (upper) + ovl_set_flag(OVL_UPPERDATA, inode); + + dentry = d_find_any_alias(inode); + if (!dentry) { + dentry = d_alloc_anon(inode->i_sb); + if (!dentry) + goto nomem; + oe = ovl_alloc_entry(lower ? 1 : 0); + if (!oe) + goto nomem; + + if (lower) { + oe->lowerstack->dentry = dget(lower); + oe->lowerstack->layer = lowerpath->layer; + } + dentry->d_fsdata = oe; + if (upper_alias) + ovl_dentry_set_upper_alias(dentry); + } + + return d_instantiate_anon(dentry, inode); + +nomem: + iput(inode); + dput(dentry); + return ERR_PTR(-ENOMEM); +} + +/* Get the upper or lower dentry in stach whose on layer @idx */ +static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) +{ + struct ovl_entry *oe = dentry->d_fsdata; + int i; + + if (!idx) + return ovl_dentry_upper(dentry); + + for (i = 0; i < oe->numlower; i++) { + if (oe->lowerstack[i].layer->idx == idx) + return oe->lowerstack[i].dentry; + } + + return NULL; +} + +/* + * Lookup a child overlay dentry to get a connected overlay dentry whose real + * dentry is @real. If @real is on upper layer, we lookup a child overlay + * dentry with the same name as the real dentry. Otherwise, we need to consult + * index for lookup. + */ +static struct dentry *ovl_lookup_real_one(struct dentry *connected, + struct dentry *real, + struct ovl_layer *layer) +{ + struct inode *dir = d_inode(connected); + struct dentry *this, *parent = NULL; + struct name_snapshot name; + int err; + + /* + * Lookup child overlay dentry by real name. The dir mutex protects us + * from racing with overlay rename. If the overlay dentry that is above + * real has already been moved to a parent that is not under the + * connected overlay dir, we return -ECHILD and restart the lookup of + * connected real path from the top. + */ + inode_lock_nested(dir, I_MUTEX_PARENT); + err = -ECHILD; + parent = dget_parent(real); + if (ovl_dentry_real_at(connected, layer->idx) != parent) + goto fail; + + /* + * We also need to take a snapshot of real dentry name to protect us + * from racing with underlying layer rename. In this case, we don't + * care about returning ESTALE, only from dereferencing a free name + * pointer because we hold no lock on the real dentry. + */ + take_dentry_name_snapshot(&name, real); + this = lookup_one_len(name.name, connected, strlen(name.name)); + err = PTR_ERR(this); + if (IS_ERR(this)) { + goto fail; + } else if (!this || !this->d_inode) { + dput(this); + err = -ENOENT; + goto fail; + } else if (ovl_dentry_real_at(this, layer->idx) != real) { + dput(this); + err = -ESTALE; + goto fail; + } + +out: + release_dentry_name_snapshot(&name); + dput(parent); + inode_unlock(dir); + return this; + +fail: + pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + this = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer); + +/* + * Lookup an indexed or hashed overlay dentry by real inode. + */ +static struct dentry *ovl_lookup_real_inode(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; + struct dentry *index = NULL; + struct dentry *this = NULL; + struct inode *inode; + + /* + * Decoding upper dir from index is expensive, so first try to lookup + * overlay dentry in inode/dcache. + */ + inode = ovl_lookup_inode(sb, real, !layer->idx); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (inode) { + this = d_find_any_alias(inode); + iput(inode); + } + + /* + * For decoded lower dir file handle, lookup index by origin to check + * if lower dir was copied up and and/or removed. + */ + if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) { + index = ovl_lookup_index(ofs, NULL, real, false); + if (IS_ERR(index)) + return index; + } + + /* Get connected upper overlay dir from index */ + if (index) { + struct dentry *upper = ovl_index_upper(ofs, index); + + dput(index); + if (IS_ERR_OR_NULL(upper)) + return upper; + + /* + * ovl_lookup_real() in lower layer may call recursively once to + * ovl_lookup_real() in upper layer. The first level call walks + * back lower parents to the topmost indexed parent. The second + * recursive call walks back from indexed upper to the topmost + * connected/hashed upper parent (or up to root). + */ + this = ovl_lookup_real(sb, upper, &upper_layer); + dput(upper); + } + + if (IS_ERR_OR_NULL(this)) + return this; + + if (ovl_dentry_real_at(this, layer->idx) != real) { + dput(this); + this = ERR_PTR(-EIO); + } + + return this; +} + +/* + * Lookup an indexed or hashed overlay dentry, whose real dentry is an + * ancestor of @real. + */ +static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct dentry *next, *parent = NULL; + struct dentry *ancestor = ERR_PTR(-EIO); + + if (real == layer->mnt->mnt_root) + return dget(sb->s_root); + + /* Find the topmost indexed or hashed ancestor */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + /* + * Lookup a matching overlay dentry in inode/dentry + * cache or in index by real inode. + */ + ancestor = ovl_lookup_real_inode(sb, next, layer); + if (ancestor) + break; + + if (parent == layer->mnt->mnt_root) { + ancestor = dget(sb->s_root); + break; + } + + /* + * If @real has been moved out of the layer root directory, + * we will eventully hit the real fs root. This cannot happen + * by legit overlay rename, so we return error in that case. + */ + if (parent == next) { + ancestor = ERR_PTR(-EXDEV); + break; + } + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + return ancestor; +} + +/* + * Lookup a connected overlay dentry whose real dentry is @real. + * If @real is on upper layer, we lookup a child overlay dentry with the same + * path the real dentry. Otherwise, we need to consult index for lookup. + */ +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct dentry *connected; + int err = 0; + + connected = ovl_lookup_real_ancestor(sb, real, layer); + if (IS_ERR(connected)) + return connected; + + while (!err) { + struct dentry *next, *this; + struct dentry *parent = NULL; + struct dentry *real_connected = ovl_dentry_real_at(connected, + layer->idx); + + if (real_connected == real) + break; + + /* Find the topmost dentry not yet connected */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + if (parent == real_connected) + break; + + /* + * If real has been moved out of 'real_connected', + * we will not find 'real_connected' and hit the layer + * root. In that case, we need to restart connecting. + * This game can go on forever in the worst case. We + * may want to consider taking s_vfs_rename_mutex if + * this happens more than once. + */ + if (parent == layer->mnt->mnt_root) { + dput(connected); + connected = dget(sb->s_root); + break; + } + + /* + * If real file has been moved out of the layer root + * directory, we will eventully hit the real fs root. + * This cannot happen by legit overlay rename, so we + * return error in that case. + */ + if (parent == next) { + err = -EXDEV; + break; + } + + dput(next); + next = parent; + } + + if (!err) { + this = ovl_lookup_real_one(connected, next, layer); + if (IS_ERR(this)) + err = PTR_ERR(this); + + /* + * Lookup of child in overlay can fail when racing with + * overlay rename of child away from 'connected' parent. + * In this case, we need to restart the lookup from the + * top, because we cannot trust that 'real_connected' is + * still an ancestor of 'real'. There is a good chance + * that the renamed overlay ancestor is now in cache, so + * ovl_lookup_real_ancestor() will find it and we can + * continue to connect exactly from where lookup failed. + */ + if (err == -ECHILD) { + this = ovl_lookup_real_ancestor(sb, real, + layer); + err = PTR_ERR_OR_ZERO(this); + } + if (!err) { + dput(connected); + connected = this; + } + } + + dput(parent); + dput(next); + } + + if (err) + goto fail; + + return connected; + +fail: + pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + dput(connected); + return ERR_PTR(err); +} + +/* + * Get an overlay dentry from upper/lower real dentries and index. + */ +static struct dentry *ovl_get_dentry(struct super_block *sb, + struct dentry *upper, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; + struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer; + struct dentry *real = upper ?: (index ?: lowerpath->dentry); + + /* + * Obtain a disconnected overlay dentry from a non-dir real dentry + * and index. + */ + if (!d_is_dir(real)) + return ovl_obtain_alias(sb, upper, lowerpath, index); + + /* Removed empty directory? */ + if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real)) + return ERR_PTR(-ENOENT); + + /* + * If real dentry is connected and hashed, get a connected overlay + * dentry whose real dentry is @real. + */ + return ovl_lookup_real(sb, real, layer); +} + +static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct dentry *dentry; + struct dentry *upper; + + if (!ofs->upper_mnt) + return ERR_PTR(-EACCES); + + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); + if (IS_ERR_OR_NULL(upper)) + return upper; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + + return dentry; +} + +static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; + struct dentry *dentry = NULL; + struct dentry *index = NULL; + struct inode *inode; + int err; + + /* First lookup overlay inode in inode cache by origin fh */ + err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); + if (err) + return ERR_PTR(err); + + if (!d_is_dir(origin.dentry) || + !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_err; + if (inode) { + dentry = d_find_any_alias(inode); + iput(inode); + if (dentry) + goto out; + } + } + + /* Then lookup indexed upper/whiteout by origin fh */ + if (ofs->indexdir) { + index = ovl_get_index_fh(ofs, fh); + err = PTR_ERR(index); + if (IS_ERR(index)) { + index = NULL; + goto out_err; + } + } + + /* Then try to get a connected upper dir by index */ + if (index && d_is_dir(index)) { + struct dentry *upper = ovl_index_upper(ofs, index); + + err = PTR_ERR(upper); + if (IS_ERR_OR_NULL(upper)) + goto out_err; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + goto out; + } + + /* Find origin.dentry again with ovl_acceptable() layer check */ + if (d_is_dir(origin.dentry)) { + dput(origin.dentry); + origin.dentry = NULL; + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); + if (err) + goto out_err; + } + if (index) { + err = ovl_verify_origin(index, origin.dentry, false); + if (err) + goto out_err; + } + + /* Get a connected non-upper dir or disconnected non-dir */ + dentry = ovl_get_dentry(sb, NULL, &origin, index); + +out: + dput(origin.dentry); + dput(index); + return dentry; + +out_err: + dentry = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct dentry *dentry = NULL; + struct ovl_fh *fh = (struct ovl_fh *) fid; + int len = fh_len << 2; + unsigned int flags = 0; + int err; + + err = -EINVAL; + if (fh_type != OVL_FILEID) + goto out_err; + + err = ovl_check_fh_len(fh, len); + if (err) + goto out_err; + + flags = fh->flags; + dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? + ovl_upper_fh_to_d(sb, fh) : + ovl_lower_fh_to_d(sb, fh); + err = PTR_ERR(dentry); + if (IS_ERR(dentry) && err != -ESTALE) + goto out_err; + + return dentry; + +out_err: + pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + len, fh_type, flags, err); + return ERR_PTR(err); +} + +static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + return ERR_PTR(-EACCES); +} + +static int ovl_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries and + * ovl_fh_to_parent() is not implemented, so we should not get here. + */ + WARN_ON_ONCE(1); + return -EIO; +} + +static struct dentry *ovl_get_parent(struct dentry *dentry) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries, so we + * should not get here. + */ + WARN_ON_ONCE(1); + return ERR_PTR(-EIO); +} + +const struct export_operations ovl_export_operations = { + .encode_fh = ovl_encode_fh, + .fh_to_dentry = ovl_fh_to_dentry, + .fh_to_parent = ovl_fh_to_parent, + .get_name = ovl_get_name, + .get_parent = ovl_get_parent, +}; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c new file mode 100644 index 000000000..73c3e2c21 --- /dev/null +++ b/fs/overlayfs/file.c @@ -0,0 +1,653 @@ +/* + * Copyright (C) 2017 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/uio.h> +#include <linux/uaccess.h> +#include "overlayfs.h" + +static char ovl_whatisit(struct inode *inode, struct inode *realinode) +{ + if (realinode != ovl_inode_upper(inode)) + return 'l'; + if (ovl_has_upperdata(inode)) + return 'u'; + else + return 'm'; +} + +/* No atime modificaton nor notify on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) + +static struct file *ovl_open_realfile(const struct file *file, + struct inode *realinode) +{ + struct inode *inode = file_inode(file); + struct file *realfile; + const struct cred *old_cred; + int flags = file->f_flags | OVL_OPEN_FLAGS; + + old_cred = ovl_override_creds(inode->i_sb); + realfile = open_with_fake_path(&file->f_path, flags, realinode, + current_cred()); + revert_creds(old_cred); + + pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", + file, file, ovl_whatisit(inode, realinode), file->f_flags, + realfile, IS_ERR(realfile) ? 0 : realfile->f_flags); + + return realfile; +} + +#define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT) + +static int ovl_change_flags(struct file *file, unsigned int flags) +{ + struct inode *inode = file_inode(file); + int err; + + flags |= OVL_OPEN_FLAGS; + + /* If some flag changed that cannot be changed then something's amiss */ + if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) + return -EIO; + + flags &= OVL_SETFL_MASK; + + if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) + return -EPERM; + + if (flags & O_DIRECT) { + if (!file->f_mapping->a_ops || + !file->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + + if (file->f_op->check_flags) { + err = file->f_op->check_flags(flags); + if (err) + return err; + } + + spin_lock(&file->f_lock); + file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + spin_unlock(&file->f_lock); + + return 0; +} + +static int ovl_real_fdget_meta(const struct file *file, struct fd *real, + bool allow_meta) +{ + struct inode *inode = file_inode(file); + struct inode *realinode; + + real->flags = 0; + real->file = file->private_data; + + if (allow_meta) + realinode = ovl_inode_real(inode); + else + realinode = ovl_inode_realdata(inode); + + /* Has it been copied up since we'd opened it? */ + if (unlikely(file_inode(real->file) != realinode)) { + real->flags = FDPUT_FPUT; + real->file = ovl_open_realfile(file, realinode); + + return PTR_ERR_OR_ZERO(real->file); + } + + /* Did the flags change since open? */ + if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS)) + return ovl_change_flags(real->file, file->f_flags); + + return 0; +} + +static int ovl_real_fdget(const struct file *file, struct fd *real) +{ + return ovl_real_fdget_meta(file, real, false); +} + +static int ovl_open(struct inode *inode, struct file *file) +{ + struct file *realfile; + int err; + + err = ovl_maybe_copy_up(file_dentry(file), file->f_flags); + if (err) + return err; + + /* No longer need these flags, so don't pass them on to underlying fs */ + file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + realfile = ovl_open_realfile(file, ovl_inode_realdata(inode)); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + file->private_data = realfile; + + return 0; +} + +static int ovl_release(struct inode *inode, struct file *file) +{ + fput(file->private_data); + + return 0; +} + +static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + loff_t ret; + + /* + * The two special cases below do not need to involve real fs, + * so we can optimizing concurrent callers. + */ + if (offset == 0) { + if (whence == SEEK_CUR) + return file->f_pos; + + if (whence == SEEK_SET) + return vfs_setpos(file, 0, 0); + } + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + /* + * Overlay file f_pos is the master copy that is preserved + * through copy up and modified on read/write, but only real + * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose + * limitations that are more strict than ->s_maxbytes for specific + * files, so we use the real file to perform seeks. + */ + inode_lock(inode); + real.file->f_pos = file->f_pos; + + old_cred = ovl_override_creds(inode->i_sb); + ret = vfs_llseek(real.file, offset, whence); + revert_creds(old_cred); + + file->f_pos = real.file->f_pos; + inode_unlock(inode); + + fdput(real); + + return ret; +} + +static void ovl_file_accessed(struct file *file) +{ + struct inode *inode, *upperinode; + + if (file->f_flags & O_NOATIME) + return; + + inode = file_inode(file); + upperinode = ovl_inode_upper(inode); + + if (!upperinode) + return; + + if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || + !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { + inode->i_mtime = upperinode->i_mtime; + inode->i_ctime = upperinode->i_ctime; + } + + touch_atime(&file->f_path); +} + +static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) +{ + int ifl = iocb->ki_flags; + rwf_t flags = 0; + + if (ifl & IOCB_NOWAIT) + flags |= RWF_NOWAIT; + if (ifl & IOCB_HIPRI) + flags |= RWF_HIPRI; + if (ifl & IOCB_DSYNC) + flags |= RWF_DSYNC; + if (ifl & IOCB_SYNC) + flags |= RWF_SYNC; + + return flags; +} + +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + if (!iov_iter_count(iter)) + return 0; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + revert_creds(old_cred); + + ovl_file_accessed(file); + + fdput(real); + + return ret; +} + +static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + if (!iov_iter_count(iter)) + return 0; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(ovl_inode_real(inode), inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(file, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + file_start_write(real.file); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + +static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget_meta(file, &real, !datasync); + if (ret) + return ret; + + /* Don't sync lower file for fear of receiving EROFS error */ + if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) { + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fsync_range(real.file, start, end, datasync); + revert_creds(old_cred); + } + + fdput(real); + + return ret; +} + +static int ovl_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *realfile = file->private_data; + const struct cred *old_cred; + int ret; + + if (!realfile->f_op->mmap) + return -ENODEV; + + if (WARN_ON(file != vma->vm_file)) + return -EIO; + + vma->vm_file = get_file(realfile); + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ret) { + /* Drop reference count from new vm_file value */ + fput(realfile); + } else { + /* Drop reference count from previous vm_file value */ + fput(file); + } + + ovl_file_accessed(file); + + return ret; +} + +static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fallocate(real.file, mode, offset, len); + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + + fdput(real); + + return ret; +} + +static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fadvise(real.file, offset, len, advice); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + +static long ovl_real_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fd real; + const struct cred *old_cred; + long ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_ioctl(real.file, cmd, arg); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + +static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int iflags) +{ + long ret; + struct inode *inode = file_inode(file); + unsigned int old_iflags; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(inode); + + /* Check the capability before cred override */ + ret = -EPERM; + old_iflags = READ_ONCE(inode->i_flags); + if (((iflags ^ old_iflags) & (S_APPEND | S_IMMUTABLE)) && + !capable(CAP_LINUX_IMMUTABLE)) + goto unlock; + + ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); + if (ret) + goto unlock; + + ret = ovl_real_ioctl(file, cmd, arg); + + ovl_copyflags(ovl_inode_real(inode), inode); +unlock: + inode_unlock(inode); + + mnt_drop_write_file(file); + + return ret; + +} + +static unsigned int ovl_fsflags_to_iflags(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & FS_SYNC_FL) + iflags |= S_SYNC; + if (flags & FS_APPEND_FL) + iflags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + iflags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + iflags |= S_NOATIME; + + return iflags; +} + +static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned int flags; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + return ovl_ioctl_set_flags(file, cmd, arg, + ovl_fsflags_to_iflags(flags)); +} + +static unsigned int ovl_fsxflags_to_iflags(unsigned int xflags) +{ + unsigned int iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= S_SYNC; + if (xflags & FS_XFLAG_APPEND) + iflags |= S_APPEND; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= S_IMMUTABLE; + if (xflags & FS_XFLAG_NOATIME) + iflags |= S_NOATIME; + + return iflags; +} + +static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fsxattr fa; + + memset(&fa, 0, sizeof(fa)); + if (copy_from_user(&fa, (void __user *) arg, sizeof(fa))) + return -EFAULT; + + return ovl_ioctl_set_flags(file, cmd, arg, + ovl_fsxflags_to_iflags(fa.fsx_xflags)); +} + +static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + case FS_IOC_FSGETXATTR: + ret = ovl_real_ioctl(file, cmd, arg); + break; + + case FS_IOC_SETFLAGS: + ret = ovl_ioctl_set_fsflags(file, cmd, arg); + break; + + case FS_IOC_FSSETXATTR: + ret = ovl_ioctl_set_fsxflags(file, cmd, arg); + break; + + default: + ret = -ENOTTY; + } + + return ret; +} + +static long ovl_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + + default: + return -ENOIOCTLCMD; + } + + return ovl_ioctl(file, cmd, arg); +} + +enum ovl_copyop { + OVL_COPY, + OVL_CLONE, + OVL_DEDUPE, +}; + +static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int flags, enum ovl_copyop op) +{ + struct inode *inode_out = file_inode(file_out); + struct fd real_in, real_out; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(file_out, &real_out); + if (ret) + return ret; + + ret = ovl_real_fdget(file_in, &real_in); + if (ret) { + fdput(real_out); + return ret; + } + + old_cred = ovl_override_creds(file_inode(file_out)->i_sb); + switch (op) { + case OVL_COPY: + ret = vfs_copy_file_range(real_in.file, pos_in, + real_out.file, pos_out, len, flags); + break; + + case OVL_CLONE: + ret = vfs_clone_file_range(real_in.file, pos_in, + real_out.file, pos_out, len); + break; + + case OVL_DEDUPE: + ret = vfs_dedupe_file_range_one(real_in.file, pos_in, + real_out.file, pos_out, len); + break; + } + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode_out), inode_out); + + fdput(real_in); + fdput(real_out); + + return ret; +} + +static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags, + OVL_COPY); +} + +static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_CLONE); +} + +static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + /* + * Don't copy up because of a dedupe request, this wouldn't make sense + * most of the time (data would be duplicated instead of deduplicated). + */ + if (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out))) + return -EPERM; + + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_DEDUPE); +} + +const struct file_operations ovl_file_operations = { + .open = ovl_open, + .release = ovl_release, + .llseek = ovl_llseek, + .read_iter = ovl_read_iter, + .write_iter = ovl_write_iter, + .fsync = ovl_fsync, + .mmap = ovl_mmap, + .fallocate = ovl_fallocate, + .fadvise = ovl_fadvise, + .unlocked_ioctl = ovl_ioctl, + .compat_ioctl = ovl_compat_ioctl, + + .copy_file_range = ovl_copy_file_range, + .clone_file_range = ovl_clone_file_range, + .dedupe_file_range = ovl_dedupe_file_range, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000..08e60a6df --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,982 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + bool full_copy_up = false; + struct dentry *upperdentry; + const struct cred *old_cred; + + err = setattr_prepare(dentry, attr); + if (err) + return err; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (attr->ia_valid & ATTR_SIZE) { + struct inode *realinode = d_inode(ovl_dentry_real(dentry)); + + err = -ETXTBSY; + if (atomic_read(&realinode->i_writecount) < 0) + goto out_drop_write; + + /* Truncate should trigger data copy up as well */ + full_copy_up = true; + } + + if (!full_copy_up) + err = ovl_copy_up(dentry); + else + err = ovl_copy_up_with_data(dentry); + if (!err) { + struct inode *winode = NULL; + + upperdentry = ovl_dentry_upper(dentry); + + if (attr->ia_valid & ATTR_SIZE) { + winode = d_inode(upperdentry); + err = get_write_access(winode); + if (err) + goto out_drop_write; + } + + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + inode_lock(upperdentry->d_inode); + old_cred = ovl_override_creds(dentry->d_sb); + err = notify_change(upperdentry, attr, NULL); + revert_creds(old_cred); + if (!err) + ovl_copyattr(upperdentry->d_inode, dentry->d_inode); + inode_unlock(upperdentry->d_inode); + + if (winode) + put_write_access(winode); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, + struct ovl_layer *lower_layer) +{ + bool samefs = ovl_same_sb(dentry->d_sb); + unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + return 0; + } else if (xinobits) { + unsigned int shift = 64 - xinobits; + /* + * All inode numbers of underlying fs should not be using the + * high xinobits, so we use high xinobits to partition the + * overlay st_ino address space. The high bits holds the fsid + * (upper fsid is 0). This way overlay inode numbers are unique + * and all inodes use overlay st_dev. Inode numbers are also + * persistent for a given layer configuration. + */ + if (stat->ino >> shift) { + pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); + } else { + if (lower_layer) + stat->ino |= ((u64)lower_layer->fsid) << shift; + + stat->dev = dentry->d_sb->s_dev; + return 0; + } + } + + /* The inode could not be mapped to a unified st_ino address space */ + if (S_ISDIR(dentry->d_inode->i_mode)) { + /* + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } else if (lower_layer && lower_layer->fsid) { + /* + * For non-samefs setup, if we cannot map all layers st_ino + * to a unified address space, we need to make sure that st_dev + * is unique per lower fs. Upper layer uses real st_dev and + * lower layers use the unique anonymous bdev assigned to the + * lower fs. + */ + stat->dev = lower_layer->fs->pseudo_dev; + } + + return 0; +} + +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + enum ovl_path_type type; + struct path realpath; + const struct cred *old_cred; + bool is_dir = S_ISDIR(dentry->d_inode->i_mode); + bool samefs = ovl_same_sb(dentry->d_sb); + struct ovl_layer *lower_layer = NULL; + int err; + bool metacopy_blocks = false; + + metacopy_blocks = ovl_is_metacopy_dentry(dentry); + + type = ovl_path_real(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&realpath, stat, request_mask, flags); + if (err) + goto out; + + /* + * For non-dir or same fs, we use st_ino of the copy up origin. + * This guaranties constant st_dev/st_ino across copy up. + * With xino feature and non-samefs, we use st_ino of the copy up + * origin masked with high bits that represent the layer id. + * + * If lower filesystem supports NFS file handles, this also guaranties + * persistent st_ino across mount cycle. + */ + if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!OVL_TYPE_UPPER(type)) { + lower_layer = ovl_layer_lower(dentry); + } else if (OVL_TYPE_ORIGIN(type)) { + struct kstat lowerstat; + u32 lowermask = STATX_INO | STATX_BLOCKS | + (!is_dir ? STATX_NLINK : 0); + + ovl_path_lower(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerstat, + lowermask, flags); + if (err) + goto out; + + /* + * Lower hardlinks may be broken on copy up to different + * upper files, so we cannot use the lower origin st_ino + * for those different files, even for the same fs case. + * + * Similarly, several redirected dirs can point to the + * same dir on a lower layer. With the "verify_lower" + * feature, we do not use the lower origin st_ino, if + * we haven't verified that this redirect is unique. + * + * With inodes index enabled, it is safe to use st_ino + * of an indexed origin. The index validates that the + * upper hardlink is not broken and that a redirected + * dir is the only redirect to that origin. + */ + if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || + (!ovl_verify_lower(dentry->d_sb) && + (is_dir || lowerstat.nlink == 1))) { + lower_layer = ovl_layer_lower(dentry); + /* + * Cannot use origin st_dev;st_ino because + * origin inode content may differ from overlay + * inode content. + */ + if (samefs || lower_layer->fsid) + stat->ino = lowerstat.ino; + } + + /* + * If we are querying a metacopy dentry and lower + * dentry is data dentry, then use the blocks we + * queried just now. We don't have to do additional + * vfs_getattr(). If lower itself is metacopy, then + * additional vfs_getattr() is unavoidable. + */ + if (metacopy_blocks && + realpath.dentry == ovl_dentry_lowerdata(dentry)) { + stat->blocks = lowerstat.blocks; + metacopy_blocks = false; + } + } + + if (metacopy_blocks) { + /* + * If lower is not same as lowerdata or if there was + * no origin on upper, we can end up here. + */ + struct kstat lowerdatastat; + u32 lowermask = STATX_BLOCKS; + + ovl_path_lowerdata(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerdatastat, + lowermask, flags); + if (err) + goto out; + stat->blocks = lowerdatastat.blocks; + } + } + + err = ovl_map_dev_ino(dentry, stat, lower_layer); + if (err) + goto out; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (is_dir && OVL_TYPE_MERGE(type)) + stat->nlink = 1; + + /* + * Return the overlay inode nlinks for indexed upper inodes. + * Overlay inode nlink counts the union of the upper hardlinks + * and non-covered lower hardlinks. It does not include the upper + * index hardlink. + */ + if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) + stat->nlink = dentry->d_inode->i_nlink; + +out: + revert_creds(old_cred); + + return err; +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct inode *upperinode = ovl_inode_upper(inode); + struct inode *realinode = upperinode ?: ovl_inode_lower(inode); + const struct cred *old_cred; + int err; + + /* Careful in RCU walk mode */ + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + return -ECHILD; + } + + /* + * Check overlay inode with the creds of task and underlying inode + * with creds of mounter + */ + err = generic_permission(inode, mask); + if (err) + return err; + + old_cred = ovl_override_creds(inode->i_sb); + if (!upperinode && + !special_file(realinode->i_mode) && mask & MAY_WRITE) { + mask &= ~(MAY_WRITE | MAY_APPEND); + /* Make sure mounter can read file for copy up later */ + mask |= MAY_READ; + } + err = inode_permission(realinode, mask); + revert_creds(old_cred); + + return err; +} + +static const char *ovl_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + const struct cred *old_cred; + const char *p; + + if (!dentry) + return ERR_PTR(-ECHILD); + + old_cred = ovl_override_creds(dentry->d_sb); + p = vfs_get_link(ovl_dentry_real(dentry), done); + revert_creds(old_cred); + return p; +} + +bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, OVL_XATTR_PREFIX, + sizeof(OVL_XATTR_PREFIX) - 1) == 0; +} + +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + const struct cred *old_cred; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (!value && !upperdentry) { + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getxattr(realdentry, name, NULL, 0); + revert_creds(old_cred); + if (err < 0) + goto out_drop_write; + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + realdentry = ovl_dentry_upper(dentry); + } + + old_cred = ovl_override_creds(dentry->d_sb); + if (value) + err = vfs_setxattr(realdentry, name, value, size, flags); + else { + WARN_ON(flags != XATTR_REPLACE); + err = vfs_removexattr(realdentry, name); + } + revert_creds(old_cred); + + /* copy c/mtime */ + ovl_copyattr(d_inode(realdentry), inode); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size) +{ + ssize_t res; + const struct cred *old_cred; + struct dentry *realdentry = + ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); + + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_getxattr(realdentry, name, value, size); + revert_creds(old_cred); + return res; +} + +static bool ovl_can_list(const char *s) +{ + /* List all non-trusted xatts */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ + return !ovl_is_private_xattr(s) && + ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + struct dentry *realdentry = ovl_dentry_real(dentry); + ssize_t res; + size_t len; + char *s; + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_listxattr(realdentry, list, size); + revert_creds(old_cred); + if (res <= 0 || size == 0) + return res; + + /* filter out private xattrs */ + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + + len -= slen; + if (!ovl_can_list(s)) { + res -= slen; + memmove(s, s + slen, len); + } else { + s += slen; + } + } + + return res; +} + +struct posix_acl *ovl_get_acl(struct inode *inode, int type) +{ + struct inode *realinode = ovl_inode_real(inode); + const struct cred *old_cred; + struct posix_acl *acl; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) + return NULL; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + + return acl; +} + +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) +{ + if (flags & S_ATIME) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct path upperpath = { + .mnt = ofs->upper_mnt, + .dentry = ovl_upperdentry_dereference(OVL_I(inode)), + }; + + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; + } + } + return 0; +} + +static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + int err; + struct inode *realinode = ovl_inode_real(inode); + const struct cred *old_cred; + + if (!realinode->i_op->fiemap) + return -EOPNOTSUPP; + + old_cred = ovl_override_creds(inode->i_sb); + + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(realinode->i_mapping); + + err = realinode->i_op->fiemap(realinode, fieinfo, start, len); + revert_creds(old_cred); + + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, + .fiemap = ovl_fiemap, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .get_link = ovl_get_link, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .update_time = ovl_update_time, +}; + +static const struct inode_operations ovl_special_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, +}; + +static const struct address_space_operations ovl_aops = { + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ + .direct_IO = noop_direct_IO, +}; + +/* + * It is possible to stack overlayfs instance on top of another + * overlayfs instance as lower layer. We need to annonate the + * stackable i_mutex locks according to stack level of the super + * block instance. An overlayfs instance can never be in stack + * depth 0 (there is always a real fs below it). An overlayfs + * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. + * + * For example, here is a snip from /proc/lockdep_chains after + * dir_iterate of nested overlayfs: + * + * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) + * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) + * [...] &type->i_mutex_dir_key (stack_depth=0) + */ +#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH + +static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; + + int depth = inode->i_sb->s_stack_depth - 1; + + if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) + depth = 0; + + if (S_ISDIR(inode->i_mode)) + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); + else + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); + + lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); +#endif +} + +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, + unsigned long ino, int fsid) +{ + int xinobits = ovl_xino_bits(inode->i_sb); + + /* + * When d_ino is consistent with st_ino (samefs or i_ino has enough + * bits to encode layer), set the same value used for st_ino to i_ino, + * so inode number exposed via /proc/locks and a like will be + * consistent with d_ino and st_ino values. An i_ino value inconsistent + * with d_ino also causes nfsd readdirplus to fail. When called from + * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real + * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + */ + if (ovl_same_sb(inode->i_sb) || xinobits) { + inode->i_ino = ino; + if (xinobits && fsid && !(ino >> (64 - xinobits))) + inode->i_ino |= (unsigned long)fsid << (64 - xinobits); + } else { + inode->i_ino = get_next_ino(); + } + inode->i_mode = mode; + inode->i_flags |= S_NOCMTIME; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; +#endif + + ovl_lockdep_annotate_inode_mutex_key(inode); + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_op = &ovl_file_inode_operations; + inode->i_fop = &ovl_file_operations; + inode->i_mapping->a_ops = &ovl_aops; + break; + + case S_IFDIR: + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + default: + inode->i_op = &ovl_special_inode_operations; + init_special_inode(inode, mode, rdev); + break; + } +} + +/* + * With inodes index enabled, an overlay inode nlink counts the union of upper + * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure + * upper inode, the following nlink modifying operations can happen: + * + * 1. Lower hardlink copy up + * 2. Upper hardlink created, unlinked or renamed over + * 3. Lower hardlink whiteout or renamed over + * + * For the first, copy up case, the union nlink does not change, whether the + * operation succeeds or fails, but the upper inode nlink may change. + * Therefore, before copy up, we store the union nlink value relative to the + * lower inode nlink in the index inode xattr trusted.overlay.nlink. + * + * For the second, upper hardlink case, the union nlink should be incremented + * or decremented IFF the operation succeeds, aligned with nlink change of the + * upper inode. Therefore, before link/unlink/rename, we store the union nlink + * value relative to the upper inode nlink in the index inode. + * + * For the last, lower cover up case, we simplify things by preceding the + * whiteout or cover up with copy up. This makes sure that there is an index + * upper inode where the nlink xattr can be stored before the copied up upper + * entry is unlink. + */ +#define OVL_NLINK_ADD_UPPER (1 << 0) + +/* + * On-disk format for indexed nlink: + * + * nlink relative to the upper inode - "U[+-]NUM" + * nlink relative to the lower inode - "L[+-]NUM" + */ + +static int ovl_set_nlink_common(struct dentry *dentry, + struct dentry *realdentry, const char *format) +{ + struct inode *inode = d_inode(dentry); + struct inode *realinode = d_inode(realdentry); + char buf[13]; + int len; + + len = snprintf(buf, sizeof(buf), format, + (int) (inode->i_nlink - realinode->i_nlink)); + + if (WARN_ON(len >= sizeof(buf))) + return -EIO; + + return ovl_do_setxattr(ovl_dentry_upper(dentry), + OVL_XATTR_NLINK, buf, len, 0); +} + +int ovl_set_nlink_upper(struct dentry *dentry) +{ + return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); +} + +int ovl_set_nlink_lower(struct dentry *dentry) +{ + return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); +} + +unsigned int ovl_get_nlink(struct dentry *lowerdentry, + struct dentry *upperdentry, + unsigned int fallback) +{ + int nlink_diff; + int nlink; + char buf[13]; + int err; + + if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) + return fallback; + + err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1); + if (err < 0) + goto fail; + + buf[err] = '\0'; + if ((buf[0] != 'L' && buf[0] != 'U') || + (buf[1] != '+' && buf[1] != '-')) + goto fail; + + err = kstrtoint(buf + 1, 10, &nlink_diff); + if (err < 0) + goto fail; + + nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; + nlink += nlink_diff; + + if (nlink <= 0) + goto fail; + + return nlink; + +fail: + pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n", + upperdentry, err); + return fallback; +} + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) + ovl_fill_inode(inode, mode, rdev, 0, 0); + + return inode; +} + +static int ovl_inode_test(struct inode *inode, void *data) +{ + return inode->i_private == data; +} + +static int ovl_inode_set(struct inode *inode, void *data) +{ + inode->i_private = data; + return 0; +} + +static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, + struct dentry *upperdentry, bool strict) +{ + /* + * For directories, @strict verify from lookup path performs consistency + * checks, so NULL lower/upper in dentry must match NULL lower/upper in + * inode. Non @strict verify from NFS handle decode path passes NULL for + * 'unknown' lower/upper. + */ + if (S_ISDIR(inode->i_mode) && strict) { + /* Real lower dir moved to upper layer under us? */ + if (!lowerdentry && ovl_inode_lower(inode)) + return false; + + /* Lookup of an uncovered redirect origin? */ + if (!upperdentry && ovl_inode_upper(inode)) + return false; + } + + /* + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. + * This happens when finding a copied up overlay inode for a renamed + * or hardlinked overlay dentry and lower dentry cannot be followed + * by origin because lower fs does not support file handles. + */ + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) + return false; + + /* + * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. + * This happens when finding a lower alias for a copied up hard link. + */ + if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) + return false; + + return true; +} + +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper) +{ + struct inode *inode, *key = d_inode(real); + + inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); + if (!inode) + return NULL; + + if (!ovl_verify_inode(inode, is_upper ? NULL : real, + is_upper ? real : NULL, false)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) +{ + struct inode *key = d_inode(dir); + struct inode *trap; + bool res; + + trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); + if (!trap) + return false; + + res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && + !ovl_inode_lower(trap); + + iput(trap); + return res; +} + +/* + * Create an inode cache entry for layer root dir, that will intentionally + * fail ovl_verify_inode(), so any lookup that will find some layer root + * will fail. + */ +struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) +{ + struct inode *key = d_inode(dir); + struct inode *trap; + + if (!d_is_dir(dir)) + return ERR_PTR(-ENOTDIR); + + trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, + ovl_inode_set, key); + if (!trap) + return ERR_PTR(-ENOMEM); + + if (!(trap->i_state & I_NEW)) { + /* Conflicting layer roots? */ + iput(trap); + return ERR_PTR(-ELOOP); + } + + trap->i_mode = S_IFDIR; + trap->i_flags = S_DEAD; + unlock_new_inode(trap); + + return trap; +} + +/* + * Does overlay inode need to be hashed by lower inode? + */ +static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, + struct dentry *lower, struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + /* No, if pure upper */ + if (!lower) + return false; + + /* Yes, if already indexed */ + if (index) + return true; + + /* Yes, if won't be copied up */ + if (!ofs->upper_mnt) + return true; + + /* No, if lower hardlink is or will be broken on copy up */ + if ((upper || !ovl_indexdir(sb)) && + !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return false; + + /* No, if non-indexed upper with NFS export */ + if (sb->s_export_op && upper) + return false; + + /* Otherwise, hash by lower inode for fsnotify */ + return true; +} + +static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, + struct inode *key) +{ + return newinode ? inode_insert5(newinode, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key) : + iget5_locked(sb, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key); +} + +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip) +{ + struct dentry *upperdentry = oip->upperdentry; + struct ovl_path *lowerpath = oip->lowerpath; + struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; + struct inode *inode; + struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, + oip->index); + int fsid = bylower ? lowerpath->layer->fsid : 0; + bool is_dir, metacopy = false; + unsigned long ino = 0; + int err = oip->newinode ? -EEXIST : -ENOMEM; + + if (!realinode) + realinode = d_inode(lowerdentry); + + /* + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key if this is a broken hardlink. + */ + is_dir = S_ISDIR(realinode->i_mode); + if (upperdentry || bylower) { + struct inode *key = d_inode(bylower ? lowerdentry : + upperdentry); + unsigned int nlink = is_dir ? 1 : realinode->i_nlink; + + inode = ovl_iget5(sb, oip->newinode, key); + if (!inode) + goto out_err; + if (!(inode->i_state & I_NEW)) { + /* + * Verify that the underlying files stored in the inode + * match those in the dentry. + */ + if (!ovl_verify_inode(inode, lowerdentry, upperdentry, + true)) { + iput(inode); + err = -ESTALE; + goto out_err; + } + + dput(upperdentry); + kfree(oip->redirect); + goto out; + } + + /* Recalculate nlink for non-dir due to indexing */ + if (!is_dir) + nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); + set_nlink(inode, nlink); + ino = key->i_ino; + } else { + /* Lower hardlink that will be broken on copy up */ + inode = new_inode(sb); + if (!inode) { + err = -ENOMEM; + goto out_err; + } + ino = realinode->i_ino; + fsid = lowerpath->layer->fsid; + } + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); + ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata); + + if (upperdentry && ovl_is_impuredir(upperdentry)) + ovl_set_flag(OVL_IMPURE, inode); + + if (oip->index) + ovl_set_flag(OVL_INDEX, inode); + + if (upperdentry) { + err = ovl_check_metacopy_xattr(upperdentry); + if (err < 0) + goto out_err; + metacopy = err; + if (!metacopy) + ovl_set_flag(OVL_UPPERDATA, inode); + } + + OVL_I(inode)->redirect = oip->redirect; + + if (bylower) + ovl_set_flag(OVL_CONST_INO, inode); + + /* Check for non-merge dir that may have whiteouts */ + if (is_dir) { + if (((upperdentry && lowerdentry) || oip->numlower > 1) || + ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { + ovl_set_flag(OVL_WHITEOUTS, inode); + } + } + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); +out: + return inode; + +out_err: + pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); + inode = ERR_PTR(err); + goto out; +} diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c new file mode 100644 index 000000000..badf03926 --- /dev/null +++ b/fs/overlayfs/namei.c @@ -0,0 +1,1164 @@ +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/exportfs.h> +#include "overlayfs.h" + +struct ovl_lookup_data { + struct super_block *sb; + struct qstr name; + bool is_dir; + bool opaque; + bool stop; + bool last; + char *redirect; + bool metacopy; +}; + +static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, + size_t prelen, const char *post) +{ + int res; + char *buf; + + buf = ovl_get_redirect_xattr(dentry, prelen + strlen(post)); + if (IS_ERR_OR_NULL(buf)) + return PTR_ERR(buf); + + if (buf[0] == '/') { + /* + * One of the ancestor path elements in an absolute path + * lookup in ovl_lookup_layer() could have been opaque and + * that will stop further lookup in lower layers (d->stop=true) + * But we have found an absolute redirect in decendant path + * element and that should force continue lookup in lower + * layers (reset d->stop). + */ + d->stop = false; + } else { + res = strlen(buf) + 1; + memmove(buf + prelen, buf, res); + memcpy(buf, d->name.name, prelen); + } + + strcat(buf, post); + kfree(d->redirect); + d->redirect = buf; + d->name.name = d->redirect; + d->name.len = strlen(d->redirect); + + return 0; +} + +static int ovl_acceptable(void *ctx, struct dentry *dentry) +{ + /* + * A non-dir origin may be disconnected, which is fine, because + * we only need it for its unique inode number. + */ + if (!d_is_dir(dentry)) + return 1; + + /* Don't decode a deleted empty directory */ + if (d_unhashed(dentry)) + return 0; + + /* Check if directory belongs to the layer we are decoding from */ + return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); +} + +/* + * Check validity of an overlay file handle buffer. + * + * Return 0 for a valid file handle. + * Return -ENODATA for "origin unknown". + * Return <0 for an invalid file handle. + */ +int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + return -EINVAL; + + if (fh->magic != OVL_FH_MAGIC) + return -EINVAL; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + return -ENODATA; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + return -ENODATA; + + return 0; +} + +static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) +{ + int res, err; + struct ovl_fh *fh = NULL; + + res = vfs_getxattr(dentry, name, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + /* Zero size value means "copied up but origin unknown" */ + if (res == 0) + return NULL; + + fh = kzalloc(res, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + res = vfs_getxattr(dentry, name, fh, res); + if (res < 0) + goto fail; + + err = ovl_check_fh_len(fh, res); + if (err < 0) { + if (err == -ENODATA) + goto out; + goto invalid; + } + + return fh; + +out: + kfree(fh); + return NULL; + +fail: + pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + goto out; +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + goto out; +} + +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected) +{ + struct dentry *real; + int bytes; + + /* + * Make sure that the stored uuid matches the uuid of the lower + * layer where file handle will be decoded. + */ + if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + return NULL; + + bytes = (fh->len - offsetof(struct ovl_fh, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, + bytes >> 2, (int)fh->type, + connected ? ovl_acceptable : NULL, mnt); + if (IS_ERR(real)) { + /* + * Treat stale file handle to lower file as "origin unknown". + * upper file handle could become stale when upper file is + * unlinked and this information is needed to handle stale + * index entries correctly. + */ + if (real == ERR_PTR(-ESTALE) && + !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + real = NULL; + return real; + } + + if (ovl_dentry_weird(real)) { + dput(real); + return NULL; + } + + return real; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); +} + +static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + const char *name, unsigned int namelen, + size_t prelen, const char *post, + struct dentry **ret) +{ + struct dentry *this; + int err; + bool last_element = !post[0]; + + this = lookup_one_len_unlocked(name, base, namelen); + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + if (err == -ENOENT || err == -ENAMETOOLONG) + goto out; + goto out_err; + } + if (!this->d_inode) + goto put_and_out; + + if (ovl_dentry_weird(this)) { + /* Don't support traversing automounts and other weirdness */ + err = -EREMOTE; + goto out_err; + } + if (ovl_is_whiteout(this)) { + d->stop = d->opaque = true; + goto put_and_out; + } + /* + * This dentry should be a regular file if previous layer lookup + * found a metacopy dentry. + */ + if (last_element && d->metacopy && !d_is_reg(this)) { + d->stop = true; + goto put_and_out; + } + if (!d_can_lookup(this)) { + if (d->is_dir || !last_element) { + d->stop = true; + goto put_and_out; + } + err = ovl_check_metacopy_xattr(this); + if (err < 0) + goto out_err; + + d->metacopy = err; + d->stop = !d->metacopy; + if (!d->metacopy || d->last) + goto out; + } else { + if (ovl_lookup_trap_inode(d->sb, this)) { + /* Caught in a trap of overlapping layers */ + err = -ELOOP; + goto out_err; + } + + if (last_element) + d->is_dir = true; + if (d->last) + goto out; + + if (ovl_is_opaquedir(this)) { + d->stop = true; + if (last_element) + d->opaque = true; + goto out; + } + } + err = ovl_check_redirect(this, d, prelen, post); + if (err) + goto out_err; +out: + *ret = this; + return 0; + +put_and_out: + dput(this); + this = NULL; + goto out; + +out_err: + dput(this); + return err; +} + +static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, + struct dentry **ret) +{ + /* Counting down from the end, since the prefix can change */ + size_t rem = d->name.len - 1; + struct dentry *dentry = NULL; + int err; + + if (d->name.name[0] != '/') + return ovl_lookup_single(base, d, d->name.name, d->name.len, + 0, "", ret); + + while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { + const char *s = d->name.name + d->name.len - rem; + const char *next = strchrnul(s, '/'); + size_t thislen = next - s; + bool end = !next[0]; + + /* Verify we did not go off the rails */ + if (WARN_ON(s[-1] != '/')) + return -EIO; + + err = ovl_lookup_single(base, d, s, thislen, + d->name.len - rem, next, &base); + dput(dentry); + if (err) + return err; + dentry = base; + if (end) + break; + + rem -= thislen + 1; + + if (WARN_ON(rem >= d->name.len)) + return -EIO; + } + *ret = dentry; + return 0; +} + + +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, + struct dentry *upperdentry, struct ovl_path **stackp) +{ + struct dentry *origin = NULL; + int i; + + for (i = 0; i < ofs->numlower; i++) { + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + connected); + if (origin) + break; + } + + if (!origin) + return -ESTALE; + else if (IS_ERR(origin)) + return PTR_ERR(origin); + + if (upperdentry && !ovl_is_whiteout(upperdentry) && + ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT)) + goto invalid; + + if (!*stackp) + *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); + if (!*stackp) { + dput(origin); + return -ENOMEM; + } + **stackp = (struct ovl_path){ + .dentry = origin, + .layer = &ofs->lower_layers[i] + }; + + return 0; + +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); + dput(origin); + return -EIO; +} + +static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, + struct ovl_path **stackp, unsigned int *ctrp) +{ + struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN); + int err; + + if (IS_ERR_OR_NULL(fh)) + return PTR_ERR(fh); + + err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); + kfree(fh); + + if (err) { + if (err == -ESTALE) + return 0; + return err; + } + + if (WARN_ON(*ctrp)) + return -EIO; + + *ctrp = 1; + return 0; +} + +/* + * Verify that @fh matches the file handle stored in xattr @name. + * Return 0 on match, -ESTALE on mismatch, < 0 on error. + */ +static int ovl_verify_fh(struct dentry *dentry, const char *name, + const struct ovl_fh *fh) +{ + struct ovl_fh *ofh = ovl_get_fh(dentry, name); + int err = 0; + + if (!ofh) + return -ENODATA; + + if (IS_ERR(ofh)) + return PTR_ERR(ofh); + + if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + err = -ESTALE; + + kfree(ofh); + return err; +} + +/* + * Verify that @real dentry matches the file handle stored in xattr @name. + * + * If @set is true and there is no stored file handle, encode @real and store + * file handle in xattr @name. + * + * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. + */ +int ovl_verify_set_fh(struct dentry *dentry, const char *name, + struct dentry *real, bool is_upper, bool set) +{ + struct inode *inode; + struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(real, is_upper); + err = PTR_ERR(fh); + if (IS_ERR(fh)) { + fh = NULL; + goto fail; + } + + err = ovl_verify_fh(dentry, name, fh); + if (set && err == -ENODATA) + err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + if (err) + goto fail; + +out: + kfree(fh); + return err; + +fail: + inode = d_inode(real); + pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n", + is_upper ? "upper" : "origin", real, + inode ? inode->i_ino : 0, err); + goto out; +} + +/* Get upper dentry from index */ +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +{ + struct ovl_fh *fh; + struct dentry *upper; + + if (!d_is_dir(index)) + return dget(index); + + fh = ovl_get_fh(index, OVL_XATTR_UPPER); + if (IS_ERR_OR_NULL(fh)) + return ERR_CAST(fh); + + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); + kfree(fh); + + if (IS_ERR_OR_NULL(upper)) + return upper ?: ERR_PTR(-ESTALE); + + if (!d_is_dir(upper)) { + pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n", + index, upper); + dput(upper); + return ERR_PTR(-EIO); + } + + return upper; +} + +/* Is this a leftover from create/whiteout of directory index entry? */ +static bool ovl_is_temp_index(struct dentry *index) +{ + return index->d_name.name[0] == '#'; +} + +/* + * Verify that an index entry name matches the origin file handle stored in + * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. + * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. + */ +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) +{ + struct ovl_fh *fh = NULL; + size_t len; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; + struct dentry *upper = NULL; + int err; + + if (!d_inode(index)) + return 0; + + /* Cleanup leftover from index create/cleanup attempt */ + err = -ESTALE; + if (ovl_is_temp_index(index)) + goto fail; + + err = -EINVAL; + if (index->d_name.len < sizeof(struct ovl_fh)*2) + goto fail; + + err = -ENOMEM; + len = index->d_name.len / 2; + fh = kzalloc(len, GFP_KERNEL); + if (!fh) + goto fail; + + err = -EINVAL; + if (hex2bin((u8 *)fh, index->d_name.name, len)) + goto fail; + + err = ovl_check_fh_len(fh, len); + if (err) + goto fail; + + /* + * Whiteout index entries are used as an indication that an exported + * overlay file handle should be treated as stale (i.e. after unlink + * of the overlay inode). These entries contain no origin xattr. + */ + if (ovl_is_whiteout(index)) + goto out; + + /* + * Verifying directory index entries are not stale is expensive, so + * only verify stale dir index if NFS export is enabled. + */ + if (d_is_dir(index) && !ofs->config.nfs_export) + goto out; + + /* + * Directory index entries should have 'upper' xattr pointing to the + * real upper dir. Non-dir index entries are hardlinks to the upper + * real inode. For non-dir index, we can read the copy up origin xattr + * directly from the index dentry, but for dir index we first need to + * decode the upper directory. + */ + upper = ovl_index_upper(ofs, index); + if (IS_ERR_OR_NULL(upper)) { + err = PTR_ERR(upper); + /* + * Directory index entries with no 'upper' xattr need to be + * removed. When dir index entry has a stale 'upper' xattr, + * we assume that upper dir was removed and we treat the dir + * index as orphan entry that needs to be whited out. + */ + if (err == -ESTALE) + goto orphan; + else if (!err) + err = -ESTALE; + goto fail; + } + + err = ovl_verify_fh(upper, OVL_XATTR_ORIGIN, fh); + dput(upper); + if (err) + goto fail; + + /* Check if non-dir index is orphan and don't warn before cleaning it */ + if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { + err = ovl_check_origin_fh(ofs, fh, false, index, &stack); + if (err) + goto fail; + + if (ovl_get_nlink(origin.dentry, index, 0) == 0) + goto orphan; + } + +out: + dput(origin.dentry); + kfree(fh); + return err; + +fail: + pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + index, d_inode(index)->i_mode & S_IFMT, err); + goto out; + +orphan: + pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(index)->i_nlink); + err = -ENOENT; + goto out; +} + +static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +{ + char *n, *s; + + n = kcalloc(fh->len, 2, GFP_KERNEL); + if (!n) + return -ENOMEM; + + s = bin2hex(n, fh, fh->len); + *name = (struct qstr) QSTR_INIT(n, s - n); + + return 0; + +} + +/* + * Lookup in indexdir for the index entry of a lower real inode or a copy up + * origin inode. The index entry name is the hex representation of the lower + * inode file handle. + * + * If the index dentry in negative, then either no lower aliases have been + * copied up yet, or aliases have been copied up in older kernels and are + * not indexed. + * + * If the index dentry for a copy up origin inode is positive, but points + * to an inode different than the upper inode, then either the upper inode + * has been copied up and not indexed or it was indexed, but since then + * index dir was cleared. Either way, that index cannot be used to indentify + * the overlay inode. + */ +int ovl_get_index_name(struct dentry *origin, struct qstr *name) +{ + struct ovl_fh *fh; + int err; + + fh = ovl_encode_real_fh(origin, false); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = ovl_get_index_name_fh(fh, name); + + kfree(fh); + return err; +} + +/* Lookup index by file handle for NFS export */ +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) +{ + struct dentry *index; + struct qstr name; + int err; + + err = ovl_get_index_name_fh(fh, &name); + if (err) + return ERR_PTR(err); + + index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + kfree(name.name); + if (IS_ERR(index)) { + if (PTR_ERR(index) == -ENOENT) + index = NULL; + return index; + } + + if (d_is_negative(index)) + err = 0; + else if (ovl_is_whiteout(index)) + err = -ESTALE; + else if (ovl_dentry_weird(index)) + err = -EIO; + else + return index; + + dput(index); + return ERR_PTR(err); +} + +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify) +{ + struct dentry *index; + struct inode *inode; + struct qstr name; + bool is_dir = d_is_dir(origin); + int err; + + err = ovl_get_index_name(origin, &name); + if (err) + return ERR_PTR(err); + + index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + if (err == -ENOENT) { + index = NULL; + goto out; + } + pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + "overlayfs: mount with '-o index=off' to disable inodes index.\n", + d_inode(origin)->i_ino, name.len, name.name, + err); + goto out; + } + + inode = d_inode(index); + if (d_is_negative(index)) { + goto out_dput; + } else if (ovl_is_whiteout(index) && !verify) { + /* + * When index lookup is called with !verify for decoding an + * overlay file handle, a whiteout index implies that decode + * should treat file handle as stale and no need to print a + * warning about it. + */ + dput(index); + index = ERR_PTR(-ESTALE); + goto out; + } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || + ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { + /* + * Index should always be of the same file type as origin + * except for the case of a whiteout index. A whiteout + * index should only exist if all lower aliases have been + * unlinked, which means that finding a lower origin on lookup + * whose index is a whiteout should be treated as an error. + */ + pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); + goto fail; + } else if (is_dir && verify) { + if (!upper) { + pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + origin, index); + goto fail; + } + + /* Verify that dir index 'upper' xattr points to upper dir */ + err = ovl_verify_upper(index, upper, false); + if (err) { + if (err == -ESTALE) { + pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + upper, origin, index); + } + goto fail; + } + } else if (upper && d_inode(upper) != inode) { + goto out_dput; + } +out: + kfree(name.name); + return index; + +out_dput: + dput(index); + index = NULL; + goto out; + +fail: + dput(index); + index = ERR_PTR(-EIO); + goto out; +} + +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + path->dentry = oe->lowerstack[idx - 1].dentry; + path->mnt = oe->lowerstack[idx - 1].layer->mnt; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + +/* Fix missing 'origin' xattr */ +static int ovl_fix_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) +{ + int err; + + if (ovl_check_origin_xattr(upper)) + return 0; + + err = ovl_want_write(dentry); + if (err) + return err; + + err = ovl_set_origin(dentry, lower, upper); + if (!err) + err = ovl_set_impure(dentry->d_parent, upper->d_parent); + + ovl_drop_write(dentry); + return err; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + const struct cred *old_cred; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct ovl_path *stack = NULL, *origin_path = NULL; + struct dentry *upperdir, *upperdentry = NULL; + struct dentry *origin = NULL; + struct dentry *index = NULL; + unsigned int ctr = 0; + struct inode *inode = NULL; + bool upperopaque = false; + char *upperredirect = NULL; + struct dentry *this; + unsigned int i; + int err; + bool metacopy = false; + struct ovl_lookup_data d = { + .sb = dentry->d_sb, + .name = dentry->d_name, + .is_dir = false, + .opaque = false, + .stop = false, + .last = ofs->config.redirect_follow ? false : !poe->numlower, + .redirect = NULL, + .metacopy = false, + }; + + if (dentry->d_name.len > ofs->namelen) + return ERR_PTR(-ENAMETOOLONG); + + old_cred = ovl_override_creds(dentry->d_sb); + upperdir = ovl_dentry_upper(dentry->d_parent); + if (upperdir) { + err = ovl_lookup_layer(upperdir, &d, &upperdentry); + if (err) + goto out; + + if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) { + dput(upperdentry); + err = -EREMOTE; + goto out; + } + if (upperdentry && !d.is_dir) { + unsigned int origin_ctr = 0; + + /* + * Lookup copy up origin by decoding origin file handle. + * We may get a disconnected dentry, which is fine, + * because we only need to hold the origin inode in + * cache and use its inode number. We may even get a + * connected dentry, that is not under any of the lower + * layers root. That is also fine for using it's inode + * number - it's the same as if we held a reference + * to a dentry in lower layer that was moved under us. + */ + err = ovl_check_origin(ofs, upperdentry, &origin_path, + &origin_ctr); + if (err) + goto out_put_upper; + + if (d.metacopy) + metacopy = true; + } + + if (d.redirect) { + err = -ENOMEM; + upperredirect = kstrdup(d.redirect, GFP_KERNEL); + if (!upperredirect) + goto out_put_upper; + if (d.redirect[0] == '/') + poe = roe; + } + upperopaque = d.opaque; + } + + if (!d.stop && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), + GFP_KERNEL); + if (!stack) + goto out_put_upper; + } + + for (i = 0; !d.stop && i < poe->numlower; i++) { + struct ovl_path lower = poe->lowerstack[i]; + + if (!ofs->config.redirect_follow) + d.last = i == poe->numlower - 1; + else + d.last = lower.layer->idx == roe->numlower; + + err = ovl_lookup_layer(lower.dentry, &d, &this); + if (err) + goto out_put; + + if (!this) + continue; + + /* + * If no origin fh is stored in upper of a merge dir, store fh + * of lower dir and set upper parent "impure". + */ + if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { + err = ovl_fix_origin(dentry, this, upperdentry); + if (err) { + dput(this); + goto out_put; + } + } + + /* + * When "verify_lower" feature is enabled, do not merge with a + * lower dir that does not match a stored origin xattr. In any + * case, only verified origin is used for index lookup. + * + * For non-dir dentry, if index=on, then ensure origin + * matches the dentry found using path based lookup, + * otherwise error out. + */ + if (upperdentry && !ctr && + ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || + (!d.is_dir && ofs->config.index && origin_path))) { + err = ovl_verify_origin(upperdentry, this, false); + if (err) { + dput(this); + if (d.is_dir) + break; + goto out_put; + } + origin = this; + } + + if (d.metacopy) + metacopy = true; + /* + * Do not store intermediate metacopy dentries in chain, + * except top most lower metacopy dentry + */ + if (d.metacopy && ctr) { + dput(this); + continue; + } + + stack[ctr].dentry = this; + stack[ctr].layer = lower.layer; + ctr++; + + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n", + dentry); + goto out_put; + } + + if (d.stop) + break; + + if (d.redirect && d.redirect[0] == '/' && poe != roe) { + poe = roe; + /* Find the current layer on the root dentry */ + i = lower.layer->idx - 1; + } + } + + if (metacopy) { + /* + * Found a metacopy dentry but did not find corresponding + * data dentry + */ + if (d.metacopy) { + err = -EIO; + goto out_put; + } + + err = -EPERM; + if (!ofs->config.metacopy) { + pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n", + dentry); + goto out_put; + } + } else if (!d.is_dir && upperdentry && !ctr && origin_path) { + if (WARN_ON(stack != NULL)) { + err = -EIO; + goto out_put; + } + stack = origin_path; + ctr = 1; + origin_path = NULL; + } + + /* + * Lookup index by lower inode and verify it matches upper inode. + * We only trust dir index if we verified that lower dir matches + * origin, otherwise dir index entries may be inconsistent and we + * ignore them. + * + * For non-dir upper metacopy dentry, we already set "origin" if we + * verified that lower matched upper origin. If upper origin was + * not present (because lower layer did not support fh encode/decode), + * or indexing is not enabled, do not set "origin" and skip looking up + * index. This case should be handled in same way as a non-dir upper + * without ORIGIN is handled. + * + * Always lookup index of non-dir non-metacopy and non-upper. + */ + if (ctr && (!upperdentry || (!d.is_dir && !metacopy))) + origin = stack[0].dentry; + + if (origin && ovl_indexdir(dentry->d_sb) && + (!d.is_dir || ovl_index_all(dentry->d_sb))) { + index = ovl_lookup_index(ofs, upperdentry, origin, true); + if (IS_ERR(index)) { + err = PTR_ERR(index); + index = NULL; + goto out_put; + } + } + + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); + dentry->d_fsdata = oe; + + if (upperopaque) + ovl_dentry_set_opaque(dentry); + + if (upperdentry) + ovl_dentry_set_upper_alias(dentry); + else if (index) { + upperdentry = dget(index); + upperredirect = ovl_get_redirect_xattr(upperdentry, 0); + if (IS_ERR(upperredirect)) { + err = PTR_ERR(upperredirect); + upperredirect = NULL; + goto out_free_oe; + } + } + + if (upperdentry || ctr) { + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = stack, + .index = index, + .numlower = ctr, + .redirect = upperredirect, + .lowerdata = (ctr > 1 && !d.is_dir) ? + stack[ctr - 1].dentry : NULL, + }; + + inode = ovl_get_inode(dentry->d_sb, &oip); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_free_oe; + } + + revert_creds(old_cred); + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } + dput(index); + kfree(stack); + kfree(d.redirect); + return d_splice_alias(inode, dentry); + +out_free_oe: + dentry->d_fsdata = NULL; + kfree(oe); +out_put: + dput(index); + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } + dput(upperdentry); + kfree(upperredirect); +out: + kfree(d.redirect); + revert_creds(old_cred); + return ERR_PTR(err); +} + +bool ovl_lower_positive(struct dentry *dentry) +{ + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + const struct qstr *name = &dentry->d_name; + const struct cred *old_cred; + unsigned int i; + bool positive = false; + bool done = false; + + /* + * If dentry is negative, then lower is positive iff this is a + * whiteout. + */ + if (!dentry->d_inode) + return ovl_dentry_is_opaque(dentry); + + /* Negative upper -> positive lower */ + if (!ovl_dentry_upper(dentry)) + return true; + + old_cred = ovl_override_creds(dentry->d_sb); + /* Positive upper -> have to look up lower to see whether it exists */ + for (i = 0; !done && !positive && i < poe->numlower; i++) { + struct dentry *this; + struct dentry *lowerdir = poe->lowerstack[i].dentry; + + this = lookup_one_len_unlocked(name->name, lowerdir, + name->len); + if (IS_ERR(this)) { + switch (PTR_ERR(this)) { + case -ENOENT: + case -ENAMETOOLONG: + break; + + default: + /* + * Assume something is there, we just couldn't + * access it. + */ + positive = true; + break; + } + } else { + if (this->d_inode) { + positive = !ovl_is_whiteout(this); + done = true; + } + dput(this); + } + } + revert_creds(old_cred); + + return positive; +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000..8dcede7df --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,426 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/uuid.h> +#include <linux/fs.h> +#include "ovl_entry.h" + +enum ovl_path_type { + __OVL_PATH_UPPER = (1 << 0), + __OVL_PATH_MERGE = (1 << 1), + __OVL_PATH_ORIGIN = (1 << 2), +}; + +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) + +#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." +#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" +#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" +#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" +#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" +#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" +#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper" +#define OVL_XATTR_METACOPY OVL_XATTR_PREFIX "metacopy" + +enum ovl_inode_flag { + /* Pure upper dir that may contain non pure upper entries */ + OVL_IMPURE, + /* Non-merge dir that may contain whiteout entries */ + OVL_WHITEOUTS, + OVL_INDEX, + OVL_UPPERDATA, + /* Inode number will remain constant over copy up. */ + OVL_CONST_INO, +}; + +enum ovl_entry_flag { + OVL_E_UPPER_ALIAS, + OVL_E_OPAQUE, + OVL_E_CONNECTED, +}; + +/* + * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, + * where: + * origin.fh - exported file handle of the lower file + * origin.uuid - uuid of the lower filesystem + */ +#define OVL_FH_VERSION 0 +#define OVL_FH_MAGIC 0xfb + +/* CPU byte order required for fid decoding: */ +#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) +#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) +/* Is the real inode encoded in fid an upper inode? */ +#define OVL_FH_FLAG_PATH_UPPER (1 << 2) + +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ + OVL_FH_FLAG_PATH_UPPER) + +#if defined(__LITTLE_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN 0 +#elif defined(__BIG_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN +#else +#error Endianness not defined +#endif + +/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ +#define OVL_FILEID 0xfb + +/* On-disk and in-memeory format for redirect by file handle */ +struct ovl_fh { + u8 version; /* 0 */ + u8 magic; /* 0xfb */ + u8 len; /* size of this header + size of fid */ + u8 flags; /* OVL_FH_FLAG_* */ + u8 type; /* fid_type of fid */ + uuid_t uuid; /* uuid of filesystem */ + u8 fid[0]; /* file identifier */ +} __packed; + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + + pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + int err = vfs_create(dir, dentry, mode, true); + + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + int err = vfs_mkdir(dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname) +{ + int err = vfs_symlink(dir, dentry, oldname); + + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + if (err) { + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) +{ + struct dentry *ret = vfs_tmpfile(dentry, mode, 0); + int err = PTR_ERR_OR_ZERO(ret); + + pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); + return ret; +} + +static inline bool ovl_open_flags_need_copy_up(int flags) +{ + if (!flags) + return false; + + return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); +} + +/* util.c */ +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +struct dentry *ovl_workdir(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); +struct super_block *ovl_same_sb(struct super_block *sb); +int ovl_can_decode_fh(struct super_block *sb); +struct dentry *ovl_indexdir(struct super_block *sb); +bool ovl_index_all(struct super_block *sb); +bool ovl_verify_lower(struct super_block *sb); +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +bool ovl_dentry_remote(struct dentry *dentry); +bool ovl_dentry_weird(struct dentry *dentry); +enum ovl_path_type ovl_path_type(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +void ovl_path_lowerdata(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); +struct ovl_layer *ovl_layer_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_i_dentry_upper(struct inode *inode); +struct inode *ovl_inode_upper(struct inode *inode); +struct inode *ovl_inode_lower(struct inode *inode); +struct inode *ovl_inode_lowerdata(struct inode *inode); +struct inode *ovl_inode_real(struct inode *inode); +struct inode *ovl_inode_realdata(struct inode *inode); +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry); +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +bool ovl_dentry_is_whiteout(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_upper_alias(struct dentry *dentry); +void ovl_dentry_set_upper_alias(struct dentry *dentry); +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); +bool ovl_has_upperdata(struct inode *inode); +void ovl_set_upperdata(struct inode *inode); +bool ovl_redirect_dir(struct super_block *sb); +const char *ovl_dentry_get_redirect(struct dentry *dentry); +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); +void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, + struct dentry *lowerdentry, struct dentry *lowerdata); +void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); +void ovl_dir_modified(struct dentry *dentry, bool impurity); +u64 ovl_dentry_version_get(struct dentry *dentry); +bool ovl_is_whiteout(struct dentry *dentry); +struct file *ovl_path_open(struct path *path, int flags); +int ovl_copy_up_start(struct dentry *dentry, int flags); +void ovl_copy_up_end(struct dentry *dentry); +bool ovl_already_copied_up(struct dentry *dentry, int flags); +bool ovl_check_origin_xattr(struct dentry *dentry); +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr); +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); +void ovl_set_flag(unsigned long flag, struct inode *inode); +void ovl_clear_flag(unsigned long flag, struct inode *inode); +bool ovl_test_flag(unsigned long flag, struct inode *inode); +bool ovl_inuse_trylock(struct dentry *dentry); +void ovl_inuse_unlock(struct dentry *dentry); +bool ovl_is_inuse(struct dentry *dentry); +bool ovl_need_index(struct dentry *dentry); +int ovl_nlink_start(struct dentry *dentry, bool *locked); +void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); +int ovl_check_metacopy_xattr(struct dentry *dentry); +bool ovl_is_metacopy_dentry(struct dentry *dentry); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); + +static inline bool ovl_is_impuredir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); +} + +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->xino_bits; +} + + +/* namei.c */ +int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, + struct dentry *upperdentry, struct ovl_path **stackp); +int ovl_verify_set_fh(struct dentry *dentry, const char *name, + struct dentry *real, bool is_upper, bool set); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); +int ovl_get_index_name(struct dentry *origin, struct qstr *name); +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +bool ovl_lower_positive(struct dentry *dentry); + +static inline int ovl_verify_origin(struct dentry *upper, + struct dentry *origin, bool set) +{ + return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set); +} + +static inline int ovl_verify_upper(struct dentry *index, + struct dentry *upper, bool set) +{ + return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set); +} + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); +void ovl_dir_cache_free(struct inode *inode); +int ovl_check_d_type_supported(struct path *realpath); +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); +int ovl_indexdir_cleanup(struct ovl_fs *ofs); + +/* inode.c */ +int ovl_set_nlink_upper(struct dentry *dentry); +int ovl_set_nlink_lower(struct dentry *dentry); +unsigned int ovl_get_nlink(struct dentry *lowerdentry, + struct dentry *upperdentry, + unsigned int fallback); +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); +int ovl_permission(struct inode *inode, int mask); +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +struct posix_acl *ovl_get_acl(struct inode *inode, int type); +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); +bool ovl_is_private_xattr(const char *name); + +struct ovl_inode_params { + struct inode *newinode; + struct dentry *upperdentry; + struct ovl_path *lowerpath; + struct dentry *index; + unsigned int numlower; + char *redirect; + struct dentry *lowerdata; +}; +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper); +bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir); +struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir); +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; + to->i_mode = from->i_mode; + to->i_atime = from->i_atime; + to->i_mtime = from->i_mtime; + to->i_ctime = from->i_ctime; + i_size_write(to, i_size_read(from)); +} + +static inline void ovl_copyflags(struct inode *from, struct inode *to) +{ + unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME; + + inode_set_flags(to, from->i_flags & mask, mask); +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, + struct dentry *dentry); +struct ovl_cattr { + dev_t rdev; + umode_t mode; + const char *link; + struct dentry *hardlink; +}; + +#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) + +int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode); +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); +int ovl_cleanup(struct inode *dir, struct dentry *dentry); +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); + +/* file.c */ +extern const struct file_operations ovl_file_operations; + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_with_data(struct dentry *dentry); +int ovl_copy_up_flags(struct dentry *dentry, int flags); +int ovl_maybe_copy_up(struct dentry *dentry, int flags); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); +int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper); + +/* export.c */ +extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h new file mode 100644 index 000000000..1a1adc697 --- /dev/null +++ b/fs/overlayfs/ovl_entry.h @@ -0,0 +1,122 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; + bool default_permissions; + bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; + bool index; + bool nfs_export; + int xino; + bool metacopy; +}; + +struct ovl_sb { + struct super_block *sb; + dev_t pseudo_dev; +}; + +struct ovl_layer { + struct vfsmount *mnt; + /* Trap in ovl inode cache */ + struct inode *trap; + struct ovl_sb *fs; + /* Index of this layer in fs root (upper idx == 0) */ + int idx; + /* One fsid per unique underlying sb (upper fsid == 0) */ + int fsid; +}; + +struct ovl_path { + struct ovl_layer *layer; + struct dentry *dentry; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + unsigned int numlower; + /* Number of unique lower sb that differ from upper sb */ + unsigned int numlowerfs; + struct ovl_layer *lower_layers; + struct ovl_sb *lower_fs; + /* workbasedir is the path at workdir= mount option */ + struct dentry *workbasedir; + /* workdir is the 'work' directory under workbasedir */ + struct dentry *workdir; + /* index directory listing overlay inodes by origin file handle */ + struct dentry *indexdir; + long namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; + bool tmpfile; + bool noxattr; + /* Did we take the inuse lock? */ + bool upperdir_locked; + bool workdir_locked; + /* Traps in ovl inode cache */ + struct inode *upperdir_trap; + struct inode *workbasedir_trap; + struct inode *workdir_trap; + struct inode *indexdir_trap; + /* Inode numbers in all layers do not use the high xino_bits */ + unsigned int xino_bits; +}; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + union { + struct { + unsigned long flags; + }; + struct rcu_head rcu; + }; + unsigned numlower; + struct ovl_path lowerstack[]; +}; + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); + +static inline struct ovl_entry *OVL_E(struct dentry *dentry) +{ + return (struct ovl_entry *) dentry->d_fsdata; +} + +struct ovl_inode { + union { + struct ovl_dir_cache *cache; /* directory */ + struct inode *lowerdata; /* regular file */ + }; + const char *redirect; + u64 version; + unsigned long flags; + struct inode vfs_inode; + struct dentry *__upperdentry; + struct inode *lower; + + /* synchronize copy up and more */ + struct mutex lock; +}; + +static inline struct ovl_inode *OVL_I(struct inode *inode) +{ + return container_of(inode, struct ovl_inode, vfs_inode); +} + +static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) +{ + return READ_ONCE(oi->__upperdentry); +} diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000..11b7941c5 --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,1161 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/xattr.h> +#include <linux/rbtree.h> +#include <linux/security.h> +#include <linux/cred.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +struct ovl_cache_entry { + unsigned int len; + unsigned int type; + u64 real_ino; + u64 ino; + struct list_head l_node; + struct rb_node node; + struct ovl_cache_entry *next_maybe_whiteout; + bool is_upper; + bool is_whiteout; + char name[]; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; + struct rb_root root; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + struct dentry *dentry; + bool is_lowest; + struct rb_root *root; + struct list_head *list; + struct list_head middle; + struct ovl_cache_entry *first_maybe_whiteout; + int count; + int err; + bool is_upper; + bool d_type_supported; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct list_head *cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return rb_entry(n, struct ovl_cache_entry, node); +} + +static bool ovl_cache_entry_find_link(const char *name, int len, + struct rb_node ***link, + struct rb_node **parent) +{ + bool found = false; + struct rb_node **newp = *link; + + while (!found && *newp) { + int cmp; + struct ovl_cache_entry *tmp; + + *parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + found = true; + } + *link = newp; + + return found; +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, + struct ovl_cache_entry *p) +{ + /* Don't care if not doing ovl_iter() */ + if (!rdd->dentry) + return false; + + /* Always recalc d_ino when remapping lower inode numbers */ + if (ovl_xino_bits(rdd->dentry->d_sb)) + return true; + + /* Always recalc d_ino for parent */ + if (strcmp(p->name, "..") == 0) + return true; + + /* If this is lower, then native d_ino will do */ + if (!rdd->is_upper) + return false; + + /* + * Recalc d_ino for '.' and for all entries if dir is impure (contains + * copied up entries) + */ + if ((p->name[0] == '.' && p->len == 1) || + ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) + return true; + + return false; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, + const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); + + p = kmalloc(size, GFP_KERNEL); + if (!p) + return NULL; + + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->real_ino = ino; + p->ino = ino; + /* Defer setting d_ino for upper entry to ovl_iterate() */ + if (ovl_calc_d_ino(rdd, p)) + p->ino = 0; + p->is_upper = rdd->is_upper; + p->is_whiteout = false; + + if (d_type == DT_CHR) { + p->next_maybe_whiteout = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p; + } + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root->rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + if (ovl_cache_entry_find_link(name, len, &newp, &parent)) + return 0; + + p = ovl_cache_entry_new(rdd, name, len, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, rdd->root); + + return 0; +} + +static int ovl_fill_lowest(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, &rdd->middle); + } else { + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, &rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +void ovl_dir_cache_free(struct inode *inode) +{ + struct ovl_dir_cache *cache = ovl_dir_cache(inode); + + if (cache) { + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(d_inode(dentry)) == cache) + ovl_set_dir_cache(d_inode(dentry), NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + if (!rdd->is_lowest) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); +} + +static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +{ + int err; + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + + old_cred = ovl_override_creds(rdd->dentry->d_sb); + + err = down_write_killable(&dir->d_inode->i_rwsem); + if (!err) { + while (rdd->first_maybe_whiteout) { + p = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_len(p->name, dir, p->len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + } + inode_unlock(dir->d_inode); + } + revert_creds(old_cred); + + return err; +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->first_maybe_whiteout = NULL; + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + + if (!err && rdd->first_maybe_whiteout && rdd->dentry) + err = ovl_check_whiteouts(realpath->dentry, rdd); + + fput(realfile); + + return err; +} + +/* + * Can we iterate real dir directly? + * + * Non-merge dir may contain whiteouts from a time it was a merge upper, before + * lower dir was removed under it and possibly before it was rotated from upper + * to lower layer. + */ +static bool ovl_dir_is_real(struct dentry *dir) +{ + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + bool is_real; + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + od->cursor = NULL; + } + is_real = ovl_dir_is_real(dentry); + if (od->is_real != is_real) { + /* is_real can only become false when dir is copied up */ + if (WARN_ON(is_real)) + return; + od->is_real = false; + } +} + +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = dentry, + .list = list, + .root = root, + .is_lowest = false, + }; + int idx, next; + + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + + if (next != -1) { + err = ovl_dir_read(&realpath, &rdd); + if (err) + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_lowest = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); + } + } + return err; +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct list_head *p; + loff_t off = 0; + + list_for_each(p, &od->cache->entries) { + if (off >= pos) + break; + off++; + } + /* Cursor is safe since the cache is stable */ + od->cursor = p; +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + WARN_ON(!cache->refcount); + cache->refcount++; + return cache; + } + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + cache->root = RB_ROOT; + + res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +/* Map inode number to lower fs unique range */ +static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, + const char *name, int namelen) +{ + if (ino >> (64 - xinobits)) { + pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + return ino; + } + + return ino | ((u64)fsid) << (64 - xinobits); +} + +/* + * Set d_ino for upper entries. Non-upper entries should always report + * the uppermost real inode ino and should not call this function. + * + * When not all layer are on same fs, report real ino also for upper. + * + * When all layers are on the same fs, and upper has a reference to + * copy up origin, call vfs_getattr() on the overlay entry to make + * sure that d_ino will be consistent with st_ino from stat(2). + */ +static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) + +{ + struct dentry *dir = path->dentry; + struct dentry *this = NULL; + enum ovl_path_type type; + u64 ino = p->real_ino; + int xinobits = ovl_xino_bits(dir->d_sb); + int err = 0; + + if (!ovl_same_sb(dir->d_sb) && !xinobits) + goto out; + + if (p->name[0] == '.') { + if (p->len == 1) { + this = dget(dir); + goto get; + } + if (p->len == 2 && p->name[1] == '.') { + /* we shall not be moved */ + this = dget(dir->d_parent); + goto get; + } + } + this = lookup_one_len(p->name, dir, p->len); + if (IS_ERR_OR_NULL(this) || !this->d_inode) { + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + goto fail; + } + goto out; + } + +get: + type = ovl_path_type(this); + if (OVL_TYPE_ORIGIN(type)) { + struct kstat stat; + struct path statpath = *path; + + statpath.dentry = this; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + goto fail; + + /* + * Directory inode is always on overlay st_dev. + * Non-dir with ovl_same_dev() could be on pseudo st_dev in case + * of xino bits overflow. + */ + WARN_ON_ONCE(S_ISDIR(stat.mode) && + dir->d_sb->s_dev != stat.dev); + ino = stat.ino; + } else if (xinobits && !OVL_TYPE_UPPER(type)) { + ino = ovl_remap_lower_ino(ino, xinobits, + ovl_layer_lower(this)->fsid, + p->name, p->len); + } + +out: + p->ino = ino; + dput(this); + return err; + +fail: + pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", + p->name, err); + goto out; +} + +static int ovl_fill_plain(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_cache_entry *p; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + list_add_tail(&p->l_node, rdd->list); + + return 0; +} + +static int ovl_dir_read_impure(struct path *path, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_cache_entry *p, *n; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_plain, + .list = list, + .root = root, + }; + + INIT_LIST_HEAD(list); + *root = RB_ROOT; + ovl_path_upper(path->dentry, &realpath); + + err = ovl_dir_read(&realpath, &rdd); + if (err) + return err; + + list_for_each_entry_safe(p, n, list, l_node) { + if (strcmp(p->name, ".") != 0 && + strcmp(p->name, "..") != 0) { + err = ovl_cache_update_ino(path, p); + if (err) + return err; + } + if (p->ino == p->real_ino) { + list_del(&p->l_node); + kfree(p); + } else { + struct rb_node **newp = &root->rb_node; + struct rb_node *parent = NULL; + + if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, + &newp, &parent))) + return -EIO; + + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, root); + } + } + return 0; +} + +static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +{ + int res; + struct dentry *dentry = path->dentry; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) + return cache; + + /* Impure cache is not refcounted, free it here */ + ovl_dir_cache_free(d_inode(dentry)); + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + res = ovl_dir_read_impure(path, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + if (list_empty(&cache->entries)) { + /* + * A good opportunity to get rid of an unneeded "impure" flag. + * Removing the "impure" xattr is best effort. + */ + if (!ovl_want_write(dentry)) { + ovl_do_removexattr(ovl_dentry_upper(dentry), + OVL_XATTR_IMPURE); + ovl_drop_write(dentry); + } + ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + kfree(cache); + return NULL; + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +struct ovl_readdir_translate { + struct dir_context *orig_ctx; + struct ovl_dir_cache *cache; + struct dir_context ctx; + u64 parent_ino; + int fsid; + int xinobits; +}; + +static int ovl_fill_real(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_translate *rdt = + container_of(ctx, struct ovl_readdir_translate, ctx); + struct dir_context *orig_ctx = rdt->orig_ctx; + + if (rdt->parent_ino && strcmp(name, "..") == 0) { + ino = rdt->parent_ino; + } else if (rdt->cache) { + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); + if (p) + ino = p->ino; + } else if (rdt->xinobits) { + ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, + name, namelen); + } + + return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); +} + +static bool ovl_is_impure_dir(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct inode *dir = d_inode(file->f_path.dentry); + + /* + * Only upper dir can be impure, but if we are in the middle of + * iterating a lower real dir, dir could be copied up and marked + * impure. We only want the impure cache if we started iterating + * a real upper dir to begin with. + */ + return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); + +} + +static int ovl_iterate_real(struct file *file, struct dir_context *ctx) +{ + int err; + struct ovl_dir_file *od = file->private_data; + struct dentry *dir = file->f_path.dentry; + struct ovl_layer *lower_layer = ovl_layer_lower(dir); + struct ovl_readdir_translate rdt = { + .ctx.actor = ovl_fill_real, + .orig_ctx = ctx, + .xinobits = ovl_xino_bits(dir->d_sb), + }; + + if (rdt.xinobits && lower_layer) + rdt.fsid = lower_layer->fsid; + + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { + struct kstat stat; + struct path statpath = file->f_path; + + statpath.dentry = dir->d_parent; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + return err; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + rdt.parent_ino = stat.ino; + } + + if (ovl_is_impure_dir(file)) { + rdt.cache = ovl_cache_get_impure(&file->f_path); + if (IS_ERR(rdt.cache)) + return PTR_ERR(rdt.cache); + } + + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; +} + + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; + int err; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) { + /* + * If parent is merge, then need to adjust d_ino for '..', if + * dir is impure then need to adjust d_ino for copied up + * entries. + */ + if (ovl_xino_bits(dentry->d_sb) || + (ovl_same_sb(dentry->d_sb) && + (ovl_is_impure_dir(file) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { + return ovl_iterate_real(file, ctx); + } + return iterate_dir(od->realfile, ctx); + } + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) { + if (!p->ino) { + err = ovl_cache_update_ino(&file->f_path, p); + if (err) + return err; + } + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + od->cursor = p->l_node.next; + ctx->pos++; + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + inode_lock(file_inode(file)); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + inode_unlock(file_inode(file)); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* Nothing to sync for lower */ + if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) + return 0; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper) { + struct inode *inode = file_inode(file); + + realfile = READ_ONCE(od->upperfile); + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + + inode_lock(inode); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + inode_unlock(inode); + return PTR_ERR(realfile); + } + smp_store_release(&od->upperfile, realfile); + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; + } + inode_unlock(inode); + } + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + inode_lock(inode); + ovl_cache_put(od, file->f_path.dentry); + inode_unlock(inode); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + od->realfile = realfile; + od->is_real = ovl_dir_is_real(file->f_path.dentry); + od->is_upper = OVL_TYPE_UPPER(type); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct ovl_cache_entry *p, *n; + struct rb_root root = RB_ROOT; + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_dir_read_merged(dentry, list, &root); + revert_creds(old_cred); + if (err) + return err; + + err = 0; + + list_for_each_entry_safe(p, n, list, l_node) { + /* + * Select whiteouts in upperdir, they should + * be cleared when deleting this directory. + */ + if (p->is_whiteout) { + if (p->is_upper) + continue; + goto del_entry; + } + + if (p->name[0] == '.') { + if (p->len == 1) + goto del_entry; + if (p->len == 2 && p->name[1] == '.') + goto del_entry; + } + err = -ENOTEMPTY; + break; + +del_entry: + list_del(&p->l_node); + kfree(p); + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (WARN_ON(!p->is_whiteout || !p->is_upper)) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + if (dentry->d_inode) + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + inode_unlock(upper->d_inode); +} + +static int ovl_check_d_type(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + /* Even if d_type is not supported, DT_DIR is returned for . and .. */ + if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) + return 0; + + if (d_type != DT_UNKNOWN) + rdd->d_type_supported = true; + + return 0; +} + +/* + * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values + * if error is encountered. + */ +int ovl_check_d_type_supported(struct path *realpath) +{ + int err; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_check_d_type, + .d_type_supported = false, + }; + + err = ovl_dir_read(realpath, &rdd); + if (err) + return err; + + return rdd.d_type_supported; +} + +static void ovl_workdir_cleanup_recurse(struct path *path, int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct rb_root root = RB_ROOT; + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = &root, + .is_lowest = false, + }; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + dentry = lookup_one_len(p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + ovl_workdir_cleanup(dir, path->mnt, dentry, level); + dput(dentry); + } + inode_unlock(dir); +out: + ovl_cache_free(&list); +} + +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + ovl_cleanup(dir, dentry); + return; + } + + err = ovl_do_rmdir(dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + ovl_workdir_cleanup_recurse(&path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + ovl_cleanup(dir, dentry); + } +} + +int ovl_indexdir_cleanup(struct ovl_fs *ofs) +{ + int err; + struct dentry *indexdir = ofs->indexdir; + struct dentry *index = NULL; + struct inode *dir = indexdir->d_inode; + struct path path = { .mnt = ofs->upper_mnt, .dentry = indexdir }; + LIST_HEAD(list); + struct rb_root root = RB_ROOT; + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = &root, + .is_lowest = false, + }; + + err = ovl_dir_read(&path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + index = lookup_one_len(p->name, indexdir, p->len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + index = NULL; + break; + } + err = ovl_verify_index(ofs, index); + if (!err) { + goto next; + } else if (err == -ESTALE) { + /* Cleanup stale index entries */ + err = ovl_cleanup(dir, index); + } else if (err != -ENOENT) { + /* + * Abort mount to avoid corrupting the index if + * an incompatible index entry was found or on out + * of memory. + */ + break; + } else if (ofs->config.nfs_export) { + /* + * Whiteout orphan index to block future open by + * handle after overlay nlink dropped to zero. + */ + err = ovl_cleanup_and_whiteout(indexdir, dir, index); + } else { + /* Cleanup orphan index entries */ + err = ovl_cleanup(dir, index); + } + + if (err) + break; + +next: + dput(index); + index = NULL; + } + dput(index); + inode_unlock(dir); +out: + ovl_cache_free(&list); + if (err) + pr_err("overlayfs: failed index dir cleanup (%i)\n", err); + return err; +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000..1a7a1e298 --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,1757 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <uapi/linux/magic.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/posix_acl_xattr.h> +#include <linux/exportfs.h> +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + + +struct ovl_dir_cache; + +#define OVL_MAX_STACK 500 + +static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); +module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); +MODULE_PARM_DESC(ovl_redirect_dir_def, + "Default to on or off for the redirect_dir feature"); + +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(ovl_redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + +static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); +module_param_named(index, ovl_index_def, bool, 0644); +MODULE_PARM_DESC(ovl_index_def, + "Default to on or off for the inodes index feature"); + +static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); +module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); +MODULE_PARM_DESC(ovl_nfs_export_def, + "Default to on or off for the NFS export feature"); + +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(ovl_xino_auto_def, + "Auto enable xino feature"); + +static void ovl_entry_stack_free(struct ovl_entry *oe) +{ + unsigned int i; + + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); +} + +static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); +module_param_named(metacopy, ovl_metacopy_def, bool, 0644); +MODULE_PARM_DESC(ovl_metacopy_def, + "Default to on or off for the metadata only copy up feature"); + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + ovl_entry_stack_free(oe); + kfree_rcu(oe, rcu); + } +} + +static struct dentry *ovl_d_real(struct dentry *dentry, + const struct inode *inode) +{ + struct dentry *real = NULL, *lower; + + /* It's an overlay file */ + if (inode && d_inode(dentry) == inode) + return dentry; + + if (!d_is_reg(dentry)) { + if (!inode || inode == d_inode(dentry)) + return dentry; + goto bug; + } + + real = ovl_dentry_upper(dentry); + if (real && (inode == d_inode(real))) + return real; + + if (real && !inode && ovl_has_upperdata(d_inode(dentry))) + return real; + + lower = ovl_dentry_lowerdata(dentry); + if (!lower) + goto bug; + real = lower; + + /* Handle recursion */ + real = d_real(real, inode); + + if (!inode || inode == d_inode(real)) + return real; +bug: + WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", + __func__, dentry, inode ? inode->i_sb->s_id : "NULL", + inode ? inode->i_ino : 0, real, + real && d_inode(real) ? d_inode(real)->i_ino : 0); + return dentry; +} + +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (ret < 0) + return ret; + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + return -ESTALE; + } + } + } + return 1; +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { + ret = d->d_op->d_weak_revalidate(d, flags); + if (ret <= 0) + break; + } + } + return ret; +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, + .d_real = ovl_d_real, +}; + +static const struct dentry_operations ovl_reval_dentry_operations = { + .d_release = ovl_dentry_release, + .d_real = ovl_d_real, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, +}; + +static struct kmem_cache *ovl_inode_cachep; + +static struct inode *ovl_alloc_inode(struct super_block *sb) +{ + struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL); + + if (!oi) + return NULL; + + oi->cache = NULL; + oi->redirect = NULL; + oi->version = 0; + oi->flags = 0; + oi->__upperdentry = NULL; + oi->lower = NULL; + oi->lowerdata = NULL; + mutex_init(&oi->lock); + + return &oi->vfs_inode; +} + +static void ovl_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(ovl_inode_cachep, OVL_I(inode)); +} + +static void ovl_destroy_inode(struct inode *inode) +{ + struct ovl_inode *oi = OVL_I(inode); + + dput(oi->__upperdentry); + iput(oi->lower); + if (S_ISDIR(inode->i_mode)) + ovl_dir_cache_free(inode); + else + iput(oi->lowerdata); + kfree(oi->redirect); + mutex_destroy(&oi->lock); + + call_rcu(&inode->i_rcu, ovl_i_callback); +} + +static void ovl_free_fs(struct ovl_fs *ofs) +{ + unsigned i; + + iput(ofs->workbasedir_trap); + iput(ofs->indexdir_trap); + iput(ofs->workdir_trap); + iput(ofs->upperdir_trap); + dput(ofs->indexdir); + dput(ofs->workdir); + if (ofs->workdir_locked) + ovl_inuse_unlock(ofs->workbasedir); + dput(ofs->workbasedir); + if (ofs->upperdir_locked) + ovl_inuse_unlock(ofs->upper_mnt->mnt_root); + mntput(ofs->upper_mnt); + for (i = 0; i < ofs->numlower; i++) { + iput(ofs->lower_layers[i].trap); + mntput(ofs->lower_layers[i].mnt); + } + for (i = 0; i < ofs->numlowerfs; i++) + free_anon_bdev(ofs->lower_fs[i].pseudo_dev); + kfree(ofs->lower_layers); + kfree(ofs->lower_fs); + + kfree(ofs->config.lowerdir); + kfree(ofs->config.upperdir); + kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); + if (ofs->creator_cred) + put_cred(ofs->creator_cred); + kfree(ofs); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + ovl_free_fs(ofs); +} + +/* Sync real dirty inodes in upper filesystem (if it exists) */ +static int ovl_sync_fs(struct super_block *sb, int wait) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret; + + if (!ofs->upper_mnt) + return 0; + + /* + * If this is a sync(2) call or an emergency sync, all the super blocks + * will be iterated, including upper_sb, so no need to do anything. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) + return 0; + + upper_sb = ofs->upper_mnt->mnt_sb; + + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + + return ret; +} + +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the upper filesystem (if it exists) + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_real(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = ofs->namelen; + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + +/* Will this overlay be forced to mount/remount ro? */ +static bool ovl_force_readonly(struct ovl_fs *ofs) +{ + return (!ofs->upper_mnt || !ofs->workdir); +} + +static const char *ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? "on" : "off"; +} + +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + +static const char * const ovl_xino_str[] = { + "off", + "auto", + "on", +}; + +static inline int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ofs = sb->s_fs_info; + + seq_show_option(m, "lowerdir", ofs->config.lowerdir); + if (ofs->config.upperdir) { + seq_show_option(m, "upperdir", ofs->config.upperdir); + seq_show_option(m, "workdir", ofs->config.workdir); + } + if (ofs->config.default_permissions) + seq_puts(m, ",default_permissions"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); + if (ofs->config.index != ovl_index_def) + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + if (ofs->config.nfs_export != ovl_nfs_export_def) + seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? + "on" : "off"); + if (ofs->config.xino != ovl_xino_def()) + seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); + if (ofs->config.metacopy != ovl_metacopy_def) + seq_printf(m, ",metacopy=%s", + ofs->config.metacopy ? "on" : "off"); + return 0; +} + +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) + return -EROFS; + + return 0; +} + +static const struct super_operations ovl_super_operations = { + .alloc_inode = ovl_alloc_inode, + .destroy_inode = ovl_destroy_inode, + .drop_inode = generic_delete_inode, + .put_super = ovl_put_super, + .sync_fs = ovl_sync_fs, + .statfs = ovl_statfs, + .show_options = ovl_show_options, + .remount_fs = ovl_remount, +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_DEFAULT_PERMISSIONS, + OPT_REDIRECT_DIR, + OPT_INDEX_ON, + OPT_INDEX_OFF, + OPT_NFS_EXPORT_ON, + OPT_NFS_EXPORT_OFF, + OPT_XINO_ON, + OPT_XINO_OFF, + OPT_XINO_AUTO, + OPT_METACOPY_ON, + OPT_METACOPY_OFF, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, + {OPT_INDEX_ON, "index=on"}, + {OPT_INDEX_OFF, "index=off"}, + {OPT_NFS_EXPORT_ON, "nfs_export=on"}, + {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, + {OPT_XINO_ON, "xino=on"}, + {OPT_XINO_OFF, "xino=off"}, + {OPT_XINO_AUTO, "xino=auto"}, + {OPT_METACOPY_ON, "metacopy=on"}, + {OPT_METACOPY_OFF, "metacopy=off"}, + {OPT_ERR, NULL} +}; + +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + int err; + bool metacopy_opt = false, redirect_opt = false; + + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + + while ((p = ovl_next_opt(&opt)) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + case OPT_DEFAULT_PERMISSIONS: + config->default_permissions = true; + break; + + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; + redirect_opt = true; + break; + + case OPT_INDEX_ON: + config->index = true; + break; + + case OPT_INDEX_OFF: + config->index = false; + break; + + case OPT_NFS_EXPORT_ON: + config->nfs_export = true; + break; + + case OPT_NFS_EXPORT_OFF: + config->nfs_export = false; + break; + + case OPT_XINO_ON: + config->xino = OVL_XINO_ON; + break; + + case OPT_XINO_OFF: + config->xino = OVL_XINO_OFF; + break; + + case OPT_XINO_AUTO: + config->xino = OVL_XINO_AUTO; + break; + + case OPT_METACOPY_ON: + config->metacopy = true; + metacopy_opt = true; + break; + + case OPT_METACOPY_OFF: + config->metacopy = false; + break; + + default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); + return -EINVAL; + } + } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + + err = ovl_parse_redirect_mode(config, config->redirect_mode); + if (err) + return err; + + /* + * This is to make the logic below simpler. It doesn't make any other + * difference, since config->redirect_dir is only used for upper. + */ + if (!config->upperdir && config->redirect_follow) + config->redirect_dir = true; + + /* Resolve metacopy -> redirect_dir dependency */ + if (config->metacopy && !config->redirect_dir) { + if (metacopy_opt && redirect_opt) { + pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (redirect_opt) { + /* + * There was an explicit redirect_dir=... that resulted + * in this conflict. + */ + pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + config->redirect_mode); + config->metacopy = false; + } else { + /* Automatically enable redirect otherwise. */ + config->redirect_follow = config->redirect_dir = true; + } + } + + return 0; +} + +#define OVL_WORKDIR_NAME "work" +#define OVL_INDEXDIR_NAME "index" + +static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, + const char *name, bool persist) +{ + struct inode *dir = ofs->workbasedir->d_inode; + struct vfsmount *mnt = ofs->upper_mnt; + struct dentry *work; + int err; + bool retried = false; + bool locked = false; + + inode_lock_nested(dir, I_MUTEX_PARENT); + locked = true; + +retry: + work = lookup_one_len(name, ofs->workbasedir, strlen(name)); + + if (!IS_ERR(work)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + if (persist) + goto out_unlock; + + retried = true; + ovl_workdir_cleanup(dir, mnt, work, 0); + dput(work); + goto retry; + } + + err = ovl_mkdir_real(dir, &work, attr.ia_mode); + if (err) + goto out_dput; + + /* Weird filesystem returning with hashed negative (kernfs)? */ + err = -EINVAL; + if (d_really_is_negative(work)) + goto out_dput; + + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; + } else { + err = PTR_ERR(work); + goto out_err; + } +out_unlock: + if (locked) + inode_unlock(dir); + + return work; + +out_dput: + dput(work); +out_err: + pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + ofs->config.workdir, name, -err); + work = NULL; + goto out_unlock; +} + +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("overlayfs: empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (ovl_dentry_weird(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!d_is_dir(path->dentry)) { + pr_err("overlayfs: '%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put_init(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + + if (!err) + if (ovl_dentry_remote(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + tmp); + path_put_init(path); + err = -EINVAL; + } + kfree(tmp); + } + return err; +} + +static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, + const char *name) +{ + struct kstatfs statfs; + int err = vfs_statfs(path, &statfs); + + if (err) + pr_err("overlayfs: statfs failed on '%s'\n", name); + else + ofs->namelen = max(ofs->namelen, statfs.f_namelen); + + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, + struct ovl_fs *ofs, int *stack_depth, bool *remote) +{ + int fh_type; + int err; + + err = ovl_mount_dir_noesc(name, path); + if (err) + goto out; + + err = ovl_check_namelen(path, ofs, name); + if (err) + goto out_put; + + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + if (ovl_dentry_remote(path->dentry)) + *remote = true; + + /* + * The inodes index feature and NFS export need to encode and decode + * file handles, so they require that all layers support them. + */ + fh_type = ovl_can_decode_fh(path->dentry->d_sb); + if ((ofs->config.nfs_export || + (ofs->config.index && ofs->config.upperdir)) && !fh_type) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + name); + } + + /* Check if lower fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + + return 0; + +out_put: + path_put_init(path); +out: + return err; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + +static int __maybe_unused +ovl_posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, handler->name, buffer, size); +} + +static int __maybe_unused +ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode); + struct posix_acl *acl = NULL; + int err; + + /* Check that everything is OK before copy-up */ + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + err = -EOPNOTSUPP; + if (!IS_POSIXACL(d_inode(workdir))) + goto out_acl_release; + if (!realinode->i_op->set_acl) + goto out_acl_release; + if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { + err = acl ? -EACCES : 0; + goto out_acl_release; + } + err = -EPERM; + if (!inode_owner_or_capable(inode)) + goto out_acl_release; + + posix_acl_release(acl); + + /* + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). + */ + if (unlikely(inode->i_mode & S_ISGID) && + handler->flags == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(dentry, &iattr); + if (err) + return err; + } + + err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); + if (!err) + ovl_copyattr(ovl_inode_real(inode), inode); + + return err; + +out_acl_release: + posix_acl_release(acl); + return err; +} + +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, inode, name, value, size, flags); +} + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = ovl_posix_acl_xattr_get, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = ovl_posix_acl_xattr_get, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler ovl_own_xattr_handler = { + .prefix = OVL_XATTR_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_other_xattr_handler = { + .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, + .set = ovl_other_xattr_set, +}; + +static const struct xattr_handler *ovl_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &ovl_posix_acl_access_xattr_handler, + &ovl_posix_acl_default_xattr_handler, +#endif + &ovl_own_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, + struct inode **ptrap, const char *name) +{ + struct inode *trap; + int err; + + trap = ovl_get_trap_inode(sb, dir); + err = PTR_ERR_OR_ZERO(trap); + if (err) { + if (err == -ELOOP) + pr_err("overlayfs: conflicting %s path\n", name); + return err; + } + + *ptrap = trap; + return 0; +} + +/* + * Determine how we treat concurrent use of upperdir/workdir based on the + * index feature. This is papering over mount leaks of container runtimes, + * for example, an old overlay mount is leaked and now its upperdir is + * attempted to be used as a lower layer in a new overlay mount. + */ +static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) +{ + if (ofs->config.index) { + pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + name); + return -EBUSY; + } else { + pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + name); + return 0; + } +} + +static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, + struct path *upperpath) +{ + struct vfsmount *upper_mnt; + int err; + + err = ovl_mount_dir(ofs->config.upperdir, upperpath); + if (err) + goto out; + + /* Upper fs should not be r/o */ + if (sb_rdonly(upperpath->mnt->mnt_sb)) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out; + } + + err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir); + if (err) + goto out; + + err = ovl_setup_trap(sb, upperpath->dentry, &ofs->upperdir_trap, + "upperdir"); + if (err) + goto out; + + upper_mnt = clone_private_mount(upperpath); + err = PTR_ERR(upper_mnt); + if (IS_ERR(upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out; + } + + /* Don't inherit atime flags */ + upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + ofs->upper_mnt = upper_mnt; + + if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { + ofs->upperdir_locked = true; + } else { + err = ovl_report_in_use(ofs, "upperdir"); + if (err) + goto out; + } + + err = 0; +out: + return err; +} + +static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, + struct path *workpath) +{ + struct vfsmount *mnt = ofs->upper_mnt; + struct dentry *temp; + int fh_type; + int err; + + err = mnt_want_write(mnt); + if (err) + return err; + + ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); + if (!ofs->workdir) + goto out; + + err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir"); + if (err) + goto out; + + /* + * Upper should support d_type, else whiteouts are visible. Given + * workdir and upper are on same fs, we can do iterate_dir() on + * workdir. This check requires successful creation of workdir in + * previous step. + */ + err = ovl_check_d_type_supported(workpath); + if (err < 0) + goto out; + + /* + * We allowed this configuration and don't want to break users over + * kernel upgrade. So warn instead of erroring out. + */ + if (!err) + pr_warn("overlayfs: upper fs needs to support d_type.\n"); + + /* Check if upper/work fs supports O_TMPFILE */ + temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(temp); + if (ofs->tmpfile) + dput(temp); + else + pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* xattr + */ + err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0); + if (err) { + ofs->noxattr = true; + ofs->config.index = false; + ofs->config.metacopy = false; + pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); + err = 0; + } else { + vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); + } + + /* Check if upper/work fs supports file handles */ + fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); + if (ofs->config.index && !fh_type) { + ofs->config.index = false; + pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + } + + /* Check if upper fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + + /* NFS export of r/w mount depends on index */ + if (ofs->config.nfs_export && !ofs->config.index) { + pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } +out: + mnt_drop_write(mnt); + return err; +} + +static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, + struct path *upperpath) +{ + int err; + struct path workpath = { }; + + err = ovl_mount_dir(ofs->config.workdir, &workpath); + if (err) + goto out; + + err = -EINVAL; + if (upperpath->mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out; + } + + ofs->workbasedir = dget(workpath.dentry); + + if (ovl_inuse_trylock(ofs->workbasedir)) { + ofs->workdir_locked = true; + } else { + err = ovl_report_in_use(ofs, "workdir"); + if (err) + goto out; + } + + err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap, + "workdir"); + if (err) + goto out; + + err = ovl_make_workdir(sb, ofs, &workpath); + +out: + path_put(&workpath); + + return err; +} + +static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, + struct ovl_entry *oe, struct path *upperpath) +{ + struct vfsmount *mnt = ofs->upper_mnt; + int err; + + err = mnt_want_write(mnt); + if (err) + return err; + + /* Verify lower root is upper root origin */ + err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, + true); + if (err) { + pr_err("overlayfs: failed to verify upper root origin\n"); + goto out; + } + + ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); + if (ofs->indexdir) { + err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap, + "indexdir"); + if (err) + goto out; + + /* + * Verify upper root is exclusively associated with index dir. + * Older kernels stored upper fh in "trusted.overlay.origin" + * xattr. If that xattr exists, verify that it is a match to + * upper dir file handle. In any case, verify or set xattr + * "trusted.overlay.upper" to indicate that index may have + * directory entries. + */ + if (ovl_check_origin_xattr(ofs->indexdir)) { + err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, + upperpath->dentry, true, false); + if (err) + pr_err("overlayfs: failed to verify index dir 'origin' xattr\n"); + } + err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); + if (err) + pr_err("overlayfs: failed to verify index dir 'upper' xattr\n"); + + /* Cleanup bad/stale/orphan index entries */ + if (!err) + err = ovl_indexdir_cleanup(ofs); + } + if (err || !ofs->indexdir) + pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + +out: + mnt_drop_write(mnt); + return err; +} + +/* Get a unique fsid for the layer */ +static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +{ + unsigned int i; + dev_t dev; + int err; + + /* fsid 0 is reserved for upper fs even with non upper overlay */ + if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) + return 0; + + for (i = 0; i < ofs->numlowerfs; i++) { + if (ofs->lower_fs[i].sb == sb) + return i + 1; + } + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + return err; + } + + ofs->lower_fs[ofs->numlowerfs].sb = sb; + ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->numlowerfs++; + + return ofs->numlowerfs; +} + +static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, + struct path *stack, unsigned int numlower) +{ + int err; + unsigned int i; + + err = -ENOMEM; + ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), + GFP_KERNEL); + if (ofs->lower_layers == NULL) + goto out; + + ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), + GFP_KERNEL); + if (ofs->lower_fs == NULL) + goto out; + + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt; + struct inode *trap; + int fsid; + + err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + if (err < 0) + goto out; + + /* + * Check if lower root conflicts with this overlay layers before + * checking if it is in-use as upperdir/workdir of "another" + * mount, because we do not bother to check in ovl_is_inuse() if + * the upperdir/workdir is in fact in-use by our + * upperdir/workdir. + */ + err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir"); + if (err) + goto out; + + if (ovl_is_inuse(stack[i].dentry)) { + err = ovl_report_in_use(ofs, "lowerdir"); + if (err) { + iput(trap); + goto out; + } + } + + mnt = clone_private_mount(&stack[i]); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + iput(trap); + goto out; + } + + /* + * Make lower layers R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; + + ofs->lower_layers[ofs->numlower].trap = trap; + ofs->lower_layers[ofs->numlower].mnt = mnt; + ofs->lower_layers[ofs->numlower].idx = i + 1; + ofs->lower_layers[ofs->numlower].fsid = fsid; + if (fsid) { + ofs->lower_layers[ofs->numlower].fs = + &ofs->lower_fs[fsid - 1]; + } + ofs->numlower++; + } + + /* + * When all layers on same fs, overlay can use real inode numbers. + * With mount option "xino=on", mounter declares that there are enough + * free high bits in underlying fs to hold the unique fsid. + * If overlayfs does encounter underlying inodes using the high xino + * bits reserved for fsid, it emits a warning and uses the original + * inode number. + */ + if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { + ofs->xino_bits = 0; + ofs->config.xino = OVL_XINO_OFF; + } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + /* + * This is a roundup of number of bits needed for numlowerfs+1 + * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for + * upper fs even with non upper overlay. + */ + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); + ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + } + + if (ofs->xino_bits) { + pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_bits); + } + + err = 0; +out: + return err; +} + +static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, + struct ovl_fs *ofs) +{ + int err; + char *lowertmp, *lower; + struct path *stack = NULL; + unsigned int stacklen, numlower = 0, i; + bool remote = false; + struct ovl_entry *oe; + + err = -ENOMEM; + lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL); + if (!lowertmp) + goto out_err; + + err = -EINVAL; + stacklen = ovl_split_lowerdirs(lowertmp); + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directories, limit is %d\n", + OVL_MAX_STACK); + goto out_err; + } else if (!ofs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_err; + } else if (!ofs->config.upperdir && ofs->config.nfs_export && + ofs->config.redirect_follow) { + pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + + err = -ENOMEM; + stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_err; + + err = -EINVAL; + lower = lowertmp; + for (numlower = 0; numlower < stacklen; numlower++) { + err = ovl_lower_dir(lower, &stack[numlower], ofs, + &sb->s_stack_depth, &remote); + if (err) + goto out_err; + + lower = strchr(lower, '\0') + 1; + } + + err = -EINVAL; + sb->s_stack_depth++; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_err; + } + + err = ovl_get_lower_layers(sb, ofs, stack, numlower); + if (err) + goto out_err; + + err = -ENOMEM; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_err; + + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = dget(stack[i].dentry); + oe->lowerstack[i].layer = &ofs->lower_layers[i]; + } + + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; + +out: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); + kfree(lowertmp); + + return oe; + +out_err: + oe = ERR_PTR(err); + goto out; +} + +/* + * Check if this layer root is a descendant of: + * - another layer of this overlayfs instance + * - upper/work dir of any overlayfs instance + */ +static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, + struct dentry *dentry, const char *name, + bool is_lower) +{ + struct dentry *next = dentry, *parent; + int err = 0; + + if (!dentry) + return 0; + + parent = dget_parent(next); + + /* Walk back ancestors to root (inclusive) looking for traps */ + while (!err && parent != next) { + if (is_lower && ovl_lookup_trap_inode(sb, parent)) { + err = -ELOOP; + pr_err("overlayfs: overlapping %s path\n", name); + } else if (ovl_is_inuse(parent)) { + err = ovl_report_in_use(ofs, name); + } + next = parent; + parent = dget_parent(next); + dput(next); + } + + dput(parent); + + return err; +} + +/* + * Check if any of the layers or work dirs overlap. + */ +static int ovl_check_overlapping_layers(struct super_block *sb, + struct ovl_fs *ofs) +{ + int i, err; + + if (ofs->upper_mnt) { + err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root, + "upperdir", false); + if (err) + return err; + + /* + * Checking workbasedir avoids hitting ovl_is_inuse(parent) of + * this instance and covers overlapping work and index dirs, + * unless work or index dir have been moved since created inside + * workbasedir. In that case, we already have their traps in + * inode cache and we will catch that case on lookup. + */ + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", + false); + if (err) + return err; + } + + for (i = 0; i < ofs->numlower; i++) { + err = ovl_check_layer(sb, ofs, + ofs->lower_layers[i].mnt->mnt_root, + "lowerdir", true); + if (err) + return err; + } + + return 0; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path upperpath = { }; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ofs; + struct cred *cred; + int err; + + err = -ENOMEM; + ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ofs) + goto out; + + ofs->creator_cred = cred = prepare_creds(); + if (!cred) + goto out_err; + + ofs->config.index = ovl_index_def; + ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); + ofs->config.metacopy = ovl_metacopy_def; + err = ovl_parse_opt((char *) data, &ofs->config); + if (err) + goto out_err; + + err = -EINVAL; + if (!ofs->config.lowerdir) { + if (!silent) + pr_err("overlayfs: missing 'lowerdir'\n"); + goto out_err; + } + + sb->s_stack_depth = 0; + sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ + if (ofs->config.xino != OVL_XINO_OFF) + ofs->xino_bits = BITS_PER_LONG - 32; + + /* alloc/destroy_inode needed for setting up traps in inode cache */ + sb->s_op = &ovl_super_operations; + + if (ofs->config.upperdir) { + if (!ofs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_err; + } + + err = ovl_get_upper(sb, ofs, &upperpath); + if (err) + goto out_err; + + err = ovl_get_workdir(sb, ofs, &upperpath); + if (err) + goto out_err; + + if (!ofs->workdir) + sb->s_flags |= SB_RDONLY; + + sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth; + sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran; + + } + oe = ovl_get_lowerstack(sb, ofs); + err = PTR_ERR(oe); + if (IS_ERR(oe)) + goto out_err; + + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ofs->upper_mnt) + sb->s_flags |= SB_RDONLY; + + if (!(ovl_force_readonly(ofs)) && ofs->config.index) { + err = ovl_get_indexdir(sb, ofs, oe, &upperpath); + if (err) + goto out_free_oe; + + /* Force r/o mount with no index dir */ + if (!ofs->indexdir) { + dput(ofs->workdir); + ofs->workdir = NULL; + sb->s_flags |= SB_RDONLY; + } + + } + + err = ovl_check_overlapping_layers(sb, ofs); + if (err) + goto out_free_oe; + + /* Show index=off in /proc/mounts for forced r/o mount */ + if (!ofs->indexdir) { + ofs->config.index = false; + if (ofs->upper_mnt && ofs->config.nfs_export) { + pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + } + + if (ofs->config.metacopy && ofs->config.nfs_export) { + pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + + if (ofs->config.nfs_export) + sb->s_export_op = &ovl_export_operations; + + /* Never override disk quota limits or use reserved space */ + cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_xattr = ovl_xattr_handlers; + sb->s_fs_info = ofs; + sb->s_flags |= SB_POSIXACL; + + err = -ENOMEM; + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + if (!root_dentry) + goto out_free_oe; + + root_dentry->d_fsdata = oe; + + mntput(upperpath.mnt); + if (upperpath.dentry) { + ovl_dentry_set_upper_alias(root_dentry); + if (ovl_is_impuredir(upperpath.dentry)) + ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); + } + + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); + ovl_set_upperdata(d_inode(root_dentry)); + ovl_inode_init(d_inode(root_dentry), upperpath.dentry, + ovl_dentry_lower(root_dentry), NULL); + + sb->s_root = root_dentry; + + return 0; + +out_free_oe: + ovl_entry_stack_free(oe); + kfree(oe); +out_err: + path_put(&upperpath); + ovl_free_fs(ofs); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlay", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlay"); + +static void ovl_inode_init_once(void *foo) +{ + struct ovl_inode *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +static int __init ovl_init(void) +{ + int err; + + ovl_inode_cachep = kmem_cache_create("ovl_inode", + sizeof(struct ovl_inode), 0, + (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + ovl_inode_init_once); + if (ovl_inode_cachep == NULL) + return -ENOMEM; + + err = register_filesystem(&ovl_fs_type); + if (err) + kmem_cache_destroy(ovl_inode_cachep); + + return err; +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ovl_inode_cachep); + +} + +module_init(ovl_init); +module_exit(ovl_exit); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c new file mode 100644 index 000000000..db8bdb29b --- /dev/null +++ b/fs/overlayfs/util.c @@ -0,0 +1,945 @@ +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/uuid.h> +#include <linux/namei.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + +struct super_block *ovl_same_sb(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + if (!ofs->numlowerfs) + return ofs->upper_mnt->mnt_sb; + else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) + return ofs->lower_fs[0].sb; + else + return NULL; +} + +/* + * Check if underlying fs supports file handles and try to determine encoding + * type, in order to deduce maximum inode number used by fs. + * + * Return 0 if file handles are not supported. + * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. + * Return -1 if fs uses a non default encoding with unknown inode size. + */ +int ovl_can_decode_fh(struct super_block *sb) +{ + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || + uuid_is_null(&sb->s_uuid)) + return 0; + + return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; +} + +struct dentry *ovl_indexdir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->indexdir; +} + +/* Index all files on copy up. For now only enabled for NFS export */ +bool ovl_index_all(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + +/* Verify lower origin on lookup. For now only enabled for NFS export */ +bool ovl_verify_lower(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower) +{ + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; +} + +bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_REAL); +} + +bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; + + if (ovl_dentry_upper(dentry)) { + type = __OVL_PATH_UPPER; + + /* + * Non-dir dentry can hold lower dentry of its copy up origin. + */ + if (oe->numlower) { + if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry))) + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry) || + !ovl_has_upperdata(d_inode(dentry))) + type |= __OVL_PATH_MERGE; + } + } else { + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; + } + return type; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_dentry_upper(dentry); +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->numlower) { + path->mnt = oe->lowerstack[0].layer->mnt; + path->dentry = oe->lowerstack[0].dentry; + } else { + *path = (struct path) { }; + } +} + +void ovl_path_lowerdata(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->numlower) { + path->mnt = oe->lowerstack[oe->numlower - 1].layer->mnt; + path->dentry = oe->lowerstack[oe->numlower - 1].dentry; + } else { + *path = (struct path) { }; + } +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + if (!OVL_TYPE_UPPER(type)) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} + +struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].layer : NULL; +} + +/* + * ovl_dentry_lower() could return either a data dentry or metacopy dentry + * dependig on what is stored in lowerstack[0]. At times we need to find + * lower dentry which has data (and not metacopy dentry). This helper + * returns the lower data dentry. + */ +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[oe->numlower - 1].dentry : NULL; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); +} + +struct dentry *ovl_i_dentry_upper(struct inode *inode) +{ + return ovl_upperdentry_dereference(OVL_I(inode)); +} + +struct inode *ovl_inode_upper(struct inode *inode) +{ + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + + return upperdentry ? d_inode(upperdentry) : NULL; +} + +struct inode *ovl_inode_lower(struct inode *inode) +{ + return OVL_I(inode)->lower; +} + +struct inode *ovl_inode_real(struct inode *inode) +{ + return ovl_inode_upper(inode) ?: ovl_inode_lower(inode); +} + +/* Return inode which contains lower data. Do not return metacopy */ +struct inode *ovl_inode_lowerdata(struct inode *inode) +{ + if (WARN_ON(!S_ISREG(inode->i_mode))) + return NULL; + + return OVL_I(inode)->lowerdata ?: ovl_inode_lower(inode); +} + +/* Return real inode which contains data. Does not return metacopy inode */ +struct inode *ovl_inode_realdata(struct inode *inode) +{ + struct inode *upperinode; + + upperinode = ovl_inode_upper(inode); + if (upperinode && ovl_has_upperdata(inode)) + return upperinode; + + return ovl_inode_lowerdata(inode); +} + +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) +{ + return OVL_I(inode)->cache; +} + +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) +{ + OVL_I(inode)->cache = cache; +} + +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry) +{ + set_bit(flag, &OVL_E(dentry)->flags); +} + +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry) +{ + clear_bit(flag, &OVL_E(dentry)->flags); +} + +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry) +{ + return test_bit(flag, &OVL_E(dentry)->flags); +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry); +} + +bool ovl_dentry_is_whiteout(struct dentry *dentry) +{ + return !dentry->d_inode && ovl_dentry_is_opaque(dentry); +} + +void ovl_dentry_set_opaque(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); +} + +/* + * For hard links and decoded file handles, it's possible for ovl_dentry_upper() + * to return positive, while there's no actual upper alias for the inode. + * Copy up code needs to know about the existence of the upper alias, so it + * can't use ovl_dentry_upper(). + */ +bool ovl_dentry_has_upper_alias(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry); +} + +void ovl_dentry_set_upper_alias(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry); +} + +static bool ovl_should_check_upperdata(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return false; + + if (!ovl_inode_lower(inode)) + return false; + + return true; +} + +bool ovl_has_upperdata(struct inode *inode) +{ + if (!ovl_should_check_upperdata(inode)) + return true; + + if (!ovl_test_flag(OVL_UPPERDATA, inode)) + return false; + /* + * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of + * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure + * if setting of OVL_UPPERDATA is visible, then effects of writes + * before that are visible too. + */ + smp_rmb(); + return true; +} + +void ovl_set_upperdata(struct inode *inode) +{ + /* + * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure + * if OVL_UPPERDATA flag is visible, then effects of write operations + * before it are visible as well. + */ + smp_wmb(); + ovl_set_flag(OVL_UPPERDATA, inode); +} + +/* Caller should hold ovl_inode->lock */ +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry)); +} + +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_has_upperdata(d_inode(dentry)); +} + +bool ovl_redirect_dir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.redirect_dir && !ofs->noxattr; +} + +const char *ovl_dentry_get_redirect(struct dentry *dentry) +{ + return OVL_I(d_inode(dentry))->redirect; +} + +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) +{ + struct ovl_inode *oi = OVL_I(d_inode(dentry)); + + kfree(oi->redirect); + oi->redirect = redirect; +} + +void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, + struct dentry *lowerdentry, struct dentry *lowerdata) +{ + struct inode *realinode = d_inode(upperdentry ?: lowerdentry); + + if (upperdentry) + OVL_I(inode)->__upperdentry = upperdentry; + if (lowerdentry) + OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); + if (lowerdata) + OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata)); + + ovl_copyattr(realinode, inode); + ovl_copyflags(realinode, inode); + if (!inode->i_ino) + inode->i_ino = realinode->i_ino; +} + +void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) +{ + struct inode *upperinode = d_inode(upperdentry); + + WARN_ON(OVL_I(inode)->__upperdentry); + + /* + * Make sure upperdentry is consistent before making it visible + */ + smp_wmb(); + OVL_I(inode)->__upperdentry = upperdentry; + if (inode_unhashed(inode)) { + if (!inode->i_ino) + inode->i_ino = upperinode->i_ino; + inode->i_private = upperinode; + __insert_inode_hash(inode, (unsigned long) upperinode); + } +} + +static void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) +{ + struct inode *inode = d_inode(dentry); + + WARN_ON(!inode_is_locked(inode)); + /* + * Version is used by readdir code to keep cache consistent. For merge + * dirs all changes need to be noted. For non-merge dirs, cache only + * contains impure (ones which have been copied up and have origins) + * entries, so only need to note changes to impure entries. + */ + if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) + OVL_I(inode)->version++; +} + +void ovl_dir_modified(struct dentry *dentry, bool impurity) +{ + /* Copy mtime/ctime */ + ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); + + ovl_dentry_version_inc(dentry, impurity); +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + WARN_ON(!inode_is_locked(inode)); + return OVL_I(inode)->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags | O_NOATIME, current_cred()); +} + +/* Caller should hold ovl_inode->lock */ +static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + return true; + + return false; +} + +bool ovl_already_copied_up(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + /* + * Check if copy-up has happened as well as for upper alias (in + * case of hard links) is there. + * + * Both checks are lockless: + * - false negatives: will recheck under oi->lock + * - false positives: + * + ovl_dentry_upper() uses memory barriers to ensure the + * upper dentry is up-to-date + * + ovl_dentry_has_upper_alias() relies on locking of + * upper parent i_rwsem to prevent reordering copy-up + * with rename. + */ + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up(dentry, flags)) + return true; + + return false; +} + +int ovl_copy_up_start(struct dentry *dentry, int flags) +{ + struct ovl_inode *oi = OVL_I(d_inode(dentry)); + int err; + + err = mutex_lock_interruptible(&oi->lock); + if (!err && ovl_already_copied_up_locked(dentry, flags)) { + err = 1; /* Already copied up */ + mutex_unlock(&oi->lock); + } + + return err; +} + +void ovl_copy_up_end(struct dentry *dentry) +{ + mutex_unlock(&OVL_I(d_inode(dentry))->lock); +} + +bool ovl_check_origin_xattr(struct dentry *dentry) +{ + int res; + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + + /* Zero size value means "copied up but origin unknown" */ + if (res >= 0) + return true; + + return false; +} + +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) +{ + int res; + char val; + + if (!d_is_dir(dentry)) + return false; + + res = vfs_getxattr(dentry, name, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr) +{ + int err; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (ofs->noxattr) + return xerr; + + err = ovl_do_setxattr(upperdentry, name, value, size, 0); + + if (err == -EOPNOTSUPP) { + pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + ofs->noxattr = true; + return xerr; + } + + return err; +} + +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + int err; + + if (ovl_test_flag(OVL_IMPURE, d_inode(dentry))) + return 0; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, + "y", 1, 0); + if (!err) + ovl_set_flag(OVL_IMPURE, d_inode(dentry)); + + return err; +} + +void ovl_set_flag(unsigned long flag, struct inode *inode) +{ + set_bit(flag, &OVL_I(inode)->flags); +} + +void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + +bool ovl_test_flag(unsigned long flag, struct inode *inode) +{ + return test_bit(flag, &OVL_I(inode)->flags); +} + +/** + * Caller must hold a reference to inode to prevent it from being freed while + * it is marked inuse. + */ +bool ovl_inuse_trylock(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + bool locked = false; + + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_OVL_INUSE)) { + inode->i_state |= I_OVL_INUSE; + locked = true; + } + spin_unlock(&inode->i_lock); + + return locked; +} + +void ovl_inuse_unlock(struct dentry *dentry) +{ + if (dentry) { + struct inode *inode = d_inode(dentry); + + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_OVL_INUSE)); + inode->i_state &= ~I_OVL_INUSE; + spin_unlock(&inode->i_lock); + } +} + +bool ovl_is_inuse(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + bool inuse; + + spin_lock(&inode->i_lock); + inuse = (inode->i_state & I_OVL_INUSE); + spin_unlock(&inode->i_lock); + + return inuse; +} + +/* + * Does this overlay dentry need to be indexed on copy up? + */ +bool ovl_need_index(struct dentry *dentry) +{ + struct dentry *lower = ovl_dentry_lower(dentry); + + if (!lower || !ovl_indexdir(dentry->d_sb)) + return false; + + /* Index all files for NFS export and consistency verification */ + if (ovl_index_all(dentry->d_sb)) + return true; + + /* Index only lower hardlinks on copy up */ + if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return true; + + return false; +} + +/* Caller must hold OVL_I(inode)->lock */ +static void ovl_cleanup_index(struct dentry *dentry) +{ + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = indexdir->d_inode; + struct dentry *lowerdentry = ovl_dentry_lower(dentry); + struct dentry *upperdentry = ovl_dentry_upper(dentry); + struct dentry *index = NULL; + struct inode *inode; + struct qstr name = { }; + int err; + + err = ovl_get_index_name(lowerdentry, &name); + if (err) + goto fail; + + inode = d_inode(upperdentry); + if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { + pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + upperdentry, inode->i_ino, inode->i_nlink); + /* + * We either have a bug with persistent union nlink or a lower + * hardlink was added while overlay is mounted. Adding a lower + * hardlink and then unlinking all overlay hardlinks would drop + * overlay nlink to zero before all upper inodes are unlinked. + * As a safety measure, when that situation is detected, set + * the overlay nlink to the index inode nlink minus one for the + * index entry itself. + */ + set_nlink(d_inode(dentry), inode->i_nlink - 1); + ovl_set_nlink_upper(dentry); + goto out; + } + + inode_lock_nested(dir, I_MUTEX_PARENT); + index = lookup_one_len(name.name, indexdir, name.len); + err = PTR_ERR(index); + if (IS_ERR(index)) { + index = NULL; + } else if (ovl_index_all(dentry->d_sb)) { + /* Whiteout orphan index to block future open by handle */ + err = ovl_cleanup_and_whiteout(indexdir, dir, index); + } else { + /* Cleanup orphan index entries */ + err = ovl_cleanup(dir, index); + } + + inode_unlock(dir); + if (err) + goto fail; + +out: + kfree(name.name); + dput(index); + return; + +fail: + pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err); + goto out; +} + +/* + * Operations that change overlay inode and upper inode nlink need to be + * synchronized with copy up for persistent nlink accounting. + */ +int ovl_nlink_start(struct dentry *dentry, bool *locked) +{ + struct ovl_inode *oi = OVL_I(d_inode(dentry)); + const struct cred *old_cred; + int err; + + if (!d_inode(dentry)) + return 0; + + /* + * With inodes index is enabled, we store the union overlay nlink + * in an xattr on the index inode. When whiting out an indexed lower, + * we need to decrement the overlay persistent nlink, but before the + * first copy up, we have no upper index inode to store the xattr. + * + * As a workaround, before whiteout/rename over an indexed lower, + * copy up to create the upper index. Creating the upper index will + * initialize the overlay nlink, so it could be dropped if unlink + * or rename succeeds. + * + * TODO: implement metadata only index copy up when called with + * ovl_copy_up_flags(dentry, O_PATH). + */ + if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) { + err = ovl_copy_up(dentry); + if (err) + return err; + } + + err = mutex_lock_interruptible(&oi->lock); + if (err) + return err; + + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + goto out; + + old_cred = ovl_override_creds(dentry->d_sb); + /* + * The overlay inode nlink should be incremented/decremented IFF the + * upper operation succeeds, along with nlink change of upper inode. + * Therefore, before link/unlink/rename, we store the union nlink + * value relative to the upper inode nlink in an upper inode xattr. + */ + err = ovl_set_nlink_upper(dentry); + revert_creds(old_cred); + +out: + if (err) + mutex_unlock(&oi->lock); + else + *locked = true; + + return err; +} + +void ovl_nlink_end(struct dentry *dentry, bool locked) +{ + if (locked) { + if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) && + d_inode(dentry)->i_nlink == 0) { + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + ovl_cleanup_index(dentry); + revert_creds(old_cred); + } + + mutex_unlock(&OVL_I(d_inode(dentry))->lock); + } +} + +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ +int ovl_check_metacopy_xattr(struct dentry *dentry) +{ + int res; + + /* Only regular files can have metacopy xattr */ + if (!S_ISREG(d_inode(dentry)->i_mode)) + return 0; + + res = vfs_getxattr(dentry, OVL_XATTR_METACOPY, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return 0; + goto out; + } + + return 1; +out: + pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res); + return res; +} + +bool ovl_is_metacopy_dentry(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (!d_is_reg(dentry)) + return false; + + if (ovl_dentry_upper(dentry)) { + if (!ovl_has_upperdata(d_inode(dentry))) + return true; + return false; + } + + return (oe->numlower > 1); +} + +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) +{ + ssize_t res; + char *buf = NULL; + + res = vfs_getxattr(dentry, name, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return -ENODATA; + goto fail; + } + + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} + +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; + if (res < 0) + return ERR_PTR(res); + if (res == 0) + goto invalid; + + if (buf[0] == '/') { + for (s = buf; *s++ == '/'; s = next) { + next = strchrnul(s, '/'); + if (s == next) + goto invalid; + } + } else { + if (strchr(buf, '/') != NULL) + goto invalid; + } + + return buf; +invalid: + pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + res = -EINVAL; + kfree(buf); + return ERR_PTR(res); +} |