summaryrefslogtreecommitdiffstats
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c180
1 files changed, 120 insertions, 60 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9da..13ef4e1fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -45,11 +45,10 @@ static struct ctl_table vm_userfaultfd_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
#endif
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
@@ -123,6 +122,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -921,21 +925,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
continue;
}
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- } else {
- prev = vma;
- }
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ vma->vm_end, new_flags,
+ NULL_VM_UFFD_CTX);
vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ prev = vma;
}
mmap_write_unlock(mm);
mmput(mm);
@@ -1033,7 +1036,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
{
int fd;
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -1325,7 +1328,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1399,7 +1402,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags))
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
goto out_unlock;
/*
@@ -1460,7 +1463,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1478,28 +1481,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }),
- anon_vma_name(vma));
- if (prev) {
- /* vma_merge() invalidated the mas */
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -1561,7 +1550,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1615,7 +1604,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags))
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
goto out_unlock;
found = true;
@@ -1631,7 +1620,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
/*
* Nothing to do: this vma is already registered into this
@@ -1664,26 +1653,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags, NULL_VM_UFFD_CTX);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -2018,6 +1994,11 @@ out:
return ret;
}
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2027,6 +2008,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
+static int userfaultfd_move(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_move uffdio_move;
+ struct uffdio_move __user *user_uffdio_move;
+ struct userfaultfd_wake_range range;
+ struct mm_struct *mm = ctx->mm;
+
+ user_uffdio_move = (struct uffdio_move __user *) arg;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_move, user_uffdio_move,
+ /* don't copy "move" last field */
+ sizeof(uffdio_move)-sizeof(__s64)))
+ return -EFAULT;
+
+ /* Do not allow cross-mm moves. */
+ if (mm != current->mm)
+ return -EINVAL;
+
+ ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
+ UFFDIO_MOVE_MODE_DONTWAKE))
+ return -EINVAL;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+
+ /* Re-check after taking mmap_lock */
+ if (likely(!atomic_read(&ctx->mmap_changing)))
+ ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
+ else
+ ret = -EINVAL;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
+ range.start = uffdio_move.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
/*
* userland asks for a certain API version and we return which bits
* and ioctl commands are implemented in this kernel for such API
@@ -2051,6 +2101,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2063,6 +2118,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
@@ -2113,6 +2169,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
+ case UFFDIO_MOVE:
+ ret = userfaultfd_move(ctx, arg);
+ break;
case UFFDIO_WRITEPROTECT:
ret = userfaultfd_writeprotect(ctx, arg);
break;
@@ -2205,7 +2264,8 @@ static int new_userfaultfd(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);