summaryrefslogtreecommitdiffstats
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile4
-rw-r--r--io_uring/alloc_cache.h5
-rw-r--r--io_uring/cancel.c10
-rw-r--r--io_uring/cancel.h4
-rw-r--r--io_uring/filetable.c11
-rw-r--r--io_uring/futex.c387
-rw-r--r--io_uring/futex.h36
-rw-r--r--io_uring/io_uring.c980
-rw-r--r--io_uring/io_uring.h28
-rw-r--r--io_uring/kbuf.c213
-rw-r--r--io_uring/kbuf.h22
-rw-r--r--io_uring/net.c338
-rw-r--r--io_uring/opdef.c74
-rw-r--r--io_uring/opdef.h2
-rw-r--r--io_uring/openclose.c59
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/poll.c64
-rw-r--r--io_uring/poll.h9
-rw-r--r--io_uring/register.c607
-rw-r--r--io_uring/register.h8
-rw-r--r--io_uring/rsrc.c206
-rw-r--r--io_uring/rsrc.h29
-rw-r--r--io_uring/rw.c229
-rw-r--r--io_uring/rw.h4
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/uring_cmd.c106
-rw-r--r--io_uring/waitid.c367
-rw-r--r--io_uring/waitid.h15
28 files changed, 2490 insertions, 1334 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387..2cdc51825 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+ cancel.o kbuf.o rsrc.o rw.o opdef.o \
+ notif.o waitid.o register.o
obj-$(CONFIG_IO_WQ) += io-wq.o
+obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 241245cb5..bf2fb26a6 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,8 +16,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++;
wq_stack_add_head(&entry->node, &cache->list);
- /* KASAN poisons object */
- kasan_slab_free_mempool(entry);
+ kasan_mempool_poison_object(entry);
return true;
}
return false;
@@ -34,7 +33,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
struct io_cache_entry *entry;
entry = container_of(cache->list.next, struct io_cache_entry, node);
- kasan_unpoison_range(entry, cache->elem_size);
+ kasan_mempool_unpoison_object(entry, cache->elem_size);
cache->list.next = cache->list.next->next;
cache->nr_cached--;
return entry;
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index a5d51471f..8a8b07dfc 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -15,6 +15,8 @@
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
+#include "waitid.h"
+#include "futex.h"
#include "cancel.h"
struct io_cancel {
@@ -119,6 +121,14 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
+ ret = io_waitid_cancel(ctx, cd, issue_flags);
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = io_futex_cancel(ctx, cd, issue_flags);
+ if (ret != -ENOENT)
+ return ret;
+
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index fc98622e6..c0a8e7c52 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_CANCEL_H
+#define IORING_CANCEL_H
#include <linux/io_uring_types.h>
@@ -22,3 +24,5 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
+
+#endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index e7d749991..6e86e6188 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_file_bitmap_clear(&ctx->file_table, slot_index);
}
- ret = io_scm_file_account(ctx, file);
- if (!ret) {
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, slot_index);
- }
- return ret;
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
+ return 0;
}
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
diff --git a/io_uring/futex.c b/io_uring/futex.c
new file mode 100644
index 000000000..792a03df5
--- /dev/null
+++ b/io_uring/futex.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../kernel/futex/futex.h"
+#include "io_uring.h"
+#include "rsrc.h"
+#include "futex.h"
+
+struct io_futex {
+ struct file *file;
+ union {
+ u32 __user *uaddr;
+ struct futex_waitv __user *uwaitv;
+ };
+ unsigned long futex_val;
+ unsigned long futex_mask;
+ unsigned long futexv_owned;
+ u32 futex_flags;
+ unsigned int futex_nr;
+ bool futexv_unqueued;
+};
+
+struct io_futex_data {
+ union {
+ struct futex_q q;
+ struct io_cache_entry cache;
+ };
+ struct io_kiocb *req;
+};
+
+void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+ io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
+ sizeof(struct io_futex_data));
+}
+
+static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+{
+ kfree(container_of(entry, struct io_futex_data, cache));
+}
+
+void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+ io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+}
+
+static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ req->async_data = NULL;
+ hlist_del_init(&req->hash_node);
+ io_req_task_complete(req, ts);
+}
+
+static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_futex_data *ifd = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_tw_lock(ctx, ts);
+ if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+ kfree(ifd);
+ __io_futex_complete(req, ts);
+}
+
+static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ struct futex_vector *futexv = req->async_data;
+
+ io_tw_lock(req->ctx, ts);
+
+ if (!iof->futexv_unqueued) {
+ int res;
+
+ res = futex_unqueue_multiple(futexv, iof->futex_nr);
+ if (res != -1)
+ io_req_set_res(req, res, 0);
+ }
+
+ kfree(req->async_data);
+ req->flags &= ~REQ_F_ASYNC_DATA;
+ __io_futex_complete(req, ts);
+}
+
+static bool io_futexv_claim(struct io_futex *iof)
+{
+ if (test_bit(0, &iof->futexv_owned) ||
+ test_and_set_bit_lock(0, &iof->futexv_owned))
+ return false;
+ return true;
+}
+
+static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ /* futex wake already done or in progress */
+ if (req->opcode == IORING_OP_FUTEX_WAIT) {
+ struct io_futex_data *ifd = req->async_data;
+
+ if (!futex_unqueue(&ifd->q))
+ return false;
+ req->io_task_work.func = io_futex_complete;
+ } else {
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+
+ if (!io_futexv_claim(iof))
+ return false;
+ req->io_task_work.func = io_futexv_complete;
+ }
+
+ hlist_del_init(&req->hash_node);
+ io_req_set_res(req, -ECANCELED, 0);
+ io_req_task_work_add(req);
+ return true;
+}
+
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ int nr = 0;
+
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+ return -ENOENT;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+ if (req->cqe.user_data != cd->data &&
+ !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+ continue;
+ if (__io_futex_cancel(ctx, req))
+ nr++;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+ break;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (nr)
+ return nr;
+
+ return -ENOENT;
+}
+
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool found = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+ if (!io_match_task_safe(req, task, cancel_all))
+ continue;
+ hlist_del_init(&req->hash_node);
+ __io_futex_cancel(ctx, req);
+ found = true;
+ }
+
+ return found;
+}
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ u32 flags;
+
+ if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index ||
+ sqe->file_index))
+ return -EINVAL;
+
+ iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ iof->futex_val = READ_ONCE(sqe->addr2);
+ iof->futex_mask = READ_ONCE(sqe->addr3);
+ flags = READ_ONCE(sqe->fd);
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ iof->futex_flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(iof->futex_flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
+ !futex_validate_input(iof->futex_flags, iof->futex_mask))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct io_kiocb *req = q->wake_data;
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+
+ if (!io_futexv_claim(iof))
+ return;
+ if (unlikely(!__futex_wake_mark(q)))
+ return;
+
+ io_req_set_res(req, 0, 0);
+ req->io_task_work.func = io_futexv_complete;
+ io_req_task_work_add(req);
+}
+
+int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ struct futex_vector *futexv;
+ int ret;
+
+ /* No flags or mask supported for waitv */
+ if (unlikely(sqe->fd || sqe->buf_index || sqe->file_index ||
+ sqe->addr2 || sqe->futex_flags || sqe->addr3))
+ return -EINVAL;
+
+ iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ iof->futex_nr = READ_ONCE(sqe->len);
+ if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX)
+ return -EINVAL;
+
+ futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv)
+ return -ENOMEM;
+
+ ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr,
+ io_futex_wakev_fn, req);
+ if (ret) {
+ kfree(futexv);
+ return ret;
+ }
+
+ iof->futexv_owned = 0;
+ iof->futexv_unqueued = 0;
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = futexv;
+ return 0;
+}
+
+static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
+ struct io_kiocb *req = ifd->req;
+
+ if (unlikely(!__futex_wake_mark(q)))
+ return;
+
+ io_req_set_res(req, 0, 0);
+ req->io_task_work.func = io_futex_complete;
+ io_req_task_work_add(req);
+}
+
+static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
+{
+ struct io_cache_entry *entry;
+
+ entry = io_alloc_cache_get(&ctx->futex_cache);
+ if (entry)
+ return container_of(entry, struct io_futex_data, cache);
+
+ return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
+}
+
+int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ struct futex_vector *futexv = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, woken = -1;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken);
+
+ /*
+ * Error case, ret is < 0. Mark the request as failed.
+ */
+ if (unlikely(ret < 0)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ kfree(futexv);
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+ return IOU_OK;
+ }
+
+ /*
+ * 0 return means that we successfully setup the waiters, and that
+ * nobody triggered a wakeup while we were doing so. If the wakeup
+ * happened post setup, the task_work will be run post this issue and
+ * under the submission lock. 1 means We got woken while setting up,
+ * let that side do the completion. Note that
+ * futex_wait_multiple_setup() will have unqueued all the futexes in
+ * this case. Mark us as having done that already, since this is
+ * different from normal wakeup.
+ */
+ if (!ret) {
+ /*
+ * If futex_wait_multiple_setup() returns 0 for a
+ * successful setup, then the task state will not be
+ * runnable. This is fine for the sync syscall, as
+ * it'll be blocking unless we already got one of the
+ * futexes woken, but it obviously won't work for an
+ * async invocation. Mark us runnable again.
+ */
+ __set_current_state(TASK_RUNNING);
+ hlist_add_head(&req->hash_node, &ctx->futex_list);
+ } else {
+ iof->futexv_unqueued = 1;
+ if (woken != -1)
+ io_req_set_res(req, woken, 0);
+ }
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_futex_data *ifd = NULL;
+ struct futex_hash_bucket *hb;
+ int ret;
+
+ if (!iof->futex_mask) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ io_ring_submit_lock(ctx, issue_flags);
+ ifd = io_alloc_ifd(ctx);
+ if (!ifd) {
+ ret = -ENOMEM;
+ goto done_unlock;
+ }
+
+ req->async_data = ifd;
+ ifd->q = futex_q_init;
+ ifd->q.bitset = iof->futex_mask;
+ ifd->q.wake = io_futex_wake_fn;
+ ifd->req = req;
+
+ ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
+ &ifd->q, &hb);
+ if (!ret) {
+ hlist_add_head(&req->hash_node, &ctx->futex_list);
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ futex_queue(&ifd->q, hb);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+done_unlock:
+ io_ring_submit_unlock(ctx, issue_flags);
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ kfree(ifd);
+ return IOU_OK;
+}
+
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ int ret;
+
+ /*
+ * Strict flags - ensure that waking 0 futexes yields a 0 result.
+ * See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
+ */
+ ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
+ iof->futex_val, iof->futex_mask);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/futex.h b/io_uring/futex.h
new file mode 100644
index 000000000..0847e9e8a
--- /dev/null
+++ b/io_uring/futex.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "cancel.h"
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags);
+int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags);
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
+
+#if defined(CONFIG_FUTEX)
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags);
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all);
+void io_futex_cache_init(struct io_ring_ctx *ctx);
+void io_futex_cache_free(struct io_ring_ctx *ctx);
+#else
+static inline int io_futex_cancel(struct io_ring_ctx *ctx,
+ struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ return 0;
+}
+static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ return false;
+}
+static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+}
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ea772a02c..dc0235ff4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -60,7 +60,6 @@
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
-#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -70,6 +69,7 @@
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
@@ -85,6 +85,7 @@
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
+#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
@@ -92,6 +93,8 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
+#include "waitid.h"
+#include "futex.h"
#include "timeout.h"
#include "poll.h"
@@ -101,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -127,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -142,6 +137,14 @@ struct io_defer_entry {
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
+/*
+ * No waiters. It's larger than any valid value of the tw counter
+ * so that tests against ->cq_wait_nr would fail and skip wake_up().
+ */
+#define IO_CQ_WAKE_INIT (-1U)
+/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
+#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1)
+
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
@@ -149,6 +152,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
+static struct workqueue_struct *iou_wq __ro_after_init;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
@@ -175,19 +179,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (io_is_uring_fops(file)) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
@@ -321,6 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
@@ -332,6 +324,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct async_poll));
io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
+ io_futex_cache_init(ctx);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -341,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -351,13 +343,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
+ INIT_HLIST_HEAD(&ctx->waitid_list);
+#ifdef CONFIG_FUTEX
+ INIT_HLIST_HEAD(&ctx->futex_list);
+#endif
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx;
err:
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
@@ -547,14 +543,13 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-
-static void io_eventfd_ops(struct rcu_head *rcu)
+void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -590,7 +585,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
@@ -954,6 +949,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
+ lockdep_assert(!io_wq_current_is_worker());
+
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
@@ -1181,12 +1178,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
static unsigned int handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx,
- struct io_tw_state *ts,
- struct llist_node *last)
+ struct io_tw_state *ts)
{
unsigned int count = 0;
- while (node && node != last) {
+ do {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1210,7 +1206,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
*ctx = NULL;
cond_resched();
}
- }
+ } while (node);
return count;
}
@@ -1229,22 +1225,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head,
return xchg(&head->first, new);
}
-/**
- * io_llist_cmpxchg - possibly swap all entries in a lock-less list
- * @head: the head of lock-less list to delete all entries
- * @old: expected old value of the first entry of the list
- * @new: new entry as the head of the list
- *
- * perform a cmpxchg on the first entry of the list.
- */
-
-static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
- struct llist_node *old,
- struct llist_node *new)
-{
- return cmpxchg(&head->first, old, new);
-}
-
static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{
struct llist_node *node = llist_del_all(&tctx->task_list);
@@ -1279,9 +1259,7 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
- struct llist_node fake = {};
struct llist_node *node;
- unsigned int loops = 0;
unsigned int count = 0;
if (unlikely(current->flags & PF_EXITING)) {
@@ -1289,21 +1267,9 @@ void tctx_task_work(struct callback_head *cb)
return;
}
- do {
- loops++;
- node = io_llist_xchg(&tctx->task_list, &fake);
- count += handle_tw_list(node, &ctx, &ts, &fake);
-
- /* skip expensive cmpxchg if there are items in the list */
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
- io_submit_flush_completions(ctx);
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- }
- node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
- } while (node != &fake);
+ node = llist_del_all(&tctx->task_list);
+ if (node)
+ count = handle_tw_list(node, &ctx, &ts);
ctx_flush_and_put(ctx, &ts);
@@ -1311,23 +1277,30 @@ void tctx_task_work(struct callback_head *cb)
if (unlikely(atomic_read(&tctx->in_cancel)))
io_uring_drop_tctx_refs(current);
- trace_io_uring_task_work_run(tctx, count, loops);
+ trace_io_uring_task_work_run(tctx, count, 1);
}
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
- struct llist_node *first;
+ struct llist_node *head;
+
+ /* See comment above IO_CQ_WAKE_INIT */
+ BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);
+ /*
+ * We don't know how many reuqests is there in the link and whether
+ * they can even be queued lazily, fall back to non-lazy.
+ */
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE;
- first = READ_ONCE(ctx->work_llist.first);
+ head = READ_ONCE(ctx->work_llist.first);
do {
nr_tw_prev = 0;
- if (first) {
- struct io_kiocb *first_req = container_of(first,
+ if (head) {
+ struct io_kiocb *first_req = container_of(head,
struct io_kiocb,
io_task_work.node);
/*
@@ -1336,17 +1309,29 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
nr_tw_prev = READ_ONCE(first_req->nr_tw);
}
+
+ /*
+ * Theoretically, it can overflow, but that's fine as one of
+ * previous adds should've tried to wake the task.
+ */
nr_tw = nr_tw_prev + 1;
- /* Large enough to fail the nr_wait comparison below */
if (!(flags & IOU_F_TWQ_LAZY_WAKE))
- nr_tw = INT_MAX;
+ nr_tw = IO_CQ_WAKE_FORCE;
req->nr_tw = nr_tw;
- req->io_task_work.node.next = first;
- } while (!try_cmpxchg(&ctx->work_llist.first, &first,
+ req->io_task_work.node.next = head;
+ } while (!try_cmpxchg(&ctx->work_llist.first, &head,
&req->io_task_work.node));
- if (!first) {
+ /*
+ * cmpxchg implies a full barrier, which pairs with the barrier
+ * in set_current_state() on the io_cqring_wait() side. It's used
+ * to ensure that either we see updated ->cq_wait_nr, or waiters
+ * going to sleep will observe the work added to the list, which
+ * is similar to the wait/wawke task state sync.
+ */
+
+ if (!head) {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
@@ -1354,14 +1339,12 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
- /* no one is waiting */
- if (!nr_wait)
+ /* not enough or no one is waiting */
+ if (nr_tw < nr_wait)
return;
- /* either not enough or the previous add has already woken it up */
- if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
+ /* the previous add has already woken it up */
+ if (nr_tw_prev >= nr_wait)
return;
- /* pairs with set_current_state() in io_cqring_wait() */
- smp_mb__after_atomic();
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
}
@@ -1408,7 +1391,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
}
}
-static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
+static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
+ int min_events)
+{
+ if (llist_empty(&ctx->work_llist))
+ return false;
+ if (events < min_events)
+ return true;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ return false;
+}
+
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
+ int min_events)
{
struct llist_node *node;
unsigned int loops = 0;
@@ -1437,18 +1433,20 @@ again:
}
loops++;
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
if (ts->locked) {
io_submit_flush_completions(ctx);
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
}
+
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
}
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
+ int min_events)
{
struct io_tw_state ts = { .locked = true, };
int ret;
@@ -1456,20 +1454,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
if (llist_empty(&ctx->work_llist))
return 0;
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
/* shouldn't happen! */
if (WARN_ON_ONCE(!ts.locked))
mutex_lock(&ctx->uring_lock);
return ret;
}
-static int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{
struct io_tw_state ts = {};
int ret;
ts.locked = mutex_trylock(&ctx->uring_lock);
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
if (ts.locked)
mutex_unlock(&ctx->uring_lock);
@@ -1665,7 +1663,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
- (void) io_run_local_work_locked(ctx);
+ (void) io_run_local_work_locked(ctx, min);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@@ -1895,14 +1893,15 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
- if (ret != IOU_ISSUE_SKIP_COMPLETE)
- return ret;
-
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
- io_iopoll_req_issued(req, issue_flags);
+ if (ret == IOU_ISSUE_SKIP_COMPLETE) {
+ ret = 0;
+ io_arm_ltimeout(req);
- return 0;
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ io_iopoll_req_issued(req, issue_flags);
+ }
+ return ret;
}
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@@ -1953,6 +1952,29 @@ fail:
goto fail;
}
+ /*
+ * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the
+ * submitter task context. Final request completions are handed to the
+ * right context, however this is not the case of auxiliary CQEs,
+ * which is the main mean of operation for multishot requests.
+ * Don't allow any multishot execution from io-wq. It's more restrictive
+ * than necessary and also cleaner.
+ */
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ err = -EBADFD;
+ if (!file_can_poll(req->file))
+ goto fail;
+ if (req->file->f_flags & O_NONBLOCK ||
+ req->file->f_mode & FMODE_NOWAIT) {
+ err = -ECANCELED;
+ if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
+ goto fail;
+ return;
+ } else {
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+ }
+ }
+
if (req->flags & REQ_F_FORCE_ASYNC) {
bool opcode_poll = def->pollin || def->pollout;
@@ -2013,9 +2035,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
slot = io_fixed_file_slot(&ctx->file_table, fd);
- file = io_slot_file(slot);
+ if (!req->rsrc_node)
+ __io_req_set_rsrc_node(req, ctx);
req->flags |= io_slot_flags(slot);
- io_req_set_rsrc_node(req, ctx, 0);
+ file = io_slot_file(slot);
out:
io_ring_submit_unlock(ctx, issue_flags);
return file;
@@ -2073,9 +2096,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2147,6 +2168,13 @@ static void io_init_req_drain(struct io_kiocb *req)
}
}
+static __cold int io_init_fail_req(struct io_kiocb *req, int err)
+{
+ /* ensure per-opcode data is cleared if we fail before prep */
+ memset(&req->cmd.data, 0, sizeof(req->cmd.data));
+ return err;
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
@@ -2167,29 +2195,29 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
}
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (sqe_flags & IOSQE_BUFFER_SELECT) {
if (!def->buffer_select)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
if (ctx->drain_disabled)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
io_init_req_drain(req);
}
}
if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ return io_init_fail_req(req, -EACCES);
/* knock it to the slow queue path, will be drained there */
if (ctx->drain_active)
req->flags |= REQ_F_FORCE_ASYNC;
@@ -2202,9 +2230,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
if (!def->ioprio && sqe->ioprio)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (def->needs_file) {
struct io_submit_state *state = &ctx->submit_state;
@@ -2228,12 +2256,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
get_cred(req->creds);
ret = security_uring_override_creds(req->creds);
if (ret) {
put_cred(req->creds);
- return ret;
+ return io_init_fail_req(req, ret);
}
req->flags |= REQ_F_CREDS;
}
@@ -2508,7 +2536,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
- if (io_run_local_work(ctx) > 0)
+ if (io_run_local_work(ctx, INT_MAX) > 0)
return 0;
}
if (io_run_task_work() > 0)
@@ -2531,7 +2559,7 @@ static bool current_pending_io(void)
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
- int io_wait, ret;
+ int ret;
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
@@ -2549,7 +2577,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- io_wait = current->in_iowait;
if (current_pending_io())
current->in_iowait = 1;
ret = 0;
@@ -2557,7 +2584,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
ret = -ETIME;
- current->in_iowait = io_wait;
+ current->in_iowait = 0;
return ret;
}
@@ -2576,26 +2603,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, min_events);
io_run_task_work();
io_cqring_overflow_flush(ctx);
/* if user messes with these they will just get an early return */
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -2612,13 +2626,25 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
+ if (sig) {
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, sigsz);
+
+ if (ret)
+ return ret;
+ }
+
trace_io_uring_cqring_wait(ctx, min_events);
do {
+ int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
-
atomic_set(&ctx->cq_wait_nr, nr_wait);
set_current_state(TASK_INTERRUPTIBLE);
} else {
@@ -2628,7 +2654,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
ret = io_cqring_wait_schedule(ctx, &iowq);
__set_current_state(TASK_RUNNING);
- atomic_set(&ctx->cq_wait_nr, 0);
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
/*
* Run task_work after scheduling and before io_should_wake().
@@ -2637,7 +2663,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
*/
io_run_task_work();
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, nr_wait);
/*
* Non-local task_work will be run on exit to userspace, but
@@ -2708,7 +2734,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
- int ret, i;
+ int ret, i, pinned;
*npages = 0;
@@ -2722,12 +2748,12 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
if (!page_array)
return ERR_PTR(-ENOMEM);
- ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- page_array);
- if (ret != nr_pages) {
-err:
- io_pages_free(&page_array, ret > 0 ? ret : 0);
- return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+
+ pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ page_array);
+ if (pinned != nr_pages) {
+ ret = (pinned < 0) ? pinned : -EFAULT;
+ goto free_pages;
}
page_addr = page_address(page_array[0]);
@@ -2741,7 +2767,7 @@ err:
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
- goto err;
+ goto free_pages;
/*
* No support for discontig pages for now, should either be a
@@ -2750,13 +2776,17 @@ err:
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
- goto err;
+ goto free_pages;
page_addr += PAGE_SIZE;
}
*pages = page_array;
*npages = nr_pages;
return page_to_virt(page_array[0]);
+
+free_pages:
+ io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+ return ERR_PTR(ret);
}
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
@@ -2778,14 +2808,15 @@ static void io_rings_free(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
- ctx->rings = NULL;
- ctx->sq_sqes = NULL;
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
}
+
+ ctx->rings = NULL;
+ ctx->sq_sqes = NULL;
}
void *io_mem_alloc(size_t size)
@@ -2838,61 +2869,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -2932,6 +2908,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+ io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
@@ -2944,13 +2921,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx, ctx->rsrc_node);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@@ -2968,7 +2938,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -2990,7 +2959,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs);
}
-static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
@@ -3049,19 +3018,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -3216,7 +3172,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
* noise and overhead, there's no discernable change in runtime
* over using system_wq.
*/
- queue_work(system_unbound_wq, &ctx->exit_work);
+ queue_work(iou_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -3292,6 +3248,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool ret = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+ hash_node) {
+ struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+ struct io_uring_cmd);
+ struct file *file = req->file;
+
+ if (!cancel_all && req->task != task)
+ continue;
+
+ if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+ /* ->sqe isn't available if no async data */
+ if (!req_has_async_data(req))
+ cmd->sqe = NULL;
+ file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+ ret = true;
+ }
+ }
+ io_submit_flush_completions(ctx);
+
+ return ret;
+}
+
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@@ -3335,10 +3322,13 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
- ret |= io_run_local_work(ctx) > 0;
+ ret |= io_run_local_work(ctx, INT_MAX) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
+ ret |= io_waitid_remove_all(ctx, task, cancel_all);
+ ret |= io_futex_remove_all(ctx, task, cancel_all);
+ ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@@ -3464,14 +3454,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- rcu_read_lock();
- ptr = io_pbuf_get_address(ctx, bgid);
- rcu_read_unlock();
- if (!ptr)
- return ERR_PTR(-EINVAL);
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ptr = bl->buf_ring;
+ io_put_bl(ctx, bl);
break;
}
default:
@@ -3694,7 +3685,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
* it should handle ownership problems if any.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- (void)io_run_local_work_locked(ctx);
+ (void)io_run_local_work_locked(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
}
@@ -3838,32 +3829,13 @@ static int io_uring_install_fd(struct file *file)
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
}
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -4130,506 +4102,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
- /*
- * Lazy activation attempts would fail if it was polled before
- * submitter_task is set.
- */
- if (wq_has_sleeper(&ctx->poll_wq))
- io_activate_pollwq(ctx);
- }
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
- cpumask_var_t new_mask)
-{
- int ret;
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_wq_cpu_affinity(current->io_uring, new_mask);
- } else {
- mutex_unlock(&ctx->uring_lock);
- ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
- mutex_lock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- cpumask_var_t new_mask;
- int ret;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = __io_register_iowq_aff(ctx, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- return __io_register_iowq_aff(ctx, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We don't quiesce the refs for register anymore and so it can't be
- * dying as we're holding a file ref here.
- */
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
- return -ENXIO;
-
- if (ctx->submitter_task && ctx->submitter_task != current)
- return -EEXIST;
-
- if (ctx->restricted) {
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_register_pbuf_ring(ctx, arg);
- break;
- case IORING_UNREGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_unregister_pbuf_ring(ctx, arg);
- break;
- case IORING_REGISTER_SYNC_CANCEL:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_sync_cancel(ctx, arg);
- break;
- case IORING_REGISTER_FILE_ALLOC_RANGE:
- ret = -EINVAL;
- if (!arg || nr_args)
- break;
- ret = io_register_file_alloc_range(ctx, arg);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct file *file;
- bool use_registered_ring;
-
- use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
- opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
-
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
-
- if (use_registered_ring) {
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- struct io_uring_task *tctx = current->io_uring;
-
- if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- file = tctx->registered_rings[fd];
- if (unlikely(!file))
- return -EBADF;
- } else {
- file = fget(fd);
- if (unlikely(!file))
- return -EBADF;
- ret = -EOPNOTSUPP;
- if (!io_is_uring_fops(file))
- goto out_fput;
- }
-
- ctx = file->private_data;
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- if (!use_registered_ring)
- fput(file);
- return ret;
-}
-
static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
@@ -4704,6 +4176,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+ /* top 8bits are for internal use */
+ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
io_uring_optable_init();
/*
@@ -4719,6 +4194,11 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
+ io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+
+ iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d2bad1df3..d5495710c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -16,21 +16,17 @@
#endif
enum {
- /*
- * A hint to not wake right away but delay until there are enough of
- * tw's queued to match the number of CQEs the task is waiting for.
- *
- * Must not be used wirh requests generating more than one CQE.
- * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
- */
- IOU_F_TWQ_LAZY_WAKE = 1,
-};
-
-enum {
IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
/*
+ * Requeue the task_work to restart operations on this request. The
+ * actual value isn't important, should just be not an otherwise
+ * valid error code, yet less than -MAX_ERRNO and valid internally.
+ */
+ IOU_REQUEUE = -3072,
+
+ /*
* Intended only when both IO_URING_F_MULTISHOT is passed
* to indicate to the poll runner that multishot should be
* removed and the result is set on req->cqe.res.
@@ -54,7 +50,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
-bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
@@ -89,6 +84,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);
+enum {
+ IO_EVENTFD_OP_SIGNAL_BIT,
+ IO_EVENTFD_OP_FREE_BIT,
+};
+
+void io_eventfd_ops(struct rcu_head *rcu);
+void io_activate_pollwq(struct io_ring_ctx *ctx);
+
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
@@ -346,6 +349,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
}
extern struct kmem_cache *req_cachep;
+extern struct kmem_cache *io_buf_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index e8516f3bb..b2c4f8293 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -17,11 +17,11 @@
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
-#define BGID_ARRAY 64
-
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
+struct kmem_cache *io_buf_cachep;
+
struct io_provide_buf {
struct file *file;
__u64 addr;
@@ -31,16 +31,6 @@ struct io_provide_buf {
__u16 bid;
};
-static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl,
- unsigned int bgid)
-{
- if (bl && bgid < BGID_ARRAY)
- return &bl[bgid];
-
- return xa_load(&ctx->io_bl_xa, bgid);
-}
-
struct io_buf_free {
struct hlist_node list;
void *mem;
@@ -48,12 +38,18 @@ struct io_buf_free {
int inuse;
};
+static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
+{
+ return xa_load(&ctx->io_bl_xa, bgid);
+}
+
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
lockdep_assert_held(&ctx->uring_lock);
- return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
+ return __io_buffer_get_list(ctx, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -65,15 +61,11 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid;
- smp_store_release(&bl->is_ready, 1);
-
- if (bgid < BGID_ARRAY)
- return 0;
-
+ atomic_set(&bl->refs, 1);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
-void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
+bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@@ -86,7 +78,7 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
* multiple use.
*/
if (req->flags & REQ_F_PARTIAL_IO)
- return;
+ return false;
io_ring_submit_lock(ctx, issue_flags);
@@ -97,7 +89,7 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
- return;
+ return true;
}
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
@@ -215,24 +207,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
- struct io_buffer_list *bl;
- int i;
-
- bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- INIT_LIST_HEAD(&bl[i].buf_list);
- bl[i].bgid = i;
- }
-
- smp_store_release(&ctx->io_bl, bl);
- return 0;
-}
-
/*
* Mark the given mapped range as free for reuse
*/
@@ -301,30 +275,37 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ if (atomic_dec_and_test(&bl->refs)) {
+ __io_remove_buffers(ctx, bl, -1U);
+ kfree_rcu(bl, rcu);
+ }
+}
+
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
+ struct list_head *item, *tmp;
+ struct io_buffer *buf;
unsigned long index;
- int i;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- if (!ctx->io_bl)
- break;
- __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
- }
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
- __io_remove_buffers(ctx, bl, -1U);
- kfree_rcu(bl, rcu);
+ io_put_bl(ctx, bl);
}
- while (!list_empty(&ctx->io_buffers_pages)) {
- struct page *page;
-
- page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
- list_del_init(&page->lru);
- __free_page(page);
+ /*
+ * Move deferred locked entries to cache before pruning
+ */
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp))
+ list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+
+ list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+ buf = list_entry(item, struct io_buffer, list);
+ kmem_cache_free(io_buf_cachep, buf);
}
}
@@ -407,11 +388,12 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return 0;
}
+#define IO_BUFFER_ALLOC_BATCH 64
+
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- struct page *page;
- int bufs_in_page;
+ struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+ int allocated;
/*
* Completions that don't happen inline (eg not under uring_lock) will
@@ -431,22 +413,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
/*
* No free buffers and no completion entries either. Allocate a new
- * page worth of buffer entries and add those to our freelist.
+ * batch of buffer entries and add those to our freelist.
*/
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return -ENOMEM;
-
- list_add(&page->lru, &ctx->io_buffers_pages);
-
- buf = page_address(page);
- bufs_in_page = PAGE_SIZE / sizeof(*buf);
- while (bufs_in_page) {
- list_add_tail(&buf->list, &ctx->io_buffers_cache);
- buf++;
- bufs_in_page--;
+
+ allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+ ARRAY_SIZE(bufs), (void **) bufs);
+ if (unlikely(!allocated)) {
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+ if (!bufs[0])
+ return -ENOMEM;
+ allocated = 1;
}
+ while (allocated)
+ list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
return 0;
}
@@ -485,12 +470,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
io_ring_submit_lock(ctx, issue_flags);
- if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
- ret = io_init_bl_list(ctx);
- if (ret)
- goto err;
- }
-
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -503,14 +482,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (ret) {
/*
* Doesn't need rcu free as it was never visible, but
- * let's keep it consistent throughout. Also can't
- * be a lower indexed array group, as adding one
- * where lookup failed cannot happen.
+ * let's keep it consistent throughout.
*/
- if (p->bgid >= BGID_ARRAY)
- kfree_rcu(bl, rcu);
- else
- WARN_ON_ONCE(1);
+ kfree_rcu(bl, rcu);
goto err;
}
}
@@ -675,12 +649,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.ring_entries >= 65536)
return -EINVAL;
- if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
- int ret = io_init_bl_list(ctx);
- if (ret)
- return ret;
- }
-
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
@@ -729,31 +697,66 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!bl->is_mapped)
return -EINVAL;
- __io_remove_buffers(ctx, bl, -1U);
- if (bl->bgid >= BGID_ARRAY) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree_rcu(bl, rcu);
- }
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ io_put_bl(ctx, bl);
return 0;
}
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
+ struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
+ int i;
- bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+ if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
+ if (buf_status.resv[i])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, buf_status.buf_group);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->is_mapped)
+ return -EINVAL;
+
+ buf_status.head = bl->head;
+ if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
+ return -EFAULT;
+
+ return 0;
+}
+
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid)
+{
+ struct io_buffer_list *bl;
+ bool ret;
- if (!bl || !bl->is_mmap)
- return NULL;
/*
- * Ensure the list is fully setup. Only strictly needed for RCU lookup
- * via mmap, and in that case only for the array indexed groups. For
- * the xarray lookups, it's either visible and ready, or not at all.
+ * We have to be a bit careful here - we're inside mmap and cannot grab
+ * the uring_lock. This means the buffer_list could be simultaneously
+ * going away, if someone is trying to be sneaky. Look it up under rcu
+ * so we know it's not going away, and attempt to grab a reference to
+ * it. If the ref is already zero, then fail the mapping. If successful,
+ * the caller will call io_put_bl() to drop the the reference at at the
+ * end. This may then safely free the buffer_list (and drop the pages)
+ * at that point, vm_insert_pages() would've already grabbed the
+ * necessary vma references.
*/
- if (!smp_load_acquire(&bl->is_ready))
- return NULL;
-
- return bl->buf_ring;
+ rcu_read_lock();
+ bl = xa_load(&ctx->io_bl_xa, bgid);
+ /* must be a mmap'able buffer ring and have pages */
+ ret = false;
+ if (bl && bl->is_mmap)
+ ret = atomic_inc_not_zero(&bl->refs);
+ rcu_read_unlock();
+
+ if (ret)
+ return bl;
+
+ return ERR_PTR(-EINVAL);
}
/*
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 3d0cb6b8c..038091a00 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -25,12 +25,12 @@ struct io_buffer_list {
__u16 head;
__u16 mask;
+ atomic_t refs;
+
/* ring mapped provided buffers */
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
- /* bl is visible from an RCU point of view for lookup */
- __u8 is_ready;
};
struct io_buffer {
@@ -53,16 +53,19 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
-void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
+bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid);
-static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
+static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
/*
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
@@ -85,8 +88,10 @@ static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
} else {
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
+ return true;
}
}
+ return false;
}
static inline bool io_do_buffer_select(struct io_kiocb *req)
@@ -96,12 +101,13 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
}
-static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED)
- io_kbuf_recycle_legacy(req, issue_flags);
+ return io_kbuf_recycle_legacy(req, issue_flags);
if (req->flags & REQ_F_BUFFER_RING)
- io_kbuf_recycle_ring(req);
+ return io_kbuf_recycle_ring(req);
+ return false;
}
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
diff --git a/io_uring/net.c b/io_uring/net.c
index 75d494dad..46ea09e1e 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -60,6 +60,7 @@ struct io_sr_msg {
unsigned len;
unsigned done_io;
unsigned msg_flags;
+ unsigned nr_multishot_loops;
u16 flags;
/* initialised and used only by !msg send variants */
u16 addr_len;
@@ -70,18 +71,12 @@ struct io_sr_msg {
struct io_kiocb *notif;
};
-static inline bool io_check_multishot(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- /*
- * When ->locked_cq is set we only allow to post CQEs from the original
- * task context. Usual request completions will be handled in other
- * generic paths but multipoll may decide to post extra cqes.
- */
- return !(issue_flags & IO_URING_F_IOWQ) ||
- !(issue_flags & IO_URING_F_MULTISHOT) ||
- !req->ctx->task_complete;
-}
+/*
+ * Number of times we'll try and do receives if there's more data. If we
+ * exceed this limit, then add us to the back of the queue and retry from
+ * there. This helps fairness between flooding clients.
+ */
+#define MULTISHOT_MAX_RETRY 32
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -196,16 +191,115 @@ static int io_setup_async_msg(struct io_kiocb *req,
return -EAGAIN;
}
+#ifdef CONFIG_COMPAT
+static int io_compat_msg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg,
+ struct compat_msghdr *msg, int ddir)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct compat_iovec __user *uiov;
+ int ret;
+
+ if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
+ return -EFAULT;
+
+ uiov = compat_ptr(msg->msg_iov);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ compat_ssize_t clen;
+
+ iomsg->free_iov = NULL;
+ if (msg->msg_iovlen == 0) {
+ sr->len = 0;
+ } else if (msg->msg_iovlen > 1) {
+ return -EINVAL;
+ } else {
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+ sr->len = clen;
+ }
+
+ return 0;
+ }
+
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
+ UIO_FASTIOV, &iomsg->free_iov,
+ &iomsg->msg.msg_iter, true);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return 0;
+}
+#endif
+
+static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
+ struct user_msghdr *msg, int ddir)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ if (copy_from_user(msg, sr->umsg, sizeof(*sr->umsg)))
+ return -EFAULT;
+
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (msg->msg_iovlen == 0) {
+ sr->len = iomsg->fast_iov[0].iov_len = 0;
+ iomsg->fast_iov[0].iov_base = NULL;
+ iomsg->free_iov = NULL;
+ } else if (msg->msg_iovlen > 1) {
+ return -EINVAL;
+ } else {
+ if (copy_from_user(iomsg->fast_iov, msg->msg_iov,
+ sizeof(*msg->msg_iov)))
+ return -EFAULT;
+ sr->len = iomsg->fast_iov[0].iov_len;
+ iomsg->free_iov = NULL;
+ }
+
+ return 0;
+ }
+
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
+ &iomsg->free_iov, &iomsg->msg.msg_iter, false);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return 0;
+}
+
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct user_msghdr msg;
int ret;
iomsg->msg.msg_name = &iomsg->addr;
- iomsg->free_iov = iomsg->fast_iov;
- ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
- &iomsg->free_iov);
+ iomsg->msg.msg_iter.nr_segs = 0;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(req->ctx->compat)) {
+ struct compat_msghdr cmsg;
+
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+
+ return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
+ }
+#endif
+
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+
+ ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
+
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
@@ -427,142 +521,77 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+static int io_recvmsg_mshot_prep(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg,
+ int namelen, size_t controllen)
{
- int hdr;
-
- if (iomsg->namelen < 0)
- return true;
- if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
- iomsg->namelen, &hdr))
- return true;
- if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
- return true;
+ if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
+ (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
+ int hdr;
+
+ if (unlikely(namelen < 0))
+ return -EOVERFLOW;
+ if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
+ namelen, &hdr))
+ return -EOVERFLOW;
+ if (check_add_overflow(hdr, controllen, &hdr))
+ return -EOVERFLOW;
+
+ iomsg->namelen = namelen;
+ iomsg->controllen = controllen;
+ return 0;
+ }
- return false;
+ return 0;
}
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct user_msghdr msg;
int ret;
- if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
- return -EFAULT;
-
- ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
- if (ret)
- return ret;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- if (msg.msg_iovlen == 0) {
- sr->len = iomsg->fast_iov[0].iov_len = 0;
- iomsg->fast_iov[0].iov_base = NULL;
- iomsg->free_iov = NULL;
- } else if (msg.msg_iovlen > 1) {
- return -EINVAL;
- } else {
- if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
- return -EFAULT;
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
- }
-
- if (req->flags & REQ_F_APOLL_MULTISHOT) {
- iomsg->namelen = msg.msg_namelen;
- iomsg->controllen = msg.msg_controllen;
- if (io_recvmsg_multishot_overflow(iomsg))
- return -EOVERFLOW;
- }
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter,
- false);
- if (ret > 0)
- ret = 0;
- }
-
- return ret;
-}
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->msg.msg_iter.nr_segs = 0;
#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct compat_msghdr msg;
- struct compat_iovec __user *uiov;
- int ret;
-
- if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
- return -EFAULT;
-
- ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
- if (ret)
- return ret;
-
- uiov = compat_ptr(msg.msg_iov);
- if (req->flags & REQ_F_BUFFER_SELECT) {
- compat_ssize_t clen;
+ if (unlikely(req->ctx->compat)) {
+ struct compat_msghdr cmsg;
- iomsg->free_iov = NULL;
- if (msg.msg_iovlen == 0) {
- sr->len = 0;
- } else if (msg.msg_iovlen > 1) {
- return -EINVAL;
- } else {
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
- sr->len = clen;
- }
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
- if (req->flags & REQ_F_APOLL_MULTISHOT) {
- iomsg->namelen = msg.msg_namelen;
- iomsg->controllen = msg.msg_controllen;
- if (io_recvmsg_multishot_overflow(iomsg))
- return -EOVERFLOW;
- }
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
- if (ret < 0)
+ ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
+ if (unlikely(ret))
return ret;
- }
- return 0;
-}
+ return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
+ cmsg.msg_controllen);
+ }
#endif
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
+ ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
+ if (unlikely(ret))
+ return ret;
- return __io_recvmsg_copy_hdr(req, iomsg);
+ return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
+ msg.msg_controllen);
}
int io_recvmsg_prep_async(struct io_kiocb *req)
{
+ struct io_async_msghdr *iomsg;
int ret;
if (!io_msg_alloc_async_prep(req))
return -ENOMEM;
- ret = io_recvmsg_copy_hdr(req, req->async_data);
+ iomsg = req->async_data;
+ ret = io_recvmsg_copy_hdr(req, iomsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -611,6 +640,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
sr->done_io = 0;
+ sr->nr_multishot_loops = 0;
return 0;
}
@@ -645,23 +675,35 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
return true;
}
- if (!mshot_finished) {
- if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- *ret, cflags | IORING_CQE_F_MORE)) {
- io_recv_prep_retry(req);
- /* Known not-empty or unknown state, retry */
- if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
- msg->msg_inq == -1)
+ if (mshot_finished)
+ goto finish;
+
+ /*
+ * Fill CQE for this receive and see if we should keep trying to
+ * receive from this socket.
+ */
+ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+ *ret, cflags | IORING_CQE_F_MORE)) {
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
+
+ io_recv_prep_retry(req);
+ /* Known not-empty or unknown state, retry */
+ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
+ if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
return false;
- if (issue_flags & IO_URING_F_MULTISHOT)
- *ret = IOU_ISSUE_SKIP_COMPLETE;
- else
- *ret = -EAGAIN;
- return true;
+ /* mshot retries exceeded, force a requeue */
+ sr->nr_multishot_loops = 0;
+ mshot_retry_ret = IOU_REQUEUE;
}
- /* Otherwise stop multishot but use the current result. */
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ *ret = mshot_retry_ret;
+ else
+ *ret = -EAGAIN;
+ return true;
}
-
+ /* Otherwise stop multishot but use the current result. */
+finish:
io_req_set_res(req, *ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
@@ -782,9 +824,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return io_setup_async_msg(req, kmsg, issue_flags);
- if (!io_check_multishot(req, issue_flags))
- return io_setup_async_msg(req, kmsg, issue_flags);
-
retry_multishot:
if (io_do_buffer_select(req)) {
void __user *buf;
@@ -860,7 +899,8 @@ retry_multishot:
kfree(kmsg->free_iov);
io_netmsg_recycle(req, issue_flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
- }
+ } else if (ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg, issue_flags);
return ret;
}
@@ -879,9 +919,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -902,6 +939,7 @@ retry_multishot:
if (!buf)
return -ENOBUFS;
sr->buf = buf;
+ sr->len = len;
}
ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
@@ -1217,6 +1255,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
if (req_has_async_data(req)) {
kmsg = req->async_data;
+ kmsg->msg.msg_control_user = sr->msg_control;
} else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
@@ -1329,8 +1368,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
@@ -1350,7 +1387,7 @@ retry:
* has already been done
*/
if (issue_flags & IO_URING_F_MULTISHOT)
- ret = IOU_ISSUE_SKIP_COMPLETE;
+ return IOU_ISSUE_SKIP_COMPLETE;
return ret;
}
if (ret == -ERESTARTSYS)
@@ -1375,7 +1412,8 @@ retry:
ret, IORING_CQE_F_MORE))
goto retry;
- return -ECANCELED;
+ io_req_set_res(req, ret, 0);
+ return IOU_STOP_MULTISHOT;
}
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b..b1ee3a9c3 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,8 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
+#include "waitid.h"
+#include "futex.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -63,7 +65,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .vectored = 1,
+ .prep = io_prep_rwv,
.issue = io_read,
},
[IORING_OP_WRITEV] = {
@@ -76,7 +79,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .vectored = 1,
+ .prep = io_prep_rwv,
.issue = io_write,
},
[IORING_OP_FSYNC] = {
@@ -94,7 +98,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .prep = io_prep_rw_fixed,
.issue = io_read,
},
[IORING_OP_WRITE_FIXED] = {
@@ -107,7 +111,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .prep = io_prep_rw_fixed,
.issue = io_write,
},
[IORING_OP_POLL_ADD] = {
@@ -428,9 +432,50 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
+ .audit_skip = 1,
+ .prep = io_read_mshot_prep,
+ .issue = io_read_mshot,
+ },
+ [IORING_OP_WAITID] = {
+ .prep = io_waitid_prep,
+ .issue = io_waitid,
+ },
+ [IORING_OP_FUTEX_WAIT] = {
+#if defined(CONFIG_FUTEX)
+ .prep = io_futex_prep,
+ .issue = io_futex_wait,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_FUTEX_WAKE] = {
+#if defined(CONFIG_FUTEX)
+ .prep = io_futex_prep,
+ .issue = io_futex_wake,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_FUTEX_WAITV] = {
+#if defined(CONFIG_FUTEX)
+ .prep = io_futexv_prep,
+ .issue = io_futexv_wait,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .needs_file = 1,
+ .prep = io_install_fixed_fd_prep,
+ .issue = io_install_fixed_fd,
+ },
};
-
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
@@ -648,6 +693,25 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .name = "READ_MULTISHOT",
+ },
+ [IORING_OP_WAITID] = {
+ .name = "WAITID",
+ .async_size = sizeof(struct io_waitid_async),
+ },
+ [IORING_OP_FUTEX_WAIT] = {
+ .name = "FUTEX_WAIT",
+ },
+ [IORING_OP_FUTEX_WAKE] = {
+ .name = "FUTEX_WAKE",
+ },
+ [IORING_OP_FUTEX_WAITV] = {
+ .name = "FUTEX_WAITV",
+ },
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .name = "FIXED_FD_INSTALL",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c22c8696e..9e5435ec2 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
+ /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+ unsigned vectored : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3fae26e0..e3357dfa1 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -31,6 +31,11 @@ struct io_close {
u32 file_slot;
};
+struct io_fixed_install {
+ struct file *file;
+ unsigned int o_flags;
+};
+
static bool io_openat_force_async(struct io_open *open)
{
/*
@@ -220,7 +225,6 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
struct files_struct *files = current->files;
struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
- struct fdtable *fdt;
struct file *file;
int ret = -EBADF;
@@ -230,13 +234,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
}
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (close->fd >= fdt->max_fds) {
- spin_unlock(&files->file_lock);
- goto err;
- }
- file = rcu_dereference_protected(fdt->fd[close->fd],
- lockdep_is_held(&files->file_lock));
+ file = files_lookup_fd_locked(files, close->fd);
if (!file || io_is_uring_fops(file)) {
spin_unlock(&files->file_lock);
goto err;
@@ -248,7 +246,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
}
- file = __close_fd_get_file(close->fd);
+ file = file_close_fd_locked(files, close->fd);
spin_unlock(&files->file_lock);
if (!file)
goto err;
@@ -261,3 +259,46 @@ err:
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_fixed_install *ifi;
+ unsigned int flags;
+
+ if (sqe->off || sqe->addr || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in || sqe->addr3)
+ return -EINVAL;
+
+ /* must be a fixed file */
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ flags = READ_ONCE(sqe->install_fd_flags);
+ if (flags & ~IORING_FIXED_FD_NO_CLOEXEC)
+ return -EINVAL;
+
+ /* ensure the task's creds are used when installing/receiving fds */
+ if (req->flags & REQ_F_CREDS)
+ return -EPERM;
+
+ /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */
+ ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
+ ifi->o_flags = O_CLOEXEC;
+ if (flags & IORING_FIXED_FD_NO_CLOEXEC)
+ ifi->o_flags = 0;
+
+ return 0;
+}
+
+int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_fixed_install *ifi;
+ int ret;
+
+ ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
+ ret = receive_fd(req->file, NULL, ifi->o_flags);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 4b1c28d3a..8a93c98ad 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 4c360ba87..c6f478962 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -226,8 +226,29 @@ enum {
IOU_POLL_NO_ACTION = 1,
IOU_POLL_REMOVE_POLL_USE_RES = 2,
IOU_POLL_REISSUE = 3,
+ IOU_POLL_REQUEUE = 4,
};
+static void __io_poll_execute(struct io_kiocb *req, int mask)
+{
+ unsigned flags = 0;
+
+ io_req_set_res(req, mask, 0);
+ req->io_task_work.func = io_poll_task_func;
+
+ trace_io_uring_task_add(req, mask);
+
+ if (!(req->flags & REQ_F_POLL_NO_LAZY))
+ flags = IOU_F_TWQ_LAZY_WAKE;
+ __io_req_task_work_add(req, flags);
+}
+
+static inline void io_poll_execute(struct io_kiocb *req, int res)
+{
+ if (io_poll_get_ownership(req))
+ __io_poll_execute(req, res);
+}
+
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
@@ -309,6 +330,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
int ret = io_poll_issue(req, ts);
if (ret == IOU_STOP_MULTISHOT)
return IOU_POLL_REMOVE_POLL_USE_RES;
+ else if (ret == IOU_REQUEUE)
+ return IOU_POLL_REQUEUE;
if (ret < 0)
return ret;
}
@@ -331,8 +354,12 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
int ret;
ret = io_poll_check_events(req, ts);
- if (ret == IOU_POLL_NO_ACTION)
+ if (ret == IOU_POLL_NO_ACTION) {
return;
+ } else if (ret == IOU_POLL_REQUEUE) {
+ __io_poll_execute(req, 0);
+ return;
+ }
io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, ts);
@@ -364,21 +391,6 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
}
}
-static void __io_poll_execute(struct io_kiocb *req, int mask)
-{
- io_req_set_res(req, mask, 0);
- req->io_task_work.func = io_poll_task_func;
-
- trace_io_uring_task_add(req, mask);
- io_req_task_work_add(req);
-}
-
-static inline void io_poll_execute(struct io_kiocb *req, int res)
-{
- if (io_poll_get_ownership(req))
- __io_poll_execute(req, res);
-}
-
static void io_poll_cancel_req(struct io_kiocb *req)
{
io_poll_mark_cancelled(req);
@@ -526,10 +538,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
poll->head = head;
poll->wait.private = (void *) wqe_private;
- if (poll->events & EPOLLEXCLUSIVE)
+ if (poll->events & EPOLLEXCLUSIVE) {
add_wait_queue_exclusive(head, &poll->wait);
- else
+ } else {
add_wait_queue(head, &poll->wait);
+ }
}
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
@@ -597,6 +610,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (issue_flags & IO_URING_F_UNLOCKED)
req->flags &= ~REQ_F_HASH_LOCKED;
+
+ /*
+ * Exclusive waits may only wake a limited amount of entries
+ * rather than all of them, this may interfere with lazy
+ * wake if someone does wait(events > 1). Ensure we don't do
+ * lazy wake for those, as we need to process each one as they
+ * come in.
+ */
+ if (poll->events & EPOLLEXCLUSIVE)
+ req->flags |= REQ_F_POLL_NO_LAZY;
+
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
if (unlikely(ipt->error || !ipt->nr_entries)) {
@@ -974,7 +998,6 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
struct io_hash_bucket *bucket;
struct io_kiocb *preq;
int ret2, ret = 0;
- struct io_tw_state ts = { .locked = true };
io_ring_submit_lock(ctx, issue_flags);
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
@@ -1023,7 +1046,8 @@ found:
req_set_fail(preq);
io_req_set_res(preq, -ECANCELED, 0);
- io_req_task_complete(preq, &ts);
+ preq->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(preq);
out:
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0) {
diff --git a/io_uring/poll.h b/io_uring/poll.h
index ff4d5d753..1dacae9e8 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -24,6 +24,15 @@ struct async_poll {
struct io_poll *double_poll;
};
+/*
+ * Must only be called inside issue_flags & IO_URING_F_MULTISHOT, or
+ * potentially other cases where we already "own" this poll request.
+ */
+static inline void io_poll_multishot_retry(struct io_kiocb *req)
+{
+ atomic_inc(&req->poll_refs);
+}
+
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/register.c b/io_uring/register.c
new file mode 100644
index 000000000..5e62c1208
--- /dev/null
+++ b/io_uring/register.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code related to the io_uring_register() syscall
+ *
+ * Copyright (C) 2023 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/refcount.h>
+#include <linux/bits.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+#include "io_uring.h"
+#include "opdef.h"
+#include "tctx.h"
+#include "rsrc.h"
+#include "sqpoll.h"
+#include "register.h"
+#include "cancel.h"
+#include "kbuf.h"
+
+#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
+ IORING_REGISTER_LAST + IORING_OP_LAST)
+
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
+{
+ struct io_ev_fd *ev_fd;
+ __s32 __user *fds = arg;
+ int fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
+ return -EBUSY;
+
+ if (copy_from_user(&fd, fds, sizeof(*fds)))
+ return -EFAULT;
+
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
+
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ return ret;
+ }
+
+ spin_lock(&ctx->completion_lock);
+ ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
+
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+ atomic_set(&ev_fd->refs, 1);
+ atomic_set(&ev_fd->ops, 0);
+ return 0;
+}
+
+int io_eventfd_unregister(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
+ call_rcu(&ev_fd->rcu, io_eventfd_ops);
+ return 0;
+ }
+
+ return -ENXIO;
+}
+
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_probe *p;
+ size_t size;
+ int i, ret;
+
+ size = struct_size(p, ops, nr_args);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+ p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(p, arg, size))
+ goto out;
+ ret = -EINVAL;
+ if (memchr_inv(p, 0, size))
+ goto out;
+
+ p->last_op = IORING_OP_LAST - 1;
+ if (nr_args > IORING_OP_LAST)
+ nr_args = IORING_OP_LAST;
+
+ for (i = 0; i < nr_args; i++) {
+ p->ops[i].op = i;
+ if (!io_issue_defs[i].not_supported)
+ p->ops[i].flags = IO_URING_OP_SUPPORTED;
+ }
+ p->ops_len = i;
+
+ ret = 0;
+ if (copy_to_user(arg, p, size))
+ ret = -EFAULT;
+out:
+ kfree(p);
+ return ret;
+}
+
+int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
+{
+ const struct cred *creds;
+
+ creds = xa_erase(&ctx->personalities, id);
+ if (creds) {
+ put_cred(creds);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+
+static int io_register_personality(struct io_ring_ctx *ctx)
+{
+ const struct cred *creds;
+ u32 id;
+ int ret;
+
+ creds = get_current_cred();
+
+ ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
+ XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
+ if (ret < 0) {
+ put_cred(creds);
+ return ret;
+ }
+ return id;
+}
+
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
+{
+ struct io_uring_restriction *res;
+ size_t size;
+ int i, ret;
+
+ /* Restrictions allowed only if rings started disabled */
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ /* We allow only a single restrictions registration */
+ if (ctx->restrictions.registered)
+ return -EBUSY;
+
+ if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
+ return -EINVAL;
+
+ size = array_size(nr_args, sizeof(*res));
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ res = memdup_user(arg, size);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = 0;
+
+ for (i = 0; i < nr_args; i++) {
+ switch (res[i].opcode) {
+ case IORING_RESTRICTION_REGISTER_OP:
+ if (res[i].register_op >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].register_op,
+ ctx->restrictions.register_op);
+ break;
+ case IORING_RESTRICTION_SQE_OP:
+ if (res[i].sqe_op >= IORING_OP_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
+ ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
+ ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ /* Reset all restrictions if an error happened */
+ if (ret != 0)
+ memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
+ else
+ ctx->restrictions.registered = true;
+
+ kfree(res);
+ return ret;
+}
+
+static int io_register_enable_rings(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
+ WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
+ /*
+ * Lazy activation attempts would fail if it was polled before
+ * submitter_task is set.
+ */
+ if (wq_has_sleeper(&ctx->poll_wq))
+ io_activate_pollwq(ctx);
+ }
+
+ if (ctx->restrictions.registered)
+ ctx->restricted = 1;
+
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+ if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
+ return 0;
+}
+
+static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
+ cpumask_var_t new_mask)
+{
+ int ret;
+
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ ret = io_wq_cpu_affinity(current->io_uring, new_mask);
+ } else {
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return ret;
+}
+
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
+{
+ cpumask_var_t new_mask;
+ int ret;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(new_mask);
+ if (len > cpumask_size())
+ len = cpumask_size();
+
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_bitmap(cpumask_bits(new_mask),
+ (const compat_ulong_t __user *)arg,
+ len * 8 /* CHAR_BIT */);
+ else
+#endif
+ ret = copy_from_user(new_mask, arg, len);
+
+ if (ret) {
+ free_cpumask_var(new_mask);
+ return -EFAULT;
+ }
+
+ ret = __io_register_iowq_aff(ctx, new_mask);
+ free_cpumask_var(new_mask);
+ return ret;
+}
+
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+{
+ return __io_register_iowq_aff(ctx, NULL);
+}
+
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_tctx_node *node;
+ struct io_uring_task *tctx = NULL;
+ struct io_sq_data *sqd = NULL;
+ __u32 new_count[2];
+ int i, ret;
+
+ if (copy_from_user(new_count, arg, sizeof(new_count)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i] > INT_MAX)
+ return -EINVAL;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ sqd = ctx->sq_data;
+ if (sqd) {
+ /*
+ * Observe the correct sqd->lock -> ctx->uring_lock
+ * ordering. Fine to drop uring_lock here, we hold
+ * a ref to the ctx.
+ */
+ refcount_inc(&sqd->refs);
+ mutex_unlock(&ctx->uring_lock);
+ mutex_lock(&sqd->lock);
+ mutex_lock(&ctx->uring_lock);
+ if (sqd->thread)
+ tctx = sqd->thread->io_uring;
+ }
+ } else {
+ tctx = current->io_uring;
+ }
+
+ BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
+ ctx->iowq_limits_set = true;
+
+ if (tctx && tctx->io_wq) {
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ goto err;
+ } else {
+ memset(new_count, 0, sizeof(new_count));
+ }
+
+ if (sqd) {
+ mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
+
+ if (copy_to_user(arg, new_count, sizeof(new_count)))
+ return -EFAULT;
+
+ /* that's it for SQPOLL, only the SQPOLL task creates requests */
+ if (sqd)
+ return 0;
+
+ /* now propagate the restriction to all registered users */
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (WARN_ON_ONCE(!tctx->io_wq))
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ new_count[i] = ctx->iowq_limits[i];
+ /* ignore errors, it always returns zero anyway */
+ (void)io_wq_max_workers(tctx->io_wq, new_count);
+ }
+ return 0;
+err:
+ if (sqd) {
+ mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
+ return ret;
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+ void __user *arg, unsigned nr_args)
+ __releases(ctx->uring_lock)
+ __acquires(ctx->uring_lock)
+{
+ int ret;
+
+ /*
+ * We don't quiesce the refs for register anymore and so it can't be
+ * dying as we're holding a file ref here.
+ */
+ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
+ return -ENXIO;
+
+ if (ctx->submitter_task && ctx->submitter_task != current)
+ return -EEXIST;
+
+ if (ctx->restricted) {
+ opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+ if (!test_bit(opcode, ctx->restrictions.register_op))
+ return -EACCES;
+ }
+
+ switch (opcode) {
+ case IORING_REGISTER_BUFFERS:
+ ret = -EFAULT;
+ if (!arg)
+ break;
+ ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
+ break;
+ case IORING_UNREGISTER_BUFFERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_buffers_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES:
+ ret = -EFAULT;
+ if (!arg)
+ break;
+ ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
+ break;
+ case IORING_UNREGISTER_FILES:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_files_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES_UPDATE:
+ ret = io_register_files_update(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_EVENTFD:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+ ret = io_eventfd_register(ctx, arg, 1);
+ break;
+ case IORING_UNREGISTER_EVENTFD:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_eventfd_unregister(ctx);
+ break;
+ case IORING_REGISTER_PROBE:
+ ret = -EINVAL;
+ if (!arg || nr_args > 256)
+ break;
+ ret = io_probe(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_personality(ctx);
+ break;
+ case IORING_UNREGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg)
+ break;
+ ret = io_unregister_personality(ctx, nr_args);
+ break;
+ case IORING_REGISTER_ENABLE_RINGS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_enable_rings(ctx);
+ break;
+ case IORING_REGISTER_RESTRICTIONS:
+ ret = io_register_restrictions(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_FILES2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
+ break;
+ case IORING_REGISTER_FILES_UPDATE2:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_FILE);
+ break;
+ case IORING_REGISTER_BUFFERS2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (!arg || !nr_args)
+ break;
+ ret = io_register_iowq_aff(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_unregister_iowq_aff(ctx);
+ break;
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 2)
+ break;
+ ret = io_register_iowq_max_workers(ctx, arg);
+ break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_ring(ctx, arg);
+ break;
+ case IORING_UNREGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_unregister_pbuf_ring(ctx, arg);
+ break;
+ case IORING_REGISTER_SYNC_CANCEL:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_sync_cancel(ctx, arg);
+ break;
+ case IORING_REGISTER_FILE_ALLOC_RANGE:
+ ret = -EINVAL;
+ if (!arg || nr_args)
+ break;
+ ret = io_register_file_alloc_range(ctx, arg);
+ break;
+ case IORING_REGISTER_PBUF_STATUS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_status(ctx, arg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+ void __user *, arg, unsigned int, nr_args)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ struct file *file;
+ bool use_registered_ring;
+
+ use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
+ opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
+
+ if (opcode >= IORING_REGISTER_LAST)
+ return -EINVAL;
+
+ if (use_registered_ring) {
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ file = tctx->registered_rings[fd];
+ if (unlikely(!file))
+ return -EBADF;
+ } else {
+ file = fget(fd);
+ if (unlikely(!file))
+ return -EBADF;
+ ret = -EOPNOTSUPP;
+ if (!io_is_uring_fops(file))
+ goto out_fput;
+ }
+
+ ctx = file->private_data;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_uring_register(ctx, opcode, arg, nr_args);
+ mutex_unlock(&ctx->uring_lock);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+out_fput:
+ if (!use_registered_ring)
+ fput(file);
+ return ret;
+}
diff --git a/io_uring/register.h b/io_uring/register.h
new file mode 100644
index 000000000..c9da997d5
--- /dev/null
+++ b/io_uring/register.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_REGISTER_H
+#define IORING_REGISTER_H
+
+int io_eventfd_unregister(struct io_ring_ctx *ctx);
+int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
+
+#endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index dde501abd..4818b7923 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -24,7 +24,6 @@ struct io_rsrc_update {
};
static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage);
@@ -157,7 +156,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
switch (node->type) {
case IORING_RSRC_FILE:
- io_rsrc_file_put(node->ctx, prsrc);
+ fput(prsrc->file);
break;
case IORING_RSRC_BUFFER:
io_rsrc_buf_put(node->ctx, prsrc);
@@ -402,23 +401,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
break;
}
/*
- * Don't allow io_uring instances to be registered. If
- * UNIX isn't enabled, then this causes a reference
- * cycle and this instance can never get freed. If UNIX
- * is enabled we'll handle it just fine, but there's
- * still no point in allowing a ring fd as it doesn't
- * support regular read/write anyway.
+ * Don't allow io_uring instances to be registered.
*/
if (io_is_uring_fops(file)) {
fput(file);
err = -EBADF;
break;
}
- err = io_scm_file_account(ctx, file);
- if (err) {
- fput(file);
- break;
- }
*io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, i);
@@ -675,22 +664,12 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
for (i = 0; i < ctx->nr_user_files; i++) {
struct file *file = io_file_from_index(&ctx->file_table, i);
- /* skip scm accounted files, they'll be freed by ->ring_sock */
- if (!file || io_file_need_scm(file))
+ if (!file)
continue;
io_file_bitmap_clear(&ctx->file_table, i);
fput(file);
}
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
- kfree_skb(skb);
- }
-#endif
io_free_file_tables(&ctx->file_table);
io_file_table_set_alloc_range(ctx, 0, 0);
io_rsrc_data_free(ctx->file_data);
@@ -718,137 +697,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return ret;
}
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing. We account only files that can hold other
- * files because otherwise they can't form a loop and so are not interesting
- * for GC.
- */
-int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sk = ctx->ring_sock->sk;
- struct sk_buff_head *head = &sk->sk_receive_queue;
- struct scm_fp_list *fpl;
- struct sk_buff *skb;
-
- if (likely(!io_file_need_scm(file)))
- return 0;
-
- /*
- * See if we can merge this file into an existing skb SCM_RIGHTS
- * file set. If there's no room, fall back to allocating a new skb
- * and filling it in.
- */
- spin_lock_irq(&head->lock);
- skb = skb_peek(head);
- if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
- __skb_unlink(skb, head);
- else
- skb = NULL;
- spin_unlock_irq(&head->lock);
-
- if (!skb) {
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
-
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
-
- fpl->user = get_uid(current_user());
- fpl->max = SCM_MAX_FD;
- fpl->count = 0;
-
- UNIXCB(skb).fp = fpl;
- skb->sk = sk;
- skb->destructor = io_uring_destruct_scm;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- }
-
- fpl = UNIXCB(skb).fp;
- fpl->fp[fpl->count++] = get_file(file);
- unix_inflight(fpl->user, file);
- skb_queue_head(head, skb);
- fput(file);
-#endif
- return 0;
-}
-
-static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head list, *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
- int i;
-
- __skb_queue_head_init(&list);
-
- /*
- * Find the skb that holds this file in its SCM_RIGHTS. When found,
- * remove this entry and rearrange the file array.
- */
- skb = skb_dequeue(head);
- while (skb) {
- struct scm_fp_list *fp;
-
- fp = UNIXCB(skb).fp;
- for (i = 0; i < fp->count; i++) {
- int left;
-
- if (fp->fp[i] != file)
- continue;
-
- unix_notinflight(fp->user, fp->fp[i]);
- left = fp->count - 1 - i;
- if (left) {
- memmove(&fp->fp[i], &fp->fp[i + 1],
- left * sizeof(struct file *));
- }
- fp->count--;
- if (!fp->count) {
- kfree_skb(skb);
- skb = NULL;
- } else {
- __skb_queue_tail(&list, skb);
- }
- fput(file);
- file = NULL;
- break;
- }
-
- if (!file)
- break;
-
- __skb_queue_tail(&list, skb);
-
- skb = skb_dequeue(head);
- }
-
- if (skb_peek(&list)) {
- spin_lock_irq(&head->lock);
- while ((skb = __skb_dequeue(&list)) != NULL)
- __skb_queue_tail(head, skb);
- spin_unlock_irq(&head->lock);
- }
-#endif
-}
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- struct file *file = prsrc->file;
-
- if (likely(!io_file_need_scm(file)))
- fput(file);
- else
- io_rsrc_file_scm_put(ctx, file);
-}
-
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
@@ -897,21 +745,12 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail;
/*
- * Don't allow io_uring instances to be registered. If UNIX
- * isn't enabled, then this causes a reference cycle and this
- * instance can never get freed. If UNIX is enabled we'll
- * handle it just fine, but there's still no point in allowing
- * a ring fd as it doesn't support regular read/write anyway.
+ * Don't allow io_uring instances to be registered.
*/
if (io_is_uring_fops(file)) {
fput(file);
goto fail;
}
- ret = io_scm_file_account(ctx, file);
- if (ret) {
- fput(file);
- goto fail;
- }
file_slot = io_fixed_file_slot(&ctx->file_table, i);
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, i);
@@ -1037,39 +876,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
struct page **pages = NULL;
- int pret, ret = -ENOMEM;
+ int ret;
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
+ WARN_ON(!nr_pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
- goto done;
+ return ERR_PTR(-ENOMEM);
- ret = 0;
mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages);
- if (pret == nr_pages)
+ ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
+ mmap_read_unlock(current->mm);
+
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
*npages = nr_pages;
- else
- ret = pret < 0 ? pret : -EFAULT;
+ return pages;
+ }
- mmap_read_unlock(current->mm);
- if (ret) {
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
/* if we did partial map, release any pages we did get */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- goto done;
- }
- ret = 0;
-done:
- if (ret < 0) {
- kvfree(pages);
- pages = ERR_PTR(ret);
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
}
- return pages;
+ kvfree(pages);
+ return ERR_PTR(ret);
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 08ac0d8e0..c6f199bbe 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -75,21 +75,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags);
-int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
-
-static inline bool io_file_need_scm(struct file *filp)
-{
- return false;
-}
-
-static inline int io_scm_file_account(struct io_ring_ctx *ctx,
- struct file *file)
-{
- if (likely(!io_file_need_scm(file)))
- return 0;
- return __io_scm_file_account(ctx, file);
-}
-
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args);
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
@@ -117,17 +102,21 @@ static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
node->refs++;
}
+static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+ req->rsrc_node = ctx->rsrc_node;
+ io_charge_rsrc_node(ctx, ctx->rsrc_node);
+}
+
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
if (!req->rsrc_node) {
io_ring_submit_lock(ctx, issue_flags);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- req->rsrc_node = ctx->rsrc_node;
- io_charge_rsrc_node(ctx, ctx->rsrc_node);
+ __io_req_set_rsrc_node(req, ctx);
io_ring_submit_unlock(ctx, issue_flags);
}
}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0a0c1c9db..c3c154790 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -10,7 +10,7 @@
#include <linux/poll.h>
#include <linux/nospec.h>
#include <linux/compat.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <uapi/linux/io_uring.h>
@@ -18,6 +18,7 @@
#include "opdef.h"
#include "kbuf.h"
#include "rsrc.h"
+#include "poll.h"
#include "rw.h"
struct io_rw {
@@ -83,18 +84,6 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
/* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- struct io_ring_ctx *ctx = req->ctx;
- u16 index;
-
- if (unlikely(req->buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
- req->imu = ctx->user_bufs[index];
- io_req_set_rsrc_node(req, ctx, 0);
- }
-
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
ret = ioprio_check_cap(ioprio);
@@ -110,45 +99,74 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
+ return 0;
+}
+
+int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
- /* Have to do this validation here, as this is in io_read() rw->len might
- * have chanaged due to buffer selection
+ /*
+ * Have to do this validation here, as this is in io_read() rw->len
+ * might have chanaged due to buffer selection
*/
- if (req->opcode == IORING_OP_READV && req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select_prep(req);
- if (ret)
- return ret;
- }
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return io_iov_buffer_select_prep(req);
return 0;
}
-void io_readv_writev_cleanup(struct io_kiocb *req)
+int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_async_rw *io = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+ u16 index;
+ int ret;
- kfree(io->free_iovec);
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+ index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
+ req->imu = ctx->user_bufs[index];
+ io_req_set_rsrc_node(req, ctx, 0);
+ return 0;
}
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+/*
+ * Multishot read is prepared just like a normal read/write request, only
+ * difference is that we set the MULTISHOT flag.
+ */
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- switch (ret) {
- case -EIOCBQUEUED:
- break;
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- case -ERESTART_RESTARTBLOCK:
- /*
- * We can't just restart the syscall, since previously
- * submitted sqes may already be in progress. Just fail this
- * IO with EINTR.
- */
- ret = -EINTR;
- fallthrough;
- default:
- kiocb->ki_complete(kiocb, ret);
- }
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ int ret;
+
+ /* must be used with provided buffers */
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
+
+ if (rw->addr || rw->len)
+ return -EINVAL;
+
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ return 0;
+}
+
+void io_readv_writev_cleanup(struct io_kiocb *req)
+{
+ struct io_async_rw *io = req->async_data;
+
+ kfree(io->free_iovec);
}
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@@ -333,6 +351,33 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
smp_store_release(&req->iopoll_completed, 1);
}
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+ /* IO was queued async, completion will happen later */
+ if (ret == -EIOCBQUEUED)
+ return;
+
+ /* transform internal restart error codes */
+ if (unlikely(ret < 0)) {
+ switch (ret) {
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /*
+ * We can't just restart the syscall, since previously
+ * submitted sqes may already be in progress. Just fail
+ * this IO with EINTR.
+ */
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll,
+ io_complete_rw, kiocb, ret);
+}
+
static int kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
@@ -388,8 +433,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
buf = u64_to_user_ptr(rw->addr);
sqe_len = rw->len;
- if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
- (req->flags & REQ_F_BUFFER_SELECT)) {
+ if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags);
if (!buf)
@@ -527,6 +571,9 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
{
if (!force && !io_cold_defs[req->opcode].prep_async)
return 0;
+ /* opcode type doesn't need async data */
+ if (!io_cold_defs[req->opcode].async_size)
+ return 0;
if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
@@ -712,7 +759,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return 0;
}
-int io_read(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_rw_state __s, *s = &__s;
@@ -780,8 +827,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
- /* if we can poll, just do that */
- if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ /*
+ * If we can poll, just do that. For a vectored read, we'll
+ * need to copy state first.
+ */
+ if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -857,7 +907,92 @@ done:
/* it's faster to check here then delegate to kfree */
if (iovec)
kfree(iovec);
- return kiocb_done(req, ret, issue_flags);
+ return ret;
+}
+
+int io_read(struct io_kiocb *req, unsigned int issue_flags)
+{
+ int ret;
+
+ ret = __io_read(req, issue_flags);
+ if (ret >= 0)
+ return kiocb_done(req, ret, issue_flags);
+
+ return ret;
+}
+
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ unsigned int cflags = 0;
+ int ret;
+
+ /*
+ * Multishot MUST be used on a pollable file
+ */
+ if (!file_can_poll(req->file))
+ return -EBADFD;
+
+ ret = __io_read(req, issue_flags);
+
+ /*
+ * If the file doesn't support proper NOWAIT, then disable multishot
+ * and stay in single shot mode.
+ */
+ if (!io_file_supports_nowait(req))
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+
+ /*
+ * If we get -EAGAIN, recycle our buffer and just let normal poll
+ * handling arm it.
+ */
+ if (ret == -EAGAIN) {
+ /*
+ * Reset rw->len to 0 again to avoid clamping future mshot
+ * reads, in case the buffer size varies.
+ */
+ if (io_kbuf_recycle(req, issue_flags))
+ rw->len = 0;
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_ISSUE_SKIP_COMPLETE;
+ return -EAGAIN;
+ }
+
+ /*
+ * Any successful return value will keep the multishot read armed.
+ */
+ if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) {
+ /*
+ * Put our buffer and post a CQE. If we fail to post a CQE, then
+ * jump to the termination path. This request is then done.
+ */
+ cflags = io_put_kbuf(req, issue_flags);
+ rw->len = 0; /* similarly to above, reset len to 0 */
+
+ if (io_fill_cqe_req_aux(req,
+ issue_flags & IO_URING_F_COMPLETE_DEFER,
+ ret, cflags | IORING_CQE_F_MORE)) {
+ if (issue_flags & IO_URING_F_MULTISHOT) {
+ /*
+ * Force retry, as we might have more data to
+ * be read and otherwise it won't get retried
+ * until (if ever) another poll is triggered.
+ */
+ io_poll_multishot_retry(req);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+ return -EAGAIN;
+ }
+ }
+
+ /*
+ * Either an error, or we've hit overflow posting the CQE. For any
+ * multishot request, hitting overflow will terminate it.
+ */
+ io_req_set_res(req, ret, cflags);
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_STOP_MULTISHOT;
+ return IOU_OK;
}
int io_write(struct io_kiocb *req, unsigned int issue_flags)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 4b89f9659..f9e89b4fe 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -16,6 +16,8 @@ struct io_async_rw {
};
int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read(struct io_kiocb *req, unsigned int issue_flags);
int io_readv_prep_async(struct io_kiocb *req);
int io_write(struct io_kiocb *req, unsigned int issue_flags);
@@ -23,3 +25,5 @@ int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7c4469e95..3b659cd23 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
struct file *in;
- long ret = 0;
+ ssize_t ret = 0;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
@@ -92,7 +92,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
struct file *in;
- long ret = 0;
+ ssize_t ret = 0;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 5fa19861c..c33fca585 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -2,7 +2,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/security.h>
#include <linux/nospec.h>
@@ -13,6 +13,45 @@
#include "rsrc.h"
#include "uring_cmd.h"
+static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
+ return;
+
+ cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_del(&req->hash_node);
+ io_ring_submit_unlock(ctx, issue_flags);
+}
+
+/*
+ * Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
+ * will try to cancel this issued command by sending ->uring_cmd() with
+ * issue_flags of IO_URING_F_CANCEL.
+ *
+ * The command is guaranteed to not be done when calling ->uring_cmd()
+ * with IO_URING_F_CANCEL, but it is driver's responsibility to deal
+ * with race between io_uring canceling and normal completion.
+ */
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
+ cmd->flags |= IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
+
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -33,13 +72,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
}
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
- __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
-
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2)
{
@@ -56,6 +88,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ io_uring_cmd_del_cancelable(ioucmd, issue_flags);
+
if (ret < 0)
req_set_fail(req);
@@ -91,7 +125,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
- if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+ if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
@@ -128,12 +162,13 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
issue_flags |= IO_URING_F_SQE128;
if (ctx->flags & IORING_SETUP_CQE32)
issue_flags |= IO_URING_F_CQE32;
+ if (ctx->compat)
+ issue_flags |= IO_URING_F_COMPAT;
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!file->f_op->uring_cmd_iopoll)
return -EOPNOTSUPP;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
- WRITE_ONCE(ioucmd->cookie, NULL);
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
@@ -165,6 +200,52 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optlen, optname, level, err;
+ void __user *optval;
+
+ level = READ_ONCE(cmd->sqe->level);
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
+ optname = READ_ONCE(cmd->sqe->optname);
+ optlen = READ_ONCE(cmd->sqe->optlen);
+
+ err = do_sock_getsockopt(sock, compat, level, optname,
+ USER_SOCKPTR(optval),
+ KERNEL_SOCKPTR(&optlen));
+ if (err)
+ return err;
+
+ /* On success, return optlen */
+ return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optname, optlen, level;
+ void __user *optval;
+ sockptr_t optval_s;
+
+ optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
+ optname = READ_ONCE(cmd->sqe->optname);
+ optlen = READ_ONCE(cmd->sqe->optlen);
+ level = READ_ONCE(cmd->sqe->level);
+ optval_s = USER_SOCKPTR(optval);
+
+ return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+ optlen);
+}
+
+#if defined(CONFIG_NET)
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
@@ -186,8 +267,13 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (ret)
return ret;
return arg;
+ case SOCKET_URING_OP_GETSOCKOPT:
+ return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_SETSOCKOPT:
+ return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
+#endif
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
new file mode 100644
index 000000000..77d340666
--- /dev/null
+++ b/io_uring/waitid.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for async notification of waitid
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "cancel.h"
+#include "waitid.h"
+#include "../kernel/exit.h"
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
+
+#define IO_WAITID_CANCEL_FLAG BIT(31)
+#define IO_WAITID_REF_MASK GENMASK(30, 0)
+
+struct io_waitid {
+ struct file *file;
+ int which;
+ pid_t upid;
+ int options;
+ atomic_t refs;
+ struct wait_queue_head *head;
+ struct siginfo __user *infop;
+ struct waitid_info info;
+};
+
+static void io_waitid_free(struct io_kiocb *req)
+{
+ struct io_waitid_async *iwa = req->async_data;
+
+ put_pid(iwa->wo.wo_pid);
+ kfree(req->async_data);
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+}
+
+#ifdef CONFIG_COMPAT
+static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+{
+ struct compat_siginfo __user *infop;
+ bool ret;
+
+ infop = (struct compat_siginfo __user *) iw->infop;
+
+ if (!user_write_access_begin(infop, sizeof(*infop)))
+ return false;
+
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+#endif
+
+static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ bool ret;
+
+ if (!iw->infop)
+ return true;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return io_waitid_compat_copy_si(iw, signo);
+#endif
+
+ if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
+ return false;
+
+ unsafe_put_user(signo, &iw->infop->si_signo, Efault);
+ unsafe_put_user(0, &iw->infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+
+static int io_waitid_finish(struct io_kiocb *req, int ret)
+{
+ int signo = 0;
+
+ if (ret > 0) {
+ signo = SIGCHLD;
+ ret = 0;
+ }
+
+ if (!io_waitid_copy_si(req, signo))
+ ret = -EFAULT;
+ io_waitid_free(req);
+ return ret;
+}
+
+static void io_waitid_complete(struct io_kiocb *req, int ret)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_tw_state ts = { .locked = true };
+
+ /* anyone completing better be holding a reference */
+ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
+
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ hlist_del_init(&req->hash_node);
+
+ ret = io_waitid_finish(req, ret);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ io_req_task_complete(req, &ts);
+}
+
+static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ /*
+ * Mark us canceled regardless of ownership. This will prevent a
+ * potential retry from a spurious wakeup.
+ */
+ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
+
+ /* claim ownership */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return false;
+
+ spin_lock_irq(&iw->head->lock);
+ list_del_init(&iwa->wo.child_wait.entry);
+ spin_unlock_irq(&iw->head->lock);
+ io_waitid_complete(req, -ECANCELED);
+ return true;
+}
+
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ int nr = 0;
+
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+ return -ENOENT;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (req->cqe.user_data != cd->data &&
+ !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+ continue;
+ if (__io_waitid_cancel(ctx, req))
+ nr++;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+ break;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (nr)
+ return nr;
+
+ return -ENOENT;
+}
+
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool found = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (!io_match_task_safe(req, task, cancel_all))
+ continue;
+ hlist_del_init(&req->hash_node);
+ __io_waitid_cancel(ctx, req);
+ found = true;
+ }
+
+ return found;
+}
+
+static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ if (!atomic_sub_return(1, &iw->refs))
+ return false;
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ return true;
+}
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_waitid_async *iwa = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ io_tw_lock(ctx, ts);
+
+ ret = __do_wait(&iwa->wo);
+
+ /*
+ * If we get -ERESTARTSYS here, we need to re-arm and check again
+ * to ensure we get another callback. If the retry works, then we can
+ * just remove ourselves from the waitqueue again and finish the
+ * request.
+ */
+ if (unlikely(ret == -ERESTARTSYS)) {
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ /* Don't retry if cancel found it meanwhile */
+ ret = -ECANCELED;
+ if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /* retry armed, drop our ref */
+ io_waitid_drop_issue_ref(req);
+ return;
+ }
+
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ }
+ }
+
+ io_waitid_complete(req, ret);
+}
+
+static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
+ struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
+ struct io_kiocb *req = iwa->req;
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct task_struct *p = key;
+
+ if (!pid_child_should_wake(wo, p))
+ return 0;
+
+ /* cancel is in progress */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return 1;
+
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ list_del_init(&wait->entry);
+ return 1;
+}
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
+ return -EINVAL;
+
+ iw->which = READ_ONCE(sqe->len);
+ iw->upid = READ_ONCE(sqe->fd);
+ iw->options = READ_ONCE(sqe->file_index);
+ iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ return 0;
+}
+
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_waitid_async *iwa;
+ int ret;
+
+ if (io_alloc_async_data(req))
+ return -ENOMEM;
+
+ iwa = req->async_data;
+ iwa->req = req;
+
+ ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
+ iw->options, NULL);
+ if (ret)
+ goto done;
+
+ /*
+ * Mark the request as busy upfront, in case we're racing with the
+ * wakeup. If we are, then we'll notice when we drop this initial
+ * reference again after arming.
+ */
+ atomic_set(&iw->refs, 1);
+
+ /*
+ * Cancel must hold the ctx lock, so there's no risk of cancelation
+ * finding us until a) we remain on the list, and b) the lock is
+ * dropped. We only need to worry about racing with the wakeup
+ * callback.
+ */
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->waitid_list);
+
+ init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
+ iwa->wo.child_wait.private = req->task;
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /*
+ * Nobody else grabbed a reference, it'll complete when we get
+ * a waitqueue callback, or if someone cancels it.
+ */
+ if (!io_waitid_drop_issue_ref(req)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ hlist_del_init(&req->hash_node);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = io_waitid_finish(req, ret);
+
+ io_ring_submit_unlock(ctx, issue_flags);
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
new file mode 100644
index 000000000..956a8adaf
--- /dev/null
+++ b/io_uring/waitid.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/exit.h"
+
+struct io_waitid_async {
+ struct io_kiocb *req;
+ struct wait_opts wo;
+};
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags);
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all);