summaryrefslogtreecommitdiffstats
path: root/io_uring
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
commit01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch)
treeb406c5242a088c4f59c6e4b719b783f43aca6ae9 /io_uring
parentAdding upstream version 6.7.12. (diff)
downloadlinux-upstream/6.8.9.tar.xz
linux-upstream/6.8.9.zip
Adding upstream version 6.8.9.upstream/6.8.9
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile2
-rw-r--r--io_uring/alloc_cache.h5
-rw-r--r--io_uring/io_uring.c766
-rw-r--r--io_uring/io_uring.h19
-rw-r--r--io_uring/kbuf.c142
-rw-r--r--io_uring/kbuf.h9
-rw-r--r--io_uring/net.c22
-rw-r--r--io_uring/opdef.c8
-rw-r--r--io_uring/openclose.c50
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/register.c607
-rw-r--r--io_uring/register.h8
-rw-r--r--io_uring/rsrc.h14
-rw-r--r--io_uring/rw.c61
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/uring_cmd.c16
16 files changed, 933 insertions, 803 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index e5be47e4fc..2cdc518254 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o
+ notif.o waitid.o register.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 241245cb54..bf2fb26a65 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,8 +16,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++;
wq_stack_add_head(&entry->node, &cache->list);
- /* KASAN poisons object */
- kasan_slab_free_mempool(entry);
+ kasan_mempool_poison_object(entry);
return true;
}
return false;
@@ -34,7 +33,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
struct io_cache_entry *entry;
entry = container_of(cache->list.next, struct io_cache_entry, node);
- kasan_unpoison_range(entry, cache->elem_size);
+ kasan_mempool_unpoison_object(entry, cache->elem_size);
cache->list.next = cache->list.next->next;
cache->nr_cached--;
return entry;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13a9d9fcd2..dc0235ff47 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -69,6 +69,7 @@
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
@@ -84,6 +85,7 @@
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
+#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
@@ -102,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -128,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -143,6 +137,14 @@ struct io_defer_entry {
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
+/*
+ * No waiters. It's larger than any valid value of the tw counter
+ * so that tests against ->cq_wait_nr would fail and skip wake_up().
+ */
+#define IO_CQ_WAKE_INIT (-1U)
+/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
+#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1)
+
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
@@ -150,6 +152,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
+static struct workqueue_struct *iou_wq __ro_after_init;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
@@ -309,6 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
@@ -350,7 +354,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
@@ -540,14 +543,13 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-
-static void io_eventfd_ops(struct rcu_head *rcu)
+void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -583,7 +585,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
@@ -947,6 +949,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
+ lockdep_assert(!io_wq_current_is_worker());
+
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
@@ -1280,16 +1284,23 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
- struct llist_node *first;
+ struct llist_node *head;
+ /* See comment above IO_CQ_WAKE_INIT */
+ BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);
+
+ /*
+ * We don't know how many reuqests is there in the link and whether
+ * they can even be queued lazily, fall back to non-lazy.
+ */
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE;
- first = READ_ONCE(ctx->work_llist.first);
+ head = READ_ONCE(ctx->work_llist.first);
do {
nr_tw_prev = 0;
- if (first) {
- struct io_kiocb *first_req = container_of(first,
+ if (head) {
+ struct io_kiocb *first_req = container_of(head,
struct io_kiocb,
io_task_work.node);
/*
@@ -1298,17 +1309,29 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
nr_tw_prev = READ_ONCE(first_req->nr_tw);
}
+
+ /*
+ * Theoretically, it can overflow, but that's fine as one of
+ * previous adds should've tried to wake the task.
+ */
nr_tw = nr_tw_prev + 1;
- /* Large enough to fail the nr_wait comparison below */
if (!(flags & IOU_F_TWQ_LAZY_WAKE))
- nr_tw = INT_MAX;
+ nr_tw = IO_CQ_WAKE_FORCE;
req->nr_tw = nr_tw;
- req->io_task_work.node.next = first;
- } while (!try_cmpxchg(&ctx->work_llist.first, &first,
+ req->io_task_work.node.next = head;
+ } while (!try_cmpxchg(&ctx->work_llist.first, &head,
&req->io_task_work.node));
- if (!first) {
+ /*
+ * cmpxchg implies a full barrier, which pairs with the barrier
+ * in set_current_state() on the io_cqring_wait() side. It's used
+ * to ensure that either we see updated ->cq_wait_nr, or waiters
+ * going to sleep will observe the work added to the list, which
+ * is similar to the wait/wawke task state sync.
+ */
+
+ if (!head) {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
@@ -1316,14 +1339,12 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
- /* no one is waiting */
- if (!nr_wait)
+ /* not enough or no one is waiting */
+ if (nr_tw < nr_wait)
return;
- /* either not enough or the previous add has already woken it up */
- if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
+ /* the previous add has already woken it up */
+ if (nr_tw_prev >= nr_wait)
return;
- /* pairs with set_current_state() in io_cqring_wait() */
- smp_mb__after_atomic();
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
}
@@ -1872,14 +1893,15 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
- if (ret != IOU_ISSUE_SKIP_COMPLETE)
- return ret;
-
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
- io_iopoll_req_issued(req, issue_flags);
+ if (ret == IOU_ISSUE_SKIP_COMPLETE) {
+ ret = 0;
+ io_arm_ltimeout(req);
- return 0;
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ io_iopoll_req_issued(req, issue_flags);
+ }
+ return ret;
}
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@@ -1930,6 +1952,29 @@ fail:
goto fail;
}
+ /*
+ * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the
+ * submitter task context. Final request completions are handed to the
+ * right context, however this is not the case of auxiliary CQEs,
+ * which is the main mean of operation for multishot requests.
+ * Don't allow any multishot execution from io-wq. It's more restrictive
+ * than necessary and also cleaner.
+ */
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ err = -EBADFD;
+ if (!file_can_poll(req->file))
+ goto fail;
+ if (req->file->f_flags & O_NONBLOCK ||
+ req->file->f_mode & FMODE_NOWAIT) {
+ err = -ECANCELED;
+ if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
+ goto fail;
+ return;
+ } else {
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+ }
+ }
+
if (req->flags & REQ_F_FORCE_ASYNC) {
bool opcode_poll = def->pollin || def->pollout;
@@ -1990,9 +2035,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
slot = io_fixed_file_slot(&ctx->file_table, fd);
- file = io_slot_file(slot);
+ if (!req->rsrc_node)
+ __io_req_set_rsrc_node(req, ctx);
req->flags |= io_slot_flags(slot);
- io_req_set_rsrc_node(req, ctx, 0);
+ file = io_slot_file(slot);
out:
io_ring_submit_unlock(ctx, issue_flags);
return file;
@@ -2050,9 +2096,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2124,6 +2168,13 @@ static void io_init_req_drain(struct io_kiocb *req)
}
}
+static __cold int io_init_fail_req(struct io_kiocb *req, int err)
+{
+ /* ensure per-opcode data is cleared if we fail before prep */
+ memset(&req->cmd.data, 0, sizeof(req->cmd.data));
+ return err;
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
@@ -2144,29 +2195,29 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
}
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (sqe_flags & IOSQE_BUFFER_SELECT) {
if (!def->buffer_select)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
if (ctx->drain_disabled)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
io_init_req_drain(req);
}
}
if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ return io_init_fail_req(req, -EACCES);
/* knock it to the slow queue path, will be drained there */
if (ctx->drain_active)
req->flags |= REQ_F_FORCE_ASYNC;
@@ -2179,9 +2230,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
if (!def->ioprio && sqe->ioprio)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (def->needs_file) {
struct io_submit_state *state = &ctx->submit_state;
@@ -2205,12 +2256,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
get_cred(req->creds);
ret = security_uring_override_creds(req->creds);
if (ret) {
put_cred(req->creds);
- return ret;
+ return io_init_fail_req(req, ret);
}
req->flags |= REQ_F_CREDS;
}
@@ -2559,19 +2610,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -2588,6 +2626,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
+ if (sig) {
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, sigsz);
+
+ if (ret)
+ return ret;
+ }
+
trace_io_uring_cqring_wait(ctx, min_events);
do {
int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
@@ -2603,7 +2654,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
ret = io_cqring_wait_schedule(ctx, &iowq);
__set_current_state(TASK_RUNNING);
- atomic_set(&ctx->cq_wait_nr, 0);
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
/*
* Run task_work after scheduling and before io_should_wake().
@@ -2818,61 +2869,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -2942,7 +2938,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -2964,7 +2959,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs);
}
-static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
@@ -3023,19 +3018,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -3190,7 +3172,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
* noise and overhead, there's no discernable change in runtime
* over using system_wq.
*/
- queue_work(system_unbound_wq, &ctx->exit_work);
+ queue_work(iou_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -3472,14 +3454,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- rcu_read_lock();
- ptr = io_pbuf_get_address(ctx, bgid);
- rcu_read_unlock();
- if (!ptr)
- return ERR_PTR(-EINVAL);
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ptr = bl->buf_ring;
+ io_put_bl(ctx, bl);
break;
}
default:
@@ -3850,7 +3833,8 @@ static int io_uring_install_fd(struct file *file)
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
- return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
}
@@ -4118,506 +4102,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
- /*
- * Lazy activation attempts would fail if it was polled before
- * submitter_task is set.
- */
- if (wq_has_sleeper(&ctx->poll_wq))
- io_activate_pollwq(ctx);
- }
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
- cpumask_var_t new_mask)
-{
- int ret;
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_wq_cpu_affinity(current->io_uring, new_mask);
- } else {
- mutex_unlock(&ctx->uring_lock);
- ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
- mutex_lock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- cpumask_var_t new_mask;
- int ret;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = __io_register_iowq_aff(ctx, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- return __io_register_iowq_aff(ctx, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We don't quiesce the refs for register anymore and so it can't be
- * dying as we're holding a file ref here.
- */
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
- return -ENXIO;
-
- if (ctx->submitter_task && ctx->submitter_task != current)
- return -EEXIST;
-
- if (ctx->restricted) {
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_register_pbuf_ring(ctx, arg);
- break;
- case IORING_UNREGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_unregister_pbuf_ring(ctx, arg);
- break;
- case IORING_REGISTER_SYNC_CANCEL:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_sync_cancel(ctx, arg);
- break;
- case IORING_REGISTER_FILE_ALLOC_RANGE:
- ret = -EINVAL;
- if (!arg || nr_args)
- break;
- ret = io_register_file_alloc_range(ctx, arg);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct file *file;
- bool use_registered_ring;
-
- use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
- opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
-
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
-
- if (use_registered_ring) {
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- struct io_uring_task *tctx = current->io_uring;
-
- if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- file = tctx->registered_rings[fd];
- if (unlikely(!file))
- return -EBADF;
- } else {
- file = fget(fd);
- if (unlikely(!file))
- return -EBADF;
- ret = -EOPNOTSUPP;
- if (!io_is_uring_fops(file))
- goto out_fput;
- }
-
- ctx = file->private_data;
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- if (!use_registered_ring)
- fput(file);
- return ret;
-}
-
static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
@@ -4714,6 +4198,8 @@ static int __init io_uring_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
NULL);
+ iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
+
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0d66a7058d..d5495710c1 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -16,17 +16,6 @@
#endif
enum {
- /*
- * A hint to not wake right away but delay until there are enough of
- * tw's queued to match the number of CQEs the task is waiting for.
- *
- * Must not be used wirh requests generating more than one CQE.
- * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
- */
- IOU_F_TWQ_LAZY_WAKE = 1,
-};
-
-enum {
IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@@ -95,6 +84,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);
+enum {
+ IO_EVENTFD_OP_SIGNAL_BIT,
+ IO_EVENTFD_OP_FREE_BIT,
+};
+
+void io_eventfd_ops(struct rcu_head *rcu);
+void io_activate_pollwq(struct io_ring_ctx *ctx);
+
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 72b6af1d2e..b2c4f82937 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -17,8 +17,6 @@
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
-#define BGID_ARRAY 64
-
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
@@ -40,13 +38,9 @@ struct io_buf_free {
int inuse;
};
-static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl,
- unsigned int bgid)
+static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- if (bl && bgid < BGID_ARRAY)
- return &bl[bgid];
-
return xa_load(&ctx->io_bl_xa, bgid);
}
@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
{
lockdep_assert_held(&ctx->uring_lock);
- return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
+ return __io_buffer_get_list(ctx, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -67,11 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid;
- smp_store_release(&bl->is_ready, 1);
-
- if (bgid < BGID_ARRAY)
- return 0;
-
+ atomic_set(&bl->refs, 1);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -217,24 +207,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
- struct io_buffer_list *bl;
- int i;
-
- bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- INIT_LIST_HEAD(&bl[i].buf_list);
- bl[i].bgid = i;
- }
-
- smp_store_release(&ctx->io_bl, bl);
- return 0;
-}
-
/*
* Mark the given mapped range as free for reuse
*/
@@ -303,24 +275,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ if (atomic_dec_and_test(&bl->refs)) {
+ __io_remove_buffers(ctx, bl, -1U);
+ kfree_rcu(bl, rcu);
+ }
+}
+
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
struct list_head *item, *tmp;
struct io_buffer *buf;
unsigned long index;
- int i;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- if (!ctx->io_bl)
- break;
- __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
- }
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
- __io_remove_buffers(ctx, bl, -1U);
- kfree_rcu(bl, rcu);
+ io_put_bl(ctx, bl);
}
/*
@@ -498,12 +470,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
io_ring_submit_lock(ctx, issue_flags);
- if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
- ret = io_init_bl_list(ctx);
- if (ret)
- goto err;
- }
-
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -516,14 +482,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (ret) {
/*
* Doesn't need rcu free as it was never visible, but
- * let's keep it consistent throughout. Also can't
- * be a lower indexed array group, as adding one
- * where lookup failed cannot happen.
+ * let's keep it consistent throughout.
*/
- if (p->bgid >= BGID_ARRAY)
- kfree_rcu(bl, rcu);
- else
- WARN_ON_ONCE(1);
+ kfree_rcu(bl, rcu);
goto err;
}
}
@@ -688,12 +649,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.ring_entries >= 65536)
return -EINVAL;
- if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
- int ret = io_init_bl_list(ctx);
- if (ret)
- return ret;
- }
-
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
@@ -742,31 +697,66 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!bl->is_mapped)
return -EINVAL;
- __io_remove_buffers(ctx, bl, -1U);
- if (bl->bgid >= BGID_ARRAY) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree_rcu(bl, rcu);
- }
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ io_put_bl(ctx, bl);
return 0;
}
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
+ struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
+ int i;
- bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+ if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
+ if (buf_status.resv[i])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, buf_status.buf_group);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->is_mapped)
+ return -EINVAL;
+
+ buf_status.head = bl->head;
+ if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
+ return -EFAULT;
+
+ return 0;
+}
+
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid)
+{
+ struct io_buffer_list *bl;
+ bool ret;
- if (!bl || !bl->is_mmap)
- return NULL;
/*
- * Ensure the list is fully setup. Only strictly needed for RCU lookup
- * via mmap, and in that case only for the array indexed groups. For
- * the xarray lookups, it's either visible and ready, or not at all.
+ * We have to be a bit careful here - we're inside mmap and cannot grab
+ * the uring_lock. This means the buffer_list could be simultaneously
+ * going away, if someone is trying to be sneaky. Look it up under rcu
+ * so we know it's not going away, and attempt to grab a reference to
+ * it. If the ref is already zero, then fail the mapping. If successful,
+ * the caller will call io_put_bl() to drop the the reference at at the
+ * end. This may then safely free the buffer_list (and drop the pages)
+ * at that point, vm_insert_pages() would've already grabbed the
+ * necessary vma references.
*/
- if (!smp_load_acquire(&bl->is_ready))
- return NULL;
-
- return bl->buf_ring;
+ rcu_read_lock();
+ bl = xa_load(&ctx->io_bl_xa, bgid);
+ /* must be a mmap'able buffer ring and have pages */
+ ret = false;
+ if (bl && bl->is_mmap)
+ ret = atomic_inc_not_zero(&bl->refs);
+ rcu_read_unlock();
+
+ if (ret)
+ return bl;
+
+ return ERR_PTR(-EINVAL);
}
/*
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 9be5960817..038091a004 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -25,12 +25,12 @@ struct io_buffer_list {
__u16 head;
__u16 mask;
+ atomic_t refs;
+
/* ring mapped provided buffers */
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
- /* bl is visible from an RCU point of view for lookup */
- __u8 is_ready;
};
struct io_buffer {
@@ -53,6 +53,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
@@ -60,7 +61,9 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
diff --git a/io_uring/net.c b/io_uring/net.c
index 5a4001139e..46ea09e1e3 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -78,19 +78,6 @@ struct io_sr_msg {
*/
#define MULTISHOT_MAX_RETRY 32
-static inline bool io_check_multishot(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- /*
- * When ->locked_cq is set we only allow to post CQEs from the original
- * task context. Usual request completions will be handled in other
- * generic paths but multipoll may decide to post extra cqes.
- */
- return !(issue_flags & IO_URING_F_IOWQ) ||
- !(req->flags & REQ_F_APOLL_MULTISHOT) ||
- !req->ctx->task_complete;
-}
-
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
@@ -837,9 +824,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return io_setup_async_msg(req, kmsg, issue_flags);
- if (!io_check_multishot(req, issue_flags))
- return io_setup_async_msg(req, kmsg, issue_flags);
-
retry_multishot:
if (io_do_buffer_select(req)) {
void __user *buf;
@@ -935,9 +919,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -1274,6 +1255,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
if (req_has_async_data(req)) {
kmsg = req->async_data;
+ kmsg->msg.msg_control_user = sr->msg_control;
} else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
@@ -1386,8 +1368,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 799db44283..b1ee3a9c38 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -469,6 +469,11 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .needs_file = 1,
+ .prep = io_install_fixed_fd_prep,
+ .issue = io_install_fixed_fd,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -704,6 +709,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FUTEX_WAITV] = {
.name = "FUTEX_WAITV",
},
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .name = "FIXED_FD_INSTALL",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index fb73adb890..e3357dfa14 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -31,6 +31,11 @@ struct io_close {
u32 file_slot;
};
+struct io_fixed_install {
+ struct file *file;
+ unsigned int o_flags;
+};
+
static bool io_openat_force_async(struct io_open *open)
{
/*
@@ -241,7 +246,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
}
- file = __close_fd_get_file(close->fd);
+ file = file_close_fd_locked(files, close->fd);
spin_unlock(&files->file_lock);
if (!file)
goto err;
@@ -254,3 +259,46 @@ err:
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_fixed_install *ifi;
+ unsigned int flags;
+
+ if (sqe->off || sqe->addr || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in || sqe->addr3)
+ return -EINVAL;
+
+ /* must be a fixed file */
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ flags = READ_ONCE(sqe->install_fd_flags);
+ if (flags & ~IORING_FIXED_FD_NO_CLOEXEC)
+ return -EINVAL;
+
+ /* ensure the task's creds are used when installing/receiving fds */
+ if (req->flags & REQ_F_CREDS)
+ return -EPERM;
+
+ /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */
+ ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
+ ifi->o_flags = O_CLOEXEC;
+ if (flags & IORING_FIXED_FD_NO_CLOEXEC)
+ ifi->o_flags = 0;
+
+ return 0;
+}
+
+int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_fixed_install *ifi;
+ int ret;
+
+ ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
+ ret = receive_fd(req->file, NULL, ifi->o_flags);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 4b1c28d3a6..8a93c98ad0 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/register.c b/io_uring/register.c
new file mode 100644
index 0000000000..5e62c12089
--- /dev/null
+++ b/io_uring/register.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code related to the io_uring_register() syscall
+ *
+ * Copyright (C) 2023 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/refcount.h>
+#include <linux/bits.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+#include "io_uring.h"
+#include "opdef.h"
+#include "tctx.h"
+#include "rsrc.h"
+#include "sqpoll.h"
+#include "register.h"
+#include "cancel.h"
+#include "kbuf.h"
+
+#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
+ IORING_REGISTER_LAST + IORING_OP_LAST)
+
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
+{
+ struct io_ev_fd *ev_fd;
+ __s32 __user *fds = arg;
+ int fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
+ return -EBUSY;
+
+ if (copy_from_user(&fd, fds, sizeof(*fds)))
+ return -EFAULT;
+
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
+
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ return ret;
+ }
+
+ spin_lock(&ctx->completion_lock);
+ ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
+
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+ atomic_set(&ev_fd->refs, 1);
+ atomic_set(&ev_fd->ops, 0);
+ return 0;
+}
+
+int io_eventfd_unregister(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
+ call_rcu(&ev_fd->rcu, io_eventfd_ops);
+ return 0;
+ }
+
+ return -ENXIO;
+}
+
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_probe *p;
+ size_t size;
+ int i, ret;
+
+ size = struct_size(p, ops, nr_args);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+ p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(p, arg, size))
+ goto out;
+ ret = -EINVAL;
+ if (memchr_inv(p, 0, size))
+ goto out;
+
+ p->last_op = IORING_OP_LAST - 1;
+ if (nr_args > IORING_OP_LAST)
+ nr_args = IORING_OP_LAST;
+
+ for (i = 0; i < nr_args; i++) {
+ p->ops[i].op = i;
+ if (!io_issue_defs[i].not_supported)
+ p->ops[i].flags = IO_URING_OP_SUPPORTED;
+ }
+ p->ops_len = i;
+
+ ret = 0;
+ if (copy_to_user(arg, p, size))
+ ret = -EFAULT;
+out:
+ kfree(p);
+ return ret;
+}
+
+int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
+{
+ const struct cred *creds;
+
+ creds = xa_erase(&ctx->personalities, id);
+ if (creds) {
+ put_cred(creds);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+
+static int io_register_personality(struct io_ring_ctx *ctx)
+{
+ const struct cred *creds;
+ u32 id;
+ int ret;
+
+ creds = get_current_cred();
+
+ ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
+ XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
+ if (ret < 0) {
+ put_cred(creds);
+ return ret;
+ }
+ return id;
+}
+
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
+{
+ struct io_uring_restriction *res;
+ size_t size;
+ int i, ret;
+
+ /* Restrictions allowed only if rings started disabled */
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ /* We allow only a single restrictions registration */
+ if (ctx->restrictions.registered)
+ return -EBUSY;
+
+ if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
+ return -EINVAL;
+
+ size = array_size(nr_args, sizeof(*res));
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ res = memdup_user(arg, size);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = 0;
+
+ for (i = 0; i < nr_args; i++) {
+ switch (res[i].opcode) {
+ case IORING_RESTRICTION_REGISTER_OP:
+ if (res[i].register_op >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].register_op,
+ ctx->restrictions.register_op);
+ break;
+ case IORING_RESTRICTION_SQE_OP:
+ if (res[i].sqe_op >= IORING_OP_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
+ ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
+ ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ /* Reset all restrictions if an error happened */
+ if (ret != 0)
+ memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
+ else
+ ctx->restrictions.registered = true;
+
+ kfree(res);
+ return ret;
+}
+
+static int io_register_enable_rings(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
+ WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
+ /*
+ * Lazy activation attempts would fail if it was polled before
+ * submitter_task is set.
+ */
+ if (wq_has_sleeper(&ctx->poll_wq))
+ io_activate_pollwq(ctx);
+ }
+
+ if (ctx->restrictions.registered)
+ ctx->restricted = 1;
+
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+ if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
+ return 0;
+}
+
+static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
+ cpumask_var_t new_mask)
+{
+ int ret;
+
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ ret = io_wq_cpu_affinity(current->io_uring, new_mask);
+ } else {
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return ret;
+}
+
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
+{
+ cpumask_var_t new_mask;
+ int ret;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(new_mask);
+ if (len > cpumask_size())
+ len = cpumask_size();
+
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_bitmap(cpumask_bits(new_mask),
+ (const compat_ulong_t __user *)arg,
+ len * 8 /* CHAR_BIT */);
+ else
+#endif
+ ret = copy_from_user(new_mask, arg, len);
+
+ if (ret) {
+ free_cpumask_var(new_mask);
+ return -EFAULT;
+ }
+
+ ret = __io_register_iowq_aff(ctx, new_mask);
+ free_cpumask_var(new_mask);
+ return ret;
+}
+
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+{
+ return __io_register_iowq_aff(ctx, NULL);
+}
+
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_tctx_node *node;
+ struct io_uring_task *tctx = NULL;
+ struct io_sq_data *sqd = NULL;
+ __u32 new_count[2];
+ int i, ret;
+
+ if (copy_from_user(new_count, arg, sizeof(new_count)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i] > INT_MAX)
+ return -EINVAL;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ sqd = ctx->sq_data;
+ if (sqd) {
+ /*
+ * Observe the correct sqd->lock -> ctx->uring_lock
+ * ordering. Fine to drop uring_lock here, we hold
+ * a ref to the ctx.
+ */
+ refcount_inc(&sqd->refs);
+ mutex_unlock(&ctx->uring_lock);
+ mutex_lock(&sqd->lock);
+ mutex_lock(&ctx->uring_lock);
+ if (sqd->thread)
+ tctx = sqd->thread->io_uring;
+ }
+ } else {
+ tctx = current->io_uring;
+ }
+
+ BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
+ ctx->iowq_limits_set = true;
+
+ if (tctx && tctx->io_wq) {
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ goto err;
+ } else {
+ memset(new_count, 0, sizeof(new_count));
+ }
+
+ if (sqd) {
+ mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
+
+ if (copy_to_user(arg, new_count, sizeof(new_count)))
+ return -EFAULT;
+
+ /* that's it for SQPOLL, only the SQPOLL task creates requests */
+ if (sqd)
+ return 0;
+
+ /* now propagate the restriction to all registered users */
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (WARN_ON_ONCE(!tctx->io_wq))
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ new_count[i] = ctx->iowq_limits[i];
+ /* ignore errors, it always returns zero anyway */
+ (void)io_wq_max_workers(tctx->io_wq, new_count);
+ }
+ return 0;
+err:
+ if (sqd) {
+ mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
+ return ret;
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+ void __user *arg, unsigned nr_args)
+ __releases(ctx->uring_lock)
+ __acquires(ctx->uring_lock)
+{
+ int ret;
+
+ /*
+ * We don't quiesce the refs for register anymore and so it can't be
+ * dying as we're holding a file ref here.
+ */
+ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
+ return -ENXIO;
+
+ if (ctx->submitter_task && ctx->submitter_task != current)
+ return -EEXIST;
+
+ if (ctx->restricted) {
+ opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+ if (!test_bit(opcode, ctx->restrictions.register_op))
+ return -EACCES;
+ }
+
+ switch (opcode) {
+ case IORING_REGISTER_BUFFERS:
+ ret = -EFAULT;
+ if (!arg)
+ break;
+ ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
+ break;
+ case IORING_UNREGISTER_BUFFERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_buffers_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES:
+ ret = -EFAULT;
+ if (!arg)
+ break;
+ ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
+ break;
+ case IORING_UNREGISTER_FILES:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_files_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES_UPDATE:
+ ret = io_register_files_update(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_EVENTFD:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+ ret = io_eventfd_register(ctx, arg, 1);
+ break;
+ case IORING_UNREGISTER_EVENTFD:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_eventfd_unregister(ctx);
+ break;
+ case IORING_REGISTER_PROBE:
+ ret = -EINVAL;
+ if (!arg || nr_args > 256)
+ break;
+ ret = io_probe(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_personality(ctx);
+ break;
+ case IORING_UNREGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg)
+ break;
+ ret = io_unregister_personality(ctx, nr_args);
+ break;
+ case IORING_REGISTER_ENABLE_RINGS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_enable_rings(ctx);
+ break;
+ case IORING_REGISTER_RESTRICTIONS:
+ ret = io_register_restrictions(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_FILES2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
+ break;
+ case IORING_REGISTER_FILES_UPDATE2:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_FILE);
+ break;
+ case IORING_REGISTER_BUFFERS2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (!arg || !nr_args)
+ break;
+ ret = io_register_iowq_aff(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_unregister_iowq_aff(ctx);
+ break;
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 2)
+ break;
+ ret = io_register_iowq_max_workers(ctx, arg);
+ break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_ring(ctx, arg);
+ break;
+ case IORING_UNREGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_unregister_pbuf_ring(ctx, arg);
+ break;
+ case IORING_REGISTER_SYNC_CANCEL:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_sync_cancel(ctx, arg);
+ break;
+ case IORING_REGISTER_FILE_ALLOC_RANGE:
+ ret = -EINVAL;
+ if (!arg || nr_args)
+ break;
+ ret = io_register_file_alloc_range(ctx, arg);
+ break;
+ case IORING_REGISTER_PBUF_STATUS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_status(ctx, arg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+ void __user *, arg, unsigned int, nr_args)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ struct file *file;
+ bool use_registered_ring;
+
+ use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
+ opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
+
+ if (opcode >= IORING_REGISTER_LAST)
+ return -EINVAL;
+
+ if (use_registered_ring) {
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ file = tctx->registered_rings[fd];
+ if (unlikely(!file))
+ return -EBADF;
+ } else {
+ file = fget(fd);
+ if (unlikely(!file))
+ return -EBADF;
+ ret = -EOPNOTSUPP;
+ if (!io_is_uring_fops(file))
+ goto out_fput;
+ }
+
+ ctx = file->private_data;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_uring_register(ctx, opcode, arg, nr_args);
+ mutex_unlock(&ctx->uring_lock);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+out_fput:
+ if (!use_registered_ring)
+ fput(file);
+ return ret;
+}
diff --git a/io_uring/register.h b/io_uring/register.h
new file mode 100644
index 0000000000..c9da997d50
--- /dev/null
+++ b/io_uring/register.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_REGISTER_H
+#define IORING_REGISTER_H
+
+int io_eventfd_unregister(struct io_ring_ctx *ctx);
+int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
+
+#endif
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 7238b9cfe3..c6f199bbee 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -102,17 +102,21 @@ static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
node->refs++;
}
+static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+ req->rsrc_node = ctx->rsrc_node;
+ io_charge_rsrc_node(ctx, ctx->rsrc_node);
+}
+
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
if (!req->rsrc_node) {
io_ring_submit_lock(ctx, issue_flags);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- req->rsrc_node = ctx->rsrc_node;
- io_charge_rsrc_node(ctx, ctx->rsrc_node);
+ __io_req_set_rsrc_node(req, ctx);
io_ring_submit_unlock(ctx, issue_flags);
}
}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 70c5beb05d..c3c154790e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -10,7 +10,7 @@
#include <linux/poll.h>
#include <linux/nospec.h>
#include <linux/compat.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <uapi/linux/io_uring.h>
@@ -169,27 +169,6 @@ void io_readv_writev_cleanup(struct io_kiocb *req)
kfree(io->free_iovec);
}
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
-{
- switch (ret) {
- case -EIOCBQUEUED:
- break;
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- case -ERESTART_RESTARTBLOCK:
- /*
- * We can't just restart the syscall, since previously
- * submitted sqes may already be in progress. Just fail this
- * IO with EINTR.
- */
- ret = -EINTR;
- fallthrough;
- default:
- kiocb->ki_complete(kiocb, ret);
- }
-}
-
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -372,6 +351,33 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
smp_store_release(&req->iopoll_completed, 1);
}
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+ /* IO was queued async, completion will happen later */
+ if (ret == -EIOCBQUEUED)
+ return;
+
+ /* transform internal restart error codes */
+ if (unlikely(ret < 0)) {
+ switch (ret) {
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /*
+ * We can't just restart the syscall, since previously
+ * submitted sqes may already be in progress. Just fail
+ * this IO with EINTR.
+ */
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll,
+ io_complete_rw, kiocb, ret);
+}
+
static int kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
@@ -926,12 +932,17 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!file_can_poll(req->file))
return -EBADFD;
- if (issue_flags & IO_URING_F_IOWQ)
- return -EAGAIN;
ret = __io_read(req, issue_flags);
/*
+ * If the file doesn't support proper NOWAIT, then disable multishot
+ * and stay in single shot mode.
+ */
+ if (!io_file_supports_nowait(req))
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+
+ /*
* If we get -EAGAIN, recycle our buffer and just let normal poll
* handling arm it.
*/
@@ -950,7 +961,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
/*
* Any successful return value will keep the multishot read armed.
*/
- if (ret > 0) {
+ if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) {
/*
* Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done.
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7c4469e954..3b659cd23e 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
struct file *in;
- long ret = 0;
+ ssize_t ret = 0;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
@@ -92,7 +92,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
struct file *in;
- long ret = 0;
+ ssize_t ret = 0;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 7d3ef62e62..c33fca585d 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -2,7 +2,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/security.h>
#include <linux/nospec.h>
@@ -52,12 +52,6 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
- return cmd_to_io_kiocb(cmd)->task;
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
-
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -78,13 +72,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
}
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
- __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
-
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2)
{
@@ -182,7 +169,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
return -EOPNOTSUPP;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
- WRITE_ONCE(ioucmd->cookie, NULL);
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);