summaryrefslogtreecommitdiffstats
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c980
1 files changed, 230 insertions, 750 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ea772a02c..dc0235ff4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -60,7 +60,6 @@
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
-#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -70,6 +69,7 @@
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
@@ -85,6 +85,7 @@
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
+#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
@@ -92,6 +93,8 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
+#include "waitid.h"
+#include "futex.h"
#include "timeout.h"
#include "poll.h"
@@ -101,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -127,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -142,6 +137,14 @@ struct io_defer_entry {
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
+/*
+ * No waiters. It's larger than any valid value of the tw counter
+ * so that tests against ->cq_wait_nr would fail and skip wake_up().
+ */
+#define IO_CQ_WAKE_INIT (-1U)
+/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
+#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1)
+
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
@@ -149,6 +152,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
+static struct workqueue_struct *iou_wq __ro_after_init;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
@@ -175,19 +179,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (io_is_uring_fops(file)) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
@@ -321,6 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
@@ -332,6 +324,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct async_poll));
io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
+ io_futex_cache_init(ctx);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -341,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -351,13 +343,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
+ INIT_HLIST_HEAD(&ctx->waitid_list);
+#ifdef CONFIG_FUTEX
+ INIT_HLIST_HEAD(&ctx->futex_list);
+#endif
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx;
err:
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
@@ -547,14 +543,13 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-
-static void io_eventfd_ops(struct rcu_head *rcu)
+void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -590,7 +585,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
@@ -954,6 +949,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
+ lockdep_assert(!io_wq_current_is_worker());
+
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
@@ -1181,12 +1178,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
static unsigned int handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx,
- struct io_tw_state *ts,
- struct llist_node *last)
+ struct io_tw_state *ts)
{
unsigned int count = 0;
- while (node && node != last) {
+ do {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1210,7 +1206,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
*ctx = NULL;
cond_resched();
}
- }
+ } while (node);
return count;
}
@@ -1229,22 +1225,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head,
return xchg(&head->first, new);
}
-/**
- * io_llist_cmpxchg - possibly swap all entries in a lock-less list
- * @head: the head of lock-less list to delete all entries
- * @old: expected old value of the first entry of the list
- * @new: new entry as the head of the list
- *
- * perform a cmpxchg on the first entry of the list.
- */
-
-static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
- struct llist_node *old,
- struct llist_node *new)
-{
- return cmpxchg(&head->first, old, new);
-}
-
static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{
struct llist_node *node = llist_del_all(&tctx->task_list);
@@ -1279,9 +1259,7 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
- struct llist_node fake = {};
struct llist_node *node;
- unsigned int loops = 0;
unsigned int count = 0;
if (unlikely(current->flags & PF_EXITING)) {
@@ -1289,21 +1267,9 @@ void tctx_task_work(struct callback_head *cb)
return;
}
- do {
- loops++;
- node = io_llist_xchg(&tctx->task_list, &fake);
- count += handle_tw_list(node, &ctx, &ts, &fake);
-
- /* skip expensive cmpxchg if there are items in the list */
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
- io_submit_flush_completions(ctx);
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- }
- node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
- } while (node != &fake);
+ node = llist_del_all(&tctx->task_list);
+ if (node)
+ count = handle_tw_list(node, &ctx, &ts);
ctx_flush_and_put(ctx, &ts);
@@ -1311,23 +1277,30 @@ void tctx_task_work(struct callback_head *cb)
if (unlikely(atomic_read(&tctx->in_cancel)))
io_uring_drop_tctx_refs(current);
- trace_io_uring_task_work_run(tctx, count, loops);
+ trace_io_uring_task_work_run(tctx, count, 1);
}
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
- struct llist_node *first;
+ struct llist_node *head;
+
+ /* See comment above IO_CQ_WAKE_INIT */
+ BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);
+ /*
+ * We don't know how many reuqests is there in the link and whether
+ * they can even be queued lazily, fall back to non-lazy.
+ */
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE;
- first = READ_ONCE(ctx->work_llist.first);
+ head = READ_ONCE(ctx->work_llist.first);
do {
nr_tw_prev = 0;
- if (first) {
- struct io_kiocb *first_req = container_of(first,
+ if (head) {
+ struct io_kiocb *first_req = container_of(head,
struct io_kiocb,
io_task_work.node);
/*
@@ -1336,17 +1309,29 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
nr_tw_prev = READ_ONCE(first_req->nr_tw);
}
+
+ /*
+ * Theoretically, it can overflow, but that's fine as one of
+ * previous adds should've tried to wake the task.
+ */
nr_tw = nr_tw_prev + 1;
- /* Large enough to fail the nr_wait comparison below */
if (!(flags & IOU_F_TWQ_LAZY_WAKE))
- nr_tw = INT_MAX;
+ nr_tw = IO_CQ_WAKE_FORCE;
req->nr_tw = nr_tw;
- req->io_task_work.node.next = first;
- } while (!try_cmpxchg(&ctx->work_llist.first, &first,
+ req->io_task_work.node.next = head;
+ } while (!try_cmpxchg(&ctx->work_llist.first, &head,
&req->io_task_work.node));
- if (!first) {
+ /*
+ * cmpxchg implies a full barrier, which pairs with the barrier
+ * in set_current_state() on the io_cqring_wait() side. It's used
+ * to ensure that either we see updated ->cq_wait_nr, or waiters
+ * going to sleep will observe the work added to the list, which
+ * is similar to the wait/wawke task state sync.
+ */
+
+ if (!head) {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
@@ -1354,14 +1339,12 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
- /* no one is waiting */
- if (!nr_wait)
+ /* not enough or no one is waiting */
+ if (nr_tw < nr_wait)
return;
- /* either not enough or the previous add has already woken it up */
- if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
+ /* the previous add has already woken it up */
+ if (nr_tw_prev >= nr_wait)
return;
- /* pairs with set_current_state() in io_cqring_wait() */
- smp_mb__after_atomic();
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
}
@@ -1408,7 +1391,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
}
}
-static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
+static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
+ int min_events)
+{
+ if (llist_empty(&ctx->work_llist))
+ return false;
+ if (events < min_events)
+ return true;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ return false;
+}
+
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
+ int min_events)
{
struct llist_node *node;
unsigned int loops = 0;
@@ -1437,18 +1433,20 @@ again:
}
loops++;
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
if (ts->locked) {
io_submit_flush_completions(ctx);
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
}
+
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
}
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
+ int min_events)
{
struct io_tw_state ts = { .locked = true, };
int ret;
@@ -1456,20 +1454,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
if (llist_empty(&ctx->work_llist))
return 0;
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
/* shouldn't happen! */
if (WARN_ON_ONCE(!ts.locked))
mutex_lock(&ctx->uring_lock);
return ret;
}
-static int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{
struct io_tw_state ts = {};
int ret;
ts.locked = mutex_trylock(&ctx->uring_lock);
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
if (ts.locked)
mutex_unlock(&ctx->uring_lock);
@@ -1665,7 +1663,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
- (void) io_run_local_work_locked(ctx);
+ (void) io_run_local_work_locked(ctx, min);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@@ -1895,14 +1893,15 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
- if (ret != IOU_ISSUE_SKIP_COMPLETE)
- return ret;
-
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
- io_iopoll_req_issued(req, issue_flags);
+ if (ret == IOU_ISSUE_SKIP_COMPLETE) {
+ ret = 0;
+ io_arm_ltimeout(req);
- return 0;
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ io_iopoll_req_issued(req, issue_flags);
+ }
+ return ret;
}
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@@ -1953,6 +1952,29 @@ fail:
goto fail;
}
+ /*
+ * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the
+ * submitter task context. Final request completions are handed to the
+ * right context, however this is not the case of auxiliary CQEs,
+ * which is the main mean of operation for multishot requests.
+ * Don't allow any multishot execution from io-wq. It's more restrictive
+ * than necessary and also cleaner.
+ */
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ err = -EBADFD;
+ if (!file_can_poll(req->file))
+ goto fail;
+ if (req->file->f_flags & O_NONBLOCK ||
+ req->file->f_mode & FMODE_NOWAIT) {
+ err = -ECANCELED;
+ if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
+ goto fail;
+ return;
+ } else {
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+ }
+ }
+
if (req->flags & REQ_F_FORCE_ASYNC) {
bool opcode_poll = def->pollin || def->pollout;
@@ -2013,9 +2035,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
slot = io_fixed_file_slot(&ctx->file_table, fd);
- file = io_slot_file(slot);
+ if (!req->rsrc_node)
+ __io_req_set_rsrc_node(req, ctx);
req->flags |= io_slot_flags(slot);
- io_req_set_rsrc_node(req, ctx, 0);
+ file = io_slot_file(slot);
out:
io_ring_submit_unlock(ctx, issue_flags);
return file;
@@ -2073,9 +2096,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2147,6 +2168,13 @@ static void io_init_req_drain(struct io_kiocb *req)
}
}
+static __cold int io_init_fail_req(struct io_kiocb *req, int err)
+{
+ /* ensure per-opcode data is cleared if we fail before prep */
+ memset(&req->cmd.data, 0, sizeof(req->cmd.data));
+ return err;
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
@@ -2167,29 +2195,29 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
}
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (sqe_flags & IOSQE_BUFFER_SELECT) {
if (!def->buffer_select)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
if (ctx->drain_disabled)
- return -EOPNOTSUPP;
+ return io_init_fail_req(req, -EOPNOTSUPP);
io_init_req_drain(req);
}
}
if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ return io_init_fail_req(req, -EACCES);
/* knock it to the slow queue path, will be drained there */
if (ctx->drain_active)
req->flags |= REQ_F_FORCE_ASYNC;
@@ -2202,9 +2230,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
if (!def->ioprio && sqe->ioprio)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
if (def->needs_file) {
struct io_submit_state *state = &ctx->submit_state;
@@ -2228,12 +2256,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
- return -EINVAL;
+ return io_init_fail_req(req, -EINVAL);
get_cred(req->creds);
ret = security_uring_override_creds(req->creds);
if (ret) {
put_cred(req->creds);
- return ret;
+ return io_init_fail_req(req, ret);
}
req->flags |= REQ_F_CREDS;
}
@@ -2508,7 +2536,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
- if (io_run_local_work(ctx) > 0)
+ if (io_run_local_work(ctx, INT_MAX) > 0)
return 0;
}
if (io_run_task_work() > 0)
@@ -2531,7 +2559,7 @@ static bool current_pending_io(void)
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
- int io_wait, ret;
+ int ret;
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
@@ -2549,7 +2577,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- io_wait = current->in_iowait;
if (current_pending_io())
current->in_iowait = 1;
ret = 0;
@@ -2557,7 +2584,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
ret = -ETIME;
- current->in_iowait = io_wait;
+ current->in_iowait = 0;
return ret;
}
@@ -2576,26 +2603,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, min_events);
io_run_task_work();
io_cqring_overflow_flush(ctx);
/* if user messes with these they will just get an early return */
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -2612,13 +2626,25 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
+ if (sig) {
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, sigsz);
+
+ if (ret)
+ return ret;
+ }
+
trace_io_uring_cqring_wait(ctx, min_events);
do {
+ int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
-
atomic_set(&ctx->cq_wait_nr, nr_wait);
set_current_state(TASK_INTERRUPTIBLE);
} else {
@@ -2628,7 +2654,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
ret = io_cqring_wait_schedule(ctx, &iowq);
__set_current_state(TASK_RUNNING);
- atomic_set(&ctx->cq_wait_nr, 0);
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
/*
* Run task_work after scheduling and before io_should_wake().
@@ -2637,7 +2663,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
*/
io_run_task_work();
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, nr_wait);
/*
* Non-local task_work will be run on exit to userspace, but
@@ -2708,7 +2734,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
- int ret, i;
+ int ret, i, pinned;
*npages = 0;
@@ -2722,12 +2748,12 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
if (!page_array)
return ERR_PTR(-ENOMEM);
- ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- page_array);
- if (ret != nr_pages) {
-err:
- io_pages_free(&page_array, ret > 0 ? ret : 0);
- return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+
+ pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ page_array);
+ if (pinned != nr_pages) {
+ ret = (pinned < 0) ? pinned : -EFAULT;
+ goto free_pages;
}
page_addr = page_address(page_array[0]);
@@ -2741,7 +2767,7 @@ err:
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
- goto err;
+ goto free_pages;
/*
* No support for discontig pages for now, should either be a
@@ -2750,13 +2776,17 @@ err:
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
- goto err;
+ goto free_pages;
page_addr += PAGE_SIZE;
}
*pages = page_array;
*npages = nr_pages;
return page_to_virt(page_array[0]);
+
+free_pages:
+ io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+ return ERR_PTR(ret);
}
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
@@ -2778,14 +2808,15 @@ static void io_rings_free(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
- ctx->rings = NULL;
- ctx->sq_sqes = NULL;
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
}
+
+ ctx->rings = NULL;
+ ctx->sq_sqes = NULL;
}
void *io_mem_alloc(size_t size)
@@ -2838,61 +2869,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -2932,6 +2908,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+ io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
@@ -2944,13 +2921,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx, ctx->rsrc_node);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@@ -2968,7 +2938,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -2990,7 +2959,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs);
}
-static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
@@ -3049,19 +3018,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -3216,7 +3172,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
* noise and overhead, there's no discernable change in runtime
* over using system_wq.
*/
- queue_work(system_unbound_wq, &ctx->exit_work);
+ queue_work(iou_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -3292,6 +3248,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool ret = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+ hash_node) {
+ struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+ struct io_uring_cmd);
+ struct file *file = req->file;
+
+ if (!cancel_all && req->task != task)
+ continue;
+
+ if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+ /* ->sqe isn't available if no async data */
+ if (!req_has_async_data(req))
+ cmd->sqe = NULL;
+ file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+ ret = true;
+ }
+ }
+ io_submit_flush_completions(ctx);
+
+ return ret;
+}
+
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@@ -3335,10 +3322,13 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
- ret |= io_run_local_work(ctx) > 0;
+ ret |= io_run_local_work(ctx, INT_MAX) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
+ ret |= io_waitid_remove_all(ctx, task, cancel_all);
+ ret |= io_futex_remove_all(ctx, task, cancel_all);
+ ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@@ -3464,14 +3454,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- rcu_read_lock();
- ptr = io_pbuf_get_address(ctx, bgid);
- rcu_read_unlock();
- if (!ptr)
- return ERR_PTR(-EINVAL);
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ptr = bl->buf_ring;
+ io_put_bl(ctx, bl);
break;
}
default:
@@ -3694,7 +3685,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
* it should handle ownership problems if any.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- (void)io_run_local_work_locked(ctx);
+ (void)io_run_local_work_locked(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
}
@@ -3838,32 +3829,13 @@ static int io_uring_install_fd(struct file *file)
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
}
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -4130,506 +4102,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
- /*
- * Lazy activation attempts would fail if it was polled before
- * submitter_task is set.
- */
- if (wq_has_sleeper(&ctx->poll_wq))
- io_activate_pollwq(ctx);
- }
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
- cpumask_var_t new_mask)
-{
- int ret;
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_wq_cpu_affinity(current->io_uring, new_mask);
- } else {
- mutex_unlock(&ctx->uring_lock);
- ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
- mutex_lock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- cpumask_var_t new_mask;
- int ret;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = __io_register_iowq_aff(ctx, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- return __io_register_iowq_aff(ctx, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We don't quiesce the refs for register anymore and so it can't be
- * dying as we're holding a file ref here.
- */
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
- return -ENXIO;
-
- if (ctx->submitter_task && ctx->submitter_task != current)
- return -EEXIST;
-
- if (ctx->restricted) {
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_register_pbuf_ring(ctx, arg);
- break;
- case IORING_UNREGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_unregister_pbuf_ring(ctx, arg);
- break;
- case IORING_REGISTER_SYNC_CANCEL:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_sync_cancel(ctx, arg);
- break;
- case IORING_REGISTER_FILE_ALLOC_RANGE:
- ret = -EINVAL;
- if (!arg || nr_args)
- break;
- ret = io_register_file_alloc_range(ctx, arg);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct file *file;
- bool use_registered_ring;
-
- use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
- opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
-
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
-
- if (use_registered_ring) {
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- struct io_uring_task *tctx = current->io_uring;
-
- if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- file = tctx->registered_rings[fd];
- if (unlikely(!file))
- return -EBADF;
- } else {
- file = fget(fd);
- if (unlikely(!file))
- return -EBADF;
- ret = -EOPNOTSUPP;
- if (!io_is_uring_fops(file))
- goto out_fput;
- }
-
- ctx = file->private_data;
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- if (!use_registered_ring)
- fput(file);
- return ret;
-}
-
static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
@@ -4704,6 +4176,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+ /* top 8bits are for internal use */
+ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
io_uring_optable_init();
/*
@@ -4719,6 +4194,11 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
+ io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+
+ iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);