diff options
Diffstat (limited to 'io_uring')
37 files changed, 1997 insertions, 1744 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e1d4e0379..fc1b23c524 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,13 +2,14 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o truncate.o +obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ + tctx.o filetable.o rw.o net.o poll.o \ + uring_cmd.o openclose.o sqpoll.o \ + xattr.o nop.o fs.o splice.o sync.o \ + msg_ring.o advise.o openclose.o \ + epoll.o statx.o timeout.o fdinfo.o \ + cancel.o waitid.o register.o \ + truncate.o memmap.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bf2fb26a65..b7a38a2069 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -4,63 +4,58 @@ /* * Don't allow the cache to grow beyond this size. */ -#define IO_ALLOC_CACHE_MAX 512 - -struct io_cache_entry { - struct io_wq_work_node node; -}; +#define IO_ALLOC_CACHE_MAX 128 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, - struct io_cache_entry *entry) + void *entry) { if (cache->nr_cached < cache->max_cached) { - cache->nr_cached++; - wq_stack_add_head(&entry->node, &cache->list); - kasan_mempool_poison_object(entry); + if (!kasan_mempool_poison_object(entry)) + return false; + cache->entries[cache->nr_cached++] = entry; return true; } return false; } -static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) -{ - return !cache->list.next; -} - -static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (cache->list.next) { - struct io_cache_entry *entry; + if (cache->nr_cached) { + void *entry = cache->entries[--cache->nr_cached]; - entry = container_of(cache->list.next, struct io_cache_entry, node); kasan_mempool_unpoison_object(entry, cache->elem_size); - cache->list.next = cache->list.next->next; - cache->nr_cached--; return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, +/* returns false if the cache was initialized properly */ +static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) { - cache->list.next = NULL; - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (cache->entries) { + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + return false; + } + return true; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(struct io_cache_entry *)) + void (*free)(const void *)) { - while (1) { - struct io_cache_entry *entry = io_alloc_cache_get(cache); + void *entry; + + if (!cache->entries) + return; - if (!entry) - break; + while ((entry = io_alloc_cache_get(cache)) != NULL) free(entry); - } - cache->nr_cached = 0; + + kvfree(cache->entries); + cache->entries = NULL; } #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index acfcdd7f05..a6e58a20ef 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); + ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 76b32e65c0..b33995e00b 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { - if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; - req->flags |= REQ_F_CANCEL_SEQ; + req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 8d444dd1b0..b1e0e0d853 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { - struct io_ring_ctx *ctx = f->private_data; + struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6e86e6188d..997c56d32e 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -84,12 +84,12 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, return ret; file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); + } else { + io_file_bitmap_set(&ctx->file_table, slot_index); } *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); return 0; } diff --git a/io_uring/futex.c b/io_uring/futex.c index 792a03df58..914848f46b 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -9,7 +9,7 @@ #include "../kernel/futex/futex.h" #include "io_uring.h" -#include "rsrc.h" +#include "alloc_cache.h" #include "futex.h" struct io_futex { @@ -27,27 +27,21 @@ struct io_futex { }; struct io_futex_data { - union { - struct futex_q q; - struct io_cache_entry cache; - }; + struct futex_q q; struct io_kiocb *req; }; -void io_futex_cache_init(struct io_ring_ctx *ctx) -{ - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); -} +#define IO_FUTEX_ALLOC_CACHE_MAX 32 -static void io_futex_cache_entry_free(struct io_cache_entry *entry) +bool io_futex_cache_init(struct io_ring_ctx *ctx) { - kfree(container_of(entry, struct io_futex_data, cache)); + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, + sizeof(struct io_futex_data)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); + io_alloc_cache_free(&ctx->futex_cache, kfree); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); __io_futex_complete(req, ts); } @@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { - struct io_cache_entry *entry; + struct io_futex_data *ifd; - entry = io_alloc_cache_get(&ctx->futex_cache); - if (entry) - return container_of(entry, struct io_futex_data, cache); + ifd = io_alloc_cache_get(&ctx->futex_cache); + if (ifd) + return ifd; return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } diff --git a/io_uring/futex.h b/io_uring/futex.h index 0847e9e8a1..b8bb09873d 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -void io_futex_cache_init(struct io_ring_ctx *ctx); +bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); #else static inline int io_futex_cancel(struct io_ring_ctx *ctx, @@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, { return false; } -static inline void io_futex_cache_init(struct io_ring_ctx *ctx) +static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) { + return false; } static inline void io_futex_cache_free(struct io_ring_ctx *ctx) { diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0..22dac58503 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -23,12 +23,13 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -44,21 +45,21 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + int create_index; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; unsigned long create_state; struct callback_head create_work; - int create_index; + int init_retries; union { struct rcu_head rcu; @@ -165,7 +166,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); } static void io_worker_ref_put(struct io_wq *wq) @@ -225,7 +226,7 @@ static void io_worker_exit(struct io_worker *worker) wait_for_completion(&worker->ref_done); raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); @@ -410,7 +411,7 @@ static void io_wq_dec_running(struct io_worker *worker) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) @@ -430,8 +431,8 @@ static void io_wq_dec_running(struct io_worker *worker) */ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); raw_spin_unlock(&wq->lock); @@ -444,8 +445,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -539,7 +540,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -564,10 +564,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * clear the stalled flag. */ work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -576,11 +573,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(wq, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -631,7 +634,8 @@ static int io_wq_worker(void *data) bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -695,11 +699,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -713,12 +717,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } @@ -732,7 +736,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, raw_spin_lock(&wq->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; + set_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -742,7 +746,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -750,6 +754,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -776,7 +782,7 @@ static void create_worker_cont(struct callback_head *cb) io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); @@ -838,12 +844,12 @@ fail: init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; + set_bit(IO_WORKER_F_BOUND, &worker->flags); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { @@ -924,8 +930,12 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - struct io_cb_cancel_data match; - unsigned work_flags = work->flags; + unsigned long work_flags = work->flags; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -963,10 +973,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } @@ -1005,8 +1011,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c170a2b8d2..896e707e06 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -63,7 +63,6 @@ #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> @@ -95,6 +94,8 @@ #include "waitid.h" #include "futex.h" #include "napi.h" +#include "uring_cmd.h" +#include "memmap.h" #include "timeout.h" #include "poll.h" @@ -170,17 +171,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {}, }; #endif -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -253,14 +246,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - struct io_tw_state ts = { .locked = true, }; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); - if (WARN_ON_ONCE(!ts.locked)) - return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -284,6 +275,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -312,14 +304,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); - io_futex_cache_init(ctx); + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw)); + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct uring_cache)); + ret |= io_futex_cache_init(ctx); + if (ret) + goto err; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -337,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); @@ -349,6 +345,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); xa_destroy(&ctx->io_bl_xa); @@ -379,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); + io_kbuf_drop(req); spin_unlock(&req->ctx->completion_lock); } @@ -471,7 +473,7 @@ static void io_prep_async_work(struct io_kiocb *req) /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && - (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); @@ -498,7 +500,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) +static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -666,28 +668,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - spin_unlock(&ctx->completion_lock); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { size_t cqe_size = sizeof(struct io_uring_cqe); - if (__io_cqring_events(ctx) == ctx->cq_entries) + lockdep_assert_held(&ctx->uring_lock); + + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) @@ -698,11 +686,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!io_get_cqe_overflow(ctx, &cqe, true)) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); } @@ -714,20 +705,17 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, false); + mutex_unlock(&ctx->uring_lock); } /* can be called by any task */ @@ -817,7 +805,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -void io_req_cqe_overflow(struct io_kiocb *req) +static void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, @@ -890,151 +878,71 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; - - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; - - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; -} - -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) + if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); -} - /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - u64 user_data = req->cqe.user_data; - struct io_uring_cqe *cqe; + bool posted; lockdep_assert(!io_wq_current_is_worker()); - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); - lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); - } - - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + __io_cq_lock(ctx); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + ctx->submit_state.cq_flush = true; + __io_cq_unlock_post(ctx); + return posted; } -static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) +static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *rsrc_node = NULL; + + /* + * All execution paths but io-wq use the deferred completions by + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. + */ + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) + return; + + /* + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires + * the submitter task context, IOPOLL protects with uring_lock. + */ + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return; + } io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } + io_cq_unlock_post(ctx); /* - * If we're the last reference to this request, add to our locked - * free_list cache. + * We don't free the request here because we know it's called from + * io-wq only, which holds a reference, so it cannot be the last put. */ - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_put_kbuf_comp(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - io_put_file(req); - - rsrc_node = req->rsrc_node; - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_task_remote(req->task); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } - io_cq_unlock_post(ctx); - - if (rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - io_put_rsrc_node(ctx, rsrc_node); - io_ring_submit_unlock(ctx, issue_flags); - } -} - -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (ctx->task_complete && ctx->submitter_task != current) { - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req, issue_flags); - } else { - mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); - mutex_unlock(&ctx->uring_lock); - } + req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) @@ -1065,15 +973,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. @@ -1085,18 +984,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } + int ret; ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -1112,8 +1000,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; + while (ret--) { + struct io_kiocb *req = reqs[ret]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); @@ -1163,11 +1051,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ts->locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - ts->locked = false; - } + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -1191,8 +1077,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, if (req->ctx != ctx) { ctx_flush_and_put(ctx, &ts); ctx = req->ctx; - /* if not contended, grab and improve batching */ - ts.locked = mutex_trylock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, @@ -1374,8 +1259,8 @@ static void io_req_normal_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sqd = ctx->sq_data; - if (wq_has_sleeper(&sqd->wait)) - wake_up(&sqd->wait); + if (sqd->thread) + __set_notify_signal(sqd->thread); return; } @@ -1453,11 +1338,9 @@ again: if (io_run_local_work_continue(ctx, ret, min_events)) goto again; - if (ts->locked) { - io_submit_flush_completions(ctx); - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; - } + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; trace_io_uring_local_work_run(ctx, ret, loops); return ret; @@ -1466,17 +1349,11 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) { - struct io_tw_state ts = { .locked = true, }; - int ret; + struct io_tw_state ts = {}; if (llist_empty(&ctx->work_llist)) return 0; - - ret = __io_run_local_work(ctx, &ts, min_events); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!ts.locked)) - mutex_lock(&ctx->uring_lock); - return ret; + return __io_run_local_work(ctx, &ts, min_events); } static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) @@ -1484,11 +1361,9 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) struct io_tw_state ts = {}; int ret; - ts.locked = mutex_trylock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); ret = __io_run_local_work(ctx, &ts, min_events); - if (ts.locked) - mutex_unlock(&ctx->uring_lock); - + mutex_unlock(&ctx->uring_lock); return ret; } @@ -1505,7 +1380,7 @@ void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, ts); + io_queue_iowq(req); else io_queue_sqe(req); } @@ -1550,7 +1425,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } @@ -1560,10 +1435,9 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); - - io_req_put_rsrc_locked(req, ctx); - + io_put_rsrc_node(ctx, req->rsrc_node); io_put_task(req->task); + node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); @@ -1576,9 +1450,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node; __io_cq_lock(ctx); - /* must come first to preserve CQE ordering in failure cases */ - if (state->cqes_count) - __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1600,6 +1471,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -1642,13 +1514,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) unsigned int nr_events = 0; unsigned long check_cq; + lockdep_assert_held(&ctx->uring_lock); + if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. @@ -1711,10 +1585,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (ts->locked) - io_req_complete_defer(req); - else - io_req_complete_post(req, IO_URING_F_UNLOCKED); + io_req_complete_defer(req); } /* @@ -1785,8 +1656,10 @@ io_req_flags_t io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); - req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1794,25 +1667,6 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!cdef->prep_async) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (!def->manual_alloc) { - if (io_alloc_async_data(req)) - return -EAGAIN; - } - return cdef->prep_async(req); -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -2093,7 +1947,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); + io_queue_iowq(req); break; case IO_APOLL_OK: break; @@ -2130,17 +1984,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) { - io_req_defer_failed(req, ret); - return; - } - if (unlikely(req->ctx->drain_active)) io_drain_req(req); else - io_queue_iowq(req, NULL); + io_queue_iowq(req); } } @@ -2211,6 +2058,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->rsrc_node = NULL; req->task = current; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; @@ -2346,10 +2194,6 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; @@ -2597,8 +2441,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, min_events); io_run_task_work(); - io_cqring_overflow_flush(ctx); - /* if user messes with these they will just get an early return */ + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); if (__io_cqring_events_user(ctx) >= min_events) return 0; @@ -2698,89 +2543,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - -static void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - int i; - - if (!page_array) - return; - - for (i = 0; i < npages; i++) - unpin_user_page(page_array[i]); - kvfree(page_array); - *pages = NULL; -} - -static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - int ret, i, pinned; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > USHRT_MAX) - return ERR_PTR(-EINVAL); - page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!page_array) - return ERR_PTR(-ENOMEM); - - - pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - page_array); - if (pinned != nr_pages) { - ret = (pinned < 0) ? pinned : -EFAULT; - goto free_pages; - } - - page_addr = page_address(page_array[0]); - for (i = 0; i < nr_pages; i++) { - ret = -EINVAL; - - /* - * Can't support mapping user allocated ring memory on 32-bit - * archs where it could potentially reside in highmem. Just - * fail those with -EINVAL, just like we did on kernels that - * didn't support this feature. - */ - if (PageHighMem(page_array[i])) - goto free_pages; - - /* - * No support for discontig pages for now, should either be a - * single normal page, or a huge page. Later on we can add - * support for remapping discontig pages, for now we will - * just fail them with EINVAL. - */ - if (page_address(page_array[i]) != page_addr) - goto free_pages; - page_addr += PAGE_SIZE; - } - - *pages = page_array; - *npages = nr_pages; - return page_to_virt(page_array[0]); - -free_pages: - io_pages_free(&page_array, pinned > 0 ? pinned : 0); - return ERR_PTR(ret); -} - static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { @@ -2798,30 +2560,23 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, + true); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, + true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; + vunmap(ctx->rings); + vunmap(ctx->sq_sqes); } ctx->rings = NULL; ctx->sq_sqes = NULL; } -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -2843,13 +2598,11 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries #endif if (ctx->flags & IORING_SETUP_NO_SQARRAY) { - if (sq_offset) - *sq_offset = SIZE_MAX; + *sq_offset = SIZE_MAX; return off; } - if (sq_offset) - *sq_offset = off; + *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) @@ -2867,7 +2620,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); @@ -2879,11 +2631,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_rsrc_node_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct io_rsrc_node, cache)); -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2898,8 +2645,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); @@ -2915,13 +2664,12 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3145,17 +2893,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); - if (ctx->rings) - io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* - * If we failed setting up the ctx, we might not have any rings - * and therefore did not submit any requests - */ - if (ctx->rings) - io_kill_timeouts(ctx, NULL, true); - flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); @@ -3241,37 +2980,6 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool ret = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, - hash_node) { - struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, - struct io_uring_cmd); - struct file *file = req->file; - - if (!cancel_all && req->task != task) - continue; - - if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); - ret = true; - } - } - io_submit_flush_completions(ctx); - - return ret; -} - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) @@ -3326,6 +3034,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); return ret; } @@ -3361,8 +3071,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; @@ -3424,137 +3137,6 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; - case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - io_put_bl(ctx, bl); - break; - } - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); -} - -static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - /* - * Do not allow to map to user-provided address to avoid breaking the - * aliasing rules. Userspace is not able to guess the offset address of - * kernel kmalloc()ed memory area. - */ - if (addr) - return -EINVAL; - - ptr = io_uring_validate_mmap_request(filp, pgoff, len); - if (IS_ERR(ptr)) - return -ENOMEM; - - /* - * Some architectures have strong cache aliasing requirements. - * For such architectures we need a coherent mapping which aliases - * kernel memory *and* userspace memory. To achieve that: - * - use a NULL file pointer to reference physical memory, and - * - use the kernel virtual address of the shared io_uring context - * (instead of the userspace-provided address, which has to be 0UL - * anyway). - * - use the same pgoff which the get_unmapped_area() uses to - * calculate the page colouring. - * For architectures without such aliasing requirements, the - * architecture will return any suitable mapping because addr is 0. - */ - filp = NULL; - flags |= MAP_SHARED; - pgoff = 0; /* has been translated to ptr above */ -#ifdef SHM_COLOUR - addr = (uintptr_t) ptr; - pgoff = addr >> PAGE_SHIFT; -#else - addr = 0UL; -#endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { @@ -3647,8 +3229,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; @@ -3737,11 +3317,9 @@ out: static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, + .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#else - .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS @@ -3770,7 +3348,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); @@ -3795,7 +3373,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); @@ -3994,7 +3572,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | + IORING_FEAT_RECVSEND_BUNDLE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6426ee3822..726e6367af 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -62,16 +62,12 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) } bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); -void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); - struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -79,7 +75,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); @@ -97,7 +92,6 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); -int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); @@ -110,9 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -void *io_mem_alloc(size_t size); -void io_mem_free(void *ptr); - enum { IO_EVENTFD_OP_SIGNAL_BIT, IO_EVENTFD_OP_FREE_BIT, @@ -121,9 +112,9 @@ enum { void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); -#if defined(CONFIG_PROVE_LOCKING) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { +#if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); if (ctx->flags & IORING_SETUP_IOPOLL) { @@ -142,18 +133,21 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) else lockdep_assert(current == ctx->submitter_task); } -} -#else -static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) -{ -} #endif +} static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || + ctx->submit_state.cq_flush) + __io_submit_flush_completions(ctx); +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) @@ -340,15 +334,12 @@ static inline int io_run_task_work(void) static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); + return task_work_pending(current) || !llist_empty(&ctx->work_llist); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!ts->locked) { - mutex_lock(&ctx->uring_lock); - ts->locked = true; - } + lockdep_assert_held(&ctx->uring_lock); } /* @@ -442,7 +433,7 @@ static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; - if (file_can_poll(req->file)) { + if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3aa16e27f5..c95dc1736d 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/vmalloc.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -14,8 +15,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) +#include "memmap.h" /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -31,25 +31,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -130,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 0; +} + +static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, + __u16 head, __u16 mask) +{ + return &br->bufs[head & mask]; +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -145,19 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; - head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + buf = io_ring_head_to_buf(br, head, bl->mask); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; @@ -172,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + req->flags &= ~REQ_F_BUFFERS_COMMIT; req->buf_list = NULL; bl->head++; } @@ -198,22 +198,134 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +/* cap it at a reasonable 256, will be one page even for 4K */ +#define PEEK_MAX_IMPORT 256 + +static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_buffer_list *bl) { - struct io_buf_free *ibf; + struct io_uring_buf_ring *br = bl->buf_ring; + struct iovec *iov = arg->iovs; + int nr_iovs = arg->nr_iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; + + tail = smp_load_acquire(&br->tail); + head = bl->head; + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(br, head, bl->mask); + if (arg->max_len) { + int needed; + + needed = (arg->max_len + buf->len - 1) / buf->len; + needed = min(needed, PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } + + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + if (arg->mode & KBUF_MODE_FREE) + kfree(arg->iovs); + arg->iovs = iov; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; + } + + /* set it to max, if not set, so we can use it unconditionally */ + if (!arg->max_len) + arg->max_len = INT_MAX; + + req->buf_index = buf->bid; + do { + /* truncate end piece, if needed */ + if (buf->len > arg->max_len) + buf->len = arg->max_len; + + iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_len = buf->len; + iov++; + + arg->out_len += buf->len; + arg->max_len -= buf->len; + if (!arg->max_len) + break; + + buf = io_ring_head_to_buf(br, ++head, bl->mask); + } while (--nr_iovs); + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + return iov - arg->iovs; +} + +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + goto out_unlock; - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + if (ret > 0) { + req->flags |= REQ_F_BL_NO_RECYCLE; + req->buf_list->head += ret; } + } else { + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + return -ENOENT; + + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + if (ret > 0) + req->flags |= REQ_F_BUFFERS_COMMIT; + return ret; } - /* can't happen... */ - WARN_ON_ONCE(1); + /* don't support multiple buffer selections for legacy */ + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } static int __io_remove_buffers(struct io_ring_ctx *ctx, @@ -227,22 +339,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { + if (bl->buf_nr_pages) { int j; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (!bl->is_mmap) { + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + } + io_pages_unmap(bl->buf_ring, &bl->buf_pages, + &bl->buf_nr_pages, bl->is_mmap); + bl->is_mmap = 0; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); @@ -498,9 +604,9 @@ err: static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_uring_buf_ring *br; + struct io_uring_buf_ring *br = NULL; struct page **pages; - int i, nr_pages; + int nr_pages, ret; pages = io_pin_pages(reg->ring_addr, flex_array_size(br, bufs, reg->ring_entries), @@ -508,18 +614,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, if (IS_ERR(pages)) return PTR_ERR(pages); - /* - * Apparently some 32-bit boxes (ARM) will return highmem pages, - * which then need to be mapped. We could support that, but it'd - * complicate the code and slowdown the common cases quite a bit. - * So just error out, returning -EINVAL just like we did on kernels - * that didn't support mapped buffer rings. - */ - for (i = 0; i < nr_pages; i++) - if (PageHighMem(pages[i])) - goto error_unpin; + br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!br) { + ret = -ENOMEM; + goto error_unpin; + } - br = page_address(pages[0]); #ifdef SHM_COLOUR /* * On platforms that have specific aliasing requirements, SHM_COLOUR @@ -530,8 +630,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, * should use IOU_PBUF_RING_MMAP instead, and liburing will handle * this transparently. */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { + ret = -EINVAL; goto error_unpin; + } #endif bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; @@ -540,69 +642,26 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->is_mmap = 0; return 0; error_unpin: - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); + unpin_user_pages(pages, nr_pages); kvfree(pages); - return -EINVAL; -} - -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; + vunmap(br); + return ret; } static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_buf_free *ibf; size_t ring_size; - void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); + if (IS_ERR(bl->buf_ring)) { + bl->buf_ring = NULL; + return -ENOMEM; } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; + bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; @@ -750,18 +809,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, return ERR_PTR(-EINVAL); } -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) { - struct io_buf_free *ibf; - struct hlist_node *tmp; + struct io_ring_ctx *ctx = file->private_data; + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; + struct io_buffer_list *bl; + int bgid, ret; - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + io_put_bl(ctx, bl); + return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index df365b8860..b90aca3a57 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -41,8 +41,26 @@ struct io_buffer { __u16 bgid; }; +enum { + /* can alloc a bigger vec */ + KBUF_MODE_EXPAND = 1, + /* if bigger vec allocated, free old one */ + KBUF_MODE_FREE = 2, +}; + +struct buf_sel_arg { + struct iovec *iovs; + size_t out_len; + size_t max_len; + int nr_iovs; + int mode; +}; + void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -55,8 +73,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -64,6 +80,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { @@ -76,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) */ if (req->buf_list) { req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } return false; @@ -100,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req) +static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; + struct io_buffer_list *bl = req->buf_list; + + if (bl) { + if (req->flags & REQ_F_BUFFERS_COMMIT) { + bl->head += nr; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + } + req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; } @@ -113,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -121,22 +143,18 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, } } -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +static inline void io_kbuf_drop(struct io_kiocb *req) { - unsigned int ret; - lockdep_assert_held(&req->ctx->completion_lock); if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; + return; - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); - return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) { unsigned int ret; @@ -145,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, nbufs); else __io_put_kbuf(req, issue_flags); return ret; } + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + return __io_put_kbufs(req, 1, issue_flags); +} + +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) +{ + return __io_put_kbufs(req, nbufs, issue_flags); +} #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c new file mode 100644 index 0000000000..a0f32a255f --- /dev/null +++ b/io_uring/memmap.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> +#include <asm/shmparam.h> + +#include "memmap.h" +#include "kbuf.h" + +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; +} + +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) +{ + bool do_vunmap = false; + + if (!ptr) + return; + + if (put_pages && *npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + +void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + void *page_addr; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = 0; + page_array = io_pin_pages(uaddr, size, &nr_pages); + if (IS_ERR(page_array)) + return page_array; + + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); + if (page_addr) { + *pages = page_array; + *npages = nr_pages; + return page_addr; + } + + io_pages_free(&page_array, nr_pages); + return ERR_PTR(-ENOMEM); +} + +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; + + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->rings; + case IORING_OFF_SQES: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->sq_sqes; + case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; + unsigned int bgid; + void *ptr; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); + return ptr; + } + } + + return ERR_PTR(-EINVAL); +} + +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + +#ifdef CONFIG_MMU + +__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); + } + + return -EINVAL; +} + +unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ +#ifdef SHM_COLOUR + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; +#else + addr = 0UL; +#endif + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); +} + +#else /* !CONFIG_MMU */ + +int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; +} + +unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ diff --git a/io_uring/memmap.h b/io_uring/memmap.h new file mode 100644 index 0000000000..5cec5b7ac4 --- /dev/null +++ b/io_uring/memmap.h @@ -0,0 +1,25 @@ +#ifndef IO_URING_MEMMAP_H +#define IO_URING_MEMMAP_H + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +void io_pages_free(struct page ***pages, int npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size); +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages); + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size); + +#ifndef CONFIG_MMU +unsigned int io_uring_nommu_mmap_capabilities(struct file *file); +#endif +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +int io_uring_mmap(struct file *file, struct vm_area_struct *vma); + +#endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index cd6dcf634b..81c4a9d437 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -83,7 +83,7 @@ static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) return -EOWNERDEAD; init_task_work(&msg->tw, func); - if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) return -EOWNERDEAD; return IOU_ISSUE_SKIP_COMPLETE; @@ -147,13 +147,11 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; - io_double_unlock_ctx(target_ctx); - } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; } + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) + ret = 0; + if (target_ctx->flags & IORING_SETUP_IOPOLL) + io_double_unlock_ctx(target_ctx); return ret; } diff --git a/io_uring/napi.c b/io_uring/napi.c index 883a1a6659..080d10e0e0 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -222,6 +222,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) @@ -261,12 +263,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) } /* - * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * __io_napi_adjust_timeout() - adjust busy loop timeout * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue * @ts: pointer to timespec or NULL * * Adjust the busy loop timeout according to timespec and busy poll timeout. + * If the specified NAPI timeout is bigger than the wait timeout, then adjust + * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct timespec64 *ts) @@ -274,16 +278,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); if (ts) { - struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); - - if (timespec64_compare(ts, &poll_to_ts) > 0) { - *ts = timespec64_sub(*ts, poll_to_ts); - } else { - u64 to = timespec64_to_ns(ts); - - do_div(to, 1000); - ts->tv_sec = 0; - ts->tv_nsec = 0; + struct timespec64 poll_to_ts; + + poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + if (timespec64_compare(ts, &poll_to_ts) < 0) { + s64 poll_to_ns = timespec64_to_ns(ts); + if (poll_to_ns > 0) { + u64 val = poll_to_ns + 999; + do_div(val, (s64) 1000); + poll_to = val; + } } } diff --git a/io_uring/net.c b/io_uring/net.c index 4afb475d41..09bb82bc20 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -28,6 +28,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + int iou_flags; u32 file_slot; unsigned long nofile; }; @@ -57,7 +58,7 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - unsigned len; + int len; unsigned done_io; unsigned msg_flags; unsigned nr_multishot_loops; @@ -115,80 +116,85 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) +{ + if (kmsg->free_iov) { + kfree(kmsg->free_iov); + kmsg->free_iov_nr = 0; + kmsg->free_iov = NULL; + } +} + static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; + struct iovec *iov; - if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) + /* can't recycle, ensure we free the iovec if we have one */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_netmsg_iovec_free(hdr); return; + } /* Let normal cleanup path reap it if we fail adding to the cache */ - if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + iov = hdr->free_iov; + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { + if (iov) + kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } } -static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, - unsigned int issue_flags) +static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct io_async_msghdr *hdr; - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->netmsg_cache); - if (entry) { - hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; + hdr = io_alloc_cache_get(&ctx->netmsg_cache); + if (hdr) { + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; } if (!io_alloc_async_data(req)) { hdr = req->async_data; + hdr->free_iov_nr = 0; hdr->free_iov = NULL; return hdr; } return NULL; } -static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) +/* assign new iovec to kmsg, if we need to */ +static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, + struct iovec *iov) { - /* ->prep_async is always called from the submission context */ - return io_msg_alloc_async(req, 0); + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; + if (kmsg->free_iov) + kfree(kmsg->free_iov); + kmsg->free_iov = iov; + } + return 0; } -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg, - unsigned int issue_flags) +static inline void io_mshot_prep_retry(struct io_kiocb *req, + struct io_async_msghdr *kmsg) { - struct io_async_msghdr *async_msg; - - if (req_has_async_data(req)) - return -EAGAIN; - async_msg = io_msg_alloc_async(req, issue_flags); - if (!async_msg) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - if (async_msg->msg.msg_name) - async_msg->msg.msg_name = &async_msg->addr; - - if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) - return -EAGAIN; - - /* if were using fast_iov, set it to the new one */ - if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; - async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; - } + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - return -EAGAIN; + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; } #ifdef CONFIG_COMPAT @@ -198,7 +204,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_iovec __user *uiov; - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; @@ -207,9 +222,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - iomsg->free_iov = NULL; if (msg->msg_iovlen == 0) { - sr->len = 0; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { @@ -225,14 +240,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return 0; } - iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); + nr_segs, &iov, &iomsg->msg.msg_iter, true); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } #endif @@ -240,7 +253,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) return -EFAULT; @@ -256,9 +278,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iomsg->fast_iov[0].iov_len = 0; - iomsg->fast_iov[0].iov_base = NULL; - iomsg->free_iov = NULL; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { ret = -EINVAL; goto ua_end; @@ -266,10 +287,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, /* we only need the length for provided buffers */ if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) goto ua_end; - unsafe_get_user(iomsg->fast_iov[0].iov_len, - &msg->msg_iov[0].iov_len, ua_end); - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; + unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, + ua_end); + sr->len = iov->iov_len; } ret = 0; ua_end: @@ -278,13 +298,12 @@ ua_end: } user_access_end(); - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, false); + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, + &iov, &iomsg->msg.msg_iter, false); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -320,60 +339,58 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, return ret; } -int io_send_prep_async(struct io_kiocb *req) +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; - int ret; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) - return 0; - zc->done_io = 0; - if (!zc->addr) - return 0; - io = io_msg_alloc_async_prep(req); - if (!io) - return -ENOMEM; - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); - return ret; + io_netmsg_iovec_free(io); } -static int io_setup_async_addr(struct io_kiocb *req, - struct sockaddr_storage *addr_storage, - unsigned int issue_flags) +static int io_send_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *kmsg = req->async_data; + int ret; - if (!sr->addr || req_has_async_data(req)) - return -EAGAIN; - io = io_msg_alloc_async(req, issue_flags); - if (!io) - return -ENOMEM; - memcpy(&io->addr, addr_storage, sizeof(io->addr)); - return -EAGAIN; + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_ubuf = NULL; + + if (sr->addr) { + ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + if (unlikely(ret < 0)) + return ret; + kmsg->msg.msg_name = &kmsg->addr; + kmsg->msg.msg_namelen = sr->addr_len; + } + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret < 0)) + return ret; + } + return 0; } -int io_sendmsg_prep_async(struct io_kiocb *req) +static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + kmsg = io_msg_alloc_async(req); + if (unlikely(!kmsg)) return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!is_msg) + return io_send_setup(req); + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -393,34 +410,114 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~SENDMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_SENDMSG) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + sr->msg_flags |= MSG_WAITALL; + sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (req->flags & REQ_F_BUFFER_SELECT && sr->len) + return -EINVAL; #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); } static void io_req_msg_cleanup(struct io_kiocb *req, - struct io_async_msghdr *kmsg, unsigned int issue_flags) { req->flags &= ~REQ_F_NEED_CLEANUP; - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); io_netmsg_recycle(req, issue_flags); } +/* + * For bundle completions, we need to figure out how many segments we consumed. + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it + * could be using an ITER_IOVEC. If the latter, then if we consumed all of + * the segments, then it's a trivial questiont o answer. If we have residual + * data in the iter, then loop the segments to figure out how much we + * transferred. + */ +static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) +{ + struct iovec *iov; + int nbufs; + + /* no data is always zero segments, and a ubuf is always 1 segment */ + if (ret <= 0) + return 0; + if (iter_is_ubuf(&kmsg->msg.msg_iter)) + return 1; + + iov = kmsg->free_iov; + if (!iov) + iov = &kmsg->fast_iov; + + /* if all data was transferred, it's basic pointer math */ + if (!iov_iter_count(&kmsg->msg.msg_iter)) + return iter_iov(&kmsg->msg.msg_iter) - iov; + + /* short transfer, count segments */ + nbufs = 0; + do { + int this_len = min_t(int, iov[nbufs].iov_len, ret); + + nbufs++; + ret -= this_len; + } while (ret); + + return nbufs; +} + +static inline bool io_send_finish(struct io_kiocb *req, int *ret, + struct io_async_msghdr *kmsg, + unsigned issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + bool bundle_finished = *ret <= 0; + unsigned int cflags; + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { + cflags = io_put_kbuf(req, issue_flags); + goto finish; + } + + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + + if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + goto finish; + + /* + * Fill CQE for this receive and see if we should keep trying to + * receive from this socket. + */ + if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + io_mshot_prep_retry(req, kmsg); + return false; + } + + /* Otherwise stop bundle and use the current result. */ +finish: + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; +} + int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; @@ -430,19 +527,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -450,23 +537,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - io_req_msg_cleanup(req, kmsg, issue_flags); + io_req_msg_cleanup(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -477,65 +566,79 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) int io_send(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; int ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_ubuf = NULL; - - if (sr->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = sr->addr_len; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); - if (unlikely(ret)) - return ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + +retry_bundle: + if (io_do_buffer_select(req)) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = INT_MAX, + .nr_iovs = 1, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode = KBUF_MODE_FREE; + } + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; + + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + sr->len = arg.out_len; + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + } + + /* + * If MSG_WAITALL is set, or this is a bundle send, then we need + * the full amount. If just bundle is set, if we do a short send + * then we complete the bundle sequence rather than continue on. + */ + if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = flags; + ret = sock_sendmsg(sock, &kmsg->msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -545,8 +648,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; + + if (!io_send_finish(req, &ret, kmsg, issue_flags)) + goto retry_bundle; + + io_req_msg_cleanup(req, issue_flags); + return ret; } static int io_recvmsg_mshot_prep(struct io_kiocb *req, @@ -611,23 +718,42 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, msg.msg_controllen); } -int io_recvmsg_prep_async(struct io_kiocb *req) +static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *iomsg; + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + kmsg = io_msg_alloc_async(req); + if (unlikely(!kmsg)) return -ENOMEM; - iomsg = req->async_data; - ret = io_recvmsg_copy_hdr(req, iomsg); + + if (req->opcode == IORING_OP_RECV) { + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_iocb = NULL; + kmsg->msg.msg_ubuf = NULL; + + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + return 0; + } + + ret = io_recvmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ + IORING_RECVSEND_BUNDLE) int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -641,21 +767,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~(RECVMSG_FLAGS)) + if (sr->flags & ~RECVMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (sr->flags & IORING_RECV_MULTISHOT) { - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sr->msg_flags & MSG_WAITALL) - return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) - return -EINVAL; - req->flags |= REQ_F_APOLL_MULTISHOT; + if (req->flags & REQ_F_BUFFER_SELECT) { /* * Store the buffer group for this multishot receive separately, * as if we end up doing an io-wq based issue that selects a @@ -665,6 +784,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * restore it. */ sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; } #ifdef CONFIG_COMPAT @@ -672,17 +805,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->nr_multishot_loops = 0; - return 0; -} - -static inline void io_recv_prep_retry(struct io_kiocb *req) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - req->flags &= ~REQ_F_BL_EMPTY; - sr->done_io = 0; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; + return io_recvmsg_prep_setup(req); } /* @@ -692,28 +815,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - struct msghdr *msg, bool mshot_finished, - unsigned issue_flags) + struct io_async_msghdr *kmsg, + bool mshot_finished, unsigned issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); unsigned int cflags; - cflags = io_put_kbuf(req, issue_flags); - if (msg->msg_inq > 0) + if (sr->flags & IORING_RECVSEND_BUNDLE) + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + issue_flags); + else + cflags = io_put_kbuf(req, issue_flags); + + if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + /* bundle with no more immediate buffers, we're done */ + if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) + goto finish; + /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && - io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE)) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - io_recv_prep_retry(req); + io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ @@ -728,12 +859,14 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } /* Finish the request / stop multishot. */ +finish: io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; + io_req_msg_cleanup(req, issue_flags); return true; } @@ -824,7 +957,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -835,18 +968,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (force_nonblock) @@ -888,17 +1012,16 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg, issue_flags); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { + if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } - return ret; + return -EAGAIN; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -914,21 +1037,80 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; - if (mshot_finished) - io_req_msg_cleanup(req, kmsg, issue_flags); - else if (ret == -EAGAIN) - return io_setup_async_msg(req, kmsg, issue_flags); - return ret; } +static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, + size_t *len, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + /* + * If the ring isn't locked, then don't use the peek interface + * to grab multiple buffers as we will lock/unlock between + * this selection and posting the buffers. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + sr->flags & IORING_RECVSEND_BUNDLE) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } + + if (kmsg->msg.msg_inq > 0) + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + + ret = io_buffers_peek(req, &arg); + if (unlikely(ret < 0)) + return ret; + + /* special case 1 vec, can be a fast path */ + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + sr->len = arg.iovs[0].iov_len; + goto map_ubuf; + } + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + } else { + void __user *buf; + + *len = sr->len; + buf = io_buffer_select(req, len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + sr->len = *len; +map_ubuf: + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + + return 0; +} + int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -943,40 +1125,27 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; - flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; retry_multishot: if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - sr->len = len; + ret = io_recv_buf_select(req, kmsg, &len, issue_flags); + if (unlikely(ret)) { + kmsg->msg.msg_inq = -1; + goto out_free; + } + sr->buf = NULL; } - ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_inq = -1; - msg.msg_flags = 0; + kmsg->msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = sock_recvmsg(sock, &msg, flags); + ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { if (issue_flags & IO_URING_F_MULTISHOT) { @@ -996,7 +1165,7 @@ retry_multishot: if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } @@ -1008,7 +1177,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) goto retry_multishot; return ret; @@ -1017,14 +1186,10 @@ out_free: void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) { - io = req->async_data; - /* might be ->fast_iov if *msg_copy_hdr failed */ - if (io->free_iov != io->fast_iov) - kfree(io->free_iov); - } + if (req_has_async_data(req)) + io_netmsg_iovec_free(io); if (zc->notif) { io_notif_flush(zc->notif); zc->notif = NULL; @@ -1041,6 +1206,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1061,8 +1227,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->flags & ~IO_ZC_FLAGS_VALID) return -EINVAL; if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { - io_notif_set_extended(notif); - io_notif_to_data(notif)->zc_report = true; + struct io_notif_data *nd = io_notif_to_data(notif); + + nd->zc_report = true; + nd->zc_used = false; + nd->zc_copied = false; } } @@ -1090,7 +1259,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -1098,7 +1267,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); } static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, @@ -1159,11 +1328,34 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, return ret; } +static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, + (u64)(uintptr_t)sr->buf, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; + } else { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + ret = io_notif_account_mem(sr->notif, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + } + + return ret; +} + int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1174,67 +1366,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - if (zc->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = zc->addr_len; - } - if (!(req->flags & REQ_F_POLLED) && (zc->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); + if (!zc->done_io) { + ret = io_send_zc_import(req, kmsg); if (unlikely(ret)) return ret; - msg.sg_from_iter = io_sg_from_iter; - } else { - io_notif_set_extended(zc->notif); - ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); - if (unlikely(ret)) - return ret; - ret = io_notif_account_mem(zc->notif, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter_iovec; } - msg_flags = zc->msg_flags | MSG_ZEROCOPY; + msg_flags = zc->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = msg_flags; - msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = msg_flags; + kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; + ret = sock_sendmsg(sock, &kmsg->msg); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { + if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1252,7 +1414,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1261,63 +1423,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; - io_notif_set_extended(sr->notif); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (req_has_async_data(req)) { - kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; - flags = sr->msg_flags | MSG_ZEROCOPY; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov = NULL; - } - io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -1329,7 +1474,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1347,10 +1492,12 @@ void io_sendrecv_fail(struct io_kiocb *req) req->cqe.flags |= IORING_CQE_F_MORE; } +#define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ + IORING_ACCEPT_POLL_FIRST) + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); - unsigned flags; if (sqe->len || sqe->buf_index) return -EINVAL; @@ -1359,15 +1506,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) + accept->iou_flags = READ_ONCE(sqe->ioprio); + if (accept->iou_flags & ~ACCEPT_FLAGS) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); if (accept->file_slot) { if (accept->flags & SOCK_CLOEXEC) return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && accept->file_slot != IORING_FILE_INDEX_ALLOC) return -EINVAL; } @@ -1375,8 +1522,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) req->flags |= REQ_F_APOLL_MULTISHOT; + if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) + req->flags |= REQ_F_NOWAIT; return 0; } @@ -1384,24 +1533,34 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; + struct proto_accept_arg arg = { + .flags = force_nonblock ? O_NONBLOCK : 0, + }; struct file *file; + unsigned cflags; int ret, fd; + if (!(req->flags & REQ_F_POLLED) && + accept->iou_flags & IORING_ACCEPT_POLL_FIRST) + return -EAGAIN; + retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) return fd; } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + arg.err = 0; + arg.is_empty = -1; + file = do_accept(req->file, &arg, accept->addr, accept->addr_len, accept->flags); if (IS_ERR(file)) { if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { + if (ret == -EAGAIN && force_nonblock && + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { /* * if it's multishot and polled, we don't need to * return EAGAIN to arm the poll infra since it @@ -1422,18 +1581,26 @@ retry: accept->file_slot); } + cflags = 0; + if (!arg.is_empty) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_OK; } if (ret < 0) return ret; - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, IORING_CQE_F_MORE)) - goto retry; + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) + goto retry; + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; + } - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_STOP_MULTISHOT; } @@ -1491,17 +1658,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); + struct io_async_msghdr *io; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1509,32 +1669,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); conn->in_progress = conn->seen_econnaborted = false; - return 0; + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); } int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); - struct io_async_connect __io, *io; + struct io_async_msghdr *io = req->async_data; unsigned file_flags; int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); + ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, + file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) && force_nonblock) { if (ret == -EINPROGRESS) { @@ -1544,13 +1698,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) goto out; connect->seen_econnaborted = true; } - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (connect->in_progress) { @@ -1568,12 +1715,20 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); + io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } -void io_netmsg_cache_free(struct io_cache_entry *entry) +void io_netmsg_cache_free(const void *entry) { - kfree(container_of(entry, struct io_async_msghdr, cache)); + struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; + + if (kmsg->free_iov) { + kasan_mempool_unpoison_object(kmsg->free_iov, + kmsg->free_iov_nr * sizeof(struct iovec)); + io_netmsg_iovec_free(kmsg); + } + kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index 191009979b..0eb1c1920f 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,22 +3,15 @@ #include <linux/net.h> #include <linux/uio.h> -#include "alloc_cache.h" - struct io_async_msghdr { #if defined(CONFIG_NET) - union { - struct iovec fast_iov[UIO_FASTIOV]; - struct { - struct iovec fast_iov_one; - __kernel_size_t controllen; - int namelen; - __kernel_size_t payloadlen; - }; - struct io_cache_entry cache; - }; + struct iovec fast_iov; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; + int free_iov_nr; + int namelen; + __kernel_size_t controllen; + __kernel_size_t payloadlen; struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; @@ -27,22 +20,15 @@ struct io_async_msghdr { #if defined(CONFIG_NET) -struct io_async_connect { - struct sockaddr_storage address; -}; - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); -int io_sendmsg_prep_async(struct io_kiocb *req); void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); -int io_send_prep_async(struct io_kiocb *req); -int io_recvmsg_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); int io_recv(struct io_kiocb *req, unsigned int issue_flags); @@ -55,7 +41,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); -int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); @@ -64,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); -void io_netmsg_cache_free(struct io_cache_entry *entry); +void io_netmsg_cache_free(const void *entry); #else -static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +static inline void io_netmsg_cache_free(const void *entry) { } #endif diff --git a/io_uring/nop.c b/io_uring/nop.c index d956599a3c..a5bcf3d698 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -10,16 +10,34 @@ #include "io_uring.h" #include "nop.h" +struct io_nop { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct file *file; + int result; +}; + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned int flags; + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + flags = READ_ONCE(sqe->nop_flags); + if (flags & ~IORING_NOP_INJECT_RESULT) + return -EINVAL; + + if (flags & IORING_NOP_INJECT_RESULT) + nop->result = READ_ONCE(sqe->len); + else + nop->result = 0; return 0; } -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - io_req_set_res(req, 0, 0); + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + if (nop->result < 0) + req_set_fail(req); + io_req_set_res(req, nop->result, 0); return IOU_OK; } diff --git a/io_uring/notif.c b/io_uring/notif.c index d3e703c37a..28859ae3ee 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,35 +9,36 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) +static const struct ubuf_info_ops io_ubuf_ops; + +static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); - struct io_ring_ctx *ctx = notif->ctx; - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + do { + notif = cmd_to_io_kiocb(nd); - if (nd->account_pages && ctx->user) { - __io_unaccount_mem(ctx->user, nd->account_pages); - nd->account_pages = 0; - } - io_req_task_complete(notif, ts); -} + lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); -static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) -{ - struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); - struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + + if (nd->account_pages && notif->ctx->user) { + __io_unaccount_mem(notif->ctx->user, nd->account_pages); + nd->account_pages = 0; + } - if (refcount_dec_and_test(&uarg->refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + nd = nd->next; + io_req_task_complete(notif, ts); + } while (nd); } -static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + struct io_kiocb *notif = cmd_to_io_kiocb(nd); + unsigned tw_flags; if (nd->zc_report) { if (success && !nd->zc_used && skb) @@ -45,23 +46,64 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } - io_tx_ubuf_callback(skb, uarg, success); + + if (!refcount_dec_and_test(&uarg->refcnt)) + return; + + if (nd->head != nd) { + io_tx_ubuf_complete(skb, &nd->head->uarg, success); + return; + } + + tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE; + notif->io_task_work.func = io_notif_tw_complete; + __io_req_task_work_add(notif, tw_flags); } -void io_notif_set_extended(struct io_kiocb *notif) +static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) { - struct io_notif_data *nd = io_notif_to_data(notif); + struct io_notif_data *nd, *prev_nd; + struct io_kiocb *prev_notif, *notif; + struct ubuf_info *prev_uarg = skb_zcopy(skb); - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { - nd->account_pages = 0; - nd->zc_report = false; - nd->zc_used = false; - nd->zc_copied = false; - nd->uarg.callback = io_tx_ubuf_callback_ext; - notif->io_task_work.func = io_notif_complete_tw_ext; + nd = container_of(uarg, struct io_notif_data, uarg); + notif = cmd_to_io_kiocb(nd); + + if (!prev_uarg) { + net_zcopy_get(&nd->uarg); + skb_zcopy_init(skb, &nd->uarg); + return 0; } + /* handle it separately as we can't link a notif to itself */ + if (unlikely(prev_uarg == &nd->uarg)) + return 0; + /* we can't join two links together, just request a fresh skb */ + if (unlikely(nd->head != nd || nd->next)) + return -EEXIST; + /* don't mix zc providers */ + if (unlikely(prev_uarg->ops != &io_ubuf_ops)) + return -EEXIST; + + prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); + prev_notif = cmd_to_io_kiocb(nd); + + /* make sure all noifications can be finished in the same task_work */ + if (unlikely(notif->ctx != prev_notif->ctx || + notif->task != prev_notif->task)) + return -EEXIST; + + nd->head = prev_nd->head; + nd->next = prev_nd->next; + prev_nd->next = nd; + net_zcopy_get(&nd->head->uarg); + return 0; } +static const struct ubuf_info_ops io_ubuf_ops = { + .complete = io_tx_ubuf_complete, + .link_skb = io_link_skb, +}; + struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { @@ -76,11 +118,15 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; - notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); + nd->zc_report = false; + nd->account_pages = 0; + nd->next = NULL; + nd->head = nd; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; - nd->uarg.callback = io_tx_ubuf_callback; + nd->uarg.ops = &io_ubuf_ops; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 86d32bd9f8..f3589cfef4 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,14 +13,19 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; - unsigned long account_pages; + + struct io_notif_data *next; + struct io_notif_data *head; + + unsigned account_pages; bool zc_report; bool zc_used; bool zc_copied; }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); -void io_notif_set_extended(struct io_kiocb *notif); +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { @@ -32,9 +37,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + io_tx_ubuf_complete(NULL, &nd->uarg, true); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9c080aadc5..760006ccc4 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,7 +67,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv, .issue = io_read, }, [IORING_OP_WRITEV] = { @@ -81,7 +82,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev, .issue = io_write, }, [IORING_OP_FSYNC] = { @@ -99,7 +101,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { @@ -112,7 +115,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { @@ -138,8 +142,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, #else @@ -152,8 +156,8 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, #else @@ -162,6 +166,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_timeout_prep, .issue = io_timeout, }, @@ -191,6 +196,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -199,6 +205,7 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_connect_prep, .issue = io_connect, #else @@ -239,7 +246,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read, .issue = io_read, }, [IORING_OP_WRITE] = { @@ -252,7 +260,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write, .issue = io_write, }, [IORING_OP_FADVISE] = { @@ -272,8 +281,9 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, + .buffer_select = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, #else @@ -288,6 +298,7 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recv, #else @@ -403,6 +414,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = 2 * sizeof(struct io_uring_sqe), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -412,8 +424,8 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, #else @@ -425,8 +437,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else @@ -439,10 +451,12 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .audit_skip = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { + .async_size = sizeof(struct io_waitid_async), .prep = io_waitid_prep, .issue = io_waitid, }, @@ -488,16 +502,12 @@ const struct io_cold_def io_cold_defs[] = { .name = "NOP", }, [IORING_OP_READV] = { - .async_size = sizeof(struct io_async_rw), .name = "READV", - .prep_async = io_readv_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITEV", - .prep_async = io_writev_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, @@ -505,13 +515,13 @@ const struct io_cold_def io_cold_defs[] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { @@ -526,8 +536,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -535,14 +543,11 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_recvmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { @@ -555,15 +560,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep_async = io_connect_prep_async, -#endif }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", @@ -583,13 +583,13 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { - .async_size = sizeof(struct io_async_rw), .name = "READ", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { @@ -601,14 +601,14 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, @@ -679,14 +679,10 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", - .async_size = 2 * sizeof(struct io_uring_sqe), - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_send_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -694,18 +690,16 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", - .async_size = sizeof(struct io_waitid_async), }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", @@ -731,6 +725,14 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_uring_op_supported(u8 opcode) +{ + if (opcode < IORING_OP_LAST && + io_issue_defs[opcode].prep != io_eopnotsupp_prep) + return true; + return false; +} + void __init io_uring_optable_init(void) { int i; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 9e5435ec27..14456436ff 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -17,8 +17,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; /* supports ioprio */ @@ -27,22 +25,19 @@ struct io_issue_def { unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; - /* opcode specific path will handle ->async_data allocation if needed */ - unsigned manual_alloc : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* size of async data needed, if any */ + unsigned short async_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); }; struct io_cold_def { - /* size of async data needed, if any */ - unsigned short async_size; - const char *name; - int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; @@ -50,5 +45,7 @@ struct io_cold_def { extern const struct io_issue_def io_issue_defs[]; extern const struct io_cold_def io_cold_defs[]; +bool io_uring_op_supported(u8 opcode); + void io_uring_optable_init(void); #endif diff --git a/io_uring/poll.c b/io_uring/poll.c index 6db1dcb2c7..1f63b60e85 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -14,6 +14,7 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" @@ -322,8 +323,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_fill_cqe_req_aux(req, ts->locked, mask, - IORING_CQE_F_MORE)) { + if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -347,6 +347,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); + io_napi_add(req); return IOU_POLL_NO_ACTION; } @@ -687,17 +688,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->apoll_cache); - if (entry == NULL) + apoll = io_alloc_cache_get(&ctx->apoll_cache); + if (!apoll) goto alloc_apoll; - apoll = container_of(entry, struct async_poll, cache); apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: @@ -1056,8 +1055,3 @@ out: io_req_set_res(req, ret, 0); return IOU_OK; } - -void io_apoll_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct async_poll, cache)); -} diff --git a/io_uring/poll.h b/io_uring/poll.h index 1dacae9e81..b0e3745f5a 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "alloc_cache.h" +#define IO_POLL_ALLOC_CACHE_MAX 32 enum { IO_APOLL_OK, @@ -17,10 +17,7 @@ struct io_poll { }; struct async_poll { - union { - struct io_poll poll; - struct io_cache_entry cache; - }; + struct io_poll poll; struct io_poll *double_poll; }; @@ -46,6 +43,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); -void io_apoll_cache_free(struct io_cache_entry *entry); - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/refs.h b/io_uring/refs.h index 1336de3f2a..63982ead9f 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -33,6 +33,13 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_dec(&req->refs); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { diff --git a/io_uring/register.c b/io_uring/register.c index 99c37775f9..11517b34cf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -113,7 +113,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) + if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -355,8 +355,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, } if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) @@ -368,8 +370,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - + tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; @@ -381,8 +382,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; err: if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } return ret; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4818b79231..570bfa6a31 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -13,8 +13,10 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" +#include "memmap.h" struct io_rsrc_update { struct file *file; @@ -169,7 +171,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) kfree(node); } @@ -197,12 +199,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; - struct io_cache_entry *entry; - entry = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (entry) { - ref_node = container_of(entry, struct io_rsrc_node, cache); - } else { + ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (!ref_node) { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; @@ -250,6 +249,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { + __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; @@ -872,42 +872,6 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct page **pages = NULL; - int ret; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - WARN_ON(!nr_pages); - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - mmap_read_lock(current->mm); - ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); - mmap_read_unlock(current->mm); - - /* success, mapped all pages */ - if (ret == nr_pages) { - *npages = nr_pages; - return pages; - } - - /* partial map, or didn't map anything */ - if (ret >= 0) { - /* if we did partial map, release any pages we did get */ - if (ret) - unpin_user_pages(pages, ret); - ret = -EFAULT; - } - kvfree(pages); - return ERR_PTR(ret); -} - static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) @@ -1104,7 +1068,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * branch doesn't expect non PAGE_SIZE'd chunks. */ iter->bvec = bvec; - iter->nr_segs = bvec->bv_len; iter->count -= offset; iter->iov_offset = offset; } else { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index e210002389..c032ca3436 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,8 +2,6 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#include "alloc_cache.h" - #define IO_NODE_ALLOC_CACHE_MAX 32 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) @@ -36,10 +34,7 @@ struct io_rsrc_data { }; struct io_rsrc_node { - union { - struct io_cache_entry cache; - struct io_ring_ctx *ctx; - }; + struct io_ring_ctx *ctx; int refs; bool empty; u16 type; @@ -88,12 +83,6 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node io_rsrc_node_ref_zero(node); } -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) -{ - io_put_rsrc_node(ctx, req->rsrc_node); -} - static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { diff --git a/io_uring/rw.c b/io_uring/rw.c index c8d4828743..1a2128459c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -18,6 +18,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" +#include "alloc_cache.h" #include "rsrc.h" #include "poll.h" #include "rw.h" @@ -75,7 +76,179 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_import_iovec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct iovec *iov; + void __user *buf; + int nr_segs, ret; + size_t sqe_len; + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + return import_ubuf(ddir, buf, sqe_len, &io->iter); + } + + if (io->free_iovec) { + nr_segs = io->free_iov_nr; + iov = io->free_iovec; + } else { + iov = &io->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ret; + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io->free_iov_nr = io->iter.nr_segs; + kfree(io->free_iovec); + io->free_iovec = iov; + } + return 0; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + int ret; + + ret = __io_import_iovec(rw, req, io, issue_flags); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&io->iter, &io->iter_state); + return 0; +} + +static void io_rw_iovec_free(struct io_async_rw *rw) +{ + if (rw->free_iovec) { + kfree(rw->free_iovec); + rw->free_iov_nr = 0; + rw->free_iovec = NULL; + } +} + +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_rw *rw = req->async_data; + struct iovec *iov; + + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_rw_iovec_free(rw); + return; + } + iov = rw->free_iovec; + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { + if (iov) + kasan_mempool_poison_object(iov); + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * Disable quick recycling for anything that's gone through io-wq. + * In theory, this should be fine to cleanup. However, some read or + * write iter handling touches the iovec AFTER having called into the + * handler, eg to reexpand or revert. This means we can have: + * + * task io-wq + * issue + * punt to io-wq + * issue + * blkdev_write_iter() + * ->ki_complete() + * io_complete_rw() + * queue tw complete + * run tw + * req_rw_cleanup + * iov_iter_count() <- look at iov_iter again + * + * which can lead to a UAF. This is only possible for io-wq offload + * as the cleanup can run in parallel. As io-wq is not the fast path, + * just leave cleanup to the end. + * + * This is really a bug in the core code that does this, any issue + * path should assume that a successful (or -EIOCBQUEUED) return can + * mean that the underlying data can be gone at any time. But that + * should be fixed seperately, and then this check could be killed. + */ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags &= ~REQ_F_NEED_CLEANUP; + io_rw_recycle(req, issue_flags); + } +} + +static int io_rw_alloc_async(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *rw; + + rw = io_alloc_cache_get(&ctx->rw_cache); + if (rw) { + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; + } + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = rw; + goto done; + } + + if (!io_alloc_async_data(req)) { + rw = req->async_data; + rw->free_iovec = NULL; + rw->free_iov_nr = 0; +done: + rw->bytes_done = 0; + return 0; + } + + return -ENOMEM; +} + +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) +{ + struct io_async_rw *rw; + int ret; + + if (io_rw_alloc_async(req)) + return -ENOMEM; + + if (!do_import || io_do_buffer_select(req)) + return 0; + + rw = req->async_data; + ret = io_import_iovec(ddir, req, rw, 0); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&rw->iter, &rw->iter_state); + return 0; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; @@ -100,34 +273,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return 0; + return io_prep_rw_setup(req, ddir, do_import); +} + +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw(req, sqe, ITER_DEST, true); } -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + return io_prep_rw(req, sqe, ITER_SOURCE, true); +} + +static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, do_import); if (unlikely(ret)) return ret; + if (do_import) + return 0; /* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection */ - if (req->flags & REQ_F_BUFFER_SELECT) - return io_iov_buffer_select_prep(req); + return io_iov_buffer_select_prep(req); +} - return 0; +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_DEST); +} + +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_SOURCE); } -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *io; u16 index; int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, false); if (unlikely(ret)) return ret; @@ -136,7 +333,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); - return 0; + + io = req->async_data; + ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + iov_iter_save_state(&io->iter, &io->iter_state); + return ret; +} + +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_DEST); +} + +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_SOURCE); } /* @@ -152,7 +363,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ITER_DEST, false); if (unlikely(ret)) return ret; @@ -165,9 +376,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); + io_rw_iovec_free(req->async_data); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -187,21 +396,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - #ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; + iov_iter_restore(&io->iter, &io->iter_state); } static bool io_rw_should_reissue(struct io_kiocb *req) @@ -230,9 +430,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req) return true; } #else -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { - return false; } static bool io_rw_should_reissue(struct io_kiocb *req) { @@ -311,11 +510,10 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) + req->cqe.flags |= io_put_kbuf(req, 0); - req->cqe.flags |= io_put_kbuf(req, issue_flags); - } + io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); } @@ -396,6 +594,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, issue_flags)); + io_req_rw_cleanup(req, issue_flags); return IOU_OK; } } else { @@ -404,71 +603,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, final_ret); + io_resubmit_prep(req); + return -EAGAIN; } return IOU_ISSUE_SKIP_COMPLETE; } -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_ubuf(ddir, buf, sqe_len, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (IS_ERR(*iovec)) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -540,89 +680,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) return ret; } -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.__iov = io->s.fast_iov; - if (iter->__iov != fast_iov) { - iov_off = iter_iov(iter) - fast_iov; - io->s.iter.__iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_cold_defs[req->opcode].prep_async) - return 0; - /* opcode type doesn't need async data */ - if (!io_cold_defs[req->opcode].async_size) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - iorw->bytes_done = 0; - iorw->free_iovec = NULL; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - if (iov) { - iorw->free_iovec = iov; - req->flags |= REQ_F_NEED_CLEANUP; - } - - return 0; -} - -int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_DEST); -} - -int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_SOURCE); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -683,7 +740,8 @@ static bool io_rw_should_retry(struct io_kiocb *req) * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks */ - if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + if (io_file_can_poll(req) || + !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) return false; wait->wait.func = io_async_buf_func; @@ -701,7 +759,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) struct file *file = rw->kiocb.ki_filp; if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); + return file->f_op->read_iter(&rw->kiocb, iter); else if (file->f_op->read) return loop_rw_iter(READ, rw, iter); else @@ -762,54 +820,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; + ssize_t ret; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); + if (io_do_buffer_select(req)) { + ret = io_import_iovec(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } + if (unlikely(!io_file_supports_nowait(req))) + return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ @@ -819,20 +851,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; - /* - * If we can poll, just do that. For a vectored read, we'll - * need to copy state first. - */ - if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) + /* If we can poll, just do that. */ + if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -842,8 +869,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - if (iovec) - kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { @@ -856,21 +881,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - iovec = NULL; - if (ret2) { - ret = ret > 0 ? ret : ret2; - goto done; - } - - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ + iov_iter_restore(&io->iter, &io->iter_state); do { /* @@ -878,11 +889,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) + iov_iter_advance(&io->iter, ret); + if (!iov_iter_count(&io->iter)) break; io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -890,24 +901,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); + iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); return ret; } @@ -970,9 +979,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ - if (io_fill_cqe_req_aux(req, - issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) { /* * Force retry, as we might have more data to @@ -991,6 +998,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * multishot request, hitting overflow will terminate it. */ io_req_set_res(req, ret, cflags); + io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) return IOU_STOP_MULTISHOT; return IOU_OK; @@ -998,42 +1006,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) int io_write(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; + goto ret_eagain; - /* File path supports NOWAIT for non-direct_IO only for block devices. */ + /* Check if we can support NOWAIT. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && - !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; + !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && + (req->flags & REQ_F_ISREG)) + goto ret_eagain; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -1044,19 +1038,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } if (req->flags & REQ_F_ISREG) kiocb_start_write(kiocb); kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); + ret2 = req->file->f_op->write_iter(kiocb, &io->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &io->iter); else ret2 = -EINVAL; @@ -1077,11 +1069,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; + goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { - struct io_async_rw *io; - trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); @@ -1090,34 +1080,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) * in the worker. Also update bytes_done to account for * the bytes already written. */ - iov_iter_save_state(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, true); - - io = req->async_data; - if (io) - io->bytes_done += ret2; + iov_iter_save_state(&io->iter, &io->iter_state); + io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); - return ret ? ret : -EAGAIN; + return -EAGAIN; } done: - ret = kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, issue_flags); } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - if (!ret) { - if (kiocb->ki_flags & IOCB_WRITE) - io_req_end_write(req); - return -EAGAIN; - } - return ret; +ret_eagain: + iov_iter_restore(&io->iter, &io->iter_state); + if (kiocb->ki_flags & IOCB_WRITE) + io_req_end_write(req); + return -EAGAIN; } - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; } void io_rw_fail(struct io_kiocb *req) @@ -1191,6 +1169,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) break; nr_events++; req->cqe.flags = io_put_kbuf(req, 0); + if (req->opcode != IORING_OP_URING_CMD) + io_req_rw_cleanup(req, 0); } if (unlikely(!nr_events)) return 0; @@ -1204,3 +1184,15 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) __io_submit_flush_completions(ctx); return nr_events; } + +void io_rw_cache_free(const void *entry) +{ + struct io_async_rw *rw = (struct io_async_rw *) entry; + + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + io_rw_iovec_free(rw); + } + kfree(rw); +} diff --git a/io_uring/rw.h b/io_uring/rw.h index f9e89b4fe4..3f432dc754 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,28 +2,27 @@ #include <linux/pagemap.h> -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; size_t bytes_done; + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov; + struct iovec *free_iovec; + int free_iov_nr; struct wait_page_queue wpq; }; -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); -int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); -int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); +void io_rw_cache_free(const void *entry); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 3983708cef..b3722e5275 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -238,11 +238,13 @@ static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) if (*retry_list) { *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); if (count >= max_entries) - return count; + goto out; max_entries -= count; } - *retry_list = tctx_task_work_run(tctx, max_entries, &count); +out: + if (task_work_pending(current)) + task_work_run(); return count; } @@ -291,6 +293,14 @@ static int io_sq_thread(void *data) sqd->sq_cpu = raw_smp_processor_id(); } + /* + * Force audit context to get setup, in case we do prep side async + * operations that would trigger an audit call before any issue side + * audit has been done. + */ + audit_uring_entry(IORING_OP_NOP); + audit_uring_exit(true, 0); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7fd7dbb211..9973876d91 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { - bool filled; - filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, - IORING_CQE_F_MORE); - if (filled) { + if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); @@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); + ret = io_try_cancel(req->task->io_uring, &cd, 0); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); @@ -541,7 +537,6 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -644,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 42f63adfa5..a54163a839 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> +#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> #include <net/sock.h> @@ -11,9 +12,71 @@ #include <asm/ioctls.h> #include "io_uring.h" +#include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" +static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct uring_cache *cache; + + cache = io_alloc_cache_get(&ctx->uring_cache); + if (cache) { + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = cache; + return cache; + } + if (!io_alloc_async_data(req)) + return req->async_data; + return NULL; +} + +static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache = req->async_data; + + if (issue_flags & IO_URING_F_UNLOCKED) + return; + if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + ioucmd->sqe = NULL; + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool ret = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, + hash_node) { + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, + struct io_uring_cmd); + struct file *file = req->file; + + if (!cancel_all && req->task != task) + continue; + + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { + /* ->sqe isn't available if no async data */ + if (!req_has_async_data(req)) + cmd->sqe = NULL; + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | + IO_URING_F_COMPLETE_DEFER); + ret = true; + } + } + io_submit_flush_completions(ctx); + return ret; +} + static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -56,9 +119,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; - ioucmd->task_work_cb(ioucmd, issue_flags); + /* task_work executor checks the deffered list completion */ + ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -97,23 +160,38 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); + io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); + } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED)) + return; + io_req_complete_defer(req); } else { - struct io_tw_state ts = { - .locked = !(issue_flags & IO_URING_F_UNLOCKED), - }; - io_req_task_complete(req, &ts); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -int io_uring_cmd_prep_async(struct io_kiocb *req) +static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache; + + cache = io_uring_async_get(req); + if (unlikely(!cache)) + return -ENOMEM; + + if (!(req->flags & REQ_F_FORCE_ASYNC)) { + /* defer memcpy until we need it */ + ioucmd->sqe = sqe; + return 0; + } - memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); + memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); ioucmd->sqe = req->async_data; return 0; } @@ -140,9 +218,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); } - ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; + + return io_uring_cmd_prep_setup(req, sqe); } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -174,22 +252,20 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } + struct uring_cache *cache = req->async_data; - if (ret != -EIOCBQUEUED) { - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return ret; + if (ioucmd->sqe != (void *) cache) + memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { + return -EIOCBQUEUED; } - return IOU_ISSUE_SKIP_COMPLETE; + if (ret < 0) + req_set_fail(req); + io_req_uring_cleanup(req, issue_flags); + io_req_set_res(req, ret, 0); + return ret < 0 ? ret : IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 8117684ec3..a361f98664 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,5 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +struct uring_cache { + struct io_uring_sqe sqes[2]; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_uring_cmd_prep_async(struct io_kiocb *req); + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 77d340666c..6362ec20ab 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = { .locked = true }; + struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); |