summaryrefslogtreecommitdiffstats
path: root/io_uring
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-01 17:13:54 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-01 17:13:54 +0000
commit2957e9a7ea070524508a846205689431cb5c101f (patch)
tree42f079ff82e701ebcb76829974b4caca3e5b6798 /io_uring
parentAdding upstream version 6.9.2. (diff)
downloadlinux-2957e9a7ea070524508a846205689431cb5c101f.tar.xz
linux-2957e9a7ea070524508a846205689431cb5c101f.zip
Adding upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/cancel.h4
-rw-r--r--io_uring/io-wq.c70
-rw-r--r--io_uring/io_uring.c1
-rw-r--r--io_uring/io_uring.h4
-rw-r--r--io_uring/napi.c24
-rw-r--r--io_uring/net.c1
-rw-r--r--io_uring/nop.c2
-rw-r--r--io_uring/rsrc.c2
-rw-r--r--io_uring/sqpoll.c14
9 files changed, 71 insertions, 51 deletions
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 76b32e65c0..b33995e00b 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence)
{
- if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq)
+ if (req->cancel_seq_set && sequence == req->work.cancel_seq)
return true;
- req->flags |= REQ_F_CANCEL_SEQ;
+ req->cancel_seq_set = true;
req->work.cancel_seq = sequence;
return false;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 522196dfb0..8a99aabcac 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -25,10 +25,10 @@
#define WORKER_IDLE_TIMEOUT (5 * HZ)
enum {
- IO_WORKER_F_UP = 1, /* up and active */
- IO_WORKER_F_RUNNING = 2, /* account as running */
- IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_BOUND = 8, /* is doing bounded work */
+ IO_WORKER_F_UP = 0, /* up and active */
+ IO_WORKER_F_RUNNING = 1, /* account as running */
+ IO_WORKER_F_FREE = 2, /* worker on free list */
+ IO_WORKER_F_BOUND = 3, /* is doing bounded work */
};
enum {
@@ -44,7 +44,8 @@ enum {
*/
struct io_worker {
refcount_t ref;
- unsigned flags;
+ int create_index;
+ unsigned long flags;
struct hlist_nulls_node nulls_node;
struct list_head all_list;
struct task_struct *task;
@@ -58,7 +59,6 @@ struct io_worker {
unsigned long create_state;
struct callback_head create_work;
- int create_index;
union {
struct rcu_head rcu;
@@ -165,7 +165,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
{
- return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND);
+ return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags));
}
static void io_worker_ref_put(struct io_wq *wq)
@@ -225,7 +225,7 @@ static void io_worker_exit(struct io_worker *worker)
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wq->lock);
- if (worker->flags & IO_WORKER_F_FREE)
+ if (test_bit(IO_WORKER_F_FREE, &worker->flags))
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
raw_spin_unlock(&wq->lock);
@@ -410,7 +410,7 @@ static void io_wq_dec_running(struct io_worker *worker)
struct io_wq_acct *acct = io_wq_get_acct(worker);
struct io_wq *wq = worker->wq;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
if (!atomic_dec_and_test(&acct->nr_running))
@@ -430,8 +430,8 @@ static void io_wq_dec_running(struct io_worker *worker)
*/
static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
{
- if (worker->flags & IO_WORKER_F_FREE) {
- worker->flags &= ~IO_WORKER_F_FREE;
+ if (test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+ clear_bit(IO_WORKER_F_FREE, &worker->flags);
raw_spin_lock(&wq->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
raw_spin_unlock(&wq->lock);
@@ -444,8 +444,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
__must_hold(wq->lock)
{
- if (!(worker->flags & IO_WORKER_F_FREE)) {
- worker->flags |= IO_WORKER_F_FREE;
+ if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+ set_bit(IO_WORKER_F_FREE, &worker->flags);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
}
}
@@ -564,10 +564,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
* clear the stalled flag.
*/
work = io_get_next_work(acct, worker);
- raw_spin_unlock(&acct->lock);
if (work) {
- __io_worker_busy(wq, worker);
-
/*
* Make sure cancelation can find this, even before
* it becomes the active work. That avoids a window
@@ -578,9 +575,15 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
- } else {
- break;
}
+
+ raw_spin_unlock(&acct->lock);
+
+ if (!work)
+ break;
+
+ __io_worker_busy(wq, worker);
+
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -631,7 +634,8 @@ static int io_wq_worker(void *data)
bool exit_mask = false, last_timeout = false;
char buf[TASK_COMM_LEN];
- worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ set_mask_bits(&worker->flags, 0,
+ BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING));
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
@@ -695,11 +699,11 @@ void io_wq_worker_running(struct task_struct *tsk)
if (!worker)
return;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
- if (worker->flags & IO_WORKER_F_RUNNING)
+ if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
return;
- worker->flags |= IO_WORKER_F_RUNNING;
+ set_bit(IO_WORKER_F_RUNNING, &worker->flags);
io_wq_inc_running(worker);
}
@@ -713,12 +717,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
if (!worker)
return;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
- if (!(worker->flags & IO_WORKER_F_RUNNING))
+ if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags))
return;
- worker->flags &= ~IO_WORKER_F_RUNNING;
+ clear_bit(IO_WORKER_F_RUNNING, &worker->flags);
io_wq_dec_running(worker);
}
@@ -732,7 +736,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker,
raw_spin_lock(&wq->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
list_add_tail_rcu(&worker->all_list, &wq->all_list);
- worker->flags |= IO_WORKER_F_FREE;
+ set_bit(IO_WORKER_F_FREE, &worker->flags);
raw_spin_unlock(&wq->lock);
wake_up_new_task(tsk);
}
@@ -838,7 +842,7 @@ fail:
init_completion(&worker->ref_done);
if (index == IO_WQ_ACCT_BOUND)
- worker->flags |= IO_WORKER_F_BOUND;
+ set_bit(IO_WORKER_F_BOUND, &worker->flags);
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
@@ -924,8 +928,12 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
struct io_wq_acct *acct = io_work_get_acct(wq, work);
- struct io_cb_cancel_data match;
- unsigned work_flags = work->flags;
+ unsigned long work_flags = work->flags;
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_item,
+ .data = work,
+ .cancel_all = false,
+ };
bool do_create;
/*
@@ -963,10 +971,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
raw_spin_unlock(&wq->lock);
/* fatal condition, failed to create the first worker */
- match.fn = io_wq_work_match_item,
- match.data = work,
- match.cancel_all = false,
-
io_acct_cancel_pending_work(wq, acct, &match);
}
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c170a2b8d2..8a216b1d6d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2211,6 +2211,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->rsrc_node = NULL;
req->task = current;
+ req->cancel_seq_set = false;
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6426ee3822..bb6492b667 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -340,7 +340,7 @@ static inline int io_run_task_work(void)
static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
{
- return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
+ return task_work_pending(current) || !llist_empty(&ctx->work_llist);
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
@@ -442,7 +442,7 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
{
if (req->flags & REQ_F_CAN_POLL)
return true;
- if (file_can_poll(req->file)) {
+ if (req->file && file_can_poll(req->file)) {
req->flags |= REQ_F_CAN_POLL;
return true;
}
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 883a1a6659..8c18ede595 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -261,12 +261,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
}
/*
- * __io_napi_adjust_timeout() - Add napi id to the busy poll list
+ * __io_napi_adjust_timeout() - adjust busy loop timeout
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
* @ts: pointer to timespec or NULL
*
* Adjust the busy loop timeout according to timespec and busy poll timeout.
+ * If the specified NAPI timeout is bigger than the wait timeout, then adjust
+ * the NAPI timeout accordingly.
*/
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
struct timespec64 *ts)
@@ -274,16 +276,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
if (ts) {
- struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
-
- if (timespec64_compare(ts, &poll_to_ts) > 0) {
- *ts = timespec64_sub(*ts, poll_to_ts);
- } else {
- u64 to = timespec64_to_ns(ts);
-
- do_div(to, 1000);
- ts->tv_sec = 0;
- ts->tv_nsec = 0;
+ struct timespec64 poll_to_ts;
+
+ poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
+ if (timespec64_compare(ts, &poll_to_ts) < 0) {
+ s64 poll_to_ns = timespec64_to_ns(ts);
+ if (poll_to_ns > 0) {
+ u64 val = poll_to_ns + 999;
+ do_div(val, (s64) 1000);
+ poll_to = val;
+ }
}
}
diff --git a/io_uring/net.c b/io_uring/net.c
index 4afb475d41..3896e6220d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1041,6 +1041,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *notif;
zc->done_io = 0;
+ req->flags |= REQ_F_POLL_NO_LAZY;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index d956599a3c..1a4e312dfe 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -12,6 +12,8 @@
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ if (READ_ONCE(sqe->rw_flags))
+ return -EINVAL;
return 0;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4818b79231..956e2c715e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -250,6 +250,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = io_run_task_work_sig(ctx);
if (ret < 0) {
+ __set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list))
ret = 0;
@@ -1104,7 +1105,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec;
- iter->nr_segs = bvec->bv_len;
iter->count -= offset;
iter->iov_offset = offset;
} else {
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 3983708cef..b3722e5275 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -238,11 +238,13 @@ static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries)
if (*retry_list) {
*retry_list = io_handle_tw_list(*retry_list, &count, max_entries);
if (count >= max_entries)
- return count;
+ goto out;
max_entries -= count;
}
-
*retry_list = tctx_task_work_run(tctx, max_entries, &count);
+out:
+ if (task_work_pending(current))
+ task_work_run();
return count;
}
@@ -291,6 +293,14 @@ static int io_sq_thread(void *data)
sqd->sq_cpu = raw_smp_processor_id();
}
+ /*
+ * Force audit context to get setup, in case we do prep side async
+ * operations that would trigger an audit call before any issue side
+ * audit has been done.
+ */
+ audit_uring_entry(IORING_OP_NOP);
+ audit_uring_exit(true, 0);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;