diff options
author | Jens Axboe <axboe@kernel.dk> | 2022-06-16 11:20:12 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2022-06-17 08:26:05 -0600 |
commit | 9adb5df42261fb7c7058755d288b3bc794769a14 (patch) | |
tree | cadc11db2225f7ea1d3fa678ac16e6dc8b19e384 | |
parent | 389bdfc03121c9a075da31d090d7249d032ba508 (diff) |
io_uring: switch to per-cpu task_workfor-5.20/io_uring-tw
We see contention on the task_work locking and list management for
networked workloads, where it's not uncommon to have task_work arriving
from multiple CPUs in the system.
The task_work handling ends up with the original task, but to save on
the overhead of repeatedly re-adding that (which is an expensive
cmpxchg), it's wrapped in a per-tctx task_list which belongs to the
original submitter. Having many networked requests inflight can mean
that there's a lot of addition activity on the structure.
Move from a single per-tctx target list to a per-cpu one instead. This
allows multiple completers to add task_work without having to
synchronize on the same lock and list.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | io_uring/io_uring.c | 10 | ||||
-rw-r--r-- | io_uring/tctx.c | 23 | ||||
-rw-r--r-- | io_uring/tctx.h | 5 |
3 files changed, 29 insertions, 9 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 13d177ab9cd8..fd166fb249eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1011,7 +1011,6 @@ void tctx_task_work(struct callback_head *cb) bool uring_locked = false; struct io_ring_ctx *ctx = NULL; struct tctx_tw *tw = container_of(cb, struct tctx_tw, task_work); - struct io_uring_task *tctx = container_of(tw, struct io_uring_task, tw); while (1) { struct io_wq_work_node *node; @@ -1035,7 +1034,7 @@ void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx, &uring_locked); /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) + if (unlikely(atomic_read(&tw->tctx->in_idle))) io_uring_drop_tctx_refs(current); } @@ -1043,12 +1042,15 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - struct tctx_tw *tw = &tctx->tw; struct io_wq_work_node *node; unsigned long flags; + struct tctx_tw *tw; bool running; - spin_lock_irqsave(&tw->task_lock, flags); + local_irq_save(flags); + tw = this_cpu_ptr(tctx->tw); + + spin_lock(&tw->task_lock); wq_list_add_tail(&req->io_task_work.node, &tw->task_list); running = tw->task_running; if (!running) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 00a2fc8ed110..7dc653b19e61 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -53,6 +53,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); + free_percpu(tctx->tw); kfree(tctx); tsk->io_uring = NULL; } @@ -61,7 +62,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx) { struct io_uring_task *tctx; - int ret; + int ret, cpu; tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) @@ -73,22 +74,36 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return ret; } + tctx->tw = alloc_percpu(struct tctx_tw); + if (!tctx->tw) { + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + return -ENOMEM; + } + tctx->io_wq = io_init_wq_offload(ctx, task); if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); + free_percpu(tctx->tw); kfree(tctx); return ret; } + for_each_possible_cpu(cpu) { + struct tctx_tw *tw = per_cpu_ptr(tctx->tw, cpu); + + spin_lock_init(&tw->task_lock); + INIT_WQ_LIST(&tw->task_list); + init_task_work(&tw->task_work, tctx_task_work); + tw->tctx = tctx; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; - spin_lock_init(&tctx->tw.task_lock); - INIT_WQ_LIST(&tctx->tw.task_list); - init_task_work(&tctx->tw.task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index b1cab2e84b16..c50432906dc8 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/percpu.h> + /* * Arbitrary limit, can be raised if need be */ @@ -9,6 +11,7 @@ struct tctx_tw { spinlock_t task_lock; struct io_wq_work_list task_list; struct callback_head task_work; + struct io_uring_task *tctx; bool task_running; }; @@ -23,7 +26,7 @@ struct io_uring_task { atomic_t inflight_tracked; atomic_t in_idle; - struct tctx_tw tw; + struct __percpu tctx_tw *tw; struct file *registered_rings[IO_RINGFD_REG_MAX]; }; |