From: Jens Axboe Date: Thu, 16 Jun 2022 17:20:12 +0000 (-0600) Subject: io_uring: switch to per-cpu task_work X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=refs%2Fheads%2Ffor-5.20%2Fio_uring-tw;p=linux-2.6-block.git io_uring: switch to per-cpu task_work We see contention on the task_work locking and list management for networked workloads, where it's not uncommon to have task_work arriving from multiple CPUs in the system. The task_work handling ends up with the original task, but to save on the overhead of repeatedly re-adding that (which is an expensive cmpxchg), it's wrapped in a per-tctx task_list which belongs to the original submitter. Having many networked requests inflight can mean that there's a lot of addition activity on the structure. Move from a single per-tctx target list to a per-cpu one instead. This allows multiple completers to add task_work without having to synchronize on the same lock and list. Signed-off-by: Jens Axboe --- diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 13d177ab9cd8..fd166fb249eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1011,7 +1011,6 @@ void tctx_task_work(struct callback_head *cb) bool uring_locked = false; struct io_ring_ctx *ctx = NULL; struct tctx_tw *tw = container_of(cb, struct tctx_tw, task_work); - struct io_uring_task *tctx = container_of(tw, struct io_uring_task, tw); while (1) { struct io_wq_work_node *node; @@ -1035,7 +1034,7 @@ void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx, &uring_locked); /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) + if (unlikely(atomic_read(&tw->tctx->in_idle))) io_uring_drop_tctx_refs(current); } @@ -1043,12 +1042,15 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - struct tctx_tw *tw = &tctx->tw; struct io_wq_work_node *node; unsigned long flags; + struct tctx_tw *tw; bool running; - spin_lock_irqsave(&tw->task_lock, flags); + local_irq_save(flags); + tw = this_cpu_ptr(tctx->tw); + + spin_lock(&tw->task_lock); wq_list_add_tail(&req->io_task_work.node, &tw->task_list); running = tw->task_running; if (!running) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 00a2fc8ed110..7dc653b19e61 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -53,6 +53,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); + free_percpu(tctx->tw); kfree(tctx); tsk->io_uring = NULL; } @@ -61,7 +62,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx) { struct io_uring_task *tctx; - int ret; + int ret, cpu; tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) @@ -73,22 +74,36 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return ret; } + tctx->tw = alloc_percpu(struct tctx_tw); + if (!tctx->tw) { + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + return -ENOMEM; + } + tctx->io_wq = io_init_wq_offload(ctx, task); if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); + free_percpu(tctx->tw); kfree(tctx); return ret; } + for_each_possible_cpu(cpu) { + struct tctx_tw *tw = per_cpu_ptr(tctx->tw, cpu); + + spin_lock_init(&tw->task_lock); + INIT_WQ_LIST(&tw->task_list); + init_task_work(&tw->task_work, tctx_task_work); + tw->tctx = tctx; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; - spin_lock_init(&tctx->tw.task_lock); - INIT_WQ_LIST(&tctx->tw.task_list); - init_task_work(&tctx->tw.task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index b1cab2e84b16..c50432906dc8 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + /* * Arbitrary limit, can be raised if need be */ @@ -9,6 +11,7 @@ struct tctx_tw { spinlock_t task_lock; struct io_wq_work_list task_list; struct callback_head task_work; + struct io_uring_task *tctx; bool task_running; }; @@ -23,7 +26,7 @@ struct io_uring_task { atomic_t inflight_tracked; atomic_t in_idle; - struct tctx_tw tw; + struct __percpu tctx_tw *tw; struct file *registered_rings[IO_RINGFD_REG_MAX]; };