From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 16 Jun 2022 17:20:12 +0000 (-0600)
Subject: io_uring: switch to per-cpu task_work
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=refs%2Fheads%2Ffor-5.20%2Fio_uring-tw;p=linux-2.6-block.git

io_uring: switch to per-cpu task_work

We see contention on the task_work locking and list management for
networked workloads, where it's not uncommon to have task_work arriving
from multiple CPUs in the system.

The task_work handling ends up with the original task, but to save on
the overhead of repeatedly re-adding that (which is an expensive
cmpxchg), it's wrapped in a per-tctx task_list which belongs to the
original submitter. Having many networked requests inflight can mean
that there's a lot of addition activity on the structure.

Move from a single per-tctx target list to a per-cpu one instead. This
allows multiple completers to add task_work without having to
synchronize on the same lock and list.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13d177ab9cd8..fd166fb249eb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1011,7 +1011,6 @@ void tctx_task_work(struct callback_head *cb)
 	bool uring_locked = false;
 	struct io_ring_ctx *ctx = NULL;
 	struct tctx_tw *tw = container_of(cb, struct tctx_tw, task_work);
-	struct io_uring_task *tctx = container_of(tw, struct io_uring_task, tw);
 
 	while (1) {
 		struct io_wq_work_node *node;
@@ -1035,7 +1034,7 @@ void tctx_task_work(struct callback_head *cb)
 	ctx_flush_and_put(ctx, &uring_locked);
 
 	/* relaxed read is enough as only the task itself sets ->in_idle */
-	if (unlikely(atomic_read(&tctx->in_idle)))
+	if (unlikely(atomic_read(&tw->tctx->in_idle)))
 		io_uring_drop_tctx_refs(current);
 }
 
@@ -1043,12 +1042,15 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct tctx_tw *tw = &tctx->tw;
 	struct io_wq_work_node *node;
 	unsigned long flags;
+	struct tctx_tw *tw;
 	bool running;
 
-	spin_lock_irqsave(&tw->task_lock, flags);
+	local_irq_save(flags);
+	tw = this_cpu_ptr(tctx->tw);
+
+	spin_lock(&tw->task_lock);
 	wq_list_add_tail(&req->io_task_work.node, &tw->task_list);
 	running = tw->task_running;
 	if (!running)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 00a2fc8ed110..7dc653b19e61 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -53,6 +53,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->cached_refs);
 
 	percpu_counter_destroy(&tctx->inflight);
+	free_percpu(tctx->tw);
 	kfree(tctx);
 	tsk->io_uring = NULL;
 }
@@ -61,7 +62,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 				       struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx;
-	int ret;
+	int ret, cpu;
 
 	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
 	if (unlikely(!tctx))
@@ -73,22 +74,36 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 		return ret;
 	}
 
+	tctx->tw = alloc_percpu(struct tctx_tw);
+	if (!tctx->tw) {
+		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	tctx->io_wq = io_init_wq_offload(ctx, task);
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		free_percpu(tctx->tw);
 		kfree(tctx);
 		return ret;
 	}
 
+	for_each_possible_cpu(cpu) {
+		struct tctx_tw *tw = per_cpu_ptr(tctx->tw, cpu);
+
+		spin_lock_init(&tw->task_lock);
+		INIT_WQ_LIST(&tw->task_list);
+		init_task_work(&tw->task_work, tctx_task_work);
+		tw->tctx = tctx;
+	}
+
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->tw.task_lock);
-	INIT_WQ_LIST(&tctx->tw.task_list);
-	init_task_work(&tctx->tw.task_work, tctx_task_work);
 	return 0;
 }
 
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index b1cab2e84b16..c50432906dc8 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/percpu.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -9,6 +11,7 @@ struct tctx_tw {
 	spinlock_t		task_lock;
 	struct io_wq_work_list	task_list;
 	struct callback_head	task_work;
+	struct io_uring_task	*tctx;
 	bool			task_running;
 };
 
@@ -23,7 +26,7 @@ struct io_uring_task {
 	atomic_t		inflight_tracked;
 	atomic_t		in_idle;
 
-	struct tctx_tw		tw;
+	struct __percpu	tctx_tw *tw;
 
 	struct file		*registered_rings[IO_RINGFD_REG_MAX];
 };