From 779f4c5754c5a9d30dfe909fad2a2546afd70621 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Feb 2021 12:33:30 -0700 Subject: io-wq: make hashed work map + lock per-ctx Before the io-wq thread change, we maintained a hash work map and lock per-node per-ring. That wasn't ideal, as we really wanted it to be per ring. But now that we have per-task workers, the hash map ends up being just per-task. That'll work just fine for the normal case of having one task use a ring, but if you share the ring between tasks, then it's considerably worse than it was before. Make the hash map per ctx instead, which provides full per-ctx buffered write serialization on hashed writes. Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++---- fs/io-wq.h | 2 ++ fs/io_uring.c | 4 ++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b0d09f60200b..9d5ab6d744dc 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -85,7 +85,6 @@ struct io_wqe { struct { raw_spinlock_t lock; struct io_wq_work_list work_list; - unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -111,6 +110,9 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; + + unsigned long *hash_map; + refcount_t refs; struct completion done; @@ -353,8 +355,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) /* hashed, can run if not already running */ hash = io_get_work_hash(work); - if (!(wqe->hash_map & BIT(hash))) { - wqe->hash_map |= BIT(hash); + if (!test_and_set_bit(hash, wqe->wq->hash_map)) { /* all items with this hash lie in [work, tail] */ tail = wqe->hash_tail[hash]; wqe->hash_tail[hash] = NULL; @@ -452,7 +453,7 @@ get_next: if (hash != -1U && !next_hashed) { raw_spin_lock_irq(&wqe->lock); - wqe->hash_map &= ~BIT_ULL(hash); + clear_bit(hash, wq->hash_map); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) @@ -975,6 +976,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (ret) goto err_wqes; + wq->hash_map = data->hash_map; wq->free_work = data->free_work; wq->do_work = data->do_work; diff --git a/fs/io-wq.h b/fs/io-wq.h index 3c63a99d1629..d5f4e1ae2d5f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -96,6 +96,8 @@ typedef void (io_wq_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; + unsigned long *hash_map; + io_wq_work_fn *do_work; free_work_fn *free_work; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 239eacec3f3a..e71bc4e3bf08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -359,6 +359,9 @@ struct io_ring_ctx { unsigned cached_cq_overflow; unsigned long sq_check_overflow; + /* hashed buffered write serialization */ + unsigned long hash_map; + struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; @@ -7796,6 +7799,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) unsigned int concurrency; data.user = ctx->user; + data.hash_map = &ctx->hash_map; data.free_work = io_free_work; data.do_work = io_wq_submit_work; -- cgit v1.2.3