io-wq: make hashed work map + lock per-ctx io_uring-worker.v2
authorJens Axboe <axboe@kernel.dk>
Fri, 19 Feb 2021 19:33:30 +0000 (12:33 -0700)
committerJens Axboe <axboe@kernel.dk>
Fri, 19 Feb 2021 20:07:17 +0000 (13:07 -0700)
Before the io-wq thread change, we maintained a hash work map and lock
per-node per-ring. That wasn't ideal, as we really wanted it to be per
ring. But now that we have per-task workers, the hash map ends up being
just per-task. That'll work just fine for the normal case of having
one task use a ring, but if you share the ring between tasks, then it's
considerably worse than it was before.

Make the hash map per ctx instead, which provides full per-ctx buffered
write serialization on hashed writes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c

index b0d09f60200bc92ebf6b4da30885db4bf612fb12..9d5ab6d744dcf2b4ef55ea40d3ff894f8e465044 100644 (file)
@@ -85,7 +85,6 @@ struct io_wqe {
        struct {
                raw_spinlock_t lock;
                struct io_wq_work_list work_list;
-               unsigned long hash_map;
                unsigned flags;
        } ____cacheline_aligned_in_smp;
 
@@ -111,6 +110,9 @@ struct io_wq {
 
        struct task_struct *manager;
        struct user_struct *user;
+
+       unsigned long *hash_map;
+
        refcount_t refs;
        struct completion done;
 
@@ -353,8 +355,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 
                /* hashed, can run if not already running */
                hash = io_get_work_hash(work);
-               if (!(wqe->hash_map & BIT(hash))) {
-                       wqe->hash_map |= BIT(hash);
+               if (!test_and_set_bit(hash, wqe->wq->hash_map)) {
                        /* all items with this hash lie in [work, tail] */
                        tail = wqe->hash_tail[hash];
                        wqe->hash_tail[hash] = NULL;
@@ -452,7 +453,7 @@ get_next:
 
                        if (hash != -1U && !next_hashed) {
                                raw_spin_lock_irq(&wqe->lock);
-                               wqe->hash_map &= ~BIT_ULL(hash);
+                               clear_bit(hash, wq->hash_map);
                                wqe->flags &= ~IO_WQE_FLAG_STALLED;
                                /* skip unnecessary unlock-lock wqe->lock */
                                if (!work)
@@ -975,6 +976,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
        if (ret)
                goto err_wqes;
 
+       wq->hash_map = data->hash_map;
        wq->free_work = data->free_work;
        wq->do_work = data->do_work;
 
index 3c63a99d16294ed1a4b0680a1e551990640dbf0a..d5f4e1ae2d5fcf499f821ecc72a3bb06b12e1bf3 100644 (file)
@@ -96,6 +96,8 @@ typedef void (io_wq_work_fn)(struct io_wq_work *);
 struct io_wq_data {
        struct user_struct *user;
 
+       unsigned long *hash_map;
+
        io_wq_work_fn *do_work;
        free_work_fn *free_work;
 };
index 239eacec3f3ae3d2595aa043a056b885c27c5708..e71bc4e3bf08760d374337b2fcdfd2ca4114105a 100644 (file)
@@ -359,6 +359,9 @@ struct io_ring_ctx {
                unsigned                cached_cq_overflow;
                unsigned long           sq_check_overflow;
 
+               /* hashed buffered write serialization */
+               unsigned long           hash_map;
+
                struct list_head        defer_list;
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
@@ -7796,6 +7799,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
        unsigned int concurrency;
 
        data.user = ctx->user;
+       data.hash_map = &ctx->hash_map;
        data.free_work = io_free_work;
        data.do_work = io_wq_submit_work;