iomap: support IOCB_DIO_DEFER xfs-async-dio.2
authorJens Axboe <axboe@kernel.dk>
Sat, 8 Jul 2023 16:01:50 +0000 (10:01 -0600)
committerJens Axboe <axboe@kernel.dk>
Tue, 18 Jul 2023 19:40:06 +0000 (13:40 -0600)
If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
and data for that callback. Rather than punt the completion to a
workqueue, we pass back the handler and data to the issuer and will get a
callback from a safe task context.

Using the following fio job to randomly dio write 4k blocks at
queue depths of 1..16:

fio --name=dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=16

shows the following results before and after this patch:

Stock Patched Diff
=======================================
QD1 155K 162K + 4.5%
QD2 290K 313K + 7.9%
QD4 533K 597K +12.0%
QD8 604K 827K +36.9%
QD16 615K 845K +37.4%

which shows nice wins all around. If we factored in per-IOP efficiency,
the wins look even nicer. This becomes apparent as queue depth rises,
as the offloaded workqueue completions runs out of steam.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/iomap/direct-io.c

index 92b9b9db8b6767623b9555bf015eb5c0e0a54899..ed615177e1f6758f478c331fbb52026ab7cf3a19 100644 (file)
@@ -131,6 +131,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
 
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+       return iomap_dio_complete(data);
+}
+
 static void iomap_dio_complete_work(struct work_struct *work)
 {
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -167,6 +172,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
                } else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
                        WRITE_ONCE(dio->iocb->private, NULL);
                        iomap_dio_complete_work(&dio->aio.work);
+               } else if ((dio->flags & IOMAP_DIO_INLINE_COMP) &&
+                          (iocb->ki_flags & IOCB_DIO_DEFER)) {
+                       /* only polled IO cares about private cleared */
+                       iocb->private = dio;
+                       iocb->dio_complete = iomap_dio_deferred_complete;
+                       /*
+                        * Invoke ->ki_complete() directly. We've assigned
+                        * out dio_complete callback handler, and since the
+                        * issuer set IOCB_DIO_DEFER, we know their
+                        * ki_complete handler will notice ->dio_complete
+                        * being set and will defer calling that handler
+                        * until it can be done from a safe task context.
+                        *
+                        * Note that the 'res' being passed in here is
+                        * not important for this case. The actual completion
+                        * value of the request will be gotten from dio_complete
+                        * when that is run by the issuer.
+                        */
+                       iocb->ki_complete(iocb, 0);
                } else {
                        struct inode *inode = file_inode(iocb->ki_filp);