block: support to account io_ticks precisely
authorYu Kuai <yukuai3@huawei.com>
Thu, 9 May 2024 12:37:16 +0000 (20:37 +0800)
committerJens Axboe <axboe@kernel.dk>
Thu, 9 May 2024 13:59:44 +0000 (07:59 -0600)
Currently, io_ticks is accounted based on sampling, specifically
update_io_ticks() will always account io_ticks by 1 jiffies from
bdev_start_io_acct()/blk_account_io_start(), and the result can be
inaccurate, for example(HZ is 250):

Test script:
fio -filename=/dev/sda -bs=4k -rw=write -direct=1 -name=test -thinktime=4ms

Test result: util is about 90%, while the disk is really idle.

This behaviour is introduced by commit 5b18b5a73760 ("block: delete
part_round_stats and switch to less precise counting"), however, there
was a key point that is missed that this patch also improve performance
a lot:

Before the commit:
part_round_stats:
  if (part->stamp != now)
   stats |= 1;

  part_in_flight()
  -> there can be lots of task here in 1 jiffies.
  part_round_stats_single()
   __part_stat_add()
  part->stamp = now;

After the commit:
update_io_ticks:
  stamp = part->bd_stamp;
  if (time_after(now, stamp))
   if (try_cmpxchg())
    __part_stat_add()
    -> only one task can reach here in 1 jiffies.

Hence in order to account io_ticks precisely, we only need to know if
there are IO inflight at most once in one jiffies. Noted that for
rq-based device, iterating tags should not be used here because
'tags->lock' is grabbed in blk_mq_find_and_get_req(), hence
part_stat_lock_inc/dec() and part_in_flight() is used to trace inflight.
The additional overhead is quite little:

 - per cpu add/dec for each IO for rq-based device;
 - per cpu sum for each jiffies;

And it's verified by null-blk that there are no performance degration
under heavy IO pressure.

Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20240509123717.3223892-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-core.c
block/blk-merge.c
block/blk-mq.c
block/blk.h
block/genhd.c

index 8efee8faa4b63c2a765232ca7198aecd5d73e8c6..8566bbd8aeba2c41ce6dcd29ab2bd680139fe3e3 100644 (file)
@@ -984,10 +984,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
        unsigned long stamp;
 again:
        stamp = READ_ONCE(part->bd_stamp);
-       if (unlikely(time_after(now, stamp))) {
-               if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
-                       __part_stat_add(part, io_ticks, end ? now - stamp : 1);
-       }
+       if (unlikely(time_after(now, stamp)) &&
+           likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+           (end || part_in_flight(part)))
+               __part_stat_add(part, io_ticks, now - stamp);
+
        if (part->bd_partno) {
                part = bdev_whole(part);
                goto again;
index f64115d72f3d4359713ebbb9d85170cdadab4890..8534c35e04976a8a59cefd9d321d7522768f4ae6 100644 (file)
@@ -780,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
        if (blk_do_io_stat(req)) {
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+               part_stat_local_dec(req->part,
+                                   in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
 }
index 9f677ea85a52df604d000f0873c1e040d72c9c50..8e01e4b32e100f45a5346d22fc652736a33cd0cd 100644 (file)
@@ -996,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
                update_io_ticks(req->part, jiffies, true);
                part_stat_inc(req->part, ios[sgrp]);
                part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+               part_stat_local_dec(req->part,
+                                   in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
 }
@@ -1018,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
 
                part_stat_lock();
                update_io_ticks(req->part, jiffies, false);
+               part_stat_local_inc(req->part,
+                                   in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
 }
index d5107e65355e271a02a9640e88e8e4e4c088d83b..3870bdcd5cad4d20921f08538bc2f1b13a8897fa 100644 (file)
@@ -366,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq)
 }
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+unsigned int part_in_flight(struct block_device *part);
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
index dec2ee338fb44ae3d94010d21f3617e3ffd5337a..8f1163d2d17166ed71bd33a1a565e118f2bc3855 100644 (file)
@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
        }
 }
 
-static unsigned int part_in_flight(struct block_device *part)
+unsigned int part_in_flight(struct block_device *part)
 {
        unsigned int inflight = 0;
        int cpu;