blk-mq: remove hybrid polling
authorKeith Busch <kbusch@kernel.org>
Mon, 20 Mar 2023 19:49:26 +0000 (12:49 -0700)
committerJens Axboe <axboe@kernel.dk>
Mon, 20 Mar 2023 21:30:03 +0000 (15:30 -0600)
io_uring provides the only way user space can poll completions, and that
always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
code, so remove it and everything supporting it.

Hybrid polling was effectively killed off with 9650b453a3d4b1, "block:
ignore RWF_HIPRI hint for sync dio", but still potentially reachable
through io_uring until d729cf9acb93119, "io_uring: don't sleep when
polling for I/O", but hybrid polling probably should not have been
reachable through that async interface from the beginning.

Fixes: 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio")
Fixes: d729cf9acb93 ("io_uring: don't sleep when polling for I/O")
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20230320194926.3353144-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Documentation/ABI/stable/sysfs-block
block/blk-core.c
block/blk-mq-debugfs.c
block/blk-mq.c
block/blk-stat.c
block/blk-sysfs.c
include/linux/blk-mq.h
include/linux/blkdev.h
io_uring/rw.c

index 282de3680367d4a3b3c86b1bdb54352d18c0fedd..c57e5b7cb5326e4639051d8200c60c2dbdbf35bf 100644 (file)
@@ -336,18 +336,11 @@ What:             /sys/block/<disk>/queue/io_poll_delay
 Date:          November 2016
 Contact:       linux-block@vger.kernel.org
 Description:
-               [RW] If polling is enabled, this controls what kind of polling
-               will be performed. It defaults to -1, which is classic polling.
+               [RW] This was used to control what kind of polling will be
+               performed.  It is now fixed to -1, which is classic polling.
                In this mode, the CPU will repeatedly ask for completions
-               without giving up any time.  If set to 0, a hybrid polling mode
-               is used, where the kernel will attempt to make an educated guess
-               at when the IO will complete. Based on this guess, the kernel
-               will put the process issuing IO to sleep for an amount of time,
-               before entering a classic poll loop. This mode might be a little
-               slower than pure classic polling, but it will be more efficient.
-               If set to a value larger than 0, the kernel will put the process
-               issuing IO to sleep for this amount of microseconds before
-               entering classic polling.
+               without giving up any time.
+               <deprecated>
 
 
 What:          /sys/block/<disk>/queue/io_timeout
index 9e5e0277a4d95a5e2f09b6cafaf4d8cd97ee5983..269765d16cfd9c65732bd6804dda54cbe0009958 100644 (file)
@@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 
 static void blk_free_queue(struct request_queue *q)
 {
-       if (q->poll_stat)
-               blk_stat_remove_callback(q, q->poll_cb);
-       blk_stat_free_callback(q->poll_cb);
-
        blk_free_queue_stats(q->stats);
-       kfree(q->poll_stat);
-
        if (queue_is_mq(q))
                blk_mq_release(q);
 
index b01818f8e216e35b45e766aff923d5a51e84f7c7..212a7f301e730279f48b5df63dd6a8a3d75918ad 100644 (file)
 #include "blk-mq-tag.h"
 #include "blk-rq-qos.h"
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-       if (stat->nr_samples) {
-               seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
-                          stat->nr_samples, stat->mean, stat->min, stat->max);
-       } else {
-               seq_puts(m, "samples=0");
-       }
-}
-
 static int queue_poll_stat_show(void *data, struct seq_file *m)
 {
-       struct request_queue *q = data;
-       int bucket;
-
-       if (!q->poll_stat)
-               return 0;
-
-       for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
-               seq_printf(m, "read  (%d Bytes): ", 1 << (9 + bucket));
-               print_stat(m, &q->poll_stat[2 * bucket]);
-               seq_puts(m, "\n");
-
-               seq_printf(m, "write (%d Bytes): ",  1 << (9 + bucket));
-               print_stat(m, &q->poll_stat[2 * bucket + 1]);
-               seq_puts(m, "\n");
-       }
        return 0;
 }
 
@@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
        RQF_NAME(STATS),
        RQF_NAME(SPECIAL_PAYLOAD),
        RQF_NAME(ZONE_WRITE_LOCKED),
-       RQF_NAME(MQ_POLL_SLEPT),
        RQF_NAME(TIMED_OUT),
        RQF_NAME(ELV),
        RQF_NAME(RESV),
index a875b1cdff9b5bc1f44541b5630d00eaccfc1a58..4e30459df81517ce9fc126882d2764e2f4bc9ad1 100644 (file)
 
 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
 
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
-       int ddir, sectors, bucket;
-
-       ddir = rq_data_dir(rq);
-       sectors = blk_rq_stats_sectors(rq);
-
-       bucket = ddir + 2 * ilog2(sectors);
-
-       if (bucket < 0)
-               return -1;
-       else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
-               return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
-       return bucket;
-}
-
-#define BLK_QC_T_SHIFT         16
-#define BLK_QC_T_INTERNAL      (1U << 31)
-
 static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
                blk_qc_t qc)
 {
-       return xa_load(&q->hctx_table,
-                       (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
-}
-
-static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
-               blk_qc_t qc)
-{
-       unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
-
-       if (qc & BLK_QC_T_INTERNAL)
-               return blk_mq_tag_to_rq(hctx->sched_tags, tag);
-       return blk_mq_tag_to_rq(hctx->tags, tag);
+       return xa_load(&q->hctx_table, qc);
 }
 
 static inline blk_qc_t blk_rq_to_qc(struct request *rq)
 {
-       return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
-               (rq->tag != -1 ?
-                rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+       return rq->mq_hctx->queue_num;
 }
 
 /*
@@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)
 
 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 {
-       if (rq->rq_flags & RQF_STATS) {
-               blk_mq_poll_stats_start(rq->q);
+       if (rq->rq_flags & RQF_STATS)
                blk_stat_add(rq, now);
-       }
 
        blk_mq_sched_completed_request(rq, now);
        blk_account_io_done(rq, now);
@@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;
 
-       q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
-                                            blk_mq_poll_stats_bkt,
-                                            BLK_MQ_POLL_STATS_BKTS, q);
-       if (!q->poll_cb)
-               goto err_exit;
-
        if (blk_mq_alloc_ctxs(q))
-               goto err_poll;
+               goto err_exit;
 
        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);
@@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
        q->nr_requests = set->queue_depth;
 
-       /*
-        * Default to classic polling
-        */
-       q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-
        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
        blk_mq_add_queue_tag_set(set, q);
        blk_mq_map_swqueue(q);
@@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 err_hctxs:
        blk_mq_release(q);
-err_poll:
-       blk_stat_free_callback(q->poll_cb);
-       q->poll_cb = NULL;
 err_exit:
        q->mq_ops = NULL;
        return -ENOMEM;
@@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
-       if (q->poll_stat)
-               return true;
-
-       return blk_stats_alloc_enable(q);
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
-       /*
-        * We don't arm the callback if polling stats are not enabled or the
-        * callback is already active.
-        */
-       if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
-               return;
-
-       blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
-       struct request_queue *q = cb->data;
-       int bucket;
-
-       for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
-               if (cb->stat[bucket].nr_samples)
-                       q->poll_stat[bucket] = cb->stat[bucket];
-       }
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
-                                      struct request *rq)
-{
-       unsigned long ret = 0;
-       int bucket;
-
-       /*
-        * If stats collection isn't on, don't sleep but turn it on for
-        * future users
-        */
-       if (!blk_poll_stats_enable(q))
-               return 0;
-
-       /*
-        * As an optimistic guess, use half of the mean service time
-        * for this type of request. We can (and should) make this smarter.
-        * For instance, if the completion latencies are tight, we can
-        * get closer than just half the mean. This is especially
-        * important on devices where the completion latencies are longer
-        * than ~10 usec. We do use the stats for the relevant IO size
-        * if available which does lead to better estimates.
-        */
-       bucket = blk_mq_poll_stats_bkt(rq);
-       if (bucket < 0)
-               return ret;
-
-       if (q->poll_stat[bucket].nr_samples)
-               ret = (q->poll_stat[bucket].mean + 1) / 2;
-
-       return ret;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
-{
-       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
-       struct request *rq = blk_qc_to_rq(hctx, qc);
-       struct hrtimer_sleeper hs;
-       enum hrtimer_mode mode;
-       unsigned int nsecs;
-       ktime_t kt;
-
-       /*
-        * If a request has completed on queue that uses an I/O scheduler, we
-        * won't get back a request from blk_qc_to_rq.
-        */
-       if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
-               return false;
-
-       /*
-        * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
-        *
-        *  0:  use half of prev avg
-        * >0:  use this specific value
-        */
-       if (q->poll_nsec > 0)
-               nsecs = q->poll_nsec;
-       else
-               nsecs = blk_mq_poll_nsecs(q, rq);
-
-       if (!nsecs)
-               return false;
-
-       rq->rq_flags |= RQF_MQ_POLL_SLEPT;
-
-       /*
-        * This will be replaced with the stats tracking code, using
-        * 'avg_completion_time / 2' as the pre-sleep target.
-        */
-       kt = nsecs;
-
-       mode = HRTIMER_MODE_REL;
-       hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
-       hrtimer_set_expires(&hs.timer, kt);
-
-       do {
-               if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
-                       break;
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               hrtimer_sleeper_start_expires(&hs, mode);
-               if (hs.task)
-                       io_schedule();
-               hrtimer_cancel(&hs.timer);
-               mode = HRTIMER_MODE_ABS;
-       } while (hs.task && !signal_pending(current));
-
-       __set_current_state(TASK_RUNNING);
-       destroy_hrtimer_on_stack(&hs.timer);
-
-       /*
-        * If we sleep, have the caller restart the poll loop to reset the
-        * state.  Like for the other success return cases, the caller is
-        * responsible for checking if the IO completed.  If the IO isn't
-        * complete, we'll get called again and will go straight to the busy
-        * poll loop.
-        */
-       return true;
-}
-
-static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
-                              struct io_comp_batch *iob, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags)
 {
        struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
        long state = get_current_state();
@@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
        return 0;
 }
 
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
-               unsigned int flags)
-{
-       if (!(flags & BLK_POLL_NOSLEEP) &&
-           q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
-               if (blk_mq_poll_hybrid(q, cookie))
-                       return 1;
-       }
-       return blk_mq_poll_classic(q, cookie, iob, flags);
-}
-
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
        return rq->mq_ctx->cpu;
index c6ca16abf911e25f682699dde6b9ec41c429e16d..74a1a8c32d86f8014c01a6db252f2899ff8c162e 100644 (file)
@@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
 
        kfree(stats);
 }
-
-bool blk_stats_alloc_enable(struct request_queue *q)
-{
-       struct blk_rq_stat *poll_stat;
-
-       poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
-                               GFP_ATOMIC);
-       if (!poll_stat)
-               return false;
-
-       if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
-               kfree(poll_stat);
-               return true;
-       }
-
-       blk_stat_add_callback(q, q->poll_cb);
-       return false;
-}
index f1fce1c7fa44b819caa60a305c7582e0d51f70ee..1a743b4f29582d627cbca4e128bac7d85d01143a 100644 (file)
@@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
 {
-       int val;
-
-       if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
-               val = BLK_MQ_POLL_CLASSIC;
-       else
-               val = q->poll_nsec / 1000;
-
-       return sprintf(page, "%d\n", val);
+       return sprintf(page, "%d\n", -1);
 }
 
 static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
                                size_t count)
 {
-       int err, val;
-
-       if (!q->mq_ops || !q->mq_ops->poll)
-               return -EINVAL;
-
-       err = kstrtoint(page, 10, &val);
-       if (err < 0)
-               return err;
-
-       if (val == BLK_MQ_POLL_CLASSIC)
-               q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-       else if (val >= 0)
-               q->poll_nsec = val * 1000;
-       else
-               return -EINVAL;
-
        return count;
 }
 
index dd5ce1137f04aeed0ce86b732b14d34c36710716..1dacb2c81fdda1d3a1fe384e61adcaf112166d39 100644 (file)
@@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD    ((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
 /* queue has elevator attached */
index d1aee08f8c1811d645856ce4b92e24cb4cd236ab..6ede578dfbc6421722072296d1ef7b24d49ab6f3 100644 (file)
@@ -44,12 +44,6 @@ extern const struct device_type disk_type;
 extern struct device_type part_type;
 extern struct class block_class;
 
-/* Must be consistent with blk_mq_poll_stats_bkt() */
-#define BLK_MQ_POLL_STATS_BKTS 16
-
-/* Doing classic polling */
-#define BLK_MQ_POLL_CLASSIC -1
-
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -468,10 +462,6 @@ struct request_queue {
 #endif
 
        unsigned int            rq_timeout;
-       int                     poll_nsec;
-
-       struct blk_stat_callback        *poll_cb;
-       struct blk_rq_stat      *poll_stat;
 
        struct timer_list       timeout;
        struct work_struct      timeout_work;
@@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno);
 
 /* only poll the hardware once, don't continue until a completion was found */
 #define BLK_POLL_ONESHOT               (1 << 0)
-/* do not sleep to wait for the expected completion time */
-#define BLK_POLL_NOSLEEP               (1 << 1)
 int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
 int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                        unsigned int flags);
index 4c233910e20097e2833c9e3a6c2f78995c2b7947..a099dc0543d9555e92a20b7da22dccaa8d7b9572 100644 (file)
@@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req)
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
        struct io_wq_work_node *pos, *start, *prev;
-       unsigned int poll_flags = BLK_POLL_NOSLEEP;
+       unsigned int poll_flags = 0;
        DEFINE_IO_COMP_BATCH(iob);
        int nr_events = 0;