block: add a struct io_comp_batch argument to fops->iopoll()
[linux-2.6-block.git] / block / blk-mq.c
index 108a352051be5fea9c7a5c132576844c3f3ee8a7..79c25b64e8b090f0f153404f12ed591923a6d9d1 100644 (file)
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/kmemleak.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/smp.h>
+#include <linux/interrupt.h>
 #include <linux/llist.h>
 #include <linux/list_sort.h>
 #include <linux/cpu.h>
@@ -63,6 +65,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
        return bucket;
 }
 
+#define BLK_QC_T_SHIFT         16
+#define BLK_QC_T_INTERNAL      (1U << 31)
+
+static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
+               blk_qc_t qc)
+{
+       return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+}
+
+static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
+               blk_qc_t qc)
+{
+       unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
+
+       if (qc & BLK_QC_T_INTERNAL)
+               return blk_mq_tag_to_rq(hctx->sched_tags, tag);
+       return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static inline blk_qc_t blk_rq_to_qc(struct request *rq)
+{
+       return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
+               (rq->tag != -1 ?
+                rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+}
+
 /*
  * Check if any of the ctx, dispatch list or elevator
  * have pending work in this hardware queue.
@@ -188,9 +216,11 @@ void blk_mq_freeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
-void blk_mq_unfreeze_queue(struct request_queue *q)
+void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 {
        mutex_lock(&q->mq_freeze_lock);
+       if (force_atomic)
+               q->q_usage_counter.data->force_atomic = true;
        q->mq_freeze_depth--;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
        if (!q->mq_freeze_depth) {
@@ -199,6 +229,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
        }
        mutex_unlock(&q->mq_freeze_lock);
 }
+
+void blk_mq_unfreeze_queue(struct request_queue *q)
+{
+       __blk_mq_unfreeze_queue(q, false);
+}
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
 /*
@@ -271,16 +306,22 @@ void blk_mq_wake_waiters(struct request_queue *q)
  */
 static inline bool blk_mq_need_time_stamp(struct request *rq)
 {
-       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
+       return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
 }
 
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                unsigned int tag, u64 alloc_time_ns)
 {
+       struct blk_mq_ctx *ctx = data->ctx;
+       struct blk_mq_hw_ctx *hctx = data->hctx;
+       struct request_queue *q = data->q;
+       struct elevator_queue *e = q->elevator;
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
+       unsigned int rq_flags = 0;
 
-       if (data->q->elevator) {
+       if (e) {
+               rq_flags = RQF_ELV;
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
@@ -288,51 +329,51 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                rq->internal_tag = BLK_MQ_NO_TAG;
        }
 
+       if (data->flags & BLK_MQ_REQ_PM)
+               rq_flags |= RQF_PM;
+       if (blk_queue_io_stat(q))
+               rq_flags |= RQF_IO_STAT;
+       rq->rq_flags = rq_flags;
+
+       if (blk_mq_need_time_stamp(rq))
+               rq->start_time_ns = ktime_get_ns();
+       else
+               rq->start_time_ns = 0;
        /* csd/requeue_work/fifo_time is initialized before use */
-       rq->q = data->q;
-       rq->mq_ctx = data->ctx;
-       rq->mq_hctx = data->hctx;
-       rq->rq_flags = 0;
+       rq->q = q;
+       rq->mq_ctx = ctx;
+       rq->mq_hctx = hctx;
        rq->cmd_flags = data->cmd_flags;
-       if (data->flags & BLK_MQ_REQ_PM)
-               rq->rq_flags |= RQF_PM;
-       if (blk_queue_io_stat(data->q))
-               rq->rq_flags |= RQF_IO_STAT;
-       INIT_LIST_HEAD(&rq->queuelist);
-       INIT_HLIST_NODE(&rq->hash);
-       RB_CLEAR_NODE(&rq->rb_node);
        rq->rq_disk = NULL;
        rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
        rq->alloc_time_ns = alloc_time_ns;
 #endif
-       if (blk_mq_need_time_stamp(rq))
-               rq->start_time_ns = ktime_get_ns();
-       else
-               rq->start_time_ns = 0;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
 #endif
-       blk_crypto_rq_set_defaults(rq);
-       /* tag was already set */
-       WRITE_ONCE(rq->deadline, 0);
-
        rq->timeout = 0;
-
        rq->end_io = NULL;
        rq->end_io_data = NULL;
 
-       data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
+       blk_crypto_rq_set_defaults(rq);
+       INIT_LIST_HEAD(&rq->queuelist);
+       /* tag was already set */
+       WRITE_ONCE(rq->deadline, 0);
        refcount_set(&rq->ref, 1);
 
-       if (!op_is_flush(data->cmd_flags)) {
+       if (rq->rq_flags & RQF_ELV) {
                struct elevator_queue *e = data->q->elevator;
 
                rq->elv.icq = NULL;
-               if (e && e->type->ops.prepare_request) {
+               INIT_HLIST_NODE(&rq->hash);
+               RB_CLEAR_NODE(&rq->rb_node);
+
+               if (!op_is_flush(data->cmd_flags) &&
+                   e->type->ops.prepare_request) {
                        if (e->type->icq_cache)
                                blk_mq_sched_assign_ioc(rq);
 
@@ -341,15 +382,41 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                }
        }
 
-       data->hctx->queued++;
        return rq;
 }
 
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
+static inline struct request *
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
+               u64 alloc_time_ns)
+{
+       unsigned int tag, tag_offset;
+       struct request *rq;
+       unsigned long tags;
+       int i, nr = 0;
+
+       tags = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+       if (unlikely(!tags))
+               return NULL;
+
+       for (i = 0; tags; i++) {
+               if (!(tags & (1UL << i)))
+                       continue;
+               tag = tag_offset + i;
+               tags &= ~(1UL << i);
+               rq = blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+               rq_list_add(data->cached_rq, rq);
+       }
+       data->nr_tags -= nr;
+
+       return rq_list_pop(data->cached_rq);
+}
+
+static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
        struct request_queue *q = data->q;
        struct elevator_queue *e = q->elevator;
        u64 alloc_time_ns = 0;
+       struct request *rq;
        unsigned int tag;
 
        /* alloc_time includes depth and tag waits */
@@ -378,6 +445,16 @@ retry:
        if (!e)
                blk_mq_tag_busy(data->hctx);
 
+       /*
+        * Try batched alloc if we want more than 1 tag.
+        */
+       if (data->nr_tags > 1) {
+               rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+               if (rq)
+                       return rq;
+               data->nr_tags = 1;
+       }
+
        /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
@@ -387,15 +464,16 @@ retry:
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;
-
                /*
-                * Give up the CPU and sleep for a random short time to ensure
-                * that thread using a realtime scheduling class are migrated
-                * off the CPU, and thus off the hctx that is going away.
+                * Give up the CPU and sleep for a random short time to
+                * ensure that thread using a realtime scheduling class
+                * are migrated off the CPU, and thus off the hctx that
+                * is going away.
                 */
                msleep(3);
                goto retry;
        }
+
        return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
 }
 
@@ -406,6 +484,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                .q              = q,
                .flags          = flags,
                .cmd_flags      = op,
+               .nr_tags        = 1,
        };
        struct request *rq;
        int ret;
@@ -414,7 +493,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
        if (ret)
                return ERR_PTR(ret);
 
-       rq = __blk_mq_alloc_request(&data);
+       rq = __blk_mq_alloc_requests(&data);
        if (!rq)
                goto out_queue_exit;
        rq->__data_len = 0;
@@ -434,6 +513,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                .q              = q,
                .flags          = flags,
                .cmd_flags      = op,
+               .nr_tags        = 1,
        };
        u64 alloc_time_ns = 0;
        unsigned int cpu;
@@ -507,12 +587,12 @@ static void __blk_mq_free_request(struct request *rq)
 void blk_mq_free_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-       if (rq->rq_flags & RQF_ELVPRIV) {
-               if (e && e->type->ops.finish_request)
+       if (rq->rq_flags & (RQF_ELVPRIV | RQF_ELV)) {
+               struct elevator_queue *e = q->elevator;
+
+               if (e->type->ops.finish_request)
                        e->type->ops.finish_request(rq);
                if (rq->elv.icq) {
                        put_io_context(rq->elv.icq->ioc);
@@ -520,7 +600,6 @@ void blk_mq_free_request(struct request *rq)
                }
        }
 
-       ctx->rq_completed[rq_is_sync(rq)]++;
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
                __blk_mq_dec_active_requests(hctx);
 
@@ -535,21 +614,173 @@ void blk_mq_free_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
-       u64 now = 0;
+       struct request *rq;
 
-       if (blk_mq_need_time_stamp(rq))
-               now = ktime_get_ns();
+       while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
+               percpu_ref_get(&rq->q->q_usage_counter);
+               blk_mq_free_request(rq);
+       }
+}
+
+static void req_bio_endio(struct request *rq, struct bio *bio,
+                         unsigned int nbytes, blk_status_t error)
+{
+       if (error)
+               bio->bi_status = error;
+
+       if (unlikely(rq->rq_flags & RQF_QUIET))
+               bio_set_flag(bio, BIO_QUIET);
+
+       bio_advance(bio, nbytes);
+
+       if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+               /*
+                * Partial zone append completions cannot be supported as the
+                * BIO fragments may end up not being written sequentially.
+                */
+               if (bio->bi_iter.bi_size)
+                       bio->bi_status = BLK_STS_IOERR;
+               else
+                       bio->bi_iter.bi_sector = rq->__sector;
+       }
+
+       /* don't actually finish bio if it's part of flush sequence */
+       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+               bio_endio(bio);
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+       if (req->part && blk_do_io_stat(req)) {
+               const int sgrp = op_stat_group(req_op(req));
+
+               part_stat_lock();
+               part_stat_add(req->part, sectors[sgrp], bytes >> 9);
+               part_stat_unlock();
+       }
+}
+
+/**
+ * blk_update_request - Complete multiple bytes without completing the request
+ * @req:      the request being processed
+ * @error:    block status code
+ * @nr_bytes: number of bytes to complete for @req
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @req, but doesn't complete
+ *     the request structure even if @req doesn't have leftover.
+ *     If @req has leftover, sets it up for the next range of segments.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
+ *
+ * Note:
+ *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ *      except in the consistency check at the end of this function.
+ *
+ * Return:
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
+ **/
+bool blk_update_request(struct request *req, blk_status_t error,
+               unsigned int nr_bytes)
+{
+       int total_bytes;
+
+       trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
+
+       if (!req->bio)
+               return false;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+           error == BLK_STS_OK)
+               req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
+       if (unlikely(error && !blk_rq_is_passthrough(req) &&
+                    !(req->rq_flags & RQF_QUIET)))
+               blk_print_req_error(req, error);
+
+       blk_account_io_completion(req, nr_bytes);
+
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
+
+               if (bio_bytes == bio->bi_iter.bi_size)
+                       req->bio = bio->bi_next;
+
+               /* Completion has already been traced */
+               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+               req_bio_endio(req, bio, bio_bytes, error);
+
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
+
+               if (!nr_bytes)
+                       break;
+       }
+
+       /*
+        * completely done
+        */
+       if (!req->bio) {
+               /*
+                * Reset counters so that the request stacking driver
+                * can find how many bytes remain in the request
+                * later.
+                */
+               req->__data_len = 0;
+               return false;
+       }
+
+       req->__data_len -= total_bytes;
+
+       /* update sector only for requests with clear definition of sector */
+       if (!blk_rq_is_passthrough(req))
+               req->__sector += total_bytes >> 9;
+
+       /* mixed attributes always follow the first bio */
+       if (req->rq_flags & RQF_MIXED_MERGE) {
+               req->cmd_flags &= ~REQ_FAILFAST_MASK;
+               req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
+       }
 
-       if (rq->rq_flags & RQF_STATS) {
-               blk_mq_poll_stats_start(rq->q);
-               blk_stat_add(rq, now);
+       if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+               /*
+                * If total number of sectors is less than the first segment
+                * size, something has gone terribly wrong.
+                */
+               if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+                       blk_dump_rq_flags(req, "request botched");
+                       req->__data_len = blk_rq_cur_bytes(req);
+               }
+
+               /* recalculate the number of segments */
+               req->nr_phys_segments = blk_recalc_rq_segments(req);
        }
 
-       blk_mq_sched_completed_request(rq, now);
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
 
-       blk_account_io_done(rq, now);
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+       if (blk_mq_need_time_stamp(rq)) {
+               u64 now = ktime_get_ns();
+
+               if (rq->rq_flags & RQF_STATS) {
+                       blk_mq_poll_stats_start(rq->q);
+                       blk_stat_add(rq, now);
+               }
+
+               blk_mq_sched_completed_request(rq, now);
+               blk_account_io_done(rq, now);
+       }
 
        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
@@ -651,7 +882,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
         * For a polled request, always complete locallly, it's pointless
         * to redirect the completion.
         */
-       if (rq->cmd_flags & REQ_HIPRI)
+       if (rq->cmd_flags & REQ_POLLED)
                return false;
 
        if (blk_mq_complete_need_ipi(rq)) {
@@ -716,7 +947,14 @@ void blk_mq_start_request(struct request *rq)
        trace_block_rq_issue(rq);
 
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-               rq->io_start_time_ns = ktime_get_ns();
+               u64 start_time;
+#ifdef CONFIG_BLK_CGROUP
+               if (rq->bio)
+                       start_time = bio_issue_time(&rq->bio->bi_issue);
+               else
+#endif
+                       start_time = ktime_get_ns();
+               rq->io_start_time_ns = start_time;
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
@@ -731,6 +969,8 @@ void blk_mq_start_request(struct request *rq)
        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                q->integrity.profile->prepare_fn(rq);
 #endif
+       if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
+               WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
@@ -1052,24 +1292,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
        return data.rq;
 }
 
-static inline unsigned int queued_to_index(unsigned int queued)
-{
-       if (!queued)
-               return 0;
-
-       return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
-}
-
 static bool __blk_mq_get_driver_tag(struct request *rq)
 {
-       struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;
 
        blk_mq_tag_busy(rq->mq_hctx);
 
        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
-               bt = rq->mq_hctx->tags->breserved_tags;
+               bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
@@ -1112,7 +1344,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                struct sbitmap_queue *sbq;
 
                list_del_init(&wait->entry);
-               sbq = hctx->tags->bitmap_tags;
+               sbq = &hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);
@@ -1130,7 +1362,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
 {
-       struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
+       struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;
@@ -1383,8 +1615,6 @@ out:
        if (!list_empty(&zone_list))
                list_splice_tail_init(&zone_list, list);
 
-       hctx->dispatched[queued_to_index(queued)]++;
-
        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
@@ -1957,19 +2187,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                           struct request *rq,
-                                           blk_qc_t *cookie, bool last)
+                                           struct request *rq, bool last)
 {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
-       blk_qc_t new_cookie;
        blk_status_t ret;
 
-       new_cookie = request_to_qc_t(hctx, rq);
-
        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
@@ -1979,7 +2205,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = new_cookie;
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
@@ -1988,7 +2213,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
-               *cookie = BLK_QC_T_NONE;
                break;
        }
 
@@ -1997,7 +2221,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                struct request *rq,
-                                               blk_qc_t *cookie,
                                                bool bypass_insert, bool last)
 {
        struct request_queue *q = rq->q;
@@ -2017,7 +2240,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
 
-       if (q->elevator && !bypass_insert)
+       if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
                goto insert;
 
        budget_token = blk_mq_get_dispatch_budget(q);
@@ -2031,7 +2254,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
 
-       return __blk_mq_issue_directly(hctx, rq, cookie, last);
+       return __blk_mq_issue_directly(hctx, rq, last);
 insert:
        if (bypass_insert)
                return BLK_STS_RESOURCE;
@@ -2045,7 +2268,6 @@ insert:
  * blk_mq_try_issue_directly - Try to send a request directly to device driver.
  * @hctx: Pointer of the associated hardware queue.
  * @rq: Pointer to request to be sent.
- * @cookie: Request queue cookie.
  *
  * If the device has enough resources to accept a new request now, send the
  * request directly to device driver. Else, insert at hctx->dispatch queue, so
@@ -2053,7 +2275,7 @@ insert:
  * queue have higher priority.
  */
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-               struct request *rq, blk_qc_t *cookie)
+               struct request *rq)
 {
        blk_status_t ret;
        int srcu_idx;
@@ -2062,7 +2284,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
        hctx_lock(hctx, &srcu_idx);
 
-       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
+       ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
                blk_mq_request_bypass_insert(rq, false, true);
        else if (ret != BLK_STS_OK)
@@ -2075,11 +2297,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
        blk_status_t ret;
        int srcu_idx;
-       blk_qc_t unused_cookie;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
        hctx_lock(hctx, &srcu_idx);
-       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
+       ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
        hctx_unlock(hctx, srcu_idx);
 
        return ret;
@@ -2136,14 +2357,14 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 }
 
 /*
- * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
  * queues. This is important for md arrays to benefit from merging
  * requests.
  */
 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 {
        if (plug->multiple_queues)
-               return BLK_MAX_REQUEST_COUNT * 4;
+               return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
 }
 
@@ -2159,27 +2380,21 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
  *
  * It will not queue the request if there is an error with the bio, or at the
  * request creation.
- *
- * Returns: Request queue cookie.
  */
-blk_qc_t blk_mq_submit_bio(struct bio *bio)
+void blk_mq_submit_bio(struct bio *bio)
 {
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = {
-               .q              = q,
-       };
        struct request *rq;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
-       unsigned int nr_segs;
-       blk_qc_t cookie;
+       unsigned int nr_segs = 1;
        blk_status_t ret;
-       bool hipri;
 
        blk_queue_bounce(q, &bio);
-       __blk_queue_split(&bio, &nr_segs);
+       if (blk_may_split(q, bio))
+               __blk_queue_split(q, &bio, &nr_segs);
 
        if (!bio_integrity_prep(bio))
                goto queue_exit;
@@ -2193,23 +2408,35 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 
        rq_qos_throttle(q, bio);
 
-       hipri = bio->bi_opf & REQ_HIPRI;
-
-       data.cmd_flags = bio->bi_opf;
-       rq = __blk_mq_alloc_request(&data);
-       if (unlikely(!rq)) {
-               rq_qos_cleanup(q, bio);
-               if (bio->bi_opf & REQ_NOWAIT)
-                       bio_wouldblock_error(bio);
-               goto queue_exit;
+       plug = blk_mq_plug(q, bio);
+       if (plug && plug->cached_rq) {
+               rq = rq_list_pop(&plug->cached_rq);
+               INIT_LIST_HEAD(&rq->queuelist);
+       } else {
+               struct blk_mq_alloc_data data = {
+                       .q              = q,
+                       .nr_tags        = 1,
+                       .cmd_flags      = bio->bi_opf,
+               };
+
+               if (plug) {
+                       data.nr_tags = plug->nr_ios;
+                       plug->nr_ios = 1;
+                       data.cached_rq = &plug->cached_rq;
+               }
+               rq = __blk_mq_alloc_requests(&data);
+               if (unlikely(!rq)) {
+                       rq_qos_cleanup(q, bio);
+                       if (bio->bi_opf & REQ_NOWAIT)
+                               bio_wouldblock_error(bio);
+                       goto queue_exit;
+               }
        }
 
        trace_block_getrq(bio);
 
        rq_qos_track(q, rq, bio);
 
-       cookie = request_to_qc_t(data.hctx, rq);
-
        blk_mq_bio_to_request(rq, bio, nr_segs);
 
        ret = blk_crypto_init_request(rq);
@@ -2217,16 +2444,16 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
-               return BLK_QC_T_NONE;
+               return;
        }
 
-       plug = blk_mq_plug(q, bio);
        if (unlikely(is_flush_fua)) {
+               struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
                /* Bypass scheduler for flush requests */
                blk_insert_flush(rq);
-               blk_mq_run_hw_queue(data.hctx, true);
+               blk_mq_run_hw_queue(hctx, true);
        } else if (plug && (q->nr_hw_queues == 1 ||
-                  blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
+                  blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
                   q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
                /*
                 * Use plugging if we have a ->commit_rqs() hook as well, as
@@ -2250,7 +2477,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                }
 
                blk_add_rq_to_plug(plug, rq);
-       } else if (q->elevator) {
+       } else if (rq->rq_flags & RQF_ELV) {
                /* Insert the request at the IO scheduler queue */
                blk_mq_sched_insert_request(rq, false, true, true);
        } else if (plug && !blk_queue_nomerges(q)) {
@@ -2271,29 +2498,25 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                trace_block_plug(q);
 
                if (same_queue_rq) {
-                       data.hctx = same_queue_rq->mq_hctx;
                        trace_block_unplug(q, 1, true);
-                       blk_mq_try_issue_directly(data.hctx, same_queue_rq,
-                                       &cookie);
+                       blk_mq_try_issue_directly(same_queue_rq->mq_hctx,
+                                                 same_queue_rq);
                }
        } else if ((q->nr_hw_queues > 1 && is_sync) ||
-                       !data.hctx->dispatch_busy) {
+                  !rq->mq_hctx->dispatch_busy) {
                /*
                 * There is no scheduler and we can try to send directly
                 * to the hardware.
                 */
-               blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+               blk_mq_try_issue_directly(rq->mq_hctx, rq);
        } else {
                /* Default case. */
                blk_mq_sched_insert_request(rq, false, true, true);
        }
 
-       if (!hipri)
-               return BLK_QC_T_NONE;
-       return cookie;
+       return;
 queue_exit:
        blk_queue_exit(q);
-       return BLK_QC_T_NONE;
 }
 
 static size_t order_to_size(unsigned int order)
@@ -2302,19 +2525,22 @@ static size_t order_to_size(unsigned int order)
 }
 
 /* called before freeing request pool in @tags */
-static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
-               struct blk_mq_tags *tags, unsigned int hctx_idx)
+static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
+                                   struct blk_mq_tags *tags)
 {
-       struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
        struct page *page;
        unsigned long flags;
 
+       /* There is no need to clear a driver tags own mapping */
+       if (drv_tags == tags)
+               return;
+
        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;
 
-               for (i = 0; i < set->queue_depth; i++) {
+               for (i = 0; i < drv_tags->nr_tags; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;
 
@@ -2338,9 +2564,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
 {
+       struct blk_mq_tags *drv_tags;
        struct page *page;
 
-       if (tags->rqs && set->ops->exit_request) {
+       if (blk_mq_is_shared_tags(set->flags))
+               drv_tags = set->shared_tags;
+       else
+               drv_tags = set->tags[hctx_idx];
+
+       if (tags->static_rqs && set->ops->exit_request) {
                int i;
 
                for (i = 0; i < tags->nr_tags; i++) {
@@ -2353,7 +2585,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                }
        }
 
-       blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+       blk_mq_clear_rq_mapping(drv_tags, tags);
 
        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
@@ -2367,21 +2599,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
        }
 }
 
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 {
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;
 
-       blk_mq_free_tags(tags, flags);
+       blk_mq_free_tags(tags);
 }
 
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-                                       unsigned int hctx_idx,
-                                       unsigned int nr_tags,
-                                       unsigned int reserved_tags,
-                                       unsigned int flags)
+static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                              unsigned int hctx_idx,
+                                              unsigned int nr_tags,
+                                              unsigned int reserved_tags)
 {
        struct blk_mq_tags *tags;
        int node;
@@ -2390,7 +2621,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
 
-       tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
+       tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
+                               BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;
 
@@ -2398,7 +2630,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs) {
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                return NULL;
        }
 
@@ -2407,7 +2639,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                        node);
        if (!tags->static_rqs) {
                kfree(tags->rqs);
-               blk_mq_free_tags(tags, flags);
+               blk_mq_free_tags(tags);
                return NULL;
        }
 
@@ -2429,8 +2661,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
        return 0;
 }
 
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-                    unsigned int hctx_idx, unsigned int depth)
+static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
+                           struct blk_mq_tags *tags,
+                           unsigned int hctx_idx, unsigned int depth)
 {
        unsigned int i, j, entries_per_page, max_order = 4;
        size_t rq_size, left;
@@ -2841,37 +3074,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
        }
 }
 
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
-                                       int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                            unsigned int hctx_idx,
+                                            unsigned int depth)
 {
-       unsigned int flags = set->flags;
-       int ret = 0;
+       struct blk_mq_tags *tags;
+       int ret;
 
-       set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
-                                       set->queue_depth, set->reserved_tags, flags);
-       if (!set->tags[hctx_idx])
-               return false;
+       tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
+       if (!tags)
+               return NULL;
 
-       ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
-                               set->queue_depth);
-       if (!ret)
-               return true;
+       ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
+       if (ret) {
+               blk_mq_free_rq_map(tags);
+               return NULL;
+       }
 
-       blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-       set->tags[hctx_idx] = NULL;
-       return false;
+       return tags;
 }
 
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
-                                        unsigned int hctx_idx)
+static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+                                      int hctx_idx)
 {
-       unsigned int flags = set->flags;
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->tags[hctx_idx] = set->shared_tags;
 
-       if (set->tags && set->tags[hctx_idx]) {
-               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-               blk_mq_free_rq_map(set->tags[hctx_idx], flags);
-               set->tags[hctx_idx] = NULL;
+               return true;
        }
+
+       set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
+                                                      set->queue_depth);
+
+       return set->tags[hctx_idx];
+}
+
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                            struct blk_mq_tags *tags,
+                            unsigned int hctx_idx)
+{
+       if (tags) {
+               blk_mq_free_rqs(set, tags, hctx_idx);
+               blk_mq_free_rq_map(tags);
+       }
+}
+
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+                                     unsigned int hctx_idx)
+{
+       if (!blk_mq_is_shared_tags(set->flags))
+               blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
+
+       set->tags[hctx_idx] = NULL;
 }
 
 static void blk_mq_map_swqueue(struct request_queue *q)
@@ -2904,7 +3158,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
-                           !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+                           !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
@@ -2951,8 +3205,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                         * fallback in case of a new remap fails
                         * allocation
                         */
-                       if (i && set->tags[i])
-                               blk_mq_free_map_and_requests(set, i);
+                       if (i)
+                               __blk_mq_free_map_and_rqs(set, i);
 
                        hctx->tags = NULL;
                        continue;
@@ -3248,8 +3502,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx = hctxs[j];
 
                if (hctx) {
-                       if (hctx->tags)
-                               blk_mq_free_map_and_requests(set, j);
+                       __blk_mq_free_map_and_rqs(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
                        hctxs[j] = NULL;
                }
@@ -3336,8 +3589,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
        int i;
 
+       if (blk_mq_is_shared_tags(set->flags)) {
+               set->shared_tags = blk_mq_alloc_map_and_rqs(set,
+                                               BLK_MQ_NO_HCTX_IDX,
+                                               set->queue_depth);
+               if (!set->shared_tags)
+                       return -ENOMEM;
+       }
+
        for (i = 0; i < set->nr_hw_queues; i++) {
-               if (!__blk_mq_alloc_map_and_request(set, i))
+               if (!__blk_mq_alloc_map_and_rqs(set, i))
                        goto out_unwind;
                cond_resched();
        }
@@ -3346,7 +3607,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 
 out_unwind:
        while (--i >= 0)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
+
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
 
        return -ENOMEM;
 }
@@ -3356,7 +3622,7 @@ out_unwind:
  * may reduce the depth asked for, if memory is tight. set->queue_depth
  * will be updated to reflect the allocated depth.
  */
-static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
 {
        unsigned int depth;
        int err;
@@ -3522,27 +3788,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
        if (ret)
                goto out_free_mq_map;
 
-       ret = blk_mq_alloc_map_and_requests(set);
+       ret = blk_mq_alloc_set_map_and_rqs(set);
        if (ret)
                goto out_free_mq_map;
 
-       if (blk_mq_is_sbitmap_shared(set->flags)) {
-               atomic_set(&set->active_queues_shared_sbitmap, 0);
-
-               if (blk_mq_init_shared_sbitmap(set)) {
-                       ret = -ENOMEM;
-                       goto out_free_mq_rq_maps;
-               }
-       }
-
        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);
 
        return 0;
 
-out_free_mq_rq_maps:
-       for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
 out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
@@ -3575,10 +3829,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
        int i, j;
 
        for (i = 0; i < set->nr_hw_queues; i++)
-               blk_mq_free_map_and_requests(set, i);
+               __blk_mq_free_map_and_rqs(set, i);
 
-       if (blk_mq_is_sbitmap_shared(set->flags))
-               blk_mq_exit_shared_sbitmap(set);
+       if (blk_mq_is_shared_tags(set->flags)) {
+               blk_mq_free_map_and_rqs(set, set->shared_tags,
+                                       BLK_MQ_NO_HCTX_IDX);
+       }
 
        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
@@ -3613,20 +3869,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
                 * If we're using an MQ scheduler, just update the scheduler
                 * queue depth. This is similar to what the old code would do.
                 */
-               if (!hctx->sched_tags) {
-                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
-                                                       false);
-                       if (!ret && blk_mq_is_sbitmap_shared(set->flags))
-                               blk_mq_tag_resize_shared_sbitmap(set, nr);
-               } else {
+               if (hctx->sched_tags) {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
-                                                       nr, true);
-                       if (blk_mq_is_sbitmap_shared(set->flags)) {
-                               hctx->sched_tags->bitmap_tags =
-                                       &q->sched_bitmap_tags;
-                               hctx->sched_tags->breserved_tags =
-                                       &q->sched_breserved_tags;
-                       }
+                                                     nr, true);
+               } else {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
+                                                     false);
                }
                if (ret)
                        break;
@@ -3635,9 +3883,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        }
        if (!ret) {
                q->nr_requests = nr;
-               if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
-                       sbitmap_queue_resize(&q->sched_bitmap_tags,
-                                            nr - set->reserved_tags);
+               if (blk_mq_is_shared_tags(set->flags)) {
+                       if (q->elevator)
+                               blk_mq_tag_update_sched_shared_tags(q);
+                       else
+                               blk_mq_tag_resize_shared_tags(set, nr);
+               }
        }
 
        blk_mq_unquiesce_queue(q);
@@ -3856,15 +4107,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
        return ret;
 }
 
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
-                                    struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
 {
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
+       struct request *rq = blk_qc_to_rq(hctx, qc);
        struct hrtimer_sleeper hs;
        enum hrtimer_mode mode;
        unsigned int nsecs;
        ktime_t kt;
 
-       if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
+       /*
+        * If a request has completed on queue that uses an I/O scheduler, we
+        * won't get back a request from blk_qc_to_rq.
+        */
+       if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
                return false;
 
        /*
@@ -3906,92 +4162,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 
        __set_current_state(TASK_RUNNING);
        destroy_hrtimer_on_stack(&hs.timer);
-       return true;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q,
-                              struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
-{
-       struct request *rq;
-
-       if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
-               return false;
-
-       if (!blk_qc_t_is_internal(cookie))
-               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-       else {
-               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-               /*
-                * With scheduling, if the request has completed, we'll
-                * get a NULL return here, as we clear the sched tag when
-                * that happens. The request still remains valid, like always,
-                * so we should be safe with just the NULL check.
-                */
-               if (!rq)
-                       return false;
-       }
-
-       return blk_mq_poll_hybrid_sleep(q, rq);
-}
-
-/**
- * blk_poll - poll for IO completions
- * @q:  the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- *    Poll for completions on the passed in queue. Returns number of
- *    completed entries found. If @spin is true, then blk_poll will continue
- *    looping until at least one completion is found, unless the task is
- *    otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int state;
-
-       if (!blk_qc_t_valid(cookie) ||
-           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               return 0;
-
-       if (current->plug)
-               blk_flush_plug_list(current->plug, false);
-
-       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 
        /*
-        * If we sleep, have the caller restart the poll loop to reset
-        * the state. Like for the other success return cases, the
-        * caller is responsible for checking if the IO completed. If
-        * the IO isn't complete, we'll get called again and will go
-        * straight to the busy poll loop. If specified not to spin,
-        * we also should not sleep.
+        * If we sleep, have the caller restart the poll loop to reset the
+        * state.  Like for the other success return cases, the caller is
+        * responsible for checking if the IO completed.  If the IO isn't
+        * complete, we'll get called again and will go straight to the busy
+        * poll loop.
         */
-       if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
-               return 1;
+       return true;
+}
 
-       hctx->poll_considered++;
+static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
+                              struct io_comp_batch *iob, unsigned int flags)
+{
+       struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
+       long state = get_current_state();
+       int ret;
 
-       state = get_current_state();
        do {
-               int ret;
-
-               hctx->poll_invoked++;
-
-               ret = q->mq_ops->poll(hctx);
+               ret = q->mq_ops->poll(hctx, iob);
                if (ret > 0) {
-                       hctx->poll_success++;
                        __set_current_state(TASK_RUNNING);
                        return ret;
                }
 
                if (signal_pending_state(state, current))
                        __set_current_state(TASK_RUNNING);
-
                if (task_is_running(current))
                        return 1;
-               if (ret < 0 || !spin)
+
+               if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                        break;
                cpu_relax();
        } while (!need_resched());
@@ -3999,7 +4200,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
        __set_current_state(TASK_RUNNING);
        return 0;
 }
-EXPORT_SYMBOL_GPL(blk_poll);
+
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+               unsigned int flags)
+{
+       if (!(flags & BLK_POLL_NOSLEEP) &&
+           q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
+               if (blk_mq_poll_hybrid(q, cookie))
+                       return 1;
+       }
+       return blk_mq_poll_classic(q, cookie, iob, flags);
+}
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {