Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc
[linux-2.6-block.git] / drivers / md / dm.c
index 8001fe9e3434734ad92c8109ef8fa860906238ce..f8c7ca3e8947378484a6d3f9745c363c78ab2879 100644 (file)
@@ -21,6 +21,9 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
 
 #include <trace/events/block.h>
 
@@ -216,8 +219,29 @@ struct mapped_device {
 
        struct kthread_worker kworker;
        struct task_struct *kworker_task;
+
+       /* for request-based merge heuristic in dm_request_fn() */
+       unsigned seq_rq_merge_deadline_usecs;
+       int last_rq_rw;
+       sector_t last_rq_pos;
+       ktime_t last_rq_start_time;
+
+       /* for blk-mq request-based DM support */
+       struct blk_mq_tag_set tag_set;
+       bool use_blk_mq;
 };
 
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+       return md->use_blk_mq;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+static unsigned __dm_get_module_param(unsigned *module_param,
                                      unsigned def, unsigned max)
 {
-       unsigned ios = ACCESS_ONCE(*reserved_ios);
-       unsigned modified_ios = 0;
+       unsigned param = ACCESS_ONCE(*module_param);
+       unsigned modified_param = 0;
 
-       if (!ios)
-               modified_ios = def;
-       else if (ios > max)
-               modified_ios = max;
+       if (!param)
+               modified_param = def;
+       else if (param > max)
+               modified_param = max;
 
-       if (modified_ios) {
-               (void)cmpxchg(reserved_ios, ios, modified_ios);
-               ios = modified_ios;
+       if (modified_param) {
+               (void)cmpxchg(module_param, param, modified_param);
+               param = modified_param;
        }
 
-       return ios;
+       return param;
 }
 
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-       return __dm_get_reserved_ios(&reserved_bio_based_ios,
+       return __dm_get_module_param(&reserved_bio_based_ios,
                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-       return __dm_get_reserved_ios(&reserved_rq_based_ios,
+       return __dm_get_module_param(&reserved_rq_based_ios,
                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
@@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
 
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+       return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
+       int nr_requests_pending;
+
        atomic_dec(&md->pending[rw]);
 
        /* nudge anyone waiting on suspend queue */
-       if (!md_in_flight(md))
+       nr_requests_pending = md_in_flight(md);
+       if (!nr_requests_pending)
                wake_up(&md->wait);
 
        /*
@@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
         * back into ->request_fn() could deadlock attempting to grab the
         * queue lock again.
         */
-       if (run_queue)
-               blk_run_queue_async(md->queue);
+       if (run_queue) {
+               if (md->queue->mq_ops)
+                       blk_mq_run_hw_queues(md->queue, true);
+               else if (!nr_requests_pending ||
+                        (nr_requests_pending >= md->queue->nr_congestion_on))
+                       blk_run_queue_async(md->queue);
+       }
 
        /*
         * dm_put() must be at the end of this function. See the comment above
@@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 static void free_rq_clone(struct request *clone)
 {
        struct dm_rq_target_io *tio = clone->end_io_data;
+       struct mapped_device *md = tio->md;
 
        blk_rq_unprep_clone(clone);
-       if (clone->q && clone->q->mq_ops)
+
+       if (clone->q->mq_ops)
                tio->ti->type->release_clone_rq(clone);
-       else
-               free_clone_request(tio->md, clone);
-       free_rq_tio(tio);
+       else if (!md->queue->mq_ops)
+               /* request_fn queue stacked on request_fn queue(s) */
+               free_clone_request(md, clone);
+
+       if (!md->queue->mq_ops)
+               free_rq_tio(tio);
 }
 
 /*
@@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error)
        }
 
        free_rq_clone(clone);
-       blk_end_request_all(rq, error);
+       if (!rq->q->mq_ops)
+               blk_end_request_all(rq, error);
+       else
+               blk_mq_end_request(rq, error);
        rq_completed(md, rw, true);
 }
 
 static void dm_unprep_request(struct request *rq)
 {
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
 
-       rq->special = NULL;
-       rq->cmd_flags &= ~REQ_DONTPREP;
+       if (!rq->q->mq_ops) {
+               rq->special = NULL;
+               rq->cmd_flags &= ~REQ_DONTPREP;
+       }
 
        if (clone)
                free_rq_clone(clone);
@@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-                                                struct request *rq)
+static void old_requeue_request(struct request *rq)
 {
-       int rw = rq_data_dir(rq);
        struct request_queue *q = rq->q;
        unsigned long flags;
 
-       dm_unprep_request(rq);
-
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, rq);
        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+                                                struct request *rq)
+{
+       int rw = rq_data_dir(rq);
+
+       dm_unprep_request(rq);
+
+       if (!rq->q->mq_ops)
+               old_requeue_request(rq);
+       else {
+               blk_mq_requeue_request(rq);
+               blk_mq_kick_requeue_list(rq->q);
+       }
 
        rq_completed(md, rw, false);
 }
@@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
        dm_requeue_unmapped_original_request(tio->md, tio->orig);
 }
 
-static void __stop_queue(struct request_queue *q)
-{
-       blk_stop_queue(q);
-}
-
-static void stop_queue(struct request_queue *q)
+static void old_stop_queue(struct request_queue *q)
 {
        unsigned long flags;
 
+       if (blk_queue_stopped(q))
+               return;
+
        spin_lock_irqsave(q->queue_lock, flags);
-       __stop_queue(q);
+       blk_stop_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void __start_queue(struct request_queue *q)
+static void stop_queue(struct request_queue *q)
 {
-       if (blk_queue_stopped(q))
-               blk_start_queue(q);
+       if (!q->mq_ops)
+               old_stop_queue(q);
+       else
+               blk_mq_stop_hw_queues(q);
 }
 
-static void start_queue(struct request_queue *q)
+static void old_start_queue(struct request_queue *q)
 {
        unsigned long flags;
 
        spin_lock_irqsave(q->queue_lock, flags);
-       __start_queue(q);
+       if (blk_queue_stopped(q))
+               blk_start_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void start_queue(struct request_queue *q)
+{
+       if (!q->mq_ops)
+               old_start_queue(q);
+       else
+               blk_mq_start_stopped_hw_queues(q, true);
+}
+
 static void dm_done(struct request *clone, int error, bool mapped)
 {
        int r = error;
@@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 static void dm_softirq_done(struct request *rq)
 {
        bool mapped = true;
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
+       int rw;
 
        if (!clone) {
-               blk_end_request_all(rq, tio->error);
-               rq_completed(tio->md, rq_data_dir(rq), false);
-               free_rq_tio(tio);
+               rw = rq_data_dir(rq);
+               if (!rq->q->mq_ops) {
+                       blk_end_request_all(rq, tio->error);
+                       rq_completed(tio->md, rw, false);
+                       free_rq_tio(tio);
+               } else {
+                       blk_mq_end_request(rq, tio->error);
+                       rq_completed(tio->md, rw, false);
+               }
                return;
        }
 
@@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq)
  */
 static void dm_complete_request(struct request *rq, int error)
 {
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_rq_target_io *tio = tio_from_request(rq);
 
        tio->error = error;
        blk_complete_request(rq);
@@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 
 /*
- * Called with the clone's queue lock held
+ * Called with the clone's queue lock held (for non-blk-mq)
  */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1693,7 +1767,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static void dm_make_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md)
        return blk_queue_stackable(md->queue);
 }
 
-static void dm_request(struct request_queue *q, struct bio *bio)
-{
-       struct mapped_device *md = q->queuedata;
-
-       if (dm_request_based(md))
-               blk_queue_bio(q, bio);
-       else
-               _dm_request(q, bio);
-}
-
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
        int r;
@@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
                                struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
-       struct request *clone = alloc_clone_request(md, gfp_mask);
+       /*
+        * Do not allocate a clone if tio->clone was already set
+        * (see: dm_mq_queue_rq).
+        */
+       bool alloc_clone = !tio->clone;
+       struct request *clone;
 
-       if (!clone)
-               return NULL;
+       if (alloc_clone) {
+               clone = alloc_clone_request(md, gfp_mask);
+               if (!clone)
+                       return NULL;
+       } else
+               clone = tio->clone;
 
        blk_rq_init(NULL, clone);
        if (setup_clone(clone, rq, tio, gfp_mask)) {
                /* -ENOMEM */
-               free_clone_request(md, clone);
+               if (alloc_clone)
+                       free_clone_request(md, clone);
                return NULL;
        }
 
@@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 
 static void map_tio_request(struct kthread_work *work);
 
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+                    struct mapped_device *md)
+{
+       tio->md = md;
+       tio->ti = NULL;
+       tio->clone = NULL;
+       tio->orig = rq;
+       tio->error = 0;
+       memset(&tio->info, 0, sizeof(tio->info));
+       if (md->kworker_task)
+               init_kthread_work(&tio->work, map_tio_request);
+}
+
 static struct dm_rq_target_io *prep_tio(struct request *rq,
                                        struct mapped_device *md, gfp_t gfp_mask)
 {
@@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
        if (!tio)
                return NULL;
 
-       tio->md = md;
-       tio->ti = NULL;
-       tio->clone = NULL;
-       tio->orig = rq;
-       tio->error = 0;
-       memset(&tio->info, 0, sizeof(tio->info));
-       init_kthread_work(&tio->work, map_tio_request);
+       init_tio(tio, rq, md);
 
        table = dm_get_live_table(md, &srcu_idx);
        if (!dm_table_mq_request_based(table)) {
@@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
  * DM_MAPIO_REQUEUE : the original request needs to be requeued
  * < 0              : the request was completed due to failure
  */
-static int map_request(struct dm_target *ti, struct request *rq,
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                       struct mapped_device *md)
 {
        int r;
-       struct dm_rq_target_io *tio = rq->special;
+       struct dm_target *ti = tio->ti;
        struct request *clone = NULL;
 
        if (tio->clone) {
@@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
                }
                if (IS_ERR(clone))
                        return DM_MAPIO_REQUEUE;
-               if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+               if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
                        /* -ENOMEM */
                        ti->type->release_clone_rq(clone);
                        return DM_MAPIO_REQUEUE;
@@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work)
        struct request *rq = tio->orig;
        struct mapped_device *md = tio->md;
 
-       if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+       if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
                dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
-       blk_start_request(orig);
+       if (!orig->q->mq_ops)
+               blk_start_request(orig);
+       else
+               blk_mq_start_request(orig);
        atomic_inc(&md->pending[rq_data_dir(orig)]);
 
+       if (md->seq_rq_merge_deadline_usecs) {
+               md->last_rq_pos = rq_end_sector(orig);
+               md->last_rq_rw = rq_data_dir(orig);
+               md->last_rq_start_time = ktime_get();
+       }
+
        /*
         * Hold the md reference here for the in-flight I/O.
         * We can't rely on the reference count by device opener,
@@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
        dm_get(md);
 }
 
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+       return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+                                                    const char *buf, size_t count)
+{
+       unsigned deadline;
+
+       if (!dm_request_based(md) || md->use_blk_mq)
+               return count;
+
+       if (kstrtouint(buf, 10, &deadline))
+               return -EINVAL;
+
+       if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+               deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+       md->seq_rq_merge_deadline_usecs = deadline;
+
+       return count;
+}
+
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+       ktime_t kt_deadline;
+
+       if (!md->seq_rq_merge_deadline_usecs)
+               return false;
+
+       kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+       kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+       return !ktime_after(ktime_get(), kt_deadline);
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q)
        while (!blk_queue_stopped(q)) {
                rq = blk_peek_request(q);
                if (!rq)
-                       goto delay_and_out;
+                       goto out;
 
                /* always use block 0 to find the target for flushes for now */
                pos = 0;
@@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q)
                        continue;
                }
 
+               if (dm_request_peeked_before_merge_deadline(md) &&
+                   md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+                   md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+                       goto delay_and_out;
+
                if (ti->type->busy && ti->type->busy(ti))
                        goto delay_and_out;
 
                dm_start_request(md, rq);
 
-               tio = rq->special;
+               tio = tio_from_request(rq);
                /* Establish tio->ti before queuing work (map_tio_request) */
                tio->ti = ti;
                queue_kthread_work(&md->kworker, &tio->work);
@@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q)
        goto out;
 
 delay_and_out:
-       blk_delay_queue(q, HZ / 10);
+       blk_delay_queue(q, HZ / 100);
 out:
        dm_put_live_table(md, srcu_idx);
 }
 
-int dm_underlying_device_busy(struct request_queue *q)
-{
-       return blk_lld_busy(q);
-}
-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
-
-static int dm_lld_busy(struct request_queue *q)
-{
-       int r;
-       struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_live_table_fast(md);
-
-       if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
-               r = 1;
-       else
-               r = dm_table_any_busy_target(map);
-
-       dm_put_live_table_fast(md);
-
-       return r;
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
        int r = bdi_bits;
@@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 {
        /*
         * Request-based dm devices cannot be stacked on top of bio-based dm
-        * devices.  The type of this dm device has not been decided yet.
+        * devices.  The type of this dm device may not have been decided yet.
         * The type is decided at the first table loading time.
         * To prevent problematic device stacking, clear the queue flag
         * for request stacking support until then.
@@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md)
         * This queue is new, so no concurrency on the queue_flags.
         */
        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
+
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+       md->use_blk_mq = false;
+       dm_init_md_queue(md);
 
+       /*
+        * Initialize aspects of queue that aren't relevant for blk-mq
+        */
        md->queue->queuedata = md;
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
-       blk_queue_make_request(md->queue, dm_request);
+
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-       blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 }
 
 /*
@@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
        if (r < 0)
                goto bad_io_barrier;
 
+       md->use_blk_mq = use_blk_mq;
        md->type = DM_TYPE_NONE;
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
@@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md)
        del_gendisk(md->disk);
        put_disk(md->disk);
        blk_cleanup_queue(md->queue);
+       if (md->use_blk_mq)
+               blk_mq_free_tag_set(&md->tag_set);
        bdput(md->bdev);
        free_minor(minor);
 
@@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-       if (md->io_pool && md->bs) {
+       if (md->bs) {
                /* The md already has necessary mempools. */
                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
                        /*
@@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
        p->bs = NULL;
 
 out:
-       /* mempool bind completed, now no need any mempools in the table */
+       /* mempool bind completed, no longer need any mempools in the table */
        dm_table_free_md_mempools(t);
 }
 
@@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
        if (!q->merge_bvec_fn)
                return 0;
 
-       if (q->make_request_fn == dm_request) {
+       if (q->make_request_fn == dm_make_request) {
                dev_md = q->queuedata;
                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
                        return 0;
@@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
         * This must be done before setting the queue restrictions,
         * because request-based dm may be run just after the setting.
         */
-       if (dm_table_request_based(t) && !blk_queue_stopped(q))
+       if (dm_table_request_based(t))
                stop_queue(q);
 
        __bind_mempools(md, t);
@@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
        return md->type;
 }
 
-static bool dm_md_type_request_based(struct mapped_device *md)
-{
-       unsigned table_type = dm_get_md_type(md);
-
-       return (table_type == DM_TYPE_REQUEST_BASED ||
-               table_type == DM_TYPE_MQ_REQUEST_BASED);
-}
-
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
        return md->immutable_target_type;
@@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+       /* Initialize the request-based DM worker thread */
+       init_kthread_worker(&md->kworker);
+       md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+                                      "kdmwork-%s", dm_device_name(md));
+}
+
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
@@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md)
        struct request_queue *q = NULL;
 
        if (md->queue->elevator)
-               return 1;
+               return 0;
 
        /* Fully initialize the queue */
        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
        if (!q)
-               return 0;
+               return -EINVAL;
+
+       /* disable dm_request_fn's merge heuristic by default */
+       md->seq_rq_merge_deadline_usecs = 0;
 
        md->queue = q;
-       dm_init_md_queue(md);
+       dm_init_old_md_queue(md);
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
-       blk_queue_lld_busy(md->queue, dm_lld_busy);
 
-       /* Also initialize the request-based DM worker thread */
-       init_kthread_worker(&md->kworker);
-       md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-                                      "kdmwork-%s", dm_device_name(md));
+       init_rq_based_worker_thread(md);
 
        elv_register_queue(md->queue);
 
-       return 1;
+       return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+                             unsigned int hctx_idx, unsigned int request_idx,
+                             unsigned int numa_node)
+{
+       struct mapped_device *md = data;
+       struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+       /*
+        * Must initialize md member of tio, otherwise it won't
+        * be available in dm_mq_queue_rq.
+        */
+       tio->md = md;
+
+       return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+                         const struct blk_mq_queue_data *bd)
+{
+       struct request *rq = bd->rq;
+       struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+       struct mapped_device *md = tio->md;
+       int srcu_idx;
+       struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+       struct dm_target *ti;
+       sector_t pos;
+
+       /* always use block 0 to find the target for flushes for now */
+       pos = 0;
+       if (!(rq->cmd_flags & REQ_FLUSH))
+               pos = blk_rq_pos(rq);
+
+       ti = dm_table_find_target(map, pos);
+       if (!dm_target_is_valid(ti)) {
+               dm_put_live_table(md, srcu_idx);
+               DMERR_LIMIT("request attempted access beyond the end of device");
+               /*
+                * Must perform setup, that rq_completed() requires,
+                * before returning BLK_MQ_RQ_QUEUE_ERROR
+                */
+               dm_start_request(md, rq);
+               return BLK_MQ_RQ_QUEUE_ERROR;
+       }
+       dm_put_live_table(md, srcu_idx);
+
+       if (ti->type->busy && ti->type->busy(ti))
+               return BLK_MQ_RQ_QUEUE_BUSY;
+
+       dm_start_request(md, rq);
+
+       /* Init tio using md established in .init_request */
+       init_tio(tio, rq, md);
+
+       /*
+        * Establish tio->ti before queuing work (map_tio_request)
+        * or making direct call to map_request().
+        */
+       tio->ti = ti;
+
+       /* Clone the request if underlying devices aren't blk-mq */
+       if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+               /* clone request is allocated at the end of the pdu */
+               tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+               if (!clone_rq(rq, md, tio, GFP_ATOMIC))
+                       return BLK_MQ_RQ_QUEUE_BUSY;
+               queue_kthread_work(&md->kworker, &tio->work);
+       } else {
+               /* Direct call is fine since .queue_rq allows allocations */
+               if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+                       dm_requeue_unmapped_original_request(md, rq);
+       }
+
+       return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+       .queue_rq = dm_mq_queue_rq,
+       .map_queue = blk_mq_map_queue,
+       .complete = dm_softirq_done,
+       .init_request = dm_mq_init_request,
+};
+
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+       unsigned md_type = dm_get_md_type(md);
+       struct request_queue *q;
+       int err;
+
+       memset(&md->tag_set, 0, sizeof(md->tag_set));
+       md->tag_set.ops = &dm_mq_ops;
+       md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+       md->tag_set.numa_node = NUMA_NO_NODE;
+       md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       md->tag_set.nr_hw_queues = 1;
+       if (md_type == DM_TYPE_REQUEST_BASED) {
+               /* make the memory for non-blk-mq clone part of the pdu */
+               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+       } else
+               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+       md->tag_set.driver_data = md;
+
+       err = blk_mq_alloc_tag_set(&md->tag_set);
+       if (err)
+               return err;
+
+       q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+       if (IS_ERR(q)) {
+               err = PTR_ERR(q);
+               goto out_tag_set;
+       }
+       md->queue = q;
+       dm_init_md_queue(md);
+
+       /* backfill 'mq' sysfs registration normally done in blk_register_queue */
+       blk_mq_register_disk(md->disk);
+
+       if (md_type == DM_TYPE_REQUEST_BASED)
+               init_rq_based_worker_thread(md);
+
+       return 0;
+
+out_tag_set:
+       blk_mq_free_tag_set(&md->tag_set);
+       return err;
+}
+
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+       if (type == DM_TYPE_BIO_BASED)
+               return type;
+
+       return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
 }
 
 /*
@@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-       if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
-               DMWARN("Cannot initialize queue for request-based mapped device");
-               return -EINVAL;
+       int r;
+       unsigned md_type = filter_md_type(dm_get_md_type(md), md);
+
+       switch (md_type) {
+       case DM_TYPE_REQUEST_BASED:
+               r = dm_init_request_based_queue(md);
+               if (r) {
+                       DMWARN("Cannot initialize queue for request-based mapped device");
+                       return r;
+               }
+               break;
+       case DM_TYPE_MQ_REQUEST_BASED:
+               r = dm_init_request_based_blk_mq_queue(md);
+               if (r) {
+                       DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+                       return r;
+               }
+               break;
+       case DM_TYPE_BIO_BASED:
+               dm_init_old_md_queue(md);
+               blk_queue_make_request(md->queue, dm_make_request);
+               blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+               break;
        }
 
        return 0;
@@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
        set_bit(DMF_FREEING, &md->flags);
        spin_unlock(&_minor_lock);
 
-       if (dm_request_based(md))
+       if (dm_request_based(md) && md->kworker_task)
                flush_kthread_worker(&md->kworker);
 
        /*
@@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
         */
        if (dm_request_based(md)) {
                stop_queue(md->queue);
-               flush_kthread_worker(&md->kworker);
+               if (md->kworker_task)
+                       flush_kthread_worker(&md->kworker);
        }
 
        flush_workqueue(md->wq);
@@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 {
        return md->disk;
 }
+EXPORT_SYMBOL_GPL(dm_disk);
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
@@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+                                           unsigned integrity, unsigned per_bio_data_size)
 {
        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-       struct kmem_cache *cachep;
+       struct kmem_cache *cachep = NULL;
        unsigned int pool_size = 0;
        unsigned int front_pad;
 
        if (!pools)
                return NULL;
 
+       type = filter_md_type(type, md);
+
        switch (type) {
        case DM_TYPE_BIO_BASED:
                cachep = _io_cache;
@@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
                break;
        case DM_TYPE_REQUEST_BASED:
+               cachep = _rq_tio_cache;
                pool_size = dm_get_reserved_rq_based_ios();
                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
                if (!pools->rq_pool)
                        goto out;
                /* fall through to setup remaining rq-based pools */
        case DM_TYPE_MQ_REQUEST_BASED:
-               cachep = _rq_tio_cache;
                if (!pool_size)
                        pool_size = dm_get_reserved_rq_based_ios();
                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                WARN_ON(per_bio_data_size != 0);
                break;
        default:
-               goto out;
+               BUG();
        }
 
-       pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-       if (!pools->io_pool)
-               goto out;
+       if (cachep) {
+               pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+               if (!pools->io_pool)
+                       goto out;
+       }
 
        pools->bs = bioset_create_nobvec(pool_size, front_pad);
        if (!pools->bs)
@@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");