Merge branch 'for-linus' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 May 2015 02:49:35 +0000 (19:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 May 2015 02:49:35 +0000 (19:49 -0700)
Pull block fixes from Jens Axboe:
 "A collection of fixes since the merge window;

   - fix for a double elevator module release, from Chao Yu.  Ancient bug.

   - the splice() MORE flag fix from Christophe Leroy.

   - a fix for NVMe, fixing a patch that went in in the merge window.
     From Keith.

   - two fixes for blk-mq CPU hotplug handling, from Ming Lei.

   - bdi vs blockdev lifetime fix from Neil Brown, fixing and oops in md.

   - two blk-mq fixes from Shaohua, fixing a race on queue stop and a
     bad merge issue with FUA writes.

   - division-by-zero fix for writeback from Tejun.

   - a block bounce page accounting fix, making sure we inc/dec after
     bouncing so that pre/post IO pages match up.  From Wang YanQing"

* 'for-linus' of git://git.kernel.dk/linux-block:
  splice: sendfile() at once fails for big files
  blk-mq: don't lose requests if a stopped queue restarts
  blk-mq: fix FUA request hang
  block: destroy bdi before blockdev is unregistered.
  block:bounce: fix call inc_|dec_zone_page_state on different pages confuse value of NR_BOUNCE
  elevator: fix double release of elevator module
  writeback: use |1 instead of +1 to protect against div by zero
  blk-mq: fix CPU hotplug handling
  blk-mq: fix race between timeout and CPU hotplug
  NVMe: Fix VPD B0 max sectors translation

block/blk-core.c
block/blk-mq.c
block/blk-sysfs.c
block/bounce.c
block/elevator.c
drivers/block/loop.c
drivers/block/nvme-scsi.c
drivers/md/md.c
fs/splice.c
include/linux/blk_types.h
mm/page-writeback.c

index fd154b94447a25788f48d5e8cc04bc803d1efdb8..7871603f0a29bba5dbdc6548174595bf827fa912 100644 (file)
@@ -552,6 +552,8 @@ void blk_cleanup_queue(struct request_queue *q)
                q->queue_lock = &q->__queue_lock;
        spin_unlock_irq(lock);
 
+       bdi_destroy(&q->backing_dev_info);
+
        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
 }
index ade8a2d1b0aa8600ad31413b59db37392628bffc..e68b71b85a7eaf0e3097debe8bf4dc4078e7a038 100644 (file)
@@ -677,8 +677,11 @@ static void blk_mq_rq_timer(unsigned long priv)
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-               queue_for_each_hw_ctx(q, hctx, i)
-                       blk_mq_tag_idle(hctx);
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       /* the hctx may be unmapped, so check it here */
+                       if (blk_mq_hw_queue_mapped(hctx))
+                               blk_mq_tag_idle(hctx);
+               }
        }
 }
 
@@ -855,6 +858,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                spin_lock(&hctx->lock);
                list_splice(&rq_list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
+               /*
+                * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+                * it's possible the queue is stopped and restarted again
+                * before this. Queue restart will dispatch requests. And since
+                * requests in rq_list aren't added into hctx->dispatch yet,
+                * the requests in rq_list might get lost.
+                *
+                * blk_mq_run_hw_queue() already checks the STOPPED bit
+                **/
+               blk_mq_run_hw_queue(hctx, true);
        }
 }
 
@@ -1571,22 +1584,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
        return NOTIFY_OK;
 }
 
-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
-       struct request_queue *q = hctx->queue;
-       struct blk_mq_tag_set *set = q->tag_set;
-
-       if (set->tags[hctx->queue_num])
-               return NOTIFY_OK;
-
-       set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-       if (!set->tags[hctx->queue_num])
-               return NOTIFY_STOP;
-
-       hctx->tags = set->tags[hctx->queue_num];
-       return NOTIFY_OK;
-}
-
 static int blk_mq_hctx_notify(void *data, unsigned long action,
                              unsigned int cpu)
 {
@@ -1594,8 +1591,11 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
 
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
                return blk_mq_hctx_cpu_offline(hctx, cpu);
-       else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-               return blk_mq_hctx_cpu_online(hctx, cpu);
+
+       /*
+        * In case of CPU online, tags may be reallocated
+        * in blk_mq_map_swqueue() after mapping is updated.
+        */
 
        return NOTIFY_OK;
 }
@@ -1775,6 +1775,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        unsigned int i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
+       struct blk_mq_tag_set *set = q->tag_set;
 
        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
@@ -1803,16 +1804,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
-                       struct blk_mq_tag_set *set = q->tag_set;
-
                        if (set->tags[i]) {
                                blk_mq_free_rq_map(set, set->tags[i], i);
                                set->tags[i] = NULL;
-                               hctx->tags = NULL;
                        }
+                       hctx->tags = NULL;
                        continue;
                }
 
+               /* unmapped hw queue can be remapped after CPU topo changed */
+               if (!set->tags[i])
+                       set->tags[i] = blk_mq_init_rq_map(set, i);
+               hctx->tags = set->tags[i];
+               WARN_ON(!hctx->tags);
+
                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
@@ -2090,9 +2095,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
-       list_for_each_entry(q, &all_q_list, all_q_node)
+       list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);
 
+               /*
+                * timeout handler can't touch hw queue during the
+                * reinitialization
+                */
+               del_timer_sync(&q->timeout);
+       }
+
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_queue_reinit(q);
 
index faaf36ade7ebdc2fdd363f174978bfb5683a4f9a..2b8fd302f677a967d87994f8a7532aab8dfe6569 100644 (file)
@@ -522,8 +522,6 @@ static void blk_release_queue(struct kobject *kobj)
 
        blk_trace_shutdown(q);
 
-       bdi_destroy(&q->backing_dev_info);
-
        ida_simple_remove(&blk_queue_ida, q->id);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
index ab21ba203d5c7744f4da2afbba85ed45dde86b98..ed9dd80671204bdebc4005544097fb05b6c90c62 100644 (file)
@@ -221,8 +221,8 @@ bounce:
                if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
                        continue;
 
-               inc_zone_page_state(to->bv_page, NR_BOUNCE);
                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+               inc_zone_page_state(to->bv_page, NR_BOUNCE);
 
                if (rw == WRITE) {
                        char *vto, *vfrom;
index 59794d0d38e34604a24b6e7a63bf309570b2f8fb..8985038f398ce503261dc4a29390a63c9f7b5b44 100644 (file)
@@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 
        eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
        if (unlikely(!eq))
-               goto err;
+               return NULL;
 
        eq->type = e;
        kobject_init(&eq->kobj, &elv_ktype);
@@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
        hash_init(eq->hash);
 
        return eq;
-err:
-       kfree(eq);
-       elevator_put(e);
-       return NULL;
 }
 EXPORT_SYMBOL(elevator_alloc);
 
index ae3fcb4199e9b7d85d2475d40ab4f209258a1cc5..d7173cb1ea76c206f1fcedbc96994e45901aa322 100644 (file)
@@ -1620,8 +1620,8 @@ out:
 
 static void loop_remove(struct loop_device *lo)
 {
-       del_gendisk(lo->lo_disk);
        blk_cleanup_queue(lo->lo_queue);
+       del_gendisk(lo->lo_disk);
        blk_mq_free_tag_set(&lo->tag_set);
        put_disk(lo->lo_disk);
        kfree(lo);
index 6b736b00f63ebbbf01db7eb037695cbd77bca8c8..88f13c525712f700d05428e741cafeeac752ad2d 100644 (file)
@@ -944,7 +944,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                                        u8 *inq_response, int alloc_len)
 {
-       __be32 max_sectors = cpu_to_be32(queue_max_hw_sectors(ns->queue));
+       __be32 max_sectors = cpu_to_be32(
+               nvme_block_nr(ns, queue_max_hw_sectors(ns->queue)));
        __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors);
        __be32 discard_desc_count = cpu_to_be32(0x100);
 
index d4f31e195e26ebcc4233c9b333624d8b73191826..593a02476c781a2b5ee7e491b9188a108630b506 100644 (file)
@@ -4818,12 +4818,12 @@ static void md_free(struct kobject *ko)
        if (mddev->sysfs_state)
                sysfs_put(mddev->sysfs_state);
 
+       if (mddev->queue)
+               blk_cleanup_queue(mddev->queue);
        if (mddev->gendisk) {
                del_gendisk(mddev->gendisk);
                put_disk(mddev->gendisk);
        }
-       if (mddev->queue)
-               blk_cleanup_queue(mddev->queue);
 
        kfree(mddev);
 }
index 476024bb6546527887517868b122c9305dc32d07..bfe62ae40f40920e6b95fa8ce16cc3130b8b0972 100644 (file)
@@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
        long ret, bytes;
        umode_t i_mode;
        size_t len;
-       int i, flags;
+       int i, flags, more;
 
        /*
         * We require the input being a regular file, as we don't want to
@@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
         * Don't block on output, we have to drain the direct pipe.
         */
        sd->flags &= ~SPLICE_F_NONBLOCK;
+       more = sd->flags & SPLICE_F_MORE;
 
        while (len) {
                size_t read_len;
@@ -1216,6 +1217,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                read_len = ret;
                sd->total_len = read_len;
 
+               /*
+                * If more data is pending, set SPLICE_F_MORE
+                * If this is the last data and SPLICE_F_MORE was not set
+                * initially, clears it.
+                */
+               if (read_len < len)
+                       sd->flags |= SPLICE_F_MORE;
+               else if (!more)
+                       sd->flags &= ~SPLICE_F_MORE;
                /*
                 * NOTE: nonblocking mode only applies to the input. We
                 * must not do the output in nonblocking mode as then we
index a1b25e35ea5f9fc2978b7f62917c6b4e39c3dc75..b7299febc4b4adfee00cb8b05d6fbf6558f01547 100644 (file)
@@ -220,7 +220,7 @@ enum rq_flag_bits {
 
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
-       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_FLUSH_SEQ)
 
 #define REQ_RAHEAD             (1ULL << __REQ_RAHEAD)
 #define REQ_THROTTLED          (1ULL << __REQ_THROTTLED)
index 5daf5568b9e149ea9dce0383b0452bd30ad67f84..eb59f7eea50827fc09e1c4f7a432b59ff2241d17 100644 (file)
@@ -580,7 +580,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
        long x;
 
        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
-                   limit - setpoint + 1);
+                     (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -807,7 +807,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
         * scale global setpoint to bdi's:
         *      bdi_setpoint = setpoint * bdi_thresh / thresh
         */
-       x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+       x = div_u64((u64)bdi_thresh << 16, thresh | 1);
        bdi_setpoint = setpoint * (u64)x >> 16;
        /*
         * Use span=(8*write_bw) in single bdi case as indicated by
@@ -822,7 +822,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 
        if (bdi_dirty < x_intercept - span / 4) {
                pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-                                   x_intercept - bdi_setpoint + 1);
+                                     (x_intercept - bdi_setpoint) | 1);
        } else
                pos_ratio /= 4;