nvme: provide fallback for discard alloc failure
authorJens Axboe <axboe@kernel.dk>
Wed, 12 Dec 2018 16:18:11 +0000 (09:18 -0700)
committerChristoph Hellwig <hch@lst.de>
Thu, 13 Dec 2018 08:59:00 +0000 (09:59 +0100)
When boxes are run near (or to) OOM, we have a problem with the discard
page allocation in nvme. If we fail allocating the special page, we
return busy, and it'll get retried. But since ordering is honored for
dispatch requests, we can keep retrying this same IO and failing. Behind
that IO could be requests that want to free memory, but they never get
the chance.

Allocate a fixed discard page per controller for a safe fallback, and use
that if the initial allocation fails.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/nvme/host/core.c
drivers/nvme/host/nvme.h

index 3043cedb24e175b5a0ac1fbc44b98e5005a30039..168f2c1eaf604fa182f5fb15191b914aed1c406e 100644 (file)
@@ -564,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
        struct nvme_dsm_range *range;
        struct bio *bio;
 
-       range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
-       if (!range)
-               return BLK_STS_RESOURCE;
+       range = kmalloc_array(segments, sizeof(*range),
+                               GFP_ATOMIC | __GFP_NOWARN);
+       if (!range) {
+               /*
+                * If we fail allocation our range, fallback to the controller
+                * discard page. If that's also busy, it's safe to return
+                * busy, as we know we can make progress once that's freed.
+                */
+               if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+                       return BLK_STS_RESOURCE;
+
+               range = page_address(ns->ctrl->discard_page);
+       }
 
        __rq_for_each_bio(bio, req) {
                u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -581,7 +591,10 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
        }
 
        if (WARN_ON_ONCE(n != segments)) {
-               kfree(range);
+               if (virt_to_page(range) == ns->ctrl->discard_page)
+                       clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+               else
+                       kfree(range);
                return BLK_STS_IOERR;
        }
 
@@ -664,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
                                blk_rq_bytes(req) >> ns->lba_shift);
        }
        if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-               kfree(page_address(req->special_vec.bv_page) +
-                     req->special_vec.bv_offset);
+               struct nvme_ns *ns = req->rq_disk->private_data;
+               struct page *page = req->special_vec.bv_page;
+
+               if (page == ns->ctrl->discard_page)
+                       clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+               else
+                       kfree(page_address(page) + req->special_vec.bv_offset);
        }
 }
 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -3578,6 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
        kfree(ctrl->effects);
        nvme_mpath_uninit(ctrl);
+       kfree(ctrl->discard_page);
 
        if (subsys) {
                mutex_lock(&subsys->lock);
@@ -3618,6 +3637,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
        memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
        ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 
+       BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+                       PAGE_SIZE);
+       ctrl->discard_page = alloc_page(GFP_KERNEL);
+       if (!ctrl->discard_page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
        if (ret < 0)
                goto out;
@@ -3655,6 +3682,8 @@ out_free_name:
 out_release_instance:
        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
+       if (ctrl->discard_page)
+               __free_page(ctrl->discard_page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
index c78a9dc7fdcf63e30e6418918e7aa9a42591ff4f..39b52f4d9b245ff4d858a4fa4eb7352e12f37782 100644 (file)
@@ -241,6 +241,9 @@ struct nvme_ctrl {
        u16 maxcmd;
        int nr_reconnects;
        struct nvmf_ctrl_options *opts;
+
+       struct page *discard_page;
+       unsigned long discard_page_busy;
 };
 
 struct nvme_subsystem {