NVMe: Retry failed commands with non-fatal errors
[linux-2.6-block.git] / drivers / block / nvme-core.c
index 596e2abd7971e127ca638fa4c0e13ff418a07ab5..efa9c8f4a7a76b320840250c6f73b3a0bb9705c9 100644 (file)
@@ -50,6 +50,7 @@
 #define SQ_SIZE(depth)         (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)         (depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT  (60 * HZ)
+#define IOD_TIMEOUT    (4 * NVME_IO_TIMEOUT)
 
 unsigned char io_timeout = 30;
 module_param(io_timeout, byte, 0644);
@@ -94,6 +95,7 @@ struct nvme_queue {
        wait_queue_head_t sq_full;
        wait_queue_t sq_cong_wait;
        struct bio_list sq_cong;
+       struct list_head iod_bio;
        u32 __iomem *q_db;
        u16 q_depth;
        u16 cq_vector;
@@ -128,7 +130,7 @@ static inline void _nvme_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
                                                struct nvme_completion *);
 
 struct nvme_cmd_info {
@@ -200,7 +202,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_FLUSH          (0x318 + CMD_CTX_BASE)
 #define CMD_CTX_ABORT          (0x31C + CMD_CTX_BASE)
 
-static void special_completion(struct nvme_dev *dev, void *ctx,
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
                                                struct nvme_completion *cqe)
 {
        if (ctx == CMD_CTX_CANCELLED)
@@ -208,26 +210,26 @@ static void special_completion(struct nvme_dev *dev, void *ctx,
        if (ctx == CMD_CTX_FLUSH)
                return;
        if (ctx == CMD_CTX_ABORT) {
-               ++dev->abort_limit;
+               ++nvmeq->dev->abort_limit;
                return;
        }
        if (ctx == CMD_CTX_COMPLETED) {
-               dev_warn(&dev->pci_dev->dev,
+               dev_warn(nvmeq->q_dmadev,
                                "completed id %d twice on queue %d\n",
                                cqe->command_id, le16_to_cpup(&cqe->sq_id));
                return;
        }
        if (ctx == CMD_CTX_INVALID) {
-               dev_warn(&dev->pci_dev->dev,
+               dev_warn(nvmeq->q_dmadev,
                                "invalid id %d completed on queue %d\n",
                                cqe->command_id, le16_to_cpup(&cqe->sq_id));
                return;
        }
 
-       dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+       dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
 }
 
-static void async_completion(struct nvme_dev *dev, void *ctx,
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
                                                struct nvme_completion *cqe)
 {
        struct async_cmd_info *cmdinfo = ctx;
@@ -357,6 +359,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
                iod->npages = -1;
                iod->length = nbytes;
                iod->nents = 0;
+               iod->first_dma = 0ULL;
                iod->start_time = jiffies;
        }
 
@@ -405,19 +408,31 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
        part_stat_unlock();
 }
 
-static void bio_completion(struct nvme_dev *dev, void *ctx,
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
                                                struct nvme_completion *cqe)
 {
        struct nvme_iod *iod = ctx;
        struct bio *bio = iod->private;
        u16 status = le16_to_cpup(&cqe->status) >> 1;
 
+       if (unlikely(status)) {
+               if (!(status & NVME_SC_DNR ||
+                               bio->bi_rw & REQ_FAILFAST_MASK) &&
+                               (jiffies - iod->start_time) < IOD_TIMEOUT) {
+                       if (!waitqueue_active(&nvmeq->sq_full))
+                               add_wait_queue(&nvmeq->sq_full,
+                                                       &nvmeq->sq_cong_wait);
+                       list_add_tail(&iod->node, &nvmeq->iod_bio);
+                       wake_up(&nvmeq->sq_full);
+                       return;
+               }
+       }
        if (iod->nents) {
-               dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+               dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
                        bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
                nvme_end_io_acct(bio, iod->start_time);
        }
-       nvme_free_iod(dev, iod);
+       nvme_free_iod(nvmeq->dev, iod);
        if (status)
                bio_endio(bio, -EIO);
        else
@@ -425,8 +440,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
-                       struct nvme_iod *iod, int total_len, gfp_t gfp)
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+                                                               gfp_t gfp)
 {
        struct dma_pool *pool;
        int length = total_len;
@@ -439,7 +454,6 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
        dma_addr_t prp_dma;
        int nprps, i;
 
-       cmd->prp1 = cpu_to_le64(dma_addr);
        length -= (PAGE_SIZE - offset);
        if (length <= 0)
                return total_len;
@@ -454,7 +468,7 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
        }
 
        if (length <= PAGE_SIZE) {
-               cmd->prp2 = cpu_to_le64(dma_addr);
+               iod->first_dma = dma_addr;
                return total_len;
        }
 
@@ -469,13 +483,12 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 
        prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
        if (!prp_list) {
-               cmd->prp2 = cpu_to_le64(dma_addr);
+               iod->first_dma = dma_addr;
                iod->npages = -1;
                return (total_len - length) + PAGE_SIZE;
        }
        list[0] = prp_list;
        iod->first_dma = prp_dma;
-       cmd->prp2 = cpu_to_le64(prp_dma);
        i = 0;
        for (;;) {
                if (i == PAGE_SIZE / 8) {
@@ -514,10 +527,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
 
        bio_chain(split, bio);
 
-       if (bio_list_empty(&nvmeq->sq_cong))
+       if (!waitqueue_active(&nvmeq->sq_full))
                add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
        bio_list_add(&nvmeq->sq_cong, split);
        bio_list_add(&nvmeq->sq_cong, bio);
+       wake_up(&nvmeq->sq_full);
 
        return 0;
 }
@@ -570,25 +584,13 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
        return length;
 }
 
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
 static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                struct bio *bio, struct nvme_iod *iod, int cmdid)
 {
-       struct nvme_dsm_range *range;
+       struct nvme_dsm_range *range =
+                               (struct nvme_dsm_range *)iod_list(iod)[0];
        struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 
-       range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
-                                                       &iod->first_dma);
-       if (!range)
-               return -ENOMEM;
-
-       iod_list(iod)[0] = (__le64 *)range;
-       iod->npages = 0;
-
        range->cattr = cpu_to_le32(0);
        range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
        range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
@@ -635,44 +637,22 @@ int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
        return nvme_submit_flush(nvmeq, ns, cmdid);
 }
 
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-                                                               struct bio *bio)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 {
+       struct bio *bio = iod->private;
+       struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
        struct nvme_command *cmnd;
-       struct nvme_iod *iod;
-       enum dma_data_direction dma_dir;
-       int cmdid, length, result;
+       int cmdid;
        u16 control;
        u32 dsmgmt;
-       int psegs = bio_phys_segments(ns->queue, bio);
-
-       if ((bio->bi_rw & REQ_FLUSH) && psegs) {
-               result = nvme_submit_flush_data(nvmeq, ns);
-               if (result)
-                       return result;
-       }
-
-       result = -ENOMEM;
-       iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
-       if (!iod)
-               goto nomem;
-       iod->private = bio;
 
-       result = -EBUSY;
        cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
        if (unlikely(cmdid < 0))
-               goto free_iod;
+               return cmdid;
 
-       if (bio->bi_rw & REQ_DISCARD) {
-               result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-               if (result)
-                       goto free_cmdid;
-               return result;
-       }
-       if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+       if (bio->bi_rw & REQ_DISCARD)
+               return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+       if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
                return nvme_submit_flush(nvmeq, ns, cmdid);
 
        control = 0;
@@ -686,42 +666,85 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
        cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
-
        memset(cmnd, 0, sizeof(*cmnd));
-       if (bio_data_dir(bio)) {
-               cmnd->rw.opcode = nvme_cmd_write;
-               dma_dir = DMA_TO_DEVICE;
-       } else {
-               cmnd->rw.opcode = nvme_cmd_read;
-               dma_dir = DMA_FROM_DEVICE;
-       }
-
-       result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
-       if (result <= 0)
-               goto free_cmdid;
-       length = result;
 
+       cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
        cmnd->rw.command_id = cmdid;
        cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-       length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
-                                                               GFP_ATOMIC);
+       cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
        cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-       cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+       cmnd->rw.length =
+               cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
        cmnd->rw.control = cpu_to_le16(control);
        cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-       nvme_start_io_acct(bio);
        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
        writel(nvmeq->sq_tail, nvmeq->q_db);
 
        return 0;
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+                                                               struct bio *bio)
+{
+       struct nvme_iod *iod;
+       int psegs = bio_phys_segments(ns->queue, bio);
+       int result;
+
+       if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+               result = nvme_submit_flush_data(nvmeq, ns);
+               if (result)
+                       return result;
+       }
+
+       iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+       if (!iod)
+               return -ENOMEM;
+
+       iod->private = bio;
+       if (bio->bi_rw & REQ_DISCARD) {
+               void *range;
+               /*
+                * We reuse the small pool to allocate the 16-byte range here
+                * as it is not worth having a special pool for these or
+                * additional cases to handle freeing the iod.
+                */
+               range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+                                               GFP_ATOMIC,
+                                               &iod->first_dma);
+               if (!range) {
+                       result = -ENOMEM;
+                       goto free_iod;
+               }
+               iod_list(iod)[0] = (__le64 *)range;
+               iod->npages = 0;
+       } else if (psegs) {
+               result = nvme_map_bio(nvmeq, iod, bio,
+                       bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+                       psegs);
+               if (result <= 0)
+                       goto free_iod;
+               if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
+                                                               result) {
+                       result = -ENOMEM;
+                       goto free_iod;
+               }
+               nvme_start_io_acct(bio);
+       }
+       if (unlikely(nvme_submit_iod(nvmeq, iod))) {
+               if (!waitqueue_active(&nvmeq->sq_full))
+                       add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+               list_add_tail(&iod->node, &nvmeq->iod_bio);
+       }
+       return 0;
 
- free_cmdid:
-       free_cmdid(nvmeq, cmdid, NULL);
  free_iod:
        nvme_free_iod(nvmeq->dev, iod);
- nomem:
        return result;
 }
 
@@ -745,7 +768,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
                }
 
                ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
-               fn(nvmeq->dev, ctx, &cqe);
+               fn(nvmeq, ctx, &cqe);
        }
 
        /* If the controller ignores the cq head doorbell and continuously
@@ -781,7 +804,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
        if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
                result = nvme_submit_bio_queue(nvmeq, ns, bio);
        if (unlikely(result)) {
-               if (bio_list_empty(&nvmeq->sq_cong))
+               if (!waitqueue_active(&nvmeq->sq_full))
                        add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
                bio_list_add(&nvmeq->sq_cong, bio);
        }
@@ -825,7 +848,7 @@ struct sync_cmd_info {
        int status;
 };
 
-static void sync_completion(struct nvme_dev *dev, void *ctx,
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
                                                struct nvme_completion *cqe)
 {
        struct sync_cmd_info *cmdinfo = ctx;
@@ -1112,7 +1135,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
                dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
                                                                nvmeq->qid);
                ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-               fn(nvmeq->dev, ctx, &cqe);
+               fn(nvmeq, ctx, &cqe);
        }
 }
 
@@ -1125,6 +1148,17 @@ static void nvme_free_queue(struct rcu_head *r)
                struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
                bio_endio(bio, -EIO);
        }
+       while (!list_empty(&nvmeq->iod_bio)) {
+               static struct nvme_completion cqe = {
+                       .status = cpu_to_le16(
+                               (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
+               };
+               struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
+                                                       struct nvme_iod,
+                                                       node);
+               list_del(&iod->node);
+               bio_completion(nvmeq, iod, &cqe);
+       }
        spin_unlock_irq(&nvmeq->q_lock);
 
        dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -1232,6 +1266,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
        init_waitqueue_head(&nvmeq->sq_full);
        init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
        bio_list_init(&nvmeq->sq_cong);
+       INIT_LIST_HEAD(&nvmeq->iod_bio);
        nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
        nvmeq->q_depth = depth;
        nvmeq->cq_vector = vector;
@@ -1565,7 +1600,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
                c.rw.metadata = cpu_to_le64(meta_dma_addr);
        }
 
-       length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+       length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+       c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       c.rw.prp2 = cpu_to_le64(iod->first_dma);
 
        if (length != (io.nblocks + 1) << ns->lba_shift)
                status = -ENOMEM;
@@ -1635,8 +1672,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
                                                                length);
                if (IS_ERR(iod))
                        return PTR_ERR(iod);
-               length = nvme_setup_prps(dev, &c.common, iod, length,
-                                                               GFP_KERNEL);
+               length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+               c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+               c.common.prp2 = cpu_to_le64(iod->first_dma);
        }
 
        timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
@@ -1733,17 +1771,33 @@ static const struct block_device_operations nvme_fops = {
        .getgeo         = nvme_getgeo,
 };
 
+static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
+{
+       struct nvme_iod *iod, *next;
+
+       list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
+               if (unlikely(nvme_submit_iod(nvmeq, iod)))
+                       break;
+               list_del(&iod->node);
+               if (bio_list_empty(&nvmeq->sq_cong) &&
+                                               list_empty(&nvmeq->iod_bio))
+                       remove_wait_queue(&nvmeq->sq_full,
+                                               &nvmeq->sq_cong_wait);
+       }
+}
+
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
        while (bio_list_peek(&nvmeq->sq_cong)) {
                struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
                struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
 
-               if (bio_list_empty(&nvmeq->sq_cong))
+               if (bio_list_empty(&nvmeq->sq_cong) &&
+                                               list_empty(&nvmeq->iod_bio))
                        remove_wait_queue(&nvmeq->sq_full,
                                                        &nvmeq->sq_cong_wait);
                if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-                       if (bio_list_empty(&nvmeq->sq_cong))
+                       if (!waitqueue_active(&nvmeq->sq_full))
                                add_wait_queue(&nvmeq->sq_full,
                                                        &nvmeq->sq_cong_wait);
                        bio_list_add_head(&nvmeq->sq_cong, bio);
@@ -1785,6 +1839,7 @@ static int nvme_kthread(void *data)
                                nvme_process_cq(nvmeq);
                                nvme_cancel_ios(nvmeq, true);
                                nvme_resubmit_bios(nvmeq);
+                               nvme_resubmit_iods(nvmeq);
  unlock:
                                spin_unlock_irq(&nvmeq->q_lock);
                        }