NVMe: Retry failed commands with non-fatal errors

[linux-2.6-block.git] / drivers / block / nvme-core.c
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c

index 596e2abd7971e127ca638fa4c0e13ff418a07ab5..efa9c8f4a7a76b320840250c6f73b3a0bb9705c9 100644 (file)
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -50,6 +50,7 @@
  #define SQ_SIZE(depth)         (depth * sizeof(struct nvme_command))
  #define CQ_SIZE(depth)         (depth * sizeof(struct nvme_completion))
  #define ADMIN_TIMEOUT  (60 * HZ)
+#define IOD_TIMEOUT    (4 * NVME_IO_TIMEOUT)
  
  unsigned char io_timeout = 30;
  module_param(io_timeout, byte, 0644);
@@ -94,6 +95,7 @@ struct nvme_queue {
         wait_queue_head_t sq_full;
         wait_queue_t sq_cong_wait;
         struct bio_list sq_cong;
+       struct list_head iod_bio;
         u32 __iomem *q_db;
         u16 q_depth;
         u16 cq_vector;
@@ -128,7 +130,7 @@ static inline void _nvme_check_size(void)
         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
  }
  
-typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
                                                 struct nvme_completion *);
  
  struct nvme_cmd_info {
@@ -200,7 +202,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
  #define CMD_CTX_FLUSH          (0x318 + CMD_CTX_BASE)
  #define CMD_CTX_ABORT          (0x31C + CMD_CTX_BASE)
  
-static void special_completion(struct nvme_dev *dev, void *ctx,
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
                                                 struct nvme_completion *cqe)
  {
         if (ctx == CMD_CTX_CANCELLED)
@@ -208,26 +210,26 @@ static void special_completion(struct nvme_dev *dev, void *ctx,
         if (ctx == CMD_CTX_FLUSH)
                 return;
         if (ctx == CMD_CTX_ABORT) {
-               ++dev->abort_limit;
+               ++nvmeq->dev->abort_limit;
                 return;
         }
         if (ctx == CMD_CTX_COMPLETED) {
-               dev_warn(&dev->pci_dev->dev,
+               dev_warn(nvmeq->q_dmadev,
                                 "completed id %d twice on queue %d\n",
                                 cqe->command_id, le16_to_cpup(&cqe->sq_id));
                 return;
         }
         if (ctx == CMD_CTX_INVALID) {
-               dev_warn(&dev->pci_dev->dev,
+               dev_warn(nvmeq->q_dmadev,
                                 "invalid id %d completed on queue %d\n",
                                 cqe->command_id, le16_to_cpup(&cqe->sq_id));
                 return;
         }
  
-       dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+       dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
  }
  
-static void async_completion(struct nvme_dev *dev, void *ctx,
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
                                                 struct nvme_completion *cqe)
  {
         struct async_cmd_info *cmdinfo = ctx;
@@ -357,6 +359,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
                 iod->npages = -1;
                 iod->length = nbytes;
                 iod->nents = 0;
+               iod->first_dma = 0ULL;
                 iod->start_time = jiffies;
         }
  
@@ -405,19 +408,31 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
         part_stat_unlock();
  }
  
-static void bio_completion(struct nvme_dev *dev, void *ctx,
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
                                                 struct nvme_completion *cqe)
  {
         struct nvme_iod *iod = ctx;
         struct bio *bio = iod->private;
         u16 status = le16_to_cpup(&cqe->status) >> 1;
  
+       if (unlikely(status)) {
+               if (!(status & NVME_SC_DNR ||
+                               bio->bi_rw & REQ_FAILFAST_MASK) &&
+                               (jiffies - iod->start_time) < IOD_TIMEOUT) {
+                       if (!waitqueue_active(&nvmeq->sq_full))
+                               add_wait_queue(&nvmeq->sq_full,
+                                                       &nvmeq->sq_cong_wait);
+                       list_add_tail(&iod->node, &nvmeq->iod_bio);
+                       wake_up(&nvmeq->sq_full);
+                       return;
+               }
+       }
         if (iod->nents) {
-               dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+               dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
                         bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
                 nvme_end_io_acct(bio, iod->start_time);
         }
-       nvme_free_iod(dev, iod);
+       nvme_free_iod(nvmeq->dev, iod);
         if (status)
                 bio_endio(bio, -EIO);
         else
@@ -425,8 +440,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
  }
  
  /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
-                       struct nvme_iod *iod, int total_len, gfp_t gfp)
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+                                                               gfp_t gfp)
  {
         struct dma_pool *pool;
         int length = total_len;
@@ -439,7 +454,6 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
         dma_addr_t prp_dma;
         int nprps, i;
  
-       cmd->prp1 = cpu_to_le64(dma_addr);
         length -= (PAGE_SIZE - offset);
         if (length <= 0)
                 return total_len;
@@ -454,7 +468,7 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
         }
  
         if (length <= PAGE_SIZE) {
-               cmd->prp2 = cpu_to_le64(dma_addr);
+               iod->first_dma = dma_addr;
                 return total_len;
         }
  
@@ -469,13 +483,12 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
  
         prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
         if (!prp_list) {
-               cmd->prp2 = cpu_to_le64(dma_addr);
+               iod->first_dma = dma_addr;
                 iod->npages = -1;
                 return (total_len - length) + PAGE_SIZE;
         }
         list[0] = prp_list;
         iod->first_dma = prp_dma;
-       cmd->prp2 = cpu_to_le64(prp_dma);
         i = 0;
         for (;;) {
                 if (i == PAGE_SIZE / 8) {
@@ -514,10 +527,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
  
         bio_chain(split, bio);
  
-       if (bio_list_empty(&nvmeq->sq_cong))
+       if (!waitqueue_active(&nvmeq->sq_full))
                 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
         bio_list_add(&nvmeq->sq_cong, split);
         bio_list_add(&nvmeq->sq_cong, bio);
+       wake_up(&nvmeq->sq_full);
  
         return 0;
  }
@@ -570,25 +584,13 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
         return length;
  }
  
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
  static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                 struct bio *bio, struct nvme_iod *iod, int cmdid)
  {
-       struct nvme_dsm_range *range;
+       struct nvme_dsm_range *range =
+                               (struct nvme_dsm_range *)iod_list(iod)[0];
         struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
  
-       range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
-                                                       &iod->first_dma);
-       if (!range)
-               return -ENOMEM;
-
-       iod_list(iod)[0] = (__le64 *)range;
-       iod->npages = 0;
-
         range->cattr = cpu_to_le32(0);
         range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
         range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
@@ -635,44 +637,22 @@ int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
         return nvme_submit_flush(nvmeq, ns, cmdid);
  }
  
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-                                                               struct bio *bio)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
  {
+       struct bio *bio = iod->private;
+       struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
         struct nvme_command *cmnd;
-       struct nvme_iod *iod;
-       enum dma_data_direction dma_dir;
-       int cmdid, length, result;
+       int cmdid;
         u16 control;
         u32 dsmgmt;
-       int psegs = bio_phys_segments(ns->queue, bio);
-
-       if ((bio->bi_rw & REQ_FLUSH) && psegs) {
-               result = nvme_submit_flush_data(nvmeq, ns);
-               if (result)
-                       return result;
-       }
-
-       result = -ENOMEM;
-       iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
-       if (!iod)
-               goto nomem;
-       iod->private = bio;
  
-       result = -EBUSY;
         cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
         if (unlikely(cmdid < 0))
-               goto free_iod;
+               return cmdid;
  
-       if (bio->bi_rw & REQ_DISCARD) {
-               result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-               if (result)
-                       goto free_cmdid;
-               return result;
-       }
-       if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+       if (bio->bi_rw & REQ_DISCARD)
+               return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+       if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
                 return nvme_submit_flush(nvmeq, ns, cmdid);
  
         control = 0;
@@ -686,42 +666,85 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
  
         cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
-
         memset(cmnd, 0, sizeof(*cmnd));
-       if (bio_data_dir(bio)) {
-               cmnd->rw.opcode = nvme_cmd_write;
-               dma_dir = DMA_TO_DEVICE;
-       } else {
-               cmnd->rw.opcode = nvme_cmd_read;
-               dma_dir = DMA_FROM_DEVICE;
-       }
-
-       result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
-       if (result <= 0)
-               goto free_cmdid;
-       length = result;
  
+       cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
         cmnd->rw.command_id = cmdid;
         cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-       length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
-                                                               GFP_ATOMIC);
+       cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
         cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
-       cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+       cmnd->rw.length =
+               cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
         cmnd->rw.control = cpu_to_le16(control);
         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
  
-       nvme_start_io_acct(bio);
         if (++nvmeq->sq_tail == nvmeq->q_depth)
                 nvmeq->sq_tail = 0;
         writel(nvmeq->sq_tail, nvmeq->q_db);
  
         return 0;
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+                                                               struct bio *bio)
+{
+       struct nvme_iod *iod;
+       int psegs = bio_phys_segments(ns->queue, bio);
+       int result;
+
+       if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+               result = nvme_submit_flush_data(nvmeq, ns);
+               if (result)
+                       return result;
+       }
+
+       iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+       if (!iod)
+               return -ENOMEM;
+
+       iod->private = bio;
+       if (bio->bi_rw & REQ_DISCARD) {
+               void *range;
+               /*
+                * We reuse the small pool to allocate the 16-byte range here
+                * as it is not worth having a special pool for these or
+                * additional cases to handle freeing the iod.
+                */
+               range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+                                               GFP_ATOMIC,
+                                               &iod->first_dma);
+               if (!range) {
+                       result = -ENOMEM;
+                       goto free_iod;
+               }
+               iod_list(iod)[0] = (__le64 *)range;
+               iod->npages = 0;
+       } else if (psegs) {
+               result = nvme_map_bio(nvmeq, iod, bio,
+                       bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+                       psegs);
+               if (result <= 0)
+                       goto free_iod;
+               if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
+                                                               result) {
+                       result = -ENOMEM;
+                       goto free_iod;
+               }
+               nvme_start_io_acct(bio);
+       }
+       if (unlikely(nvme_submit_iod(nvmeq, iod))) {
+               if (!waitqueue_active(&nvmeq->sq_full))
+                       add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+               list_add_tail(&iod->node, &nvmeq->iod_bio);
+       }
+       return 0;
  
- free_cmdid:
-       free_cmdid(nvmeq, cmdid, NULL);
   free_iod:
         nvme_free_iod(nvmeq->dev, iod);
- nomem:
         return result;
  }
  
@@ -745,7 +768,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
                 }
  
                 ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
-               fn(nvmeq->dev, ctx, &cqe);
+               fn(nvmeq, ctx, &cqe);
         }
  
         /* If the controller ignores the cq head doorbell and continuously
@@ -781,7 +804,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
         if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
                 result = nvme_submit_bio_queue(nvmeq, ns, bio);
         if (unlikely(result)) {
-               if (bio_list_empty(&nvmeq->sq_cong))
+               if (!waitqueue_active(&nvmeq->sq_full))
                         add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
                 bio_list_add(&nvmeq->sq_cong, bio);
         }
@@ -825,7 +848,7 @@ struct sync_cmd_info {
         int status;
  };
  
-static void sync_completion(struct nvme_dev *dev, void *ctx,
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
                                                 struct nvme_completion *cqe)
  {
         struct sync_cmd_info *cmdinfo = ctx;
@@ -1112,7 +1135,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
                 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
                                                                 nvmeq->qid);
                 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-               fn(nvmeq->dev, ctx, &cqe);
+               fn(nvmeq, ctx, &cqe);
         }
  }
  
@@ -1125,6 +1148,17 @@ static void nvme_free_queue(struct rcu_head *r)
                 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
                 bio_endio(bio, -EIO);
         }
+       while (!list_empty(&nvmeq->iod_bio)) {
+               static struct nvme_completion cqe = {
+                       .status = cpu_to_le16(
+                               (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
+               };
+               struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
+                                                       struct nvme_iod,
+                                                       node);
+               list_del(&iod->node);
+               bio_completion(nvmeq, iod, &cqe);
+       }
         spin_unlock_irq(&nvmeq->q_lock);
  
         dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -1232,6 +1266,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
         init_waitqueue_head(&nvmeq->sq_full);
         init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
         bio_list_init(&nvmeq->sq_cong);
+       INIT_LIST_HEAD(&nvmeq->iod_bio);
         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
         nvmeq->q_depth = depth;
         nvmeq->cq_vector = vector;
@@ -1565,7 +1600,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
                 c.rw.metadata = cpu_to_le64(meta_dma_addr);
         }
  
-       length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+       length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+       c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       c.rw.prp2 = cpu_to_le64(iod->first_dma);
  
         if (length != (io.nblocks + 1) << ns->lba_shift)
                 status = -ENOMEM;
@@ -1635,8 +1672,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
                                                                 length);
                 if (IS_ERR(iod))
                         return PTR_ERR(iod);
-               length = nvme_setup_prps(dev, &c.common, iod, length,
-                                                               GFP_KERNEL);
+               length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+               c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+               c.common.prp2 = cpu_to_le64(iod->first_dma);
         }
  
         timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
@@ -1733,17 +1771,33 @@ static const struct block_device_operations nvme_fops = {
         .getgeo         = nvme_getgeo,
  };
  
+static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
+{
+       struct nvme_iod *iod, *next;
+
+       list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
+               if (unlikely(nvme_submit_iod(nvmeq, iod)))
+                       break;
+               list_del(&iod->node);
+               if (bio_list_empty(&nvmeq->sq_cong) &&
+                                               list_empty(&nvmeq->iod_bio))
+                       remove_wait_queue(&nvmeq->sq_full,
+                                               &nvmeq->sq_cong_wait);
+       }
+}
+
  static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
  {
         while (bio_list_peek(&nvmeq->sq_cong)) {
                 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
                 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
  
-               if (bio_list_empty(&nvmeq->sq_cong))
+               if (bio_list_empty(&nvmeq->sq_cong) &&
+                                               list_empty(&nvmeq->iod_bio))
                         remove_wait_queue(&nvmeq->sq_full,
                                                         &nvmeq->sq_cong_wait);
                 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-                       if (bio_list_empty(&nvmeq->sq_cong))
+                       if (!waitqueue_active(&nvmeq->sq_full))
                                 add_wait_queue(&nvmeq->sq_full,
                                                         &nvmeq->sq_cong_wait);
                         bio_list_add_head(&nvmeq->sq_cong, bio);
@@ -1785,6 +1839,7 @@ static int nvme_kthread(void *data)
                                 nvme_process_cq(nvmeq);
                                 nvme_cancel_ios(nvmeq, true);
                                 nvme_resubmit_bios(nvmeq);
+                               nvme_resubmit_iods(nvmeq);
   unlock:
                                 spin_unlock_irq(&nvmeq->q_lock);
                         }