Merge branch 'for-4.7/drivers' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 May 2016 23:03:32 +0000 (16:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 May 2016 23:03:32 +0000 (16:03 -0700)
Pull block driver updates from Jens Axboe:
 "On top of the core pull request, this is the drivers pull request for
  this merge window.  This contains:

   - Switch drivers to the new write back cache API, and kill off the
     flush flags.  From me.

   - Kill the discard support for the STEC pci-e flash driver.  It's
     trivially broken, and apparently unmaintained, so it's safer to
     just remove it.  From Jeff Moyer.

   - A set of lightnvm updates from the usual suspects (Matias/Javier,
     and Simon), and fixes from Arnd, Jeff Mahoney, Sagi, and Wenwei
     Tao.

   - A set of updates for NVMe:

        - Turn the controller state management into a proper state
          machine.  From Christoph.

        - Shuffling of code in preparation for NVMe-over-fabrics, also
          from Christoph.

        - Cleanup of the command prep part from Ming Lin.

        - Rewrite of the discard support from Ming Lin.

        - Deadlock fix for namespace removal from Ming Lin.

        - Use the now exported blk-mq tag helper for IO termination.
          From Sagi.

        - Various little fixes from Christoph, Guilherme, Keith, Ming
          Lin, Wang Sheng-Hui.

   - Convert mtip32xx to use the now exported blk-mq tag iter function,
     from Keith"

* 'for-4.7/drivers' of git://git.kernel.dk/linux-block: (74 commits)
  lightnvm: reserved space calculation incorrect
  lightnvm: rename nr_pages to nr_ppas on nvm_rq
  lightnvm: add is_cached entry to struct ppa_addr
  lightnvm: expose gennvm_mark_blk to targets
  lightnvm: remove mgt targets on mgt removal
  lightnvm: pass dma address to hardware rather than pointer
  lightnvm: do not assume sequential lun alloc.
  nvme/lightnvm: Log using the ctrl named device
  lightnvm: rename dma helper functions
  lightnvm: enable metadata to be sent to device
  lightnvm: do not free unused metadata on rrpc
  lightnvm: fix out of bound ppa lun id on bb tbl
  lightnvm: refactor set_bb_tbl for accepting ppa list
  lightnvm: move responsibility for bad blk mgmt to target
  lightnvm: make nvm_set_rqd_ppalist() aware of vblks
  lightnvm: remove struct factory_blks
  lightnvm: refactor device ops->get_bb_tbl()
  lightnvm: introduce nvm_for_each_lun_ppa() macro
  lightnvm: refactor dev->online_target to global nvm_targets
  lightnvm: rename nvm_targets to nvm_tgt_type
  ...

1  2 
drivers/block/loop.c
drivers/md/md.c
drivers/mmc/card/block.c
drivers/nvme/host/pci.c
include/linux/blkdev.h

diff --combined drivers/block/loop.c
index 80cf8add46ff3667d896fca88aaea3fbf338ad27,7e5e27ac45bbea7c946e7f0ee8da777c11fa447c..1fa8cc235977f404bc995d73659714fbccad7066
@@@ -488,12 -488,6 +488,12 @@@ static int lo_rw_aio(struct loop_devic
        bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
        iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
                      bio_segments(bio), blk_rq_bytes(cmd->rq));
 +      /*
 +       * This bio may be started from the middle of the 'bvec'
 +       * because of bio splitting, so offset from the bvec must
 +       * be passed to iov iterator
 +       */
 +      iter.iov_offset = bio->bi_iter.bi_bvec_done;
  
        cmd->iocb.ki_pos = pos;
        cmd->iocb.ki_filp = file;
@@@ -943,7 -937,7 +943,7 @@@ static int loop_set_fd(struct loop_devi
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
  
        if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-               blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+               blk_queue_write_cache(lo->lo_queue, true, false);
  
        loop_update_dio(lo);
        set_capacity(lo->lo_disk, size);
diff --combined drivers/md/md.c
index 14d3b37944df031214c2c6951ed15c46da104842,5d61e76cec343d1895c870aa95776e7a69e4d0a8..c9a475c33cc7401dc067716e6cf3aaf648c46d93
@@@ -284,8 -284,6 +284,8 @@@ static blk_qc_t md_make_request(struct 
         * go away inside make_request
         */
        sectors = bio_sectors(bio);
 +      /* bio could be mergeable after passing to underlayer */
 +      bio->bi_rw &= ~REQ_NOMERGE;
        mddev->pers->make_request(mddev, bio);
  
        cpu = part_stat_lock();
@@@ -5039,7 -5037,7 +5039,7 @@@ static int md_alloc(dev_t dev, char *na
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
-       blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
+       blk_queue_write_cache(mddev->queue, true, true);
        /* Allow extended partitions.  This makes the
         * 'mdp' device redundant, but we can't really
         * remove it now.
diff --combined drivers/mmc/card/block.c
index 5f2a3d69344f2b8652635afffa7ab566d1bd6a44,32daf433a9fb2622f28b97d8348d3e17c1f93c62..ddc96206288a1f281fa8b0663ef033b3fed774bb
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/capability.h>
  #include <linux/compat.h>
  #include <linux/pm_runtime.h>
 +#include <linux/idr.h>
  
  #include <linux/mmc/ioctl.h>
  #include <linux/mmc/card.h>
@@@ -79,14 -78,15 +79,14 @@@ static int perdev_minors = CONFIG_MMC_B
  /*
   * We've only got one major, so number of mmcblk devices is
   * limited to (1 << 20) / number of minors per device.  It is also
 - * currently limited by the size of the static bitmaps below.
 + * limited by the MAX_DEVICES below.
   */
  static int max_devices;
  
  #define MAX_DEVICES 256
  
 -/* TODO: Replace these with struct ida */
 -static DECLARE_BITMAP(dev_use, MAX_DEVICES);
 -static DECLARE_BITMAP(name_use, MAX_DEVICES);
 +static DEFINE_IDA(mmc_blk_ida);
 +static DEFINE_SPINLOCK(mmc_blk_lock);
  
  /*
   * There is one mmc_blk_data per slot.
@@@ -105,6 -105,7 +105,6 @@@ struct mmc_blk_data 
        unsigned int    usage;
        unsigned int    read_only;
        unsigned int    part_type;
 -      unsigned int    name_idx;
        unsigned int    reset_done;
  #define MMC_BLK_READ          BIT(0)
  #define MMC_BLK_WRITE         BIT(1)
@@@ -179,9 -180,7 +179,9 @@@ static void mmc_blk_put(struct mmc_blk_
                int devidx = mmc_get_devidx(md->disk);
                blk_cleanup_queue(md->queue.queue);
  
 -              __clear_bit(devidx, dev_use);
 +              spin_lock(&mmc_blk_lock);
 +              ida_remove(&mmc_blk_ida, devidx);
 +              spin_unlock(&mmc_blk_lock);
  
                put_disk(md->disk);
                kfree(md);
@@@ -948,22 -947,16 +948,22 @@@ static int mmc_blk_cmd_error(struct req
                        req->rq_disk->disk_name, "timed out", name, status);
  
                /* If the status cmd initially failed, retry the r/w cmd */
 -              if (!status_valid)
 +              if (!status_valid) {
 +                      pr_err("%s: status not valid, retrying timeout\n",
 +                              req->rq_disk->disk_name);
                        return ERR_RETRY;
 +              }
  
                /*
                 * If it was a r/w cmd crc error, or illegal command
                 * (eg, issued in wrong state) then retry - we should
                 * have corrected the state problem above.
                 */
 -              if (status & (R1_COM_CRC_ERROR | R1_ILLEGAL_COMMAND))
 +              if (status & (R1_COM_CRC_ERROR | R1_ILLEGAL_COMMAND)) {
 +                      pr_err("%s: command error, retrying timeout\n",
 +                              req->rq_disk->disk_name);
                        return ERR_RETRY;
 +              }
  
                /* Otherwise abort the command */
                return ERR_ABORT;
@@@ -2198,23 -2191,10 +2198,23 @@@ static struct mmc_blk_data *mmc_blk_all
        struct mmc_blk_data *md;
        int devidx, ret;
  
 -      devidx = find_first_zero_bit(dev_use, max_devices);
 -      if (devidx >= max_devices)
 -              return ERR_PTR(-ENOSPC);
 -      __set_bit(devidx, dev_use);
 +again:
 +      if (!ida_pre_get(&mmc_blk_ida, GFP_KERNEL))
 +              return ERR_PTR(-ENOMEM);
 +
 +      spin_lock(&mmc_blk_lock);
 +      ret = ida_get_new(&mmc_blk_ida, &devidx);
 +      spin_unlock(&mmc_blk_lock);
 +
 +      if (ret == -EAGAIN)
 +              goto again;
 +      else if (ret)
 +              return ERR_PTR(ret);
 +
 +      if (devidx >= max_devices) {
 +              ret = -ENOSPC;
 +              goto out;
 +      }
  
        md = kzalloc(sizeof(struct mmc_blk_data), GFP_KERNEL);
        if (!md) {
                goto out;
        }
  
 -      /*
 -       * !subname implies we are creating main mmc_blk_data that will be
 -       * associated with mmc_card with dev_set_drvdata. Due to device
 -       * partitions, devidx will not coincide with a per-physical card
 -       * index anymore so we keep track of a name index.
 -       */
 -      if (!subname) {
 -              md->name_idx = find_first_zero_bit(name_use, max_devices);
 -              __set_bit(md->name_idx, name_use);
 -      } else
 -              md->name_idx = ((struct mmc_blk_data *)
 -                              dev_to_disk(parent)->private_data)->name_idx;
 -
        md->area_type = area_type;
  
        /*
         */
  
        snprintf(md->disk->disk_name, sizeof(md->disk->disk_name),
 -               "mmcblk%u%s", md->name_idx, subname ? subname : "");
 +               "mmcblk%u%s", card->host->index, subname ? subname : "");
  
        if (mmc_card_mmc(card))
                blk_queue_logical_block_size(md->queue.queue,
            ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) ||
             card->ext_csd.rel_sectors)) {
                md->flags |= MMC_BLK_REL_WR;
-               blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA);
+               blk_queue_write_cache(md->queue.queue, true, true);
        }
  
        if (mmc_card_mmc(card) &&
   err_kfree:
        kfree(md);
   out:
 +      spin_lock(&mmc_blk_lock);
 +      ida_remove(&mmc_blk_ida, devidx);
 +      spin_unlock(&mmc_blk_lock);
        return ERR_PTR(ret);
  }
  
@@@ -2428,6 -2418,7 +2428,6 @@@ static void mmc_blk_remove_parts(struc
        struct list_head *pos, *q;
        struct mmc_blk_data *part_md;
  
 -      __clear_bit(md->name_idx, name_use);
        list_for_each_safe(pos, q, &md->part) {
                part_md = list_entry(pos, struct mmc_blk_data, part);
                list_del(pos);
diff --combined drivers/nvme/host/pci.c
index 4fd733ff72b1cb7cf38023701d640a059b4e4670,fb741d09831aaa6536e3ccdb4e69b7de486d448d..0f093f14d3482394630b66c8586da5beb455d8c4
@@@ -54,8 -54,7 +54,7 @@@
   * We handle AEN commands ourselves and don't even let the
   * block layer know about them.
   */
- #define NVME_NR_AEN_COMMANDS  1
- #define NVME_AQ_BLKMQ_DEPTH   (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
+ #define NVME_AQ_BLKMQ_DEPTH   (NVME_AQ_DEPTH - NVME_NR_AERS)
  
  static int use_threaded_interrupts;
  module_param(use_threaded_interrupts, int, 0);
@@@ -92,9 -91,7 +91,7 @@@ struct nvme_dev 
        struct msix_entry *entry;
        void __iomem *bar;
        struct work_struct reset_work;
-       struct work_struct scan_work;
        struct work_struct remove_work;
-       struct work_struct async_work;
        struct timer_list watchdog_timer;
        struct mutex shutdown_lock;
        bool subsystem;
        dma_addr_t cmb_dma_addr;
        u64 cmb_size;
        u32 cmbsz;
-       unsigned long flags;
- #define NVME_CTRL_RESETTING    0
- #define NVME_CTRL_REMOVING     1
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
  };
@@@ -271,40 -263,6 +263,6 @@@ static int nvme_init_request(void *data
        return 0;
  }
  
- static void nvme_queue_scan(struct nvme_dev *dev)
- {
-       /*
-        * Do not queue new scan work when a controller is reset during
-        * removal.
-        */
-       if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
-               return;
-       queue_work(nvme_workq, &dev->scan_work);
- }
- static void nvme_complete_async_event(struct nvme_dev *dev,
-               struct nvme_completion *cqe)
- {
-       u16 status = le16_to_cpu(cqe->status) >> 1;
-       u32 result = le32_to_cpu(cqe->result);
-       if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
-               ++dev->ctrl.event_limit;
-               queue_work(nvme_workq, &dev->async_work);
-       }
-       if (status != NVME_SC_SUCCESS)
-               return;
-       switch (result & 0xff07) {
-       case NVME_AER_NOTICE_NS_CHANGED:
-               dev_info(dev->ctrl.device, "rescanning\n");
-               nvme_queue_scan(dev);
-       default:
-               dev_warn(dev->ctrl.device, "async event result %08x\n", result);
-       }
- }
  /**
   * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
   * @nvmeq: The queue to use
@@@ -334,16 -292,11 +292,11 @@@ static __le64 **iod_list(struct reques
        return (__le64 **)(iod->sg + req->nr_phys_segments);
  }
  
- static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+ static int nvme_init_iod(struct request *rq, unsigned size,
+               struct nvme_dev *dev)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
        int nseg = rq->nr_phys_segments;
-       unsigned size;
-       if (rq->cmd_flags & REQ_DISCARD)
-               size = sizeof(struct nvme_dsm_range);
-       else
-               size = blk_rq_bytes(rq);
  
        if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
                iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
@@@ -368,6 -321,8 +321,8 @@@ static void nvme_free_iod(struct nvme_d
        __le64 **list = iod_list(req);
        dma_addr_t prp_dma = iod->first_dma;
  
+       nvme_cleanup_cmd(req);
        if (iod->npages == 0)
                dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
        for (i = 0; i < iod->npages; i++) {
@@@ -529,7 -484,7 +484,7 @@@ static bool nvme_setup_prps(struct nvme
  }
  
  static int nvme_map_data(struct nvme_dev *dev, struct request *req,
-               struct nvme_command *cmnd)
+               unsigned size, struct nvme_command *cmnd)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct request_queue *q = req->q;
        if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
                goto out;
  
-       if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
+       if (!nvme_setup_prps(dev, req, size))
                goto out_unmap;
  
        ret = BLK_MQ_RQ_QUEUE_ERROR;
@@@ -595,37 -550,6 +550,6 @@@ static void nvme_unmap_data(struct nvme
        nvme_free_iod(dev, req);
  }
  
- /*
-  * We reuse the small pool to allocate the 16-byte range here as it is not
-  * worth having a special pool for these or additional cases to handle freeing
-  * the iod.
-  */
- static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-               struct request *req, struct nvme_command *cmnd)
- {
-       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       struct nvme_dsm_range *range;
-       range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
-                                               &iod->first_dma);
-       if (!range)
-               return BLK_MQ_RQ_QUEUE_BUSY;
-       iod_list(req)[0] = (__le64 *)range;
-       iod->npages = 0;
-       range->cattr = cpu_to_le32(0);
-       range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
-       range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-       memset(cmnd, 0, sizeof(*cmnd));
-       cmnd->dsm.opcode = nvme_cmd_dsm;
-       cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
-       cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
-       cmnd->dsm.nr = 0;
-       cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
-       return BLK_MQ_RQ_QUEUE_OK;
- }
  /*
   * NOTE: ns is NULL when called on the admin queue.
   */
@@@ -637,6 -561,7 +561,7 @@@ static int nvme_queue_rq(struct blk_mq_
        struct nvme_dev *dev = nvmeq->dev;
        struct request *req = bd->rq;
        struct nvme_command cmnd;
+       unsigned map_len;
        int ret = BLK_MQ_RQ_QUEUE_OK;
  
        /*
                }
        }
  
-       ret = nvme_init_iod(req, dev);
+       map_len = nvme_map_len(req);
+       ret = nvme_init_iod(req, map_len, dev);
        if (ret)
                return ret;
  
-       if (req->cmd_flags & REQ_DISCARD) {
-               ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
-       } else {
-               if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-                       memcpy(&cmnd, req->cmd, sizeof(cmnd));
-               else if (req->cmd_flags & REQ_FLUSH)
-                       nvme_setup_flush(ns, &cmnd);
-               else
-                       nvme_setup_rw(ns, req, &cmnd);
+       ret = nvme_setup_cmd(ns, req, &cmnd);
+       if (ret)
+               goto out;
  
-               if (req->nr_phys_segments)
-                       ret = nvme_map_data(dev, req, &cmnd);
-       }
+       if (req->nr_phys_segments)
+               ret = nvme_map_data(dev, req, map_len, &cmnd);
  
        if (ret)
                goto out;
@@@ -764,7 -683,7 +683,7 @@@ static void __nvme_process_cq(struct nv
                 */
                if (unlikely(nvmeq->qid == 0 &&
                                cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
-                       nvme_complete_async_event(nvmeq->dev, &cqe);
+                       nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe);
                        continue;
                }
  
@@@ -833,21 -752,18 +752,18 @@@ static int nvme_poll(struct blk_mq_hw_c
        return 0;
  }
  
- static void nvme_async_event_work(struct work_struct *work)
+ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
  {
-       struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
        struct nvme_queue *nvmeq = dev->queues[0];
        struct nvme_command c;
  
        memset(&c, 0, sizeof(c));
        c.common.opcode = nvme_admin_async_event;
+       c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
  
        spin_lock_irq(&nvmeq->q_lock);
-       while (dev->ctrl.event_limit > 0) {
-               c.common.command_id = NVME_AQ_BLKMQ_DEPTH +
-                       --dev->ctrl.event_limit;
-               __nvme_submit_cmd(nvmeq, &c);
-       }
+       __nvme_submit_cmd(nvmeq, &c);
        spin_unlock_irq(&nvmeq->q_lock);
  }
  
@@@ -939,7 -855,7 +855,7 @@@ static enum blk_eh_timer_return nvme_ti
         * cancellation error. All outstanding requests are completed on
         * shutdown, so we return BLK_EH_HANDLED.
         */
-       if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
+       if (dev->ctrl.state == NVME_CTRL_RESETTING) {
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, disable controller\n",
                         req->tag, nvmeq->qid);
        return BLK_EH_RESET_TIMER;
  }
  
- static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
+ static void nvme_cancel_io(struct request *req, void *data, bool reserved)
  {
-       struct nvme_queue *nvmeq = data;
        int status;
  
        if (!blk_mq_request_started(req))
                return;
  
-       dev_dbg_ratelimited(nvmeq->dev->ctrl.device,
-                "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
+       dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device,
+                               "Cancelling I/O %d", req->tag);
  
        status = NVME_SC_ABORT_REQ;
        if (blk_queue_dying(req->q))
@@@ -1069,14 -984,6 +984,6 @@@ static int nvme_suspend_queue(struct nv
        return 0;
  }
  
- static void nvme_clear_queue(struct nvme_queue *nvmeq)
- {
-       spin_lock_irq(&nvmeq->q_lock);
-       if (nvmeq->tags && *nvmeq->tags)
-               blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
-       spin_unlock_irq(&nvmeq->q_lock);
- }
  static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
  {
        struct nvme_queue *nvmeq = dev->queues[0];
@@@ -1350,22 -1257,44 +1257,44 @@@ static int nvme_configure_admin_queue(s
        return result;
  }
  
+ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+ {
+       /* If true, indicates loss of adapter communication, possibly by a
+        * NVMe Subsystem reset.
+        */
+       bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+       /* If there is a reset ongoing, we shouldn't reset again. */
+       if (work_busy(&dev->reset_work))
+               return false;
+       /* We shouldn't reset unless the controller is on fatal error state
+        * _or_ if we lost the communication with it.
+        */
+       if (!(csts & NVME_CSTS_CFS) && !nssro)
+               return false;
+       /* If PCI error recovery process is happening, we cannot reset or
+        * the recovery mechanism will surely fail.
+        */
+       if (pci_channel_offline(to_pci_dev(dev->dev)))
+               return false;
+       return true;
+ }
  static void nvme_watchdog_timer(unsigned long data)
  {
        struct nvme_dev *dev = (struct nvme_dev *)data;
        u32 csts = readl(dev->bar + NVME_REG_CSTS);
  
-       /*
-        * Skip controllers currently under reset.
-        */
-       if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) &&
-           ((csts & NVME_CSTS_CFS) ||
-            (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) {
-               if (queue_work(nvme_workq, &dev->reset_work)) {
+       /* Skip controllers under certain specific conditions. */
+       if (nvme_should_reset(dev, csts)) {
+               if (queue_work(nvme_workq, &dev->reset_work))
                        dev_warn(dev->dev,
                                "Failed status: 0x%x, reset controller.\n",
                                csts);
-               }
                return;
        }
  
@@@ -1551,8 -1480,9 +1480,9 @@@ static int nvme_setup_io_queues(struct 
        return result;
  }
  
- static void nvme_set_irq_hints(struct nvme_dev *dev)
+ static void nvme_pci_post_scan(struct nvme_ctrl *ctrl)
  {
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
        struct nvme_queue *nvmeq;
        int i;
  
        }
  }
  
- static void nvme_dev_scan(struct work_struct *work)
- {
-       struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
-       if (!dev->tagset.tags)
-               return;
-       nvme_scan_namespaces(&dev->ctrl);
-       nvme_set_irq_hints(dev);
- }
  static void nvme_del_queue_end(struct request *req, int error)
  {
        struct nvme_queue *nvmeq = req->end_io_data;
@@@ -1592,7 -1512,13 +1512,13 @@@ static void nvme_del_cq_end(struct requ
        if (!error) {
                unsigned long flags;
  
-               spin_lock_irqsave(&nvmeq->q_lock, flags);
+               /*
+                * We might be called with the AQ q_lock held
+                * and the I/O queue q_lock should always
+                * nest inside the AQ one.
+                */
+               spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
+                                       SINGLE_DEPTH_NESTING);
                nvme_process_cq(nvmeq);
                spin_unlock_irqrestore(&nvmeq->q_lock, flags);
        }
@@@ -1684,7 -1610,6 +1610,6 @@@ static int nvme_dev_add(struct nvme_de
                nvme_free_queues(dev, dev->online_queues);
        }
  
-       nvme_queue_scan(dev);
        return 0;
  }
  
@@@ -1797,8 -1722,8 +1722,8 @@@ static void nvme_dev_disable(struct nvm
        }
        nvme_pci_disable(dev);
  
-       for (i = dev->queue_count - 1; i >= 0; i--)
-               nvme_clear_queue(dev->queues[i]);
+       blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev);
+       blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev);
        mutex_unlock(&dev->shutdown_lock);
  }
  
@@@ -1854,7 -1779,7 +1779,7 @@@ static void nvme_reset_work(struct work
        struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
        int result = -ENODEV;
  
-       if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
+       if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
                goto out;
  
        /*
        if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                nvme_dev_disable(dev, false);
  
-       if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
                goto out;
  
-       set_bit(NVME_CTRL_RESETTING, &dev->flags);
        result = nvme_pci_enable(dev);
        if (result)
                goto out;
        if (result)
                goto out;
  
-       dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
-       queue_work(nvme_workq, &dev->async_work);
+       /*
+        * A controller that can not execute IO typically requires user
+        * intervention to correct. For such degraded controllers, the driver
+        * should not submit commands the user did not request, so skip
+        * registering for asynchronous event notification on this condition.
+        */
+       if (dev->online_queues > 1)
+               nvme_queue_async_events(&dev->ctrl);
  
        mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
  
         */
        if (dev->online_queues < 2) {
                dev_warn(dev->ctrl.device, "IO queues not created\n");
+               nvme_kill_queues(&dev->ctrl);
                nvme_remove_namespaces(&dev->ctrl);
        } else {
                nvme_start_queues(&dev->ctrl);
                nvme_dev_add(dev);
        }
  
-       clear_bit(NVME_CTRL_RESETTING, &dev->flags);
+       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+               dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+               goto out;
+       }
+       if (dev->online_queues > 1)
+               nvme_queue_scan(&dev->ctrl);
        return;
  
   out:
@@@ -1955,13 -1891,6 +1891,6 @@@ static int nvme_pci_reg_read64(struct n
        return 0;
  }
  
- static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
- {
-       struct nvme_dev *dev = to_nvme_dev(ctrl);
-       return !dev->bar || dev->online_queues < 2;
- }
  static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
  {
        return nvme_reset(to_nvme_dev(ctrl));
@@@ -1972,9 -1901,10 +1901,10 @@@ static const struct nvme_ctrl_ops nvme_
        .reg_read32             = nvme_pci_reg_read32,
        .reg_write32            = nvme_pci_reg_write32,
        .reg_read64             = nvme_pci_reg_read64,
-       .io_incapable           = nvme_pci_io_incapable,
        .reset_ctrl             = nvme_pci_reset_ctrl,
        .free_ctrl              = nvme_pci_free_ctrl,
+       .post_scan              = nvme_pci_post_scan,
+       .submit_async_event     = nvme_pci_submit_async_event,
  };
  
  static int nvme_dev_map(struct nvme_dev *dev)
@@@ -2026,10 -1956,8 +1956,8 @@@ static int nvme_probe(struct pci_dev *p
        if (result)
                goto free;
  
-       INIT_WORK(&dev->scan_work, nvme_dev_scan);
        INIT_WORK(&dev->reset_work, nvme_reset_work);
        INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
-       INIT_WORK(&dev->async_work, nvme_async_event_work);
        setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
                (unsigned long)dev);
        mutex_init(&dev->shutdown_lock);
@@@ -2086,15 -2014,13 +2014,12 @@@ static void nvme_remove(struct pci_dev 
  {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
  
-       set_bit(NVME_CTRL_REMOVING, &dev->flags);
+       nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
        pci_set_drvdata(pdev, NULL);
-       flush_work(&dev->async_work);
        flush_work(&dev->reset_work);
-       flush_work(&dev->scan_work);
-       nvme_remove_namespaces(&dev->ctrl);
        nvme_uninit_ctrl(&dev->ctrl);
        nvme_dev_disable(dev, true);
--      flush_work(&dev->reset_work);
        nvme_dev_remove_admin(dev);
        nvme_free_queues(dev, 0);
        nvme_release_cmb(dev);
diff --combined include/linux/blkdev.h
index b79131acf6c0cf76cb096a4d2721779c061d0277,57c085917da69c87d79faa0c6ad70d669fabcfcf..1fd8fdff2f813305fd7d4adb37a6d716b59aecc5
@@@ -433,8 -433,6 +433,6 @@@ struct request_queue 
        /*
         * for flush operations
         */
-       unsigned int            flush_flags;
-       unsigned int            flush_not_queueable:1;
        struct blk_flush_queue  *fq;
  
        struct list_head        requeue_list;
  #define QUEUE_FLAG_POLL              22       /* IO polling enabled if set */
  #define QUEUE_FLAG_WC        23       /* Write back caching */
  #define QUEUE_FLAG_FUA               24       /* device supports FUA writes */
+ #define QUEUE_FLAG_FLUSH_NQ    25     /* flush not queueuable */
  
  #define QUEUE_FLAG_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@@ -1009,7 -1008,6 +1008,6 @@@ extern void blk_queue_update_dma_alignm
  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
- extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
  extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
  extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
@@@ -1131,8 -1129,6 +1129,8 @@@ static inline struct request *blk_map_q
  extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
  extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 +extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 +              sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop);
  extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct page *page);
  extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
@@@ -1368,7 -1364,7 +1366,7 @@@ static inline unsigned int block_size(s
  
  static inline bool queue_flush_queueable(struct request_queue *q)
  {
-       return !q->flush_not_queueable;
+       return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
  }
  
  typedef struct {struct page *v;} Sector;