*/
#include <linux/nvme.h>
-#include <linux/bio.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/pci.h>
-#include <linux/percpu.h>
#include <linux/poison.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
#include <scsi/sg.h>
#include <asm-generic/io-64-nonatomic-lo-hi.h>
-#include <trace/events/block.h>
-
-#define NVME_Q_DEPTH 1024
+#define NVME_Q_DEPTH 1024
+#define NVME_AQ_DEPTH 64
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT (60 * HZ)
-#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT)
+#define ADMIN_TIMEOUT (60 * HZ)
unsigned char io_timeout = 30;
module_param(io_timeout, byte, 0644);
static wait_queue_head_t nvme_kthread_wait;
static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
struct async_cmd_info {
struct kthread_work work;
struct kthread_worker *worker;
+ struct request *req;
u32 result;
int status;
void *ctx;
volatile struct nvme_completion *cqes;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
- wait_queue_head_t sq_full;
- wait_queue_t sq_cong_wait;
- struct bio_list sq_cong;
- struct list_head iod_bio;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
u16 cq_head;
u16 qid;
u8 cq_phase;
- u8 cqe_seen;
u8 q_suspended;
cpumask_var_t cpu_mask;
struct async_cmd_info cmdinfo;
- unsigned long cmdid_data[];
+ struct blk_mq_hw_ctx *hctx;
};
/*
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
}
+static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
+{
+ return rcu_dereference_raw(dev->queues[qid]);
+}
+
+static struct nvme_queue *get_nvmeq(struct blk_mq_hw_ctx *hctx) __acquires(RCU)
+{
+ rcu_read_lock();
+ return rcu_dereference(hctx->driver_data);
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return rcu_dereference(dev->queues[q_idx]);
+}
+
+static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+
typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
struct nvme_completion *);
-struct nvme_cmd_info {
+struct nvme_cmd_cb {
nvme_completion_fn fn;
void *ctx;
- unsigned long timeout;
int aborted;
+ struct nvme_queue *nvmeq;
};
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int i)
{
- return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+ struct nvme_dev *dev = data;
+ struct nvme_queue *nvmeq = raw_nvmeq(dev, (i % dev->queue_count) + 1);
+ BUG_ON(!nvmeq);
+ WARN_ON(nvmeq->hctx);
+ nvmeq->hctx = hctx;
+ hctx->driver_data = nvmeq;
+ return 0;
}
-static unsigned nvme_queue_extra(int depth)
+static int nvme_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int i)
{
- return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+ struct nvme_dev *dev = data;
+ struct nvme_queue *nvmeq = raw_nvmeq(dev, 0);
+ BUG_ON(!nvmeq);
+ WARN_ON(nvmeq->hctx);
+ nvmeq->hctx = hctx;
+ hctx->driver_data = nvmeq;
+ return 0;
}
-/**
- * alloc_cmdid() - Allocate a Command ID
- * @nvmeq: The queue that will be used for this command
- * @ctx: A pointer that will be passed to the handler
- * @handler: The function to call on completion
- *
- * Allocate a Command ID for a queue. The data passed in will
- * be passed to the completion handler. This is implemented by using
- * the bottom two bits of the ctx pointer to store the handler ID.
- * Passing in a pointer that's not 4-byte aligned will cause a BUG.
- * We can change this if it becomes a problem.
- *
- * May be called with local interrupts disabled and the q_lock held,
- * or with interrupts enabled and no locks held.
- */
-static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
- nvme_completion_fn handler, unsigned timeout)
+static int nvme_init_admin_request(void *data, struct request *req,
+ unsigned int hctx_idx, unsigned int rq_idx,
+ unsigned int numa_node)
{
- int depth = nvmeq->q_depth - 1;
- struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
- int cmdid;
-
- do {
- cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
- if (cmdid >= depth)
- return -EBUSY;
- } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+ struct nvme_dev *dev = data;
+ struct nvme_cmd_cb *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = raw_nvmeq(dev, 0);
+ WARN_ON(!nvmeq);
+ WARN_ON(!cmd);
+ cmd->nvmeq = nvmeq;
+ return 0;
+}
- info[cmdid].fn = handler;
- info[cmdid].ctx = ctx;
- info[cmdid].timeout = jiffies + timeout;
- info[cmdid].aborted = 0;
- return cmdid;
+static int nvme_init_request(void *data, struct request *req,
+ unsigned int hctx_idx, unsigned int rq_idx,
+ unsigned int numa_node)
+{
+ struct nvme_dev *dev = data;
+ struct nvme_cmd_cb *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = raw_nvmeq(dev, hctx_idx + 1);
+ WARN_ON(!nvmeq);
+ WARN_ON(!cmd);
+ cmd->nvmeq = nvmeq;
+ return 0;
}
-static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
- nvme_completion_fn handler, unsigned timeout)
+static void nvme_set_cb(struct nvme_cmd_cb *cmd, void *ctx,
+ nvme_completion_fn handler)
{
- int cmdid;
- wait_event_killable(nvmeq->sq_full,
- (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
- return (cmdid < 0) ? -EINTR : cmdid;
+ cmd->fn = handler;
+ cmd->ctx = ctx;
+ cmd->aborted = 0;
}
/* Special values must be less than 0x1000 */
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
-#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE)
+#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
+#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE)
static void special_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
if (ctx == CMD_CTX_CANCELLED)
return;
+ if (ctx == CMD_CTX_FLUSH) {
+ struct nvme_iod *iod = ctx;
+ struct request *req = iod->private;
+ blk_put_request(req);
+ return;
+ }
if (ctx == CMD_CTX_ABORT) {
++nvmeq->dev->abort_limit;
return;
cmdinfo->result = le32_to_cpup(&cqe->result);
cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+ blk_put_request(cmdinfo->req);
+}
+
+static inline struct nvme_cmd_cb *get_cmd_from_tag(struct nvme_queue *nvmeq,
+ unsigned int tag)
+{
+ struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+ struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+ return blk_mq_rq_to_pdu(req);
}
/*
* Called with local interrupts disabled and the q_lock held. May not sleep.
*/
-static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
nvme_completion_fn *fn)
{
+ struct nvme_cmd_cb *cmd = get_cmd_from_tag(nvmeq, tag);
void *ctx;
- struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-
- if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
- if (fn)
- *fn = special_completion;
+ if (tag >= nvmeq->q_depth) {
+ *fn = special_completion;
return CMD_CTX_INVALID;
}
if (fn)
- *fn = info[cmdid].fn;
- ctx = info[cmdid].ctx;
- info[cmdid].fn = special_completion;
- info[cmdid].ctx = CMD_CTX_COMPLETED;
- clear_bit(cmdid, nvmeq->cmdid_data);
- wake_up(&nvmeq->sq_full);
+ *fn = cmd->fn;
+ ctx = cmd->ctx;
+ cmd->fn = special_completion;
+ cmd->ctx = CMD_CTX_COMPLETED;
return ctx;
}
-static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
- nvme_completion_fn *fn)
+static void *cancel_cmd_cb(struct nvme_cmd_cb *cmd, nvme_completion_fn *fn)
{
void *ctx;
- struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
if (fn)
- *fn = info[cmdid].fn;
- ctx = info[cmdid].ctx;
- info[cmdid].fn = special_completion;
- info[cmdid].ctx = CMD_CTX_CANCELLED;
+ *fn = cmd->fn;
+ ctx = cmd->ctx;
+ cmd->fn = special_completion;
+ cmd->ctx = CMD_CTX_CANCELLED;
return ctx;
}
-static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
-{
- return rcu_dereference_raw(dev->queues[qid]);
-}
-
-static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
-{
- unsigned queue_id = get_cpu_var(*dev->io_queue);
- rcu_read_lock();
- return rcu_dereference(dev->queues[queue_id]);
-}
-
-static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
- rcu_read_unlock();
- put_cpu_var(nvmeq->dev->io_queue);
-}
-
-static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
- __acquires(RCU)
-{
- rcu_read_lock();
- return rcu_dereference(dev->queues[q_idx]);
-}
-
-static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
-{
- rcu_read_unlock();
-}
-
/**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
iod->length = nbytes;
iod->nents = 0;
iod->first_dma = 0ULL;
- iod->start_time = jiffies;
}
return iod;
kfree(iod);
}
-static void nvme_start_io_acct(struct bio *bio)
-{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
- const int rw = bio_data_dir(bio);
- int cpu = part_stat_lock();
- part_round_stats(cpu, &disk->part0);
- part_stat_inc(cpu, &disk->part0, ios[rw]);
- part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
- part_inc_in_flight(&disk->part0, rw);
- part_stat_unlock();
-}
-
-static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
-{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
- const int rw = bio_data_dir(bio);
- unsigned long duration = jiffies - start_time;
- int cpu = part_stat_lock();
- part_stat_add(cpu, &disk->part0, ticks[rw], duration);
- part_round_stats(cpu, &disk->part0);
- part_dec_in_flight(&disk->part0, rw);
- part_stat_unlock();
-}
-
-static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
- struct bio *bio = iod->private;
+ struct request *req = iod->private;
+
u16 status = le16_to_cpup(&cqe->status) >> 1;
- int error = 0;
-
- if (unlikely(status)) {
- if (!(status & NVME_SC_DNR ||
- bio->bi_rw & REQ_FAILFAST_MASK) &&
- (jiffies - iod->start_time) < IOD_TIMEOUT) {
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full,
- &nvmeq->sq_cong_wait);
- list_add_tail(&iod->node, &nvmeq->iod_bio);
- wake_up(&nvmeq->sq_full);
- return;
- }
- error = -EIO;
- }
+
if (iod->nents) {
- dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
- bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- nvme_end_io_acct(bio, iod->start_time);
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+ rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
}
nvme_free_iod(nvmeq->dev, iod);
- trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error);
- bio_endio(bio, error);
+ if (unlikely(status))
+ req->errors = -EIO;
+ else
+ req->errors = 0;
+
+ blk_mq_complete_request(req);
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
return total_len;
}
-static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
- int len)
-{
- struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
- if (!split)
- return -ENOMEM;
-
- trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
- split->bi_iter.bi_sector);
- bio_chain(split, bio);
-
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
- bio_list_add(&nvmeq->sq_cong, split);
- bio_list_add(&nvmeq->sq_cong, bio);
- wake_up(&nvmeq->sq_full);
-
- return 0;
-}
-
-/* NVMe scatterlists require no holes in the virtual address */
-#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
- (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
-
-static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
- struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+static int nvme_map_rq(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+ struct request *req, enum dma_data_direction dma_dir,
+ int psegs)
{
- struct bio_vec bvec, bvprv;
- struct bvec_iter iter;
- struct scatterlist *sg = NULL;
- int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
- int first = 1;
-
- if (nvmeq->dev->stripe_size)
- split_len = nvmeq->dev->stripe_size -
- ((bio->bi_iter.bi_sector << 9) &
- (nvmeq->dev->stripe_size - 1));
-
sg_init_table(iod->sg, psegs);
- bio_for_each_segment(bvec, bio, iter) {
- if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
- sg->length += bvec.bv_len;
- } else {
- if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
- return nvme_split_and_submit(bio, nvmeq,
- length);
-
- sg = sg ? sg + 1 : iod->sg;
- sg_set_page(sg, bvec.bv_page,
- bvec.bv_len, bvec.bv_offset);
- nsegs++;
- }
+ iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
- if (split_len - length < bvec.bv_len)
- return nvme_split_and_submit(bio, nvmeq, split_len);
- length += bvec.bv_len;
- bvprv = bvec;
- first = 0;
- }
- iod->nents = nsegs;
- sg_mark_end(sg);
- if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
+ if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
return -ENOMEM;
- BUG_ON(length != bio->bi_iter.bi_size);
- return length;
+ return iod->nents;
}
-static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct bio *bio, struct nvme_iod *iod, int cmdid)
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct request *req, struct nvme_iod *iod)
{
struct nvme_dsm_range *range =
(struct nvme_dsm_range *)iod_list(iod)[0];
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
range->cattr = cpu_to_le32(0);
- range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
- range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+ range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+ range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.command_id = cmdid;
+ cmnd->dsm.command_id = req->tag;
cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
cmnd->dsm.nr = 0;
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
-
- return 0;
}
-static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
int cmdid)
{
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
+static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+ struct request *req;
+ struct nvme_cmd_cb *cmd;
+
+ req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
+ if (!req)
+ return -ENOMEM;
+
+ cmd = blk_mq_rq_to_pdu(req);
+ nvme_set_cb(cmd, (void *)CMD_CTX_FLUSH, special_completion);
+ nvme_submit_flush(nvmeq, ns, req->tag);
return 0;
}
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+ struct nvme_ns *ns)
{
- struct bio *bio = iod->private;
- struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+ struct request *req = iod->private;
struct nvme_command *cmnd;
- int cmdid;
- u16 control;
- u32 dsmgmt;
+ u16 control = 0;
+ u32 dsmgmt = 0;
- cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
- if (unlikely(cmdid < 0))
- return cmdid;
+ spin_lock_irq(&nvmeq->q_lock);
+ if (nvmeq->q_suspended) {
+ spin_unlock_irq(&nvmeq->q_lock);
+ return -EBUSY;
+ }
- if (bio->bi_rw & REQ_DISCARD)
- return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
- if (bio->bi_rw & REQ_FLUSH)
- return nvme_submit_flush(nvmeq, ns, cmdid);
+ if (req->cmd_flags & REQ_DISCARD) {
+ nvme_submit_discard(nvmeq, ns, req, iod);
+ goto end_submit;
+ }
+ if (req->cmd_flags & REQ_FLUSH) {
+ nvme_submit_flush(nvmeq, ns, req->tag);
+ goto end_submit;
+ }
- control = 0;
- if (bio->bi_rw & REQ_FUA)
+ if (req->cmd_flags & REQ_FUA)
control |= NVME_RW_FUA;
- if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+ if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
control |= NVME_RW_LR;
- dsmgmt = 0;
- if (bio->bi_rw & REQ_RAHEAD)
+ if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
memset(cmnd, 0, sizeof(*cmnd));
- cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
- cmnd->rw.command_id = cmdid;
+ cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd->rw.command_id = req->tag;
cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
- cmnd->rw.length =
- cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
+ cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
- return 0;
-}
-
-static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
-{
- struct bio *split = bio_clone(bio, GFP_ATOMIC);
- if (!split)
- return -ENOMEM;
-
- split->bi_iter.bi_size = 0;
- split->bi_phys_segments = 0;
- bio->bi_rw &= ~REQ_FLUSH;
- bio_chain(split, bio);
-
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
- bio_list_add(&nvmeq->sq_cong, split);
- bio_list_add(&nvmeq->sq_cong, bio);
- wake_up_process(nvme_thread);
-
+ end_submit:
+ nvme_process_cq(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
return 0;
}
/*
- * Called with local interrupts disabled and the q_lock held. May not sleep.
+ * Called with preemption disabled, may not sleep.
*/
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct bio *bio)
+static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct request *req)
{
+ struct nvme_cmd_cb *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
- int psegs = bio_phys_segments(ns->queue, bio);
- int result;
+ enum dma_data_direction dma_dir;
+ int psegs = req->nr_phys_segments;
- if ((bio->bi_rw & REQ_FLUSH) && psegs)
- return nvme_split_flush_data(nvmeq, bio);
-
- iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+ iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
if (!iod)
- return -ENOMEM;
+ return BLK_MQ_RQ_QUEUE_BUSY;
- iod->private = bio;
- if (bio->bi_rw & REQ_DISCARD) {
+ if ((req->cmd_flags & REQ_FLUSH) && psegs) {
+ if (nvme_submit_flush_data(nvmeq, ns))
+ goto free_iod;
+ }
+
+ iod->private = req;
+ nvme_set_cb(cmd, iod, req_completion);
+
+ if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
* We reuse the small pool to allocate the 16-byte range here
range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
GFP_ATOMIC,
&iod->first_dma);
- if (!range) {
- result = -ENOMEM;
- goto free_iod;
- }
+ if (!range)
+ goto finish_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
} else if (psegs) {
- result = nvme_map_bio(nvmeq, iod, bio,
- bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
- psegs);
- if (result <= 0)
- goto free_iod;
- if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
- result) {
- result = -ENOMEM;
- goto free_iod;
- }
- nvme_start_io_acct(bio);
- }
- if (unlikely(nvme_submit_iod(nvmeq, iod))) {
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
- list_add_tail(&iod->node, &nvmeq->iod_bio);
+ dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+ if (nvme_map_rq(nvmeq, iod, req, dma_dir, psegs) <= 0)
+ goto finish_cmd;
+
+ if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+ blk_rq_bytes(req), GFP_ATOMIC))
+ goto finish_cmd;
}
- return 0;
+ if (!nvme_submit_iod(nvmeq, iod, ns))
+ return 0;
+
+ finish_cmd:
+ nvme_finish_cmd(nvmeq, req->tag, NULL);
free_iod:
nvme_free_iod(nvmeq->dev, iod);
- return result;
+ return BLK_MQ_RQ_QUEUE_ERROR;
}
static int nvme_process_cq(struct nvme_queue *nvmeq)
head = 0;
phase = !phase;
}
-
- ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+ ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
fn(nvmeq, ctx, &cqe);
}
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
- nvmeq->cqe_seen = 1;
return 1;
}
-static void nvme_make_request(struct request_queue *q, struct bio *bio)
+static int nvme_queue_request(struct blk_mq_hw_ctx *hctx, struct request *req)
{
- struct nvme_ns *ns = q->queuedata;
- struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
- int result = -EBUSY;
+ struct nvme_ns *ns = hctx->queue->queuedata;
+ struct nvme_queue *nvmeq = get_nvmeq(hctx);
+ int result;
if (!nvmeq) {
put_nvmeq(NULL);
- bio_endio(bio, -EIO);
- return;
+ return BLK_MQ_RQ_QUEUE_ERROR;
}
- spin_lock_irq(&nvmeq->q_lock);
- if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
- result = nvme_submit_bio_queue(nvmeq, ns, bio);
- if (unlikely(result)) {
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
- bio_list_add(&nvmeq->sq_cong, bio);
- }
-
- nvme_process_cq(nvmeq);
- spin_unlock_irq(&nvmeq->q_lock);
+ result = nvme_submit_req_queue(nvmeq, ns, req);
put_nvmeq(nvmeq);
+ return result;
}
static irqreturn_t nvme_irq(int irq, void *data)
irqreturn_t result;
struct nvme_queue *nvmeq = data;
spin_lock(&nvmeq->q_lock);
- nvme_process_cq(nvmeq);
- result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
- nvmeq->cqe_seen = 0;
+ result = nvme_process_cq(nvmeq) ? IRQ_HANDLED : IRQ_NONE;
spin_unlock(&nvmeq->q_lock);
return result;
}
static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
{
- spin_lock_irq(&nvmeq->q_lock);
- cancel_cmdid(nvmeq, cmdid, NULL);
- spin_unlock_irq(&nvmeq->q_lock);
+ /*
+ * spin_lock_irq(&nvmeq->q_lock);
+ * cancel_cmd_cb(nvmeq, cmdid, NULL);
+ * spin_unlock_irq(&nvmeq->q_lock);
+ */
}
struct sync_cmd_info {
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
*/
-static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
- struct nvme_command *cmd,
+int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
u32 *result, unsigned timeout)
{
- int cmdid, ret;
+ int ret;
struct sync_cmd_info cmdinfo;
- struct nvme_queue *nvmeq;
-
- nvmeq = lock_nvmeq(dev, q_idx);
- if (!nvmeq) {
- unlock_nvmeq(nvmeq);
- return -ENODEV;
- }
+ struct nvme_cmd_cb *cmd_rq = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd_rq->nvmeq;
cmdinfo.task = current;
cmdinfo.status = -EINTR;
- cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
- if (cmdid < 0) {
- unlock_nvmeq(nvmeq);
- return cmdid;
- }
- cmd->common.command_id = cmdid;
+ cmd->common.command_id = req->tag;
+
+ nvme_set_cb(cmd_rq, &cmdinfo, sync_completion);
set_current_state(TASK_KILLABLE);
ret = nvme_submit_cmd(nvmeq, cmd);
if (ret) {
- free_cmdid(nvmeq, cmdid, NULL);
- unlock_nvmeq(nvmeq);
+ nvme_finish_cmd(nvmeq, req->tag, NULL);
set_current_state(TASK_RUNNING);
- return ret;
}
- unlock_nvmeq(nvmeq);
schedule_timeout(timeout);
if (cmdinfo.status == -EINTR) {
- nvmeq = lock_nvmeq(dev, q_idx);
- if (nvmeq)
- nvme_abort_command(nvmeq, cmdid);
- unlock_nvmeq(nvmeq);
+ nvme_abort_command(nvmeq, req->tag);
return -EINTR;
}
struct nvme_command *cmd,
struct async_cmd_info *cmdinfo, unsigned timeout)
{
- int cmdid;
+ struct request *req;
+ struct nvme_cmd_cb *cmd_rq;
- cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
- if (cmdid < 0)
- return cmdid;
+ req = blk_mq_alloc_request(nvmeq->hctx->queue, WRITE, GFP_KERNEL,
+ false);
+ if (!req)
+ return -ENOMEM;
+
+ req->timeout = timeout;
+ cmd_rq = blk_mq_rq_to_pdu(req);
+ cmdinfo->req = req;
+ nvme_set_cb(cmd_rq, cmdinfo, async_completion);
cmdinfo->status = -EINTR;
- cmd->common.command_id = cmdid;
+
+ cmd->common.command_id = req->tag;
+
return nvme_submit_cmd(nvmeq, cmd);
}
+int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+ u32 *result, unsigned timeout)
+{
+ int res;
+ struct request *req;
+
+ req = blk_mq_alloc_request(dev->admin_rq, WRITE, GFP_KERNEL, false);
+ if (!req)
+ return -ENOMEM;
+ res = nvme_submit_sync_cmd(req, cmd, result, timeout);
+ blk_put_request(req);
+ return res;
+}
+
int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
u32 *result)
{
- return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+ return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
}
int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
u32 *result)
{
- return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
- NVME_IO_TIMEOUT);
+ int res;
+ struct request *req;
+ struct nvme_queue *nvmeq;
+
+ nvmeq = lock_nvmeq(dev, (smp_processor_id() + 1) % dev->online_queues);
+ if (!nvmeq) {
+ unlock_nvmeq(nvmeq);
+ return -ENODEV;
+ }
+
+ req = blk_mq_alloc_request(nvmeq->hctx->queue, WRITE, GFP_KERNEL,
+ false);
+ if (!req) {
+ unlock_nvmeq(nvmeq);
+ return -ENOMEM;
+ }
+
+ if (nvmeq->q_suspended) {
+ res = -EBUSY;
+ goto end_sync_io;
+ }
+
+ res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
+ end_sync_io:
+ unlock_nvmeq(nvmeq);
+ blk_put_request(req);
+ return res;
}
static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
- int status;
struct nvme_command c;
memset(&c, 0, sizeof(c));
c.delete_queue.opcode = opcode;
c.delete_queue.qid = cpu_to_le16(id);
- status = nvme_submit_admin_cmd(dev, &c, NULL);
- if (status)
- return -EIO;
- return 0;
+ return nvme_submit_admin_cmd(dev, &c, NULL);
}
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
- int status;
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
- status = nvme_submit_admin_cmd(dev, &c, NULL);
- if (status)
- return -EIO;
- return 0;
+ return nvme_submit_admin_cmd(dev, &c, NULL);
}
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
- int status;
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
- status = nvme_submit_admin_cmd(dev, &c, NULL);
- if (status)
- return -EIO;
- return 0;
+ return nvme_submit_admin_cmd(dev, &c, NULL);
}
static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
}
/**
- * nvme_abort_cmd - Attempt aborting a command
- * @cmdid: Command id of a timed out IO
- * @queue: The queue with timed out IO
+ * nvme_abort_req - Attempt aborting a rq
*
* Schedule controller reset if the command was already aborted once before and
* still hasn't been returned to the driver, or if this is the admin queue.
*/
-static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+static void nvme_abort_req(struct request *req)
{
- int a_cmdid;
- struct nvme_command cmd;
+ struct nvme_cmd_cb *cmd_rq = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd_rq->nvmeq;
struct nvme_dev *dev = nvmeq->dev;
- struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
- struct nvme_queue *adminq;
+ struct nvme_command cmd;
+ struct request *abort_req;
+ struct nvme_cmd_cb *abort_cmd;
- if (!nvmeq->qid || info[cmdid].aborted) {
+ if (!nvmeq->qid || cmd_rq->aborted) {
if (work_busy(&dev->reset_work))
return;
list_del_init(&dev->node);
dev_warn(&dev->pci_dev->dev,
- "I/O %d QID %d timeout, reset controller\n", cmdid,
- nvmeq->qid);
+ "I/O %d QID %d timeout, reset controller\n",
+ req->tag, nvmeq->qid);
dev->reset_workfn = nvme_reset_failed_dev;
queue_work(nvme_workq, &dev->reset_work);
return;
if (!dev->abort_limit)
return;
- adminq = rcu_dereference(dev->queues[0]);
- a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
- ADMIN_TIMEOUT);
- if (a_cmdid < 0)
- return;
+ abort_req = blk_mq_alloc_request(dev->admin_rq, WRITE, GFP_KERNEL,
+ true);
+ abort_cmd = blk_mq_rq_to_pdu(abort_req);
+ nvme_set_cb(abort_cmd, CMD_CTX_ABORT, special_completion);
memset(&cmd, 0, sizeof(cmd));
cmd.abort.opcode = nvme_admin_abort_cmd;
- cmd.abort.cid = cmdid;
+ cmd.abort.cid = req->tag;
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
- cmd.abort.command_id = a_cmdid;
+ cmd.abort.command_id = abort_req->tag;
--dev->abort_limit;
- info[cmdid].aborted = 1;
- info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+ cmd_rq->aborted = 1;
- dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+ dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
nvmeq->qid);
- nvme_submit_cmd(adminq, &cmd);
+ nvme_submit_cmd(raw_nvmeq(dev, 0), &cmd);
}
-/**
- * nvme_cancel_ios - Cancel outstanding I/Os
- * @queue: The queue to cancel I/Os on
- * @timeout: True to only cancel I/Os which have timed out
- */
-static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
{
- int depth = nvmeq->q_depth - 1;
- struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
- unsigned long now = jiffies;
- int cmdid;
+ struct nvme_queue *nvmeq = (struct nvme_queue *) data;
+ unsigned int tag = 0;
+ struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
- for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+ tag = 0;
+ do {
+ struct request *req;
void *ctx;
nvme_completion_fn fn;
+ struct nvme_cmd_cb *cmd;
static struct nvme_completion cqe = {
.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
};
+ int qdepth = nvmeq == nvmeq->dev->queues[0] ?
+ nvmeq->dev->admin_tagset.queue_depth :
+ nvmeq->dev->tagset.queue_depth;
- if (timeout && !time_after(now, info[cmdid].timeout))
- continue;
- if (info[cmdid].ctx == CMD_CTX_CANCELLED)
+ /* zero'd bits are free tags */
+ tag = find_next_zero_bit(tag_map, qdepth, tag);
+ if (tag >= qdepth)
+ break;
+
+ req = blk_mq_tag_to_rq(hctx->tags, tag++);
+ cmd = blk_mq_rq_to_pdu(req);
+
+ /* TODO: is this test necessary? */
+ if (!test_bit(1 /* REQ_ATOM_STARTED */, &req->atomic_flags))
continue;
- if (timeout && nvmeq->dev->initialized) {
- nvme_abort_cmd(cmdid, nvmeq);
+
+ if (cmd->ctx == CMD_CTX_CANCELLED)
continue;
- }
- dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
- nvmeq->qid);
- ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+
+ /* TODO: should we send abort request to hw? */
+
+ dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
+ req->tag, nvmeq->qid);
+ ctx = cancel_cmd_cb(cmd, &fn);
fn(nvmeq, ctx, &cqe);
- }
+
+ } while (1);
+}
+
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @queue: The queue to cancel I/Os on
+ * @timeout: True to only cancel I/Os which have timed out
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq)
+{
+ struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+
+ if (nvmeq->dev->initialized)
+ blk_mq_tag_busy_iter(hctx->tags, nvme_cancel_queue_ios, nvmeq);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req)
+{
+ void *ctx;
+ struct nvme_cmd_cb *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd->nvmeq;
+ static struct nvme_completion cqe = {
+ .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
+ };
+ nvme_completion_fn fn;
+
+ dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", req->tag,
+ nvmeq->qid);
+
+ if (nvmeq->dev->initialized)
+ nvme_abort_req(req);
+
+ ctx = cancel_cmd_cb(cmd, &fn);
+ fn(nvmeq, ctx, &cqe);
+
+ return BLK_EH_HANDLED;
}
static void nvme_free_queue(struct rcu_head *r)
{
struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
- spin_lock_irq(&nvmeq->q_lock);
- while (bio_list_peek(&nvmeq->sq_cong)) {
- struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
- bio_endio(bio, -EIO);
- }
- while (!list_empty(&nvmeq->iod_bio)) {
- static struct nvme_completion cqe = {
- .status = cpu_to_le16(
- (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
- };
- struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
- struct nvme_iod,
- node);
- list_del(&iod->node);
- bio_completion(nvmeq, iod, &cqe);
- }
- spin_unlock_irq(&nvmeq->q_lock);
-
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
{
spin_lock_irq(&nvmeq->q_lock);
nvme_process_cq(nvmeq);
- nvme_cancel_ios(nvmeq, false);
+ nvme_cancel_ios(nvmeq);
spin_unlock_irq(&nvmeq->q_lock);
}
int depth, int vector)
{
struct device *dmadev = &dev->pci_dev->dev;
- unsigned extra = nvme_queue_extra(depth);
- struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+ int orig_node = dev_to_node(dmadev);
+ int node = qid ? cpu_to_node((qid - 1) % num_online_cpus()) :
+ cpu_to_node(0);
+ struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+ node);
if (!nvmeq)
return NULL;
+ set_dev_node(dmadev, node);
nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
spin_lock_init(&nvmeq->q_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
- init_waitqueue_head(&nvmeq->sq_full);
- init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
- bio_list_init(&nvmeq->sq_cong);
- INIT_LIST_HEAD(&nvmeq->iod_bio);
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->cq_vector = vector;
nvmeq->qid = qid;
nvmeq->q_suspended = 1;
dev->queue_count++;
+ set_dev_node(dmadev, orig_node);
rcu_assign_pointer(dev->queues[qid], nvmeq);
return nvmeq;
nvmeq->cq_dma_addr);
free_nvmeq:
kfree(nvmeq);
+ set_dev_node(dmadev, orig_node);
return NULL;
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
- unsigned extra = nvme_queue_extra(nvmeq->q_depth);
nvmeq->sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- memset(nvmeq->cmdid_data, 0, extra);
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
- nvme_cancel_ios(nvmeq, false);
nvmeq->q_suspended = 0;
dev->online_queues++;
}
return 0;
}
+static void nvme_req_complete(struct request *req)
+{
+ blk_mq_end_io(req, req->errors);
+}
+
+static struct blk_mq_ops nvme_mq_admin_ops = {
+ .queue_rq = nvme_queue_request,
+ .map_queue = blk_mq_map_queue,
+ .complete = nvme_req_complete,
+ .init_hctx = nvme_init_admin_hctx,
+ .init_request = nvme_init_admin_request,
+ .timeout = nvme_timeout,
+};
+
+static struct blk_mq_ops nvme_mq_ops = {
+ .queue_rq = nvme_queue_request,
+ .map_queue = blk_mq_map_queue,
+ .complete = nvme_req_complete,
+ .init_hctx = nvme_init_hctx,
+ .init_request = nvme_init_request,
+ .timeout = nvme_timeout,
+};
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+ if (!dev->admin_rq) {
+ dev->admin_tagset.ops = &nvme_mq_admin_ops;
+ dev->admin_tagset.nr_hw_queues = 1;
+ dev->admin_tagset.queue_depth = NVME_AQ_DEPTH;
+ dev->admin_tagset.reserved_tags = 1;
+ dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+ dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+ dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_cb);
+ dev->admin_tagset.driver_data = dev;
+
+ if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+ return -ENOMEM;
+
+ dev->admin_rq = blk_mq_init_queue(&dev->admin_tagset);
+ if (!dev->admin_rq) {
+ memset(&dev->admin_tagset, 0,
+ sizeof(dev->admin_tagset));
+ blk_mq_free_tag_set(&dev->admin_tagset);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+ if (dev->admin_rq)
+ blk_mq_free_tag_set(&dev->admin_tagset);
+}
+
static int nvme_configure_admin_queue(struct nvme_dev *dev)
{
int result;
nvmeq = raw_nvmeq(dev, 0);
if (!nvmeq) {
- nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+ nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
if (!nvmeq)
return -ENOMEM;
}
result = nvme_enable_ctrl(dev, cap);
if (result)
- return result;
+ goto free_nvmeq;
+
+ result = nvme_alloc_admin_tags(dev);
+ if (result)
+ goto free_nvmeq;
result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
if (result)
- return result;
+ goto free_tags;
spin_lock_irq(&nvmeq->q_lock);
nvme_init_queue(nvmeq, 0);
spin_unlock_irq(&nvmeq->q_lock);
return result;
+
+ free_tags:
+ nvme_free_admin_tags(dev);
+ free_nvmeq:
+ nvme_free_queues(dev, 0);
+ return result;
}
struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
ADMIN_TIMEOUT;
+
if (length != cmd.data_len)
status = -ENOMEM;
else
- status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
+ status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
if (cmd.data_len) {
nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
.getgeo = nvme_getgeo,
};
-static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
-{
- struct nvme_iod *iod, *next;
-
- list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
- if (unlikely(nvme_submit_iod(nvmeq, iod)))
- break;
- list_del(&iod->node);
- if (bio_list_empty(&nvmeq->sq_cong) &&
- list_empty(&nvmeq->iod_bio))
- remove_wait_queue(&nvmeq->sq_full,
- &nvmeq->sq_cong_wait);
- }
-}
-
-static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
-{
- while (bio_list_peek(&nvmeq->sq_cong)) {
- struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
- struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-
- if (bio_list_empty(&nvmeq->sq_cong) &&
- list_empty(&nvmeq->iod_bio))
- remove_wait_queue(&nvmeq->sq_full,
- &nvmeq->sq_cong_wait);
- if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full,
- &nvmeq->sq_cong_wait);
- bio_list_add_head(&nvmeq->sq_cong, bio);
- break;
- }
- }
-}
-
static int nvme_kthread(void *data)
{
struct nvme_dev *dev, *next;
queue_work(nvme_workq, &dev->reset_work);
continue;
}
- rcu_read_lock();
for (i = 0; i < dev->queue_count; i++) {
struct nvme_queue *nvmeq =
rcu_dereference(dev->queues[i]);
if (nvmeq->q_suspended)
goto unlock;
nvme_process_cq(nvmeq);
- nvme_cancel_ios(nvmeq, true);
- nvme_resubmit_bios(nvmeq);
- nvme_resubmit_iods(nvmeq);
unlock:
spin_unlock_irq(&nvmeq->q_lock);
}
- rcu_read_unlock();
}
spin_unlock(&dev_list_lock);
schedule_timeout(round_jiffies_relative(HZ));
{
struct nvme_ns *ns;
struct gendisk *disk;
+ int node = dev_to_node(&dev->pci_dev->dev);
int lbaf;
if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
return NULL;
- ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+ ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
return NULL;
- ns->queue = blk_alloc_queue(GFP_KERNEL);
+ ns->queue = blk_mq_init_queue(&dev->tagset);
if (!ns->queue)
goto out_free_ns;
- ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+ queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);
queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
- blk_queue_make_request(ns->queue, nvme_make_request);
+ queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue);
ns->dev = dev;
ns->queue->queuedata = ns;
- disk = alloc_disk(0);
+ disk = alloc_disk_node(0, node);
if (!disk)
goto out_free_queue;
+
ns->ns_id = nsid;
ns->disk = disk;
lbaf = id->flbas & 0xf;
(pdev->device == 0x0953) && ctrl->vs[3])
dev->stripe_size = 1 << (ctrl->vs[3] + shift);
+ dev->tagset.ops = &nvme_mq_ops;
+ dev->tagset.nr_hw_queues = dev->queue_count - 1;
+ dev->tagset.timeout = NVME_IO_TIMEOUT;
+ dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+ dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH);
+ dev->tagset.cmd_size = sizeof(struct nvme_cmd_cb);
+ dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tagset.driver_data = dev;
+
+ if (blk_mq_alloc_tag_set(&dev->tagset))
+ goto out;
+
id_ns = mem;
for (i = 1; i <= nn; i++) {
res = nvme_identify(dev, i, 0, dma_addr);
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
nvme_free_namespaces(dev);
+ blk_mq_free_tag_set(&dev->tagset);
free_percpu(dev->io_queue);
kfree(dev->queues);
kfree(dev->entry);
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
- int result = -ENOMEM;
+ int node, result = -ENOMEM;
struct nvme_dev *dev;
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ node = dev_to_node(&pdev->dev);
+ if (node == NUMA_NO_NODE)
+ set_dev_node(&pdev->dev, 0);
+
+ dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
- dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
- GFP_KERNEL);
+ dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
+ GFP_KERNEL, node);
if (!dev->entry)
goto free;
- dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
- GFP_KERNEL);
+ dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+ GFP_KERNEL, node);
if (!dev->queues)
goto free;
dev->io_queue = alloc_percpu(unsigned short);
nvme_dev_shutdown(dev);
nvme_free_queues(dev, 0);
rcu_barrier();
+ nvme_free_admin_tags(dev);
nvme_release_instance(dev);
nvme_release_prp_pools(dev);
kref_put(&dev->kref, nvme_free_dev);