null_blk: restart request processing on completion handler
[linux-2.6-block.git] / drivers / block / nvme-core.c
CommitLineData
b60503ba
MW
1/*
2 * NVM Express device driver
6eb0d698 3 * Copyright (c) 2011-2014, Intel Corporation.
b60503ba
MW
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
b60503ba
MW
13 */
14
15#include <linux/nvme.h>
8de05535 16#include <linux/bitops.h>
b60503ba 17#include <linux/blkdev.h>
a4aea562 18#include <linux/blk-mq.h>
42f61420 19#include <linux/cpu.h>
fd63e9ce 20#include <linux/delay.h>
b60503ba
MW
21#include <linux/errno.h>
22#include <linux/fs.h>
23#include <linux/genhd.h>
4cc09e2d 24#include <linux/hdreg.h>
5aff9382 25#include <linux/idr.h>
b60503ba
MW
26#include <linux/init.h>
27#include <linux/interrupt.h>
28#include <linux/io.h>
29#include <linux/kdev_t.h>
1fa6aead 30#include <linux/kthread.h>
b60503ba
MW
31#include <linux/kernel.h>
32#include <linux/mm.h>
33#include <linux/module.h>
34#include <linux/moduleparam.h>
35#include <linux/pci.h>
be7b6275 36#include <linux/poison.h>
c3bfe717 37#include <linux/ptrace.h>
b60503ba
MW
38#include <linux/sched.h>
39#include <linux/slab.h>
e1e5e564 40#include <linux/t10-pi.h>
b60503ba 41#include <linux/types.h>
5d0f6131 42#include <scsi/sg.h>
797a796a
HM
43#include <asm-generic/io-64-nonatomic-lo-hi.h>
44
b3fffdef 45#define NVME_MINORS (1U << MINORBITS)
9d43cf64 46#define NVME_Q_DEPTH 1024
d31af0a3 47#define NVME_AQ_DEPTH 256
b60503ba
MW
48#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
49#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
9d43cf64 50#define ADMIN_TIMEOUT (admin_timeout * HZ)
2484f407 51#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
9d43cf64
KB
52
53static unsigned char admin_timeout = 60;
54module_param(admin_timeout, byte, 0644);
55MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
b60503ba 56
bd67608a
MW
57unsigned char nvme_io_timeout = 30;
58module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
b355084a 59MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
b60503ba 60
2484f407
DM
61static unsigned char shutdown_timeout = 5;
62module_param(shutdown_timeout, byte, 0644);
63MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
64
b60503ba
MW
65static int nvme_major;
66module_param(nvme_major, int, 0);
67
b3fffdef
KB
68static int nvme_char_major;
69module_param(nvme_char_major, int, 0);
70
58ffacb5
MW
71static int use_threaded_interrupts;
72module_param(use_threaded_interrupts, int, 0);
73
1fa6aead
MW
74static DEFINE_SPINLOCK(dev_list_lock);
75static LIST_HEAD(dev_list);
76static struct task_struct *nvme_thread;
9a6b9458 77static struct workqueue_struct *nvme_workq;
b9afca3e 78static wait_queue_head_t nvme_kthread_wait;
1fa6aead 79
b3fffdef
KB
80static struct class *nvme_class;
81
d4b4ff8e 82static void nvme_reset_failed_dev(struct work_struct *ws);
a4aea562 83static int nvme_process_cq(struct nvme_queue *nvmeq);
d4b4ff8e 84
4d115420
KB
85struct async_cmd_info {
86 struct kthread_work work;
87 struct kthread_worker *worker;
a4aea562 88 struct request *req;
4d115420
KB
89 u32 result;
90 int status;
91 void *ctx;
92};
1fa6aead 93
b60503ba
MW
94/*
95 * An NVM Express queue. Each device has at least two (one for admin
96 * commands and one for I/O commands).
97 */
98struct nvme_queue {
99 struct device *q_dmadev;
091b6092 100 struct nvme_dev *dev;
3193f07b 101 char irqname[24]; /* nvme4294967295-65535\0 */
b60503ba
MW
102 spinlock_t q_lock;
103 struct nvme_command *sq_cmds;
104 volatile struct nvme_completion *cqes;
42483228 105 struct blk_mq_tags **tags;
b60503ba
MW
106 dma_addr_t sq_dma_addr;
107 dma_addr_t cq_dma_addr;
b60503ba
MW
108 u32 __iomem *q_db;
109 u16 q_depth;
6222d172 110 s16 cq_vector;
b60503ba
MW
111 u16 sq_head;
112 u16 sq_tail;
113 u16 cq_head;
c30341dc 114 u16 qid;
e9539f47
MW
115 u8 cq_phase;
116 u8 cqe_seen;
4d115420 117 struct async_cmd_info cmdinfo;
b60503ba
MW
118};
119
120/*
121 * Check we didin't inadvertently grow the command struct
122 */
123static inline void _nvme_check_size(void)
124{
125 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
126 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
127 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
128 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
129 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
f8ebf840 130 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
c30341dc 131 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
b60503ba
MW
132 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
133 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
134 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
135 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
6ecec745 136 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
b60503ba
MW
137}
138
edd10d33 139typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
c2f5b650
MW
140 struct nvme_completion *);
141
e85248e5 142struct nvme_cmd_info {
c2f5b650
MW
143 nvme_completion_fn fn;
144 void *ctx;
c30341dc 145 int aborted;
a4aea562 146 struct nvme_queue *nvmeq;
ac3dd5bd 147 struct nvme_iod iod[0];
e85248e5
MW
148};
149
ac3dd5bd
JA
150/*
151 * Max size of iod being embedded in the request payload
152 */
153#define NVME_INT_PAGES 2
154#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
fda631ff 155#define NVME_INT_MASK 0x01
ac3dd5bd
JA
156
157/*
158 * Will slightly overestimate the number of pages needed. This is OK
159 * as it only leads to a small amount of wasted memory for the lifetime of
160 * the I/O.
161 */
162static int nvme_npages(unsigned size, struct nvme_dev *dev)
163{
164 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
165 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
166}
167
168static unsigned int nvme_cmd_size(struct nvme_dev *dev)
169{
170 unsigned int ret = sizeof(struct nvme_cmd_info);
171
172 ret += sizeof(struct nvme_iod);
173 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
174 ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
175
176 return ret;
177}
178
a4aea562
MB
179static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
180 unsigned int hctx_idx)
e85248e5 181{
a4aea562
MB
182 struct nvme_dev *dev = data;
183 struct nvme_queue *nvmeq = dev->queues[0];
184
42483228
KB
185 WARN_ON(hctx_idx != 0);
186 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
187 WARN_ON(nvmeq->tags);
188
a4aea562 189 hctx->driver_data = nvmeq;
42483228 190 nvmeq->tags = &dev->admin_tagset.tags[0];
a4aea562 191 return 0;
e85248e5
MW
192}
193
a4aea562
MB
194static int nvme_admin_init_request(void *data, struct request *req,
195 unsigned int hctx_idx, unsigned int rq_idx,
196 unsigned int numa_node)
22404274 197{
a4aea562
MB
198 struct nvme_dev *dev = data;
199 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
200 struct nvme_queue *nvmeq = dev->queues[0];
201
202 BUG_ON(!nvmeq);
203 cmd->nvmeq = nvmeq;
204 return 0;
22404274
KB
205}
206
a4aea562
MB
207static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
208 unsigned int hctx_idx)
b60503ba 209{
a4aea562 210 struct nvme_dev *dev = data;
42483228 211 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
a4aea562 212
42483228
KB
213 if (!nvmeq->tags)
214 nvmeq->tags = &dev->tagset.tags[hctx_idx];
b60503ba 215
42483228 216 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
a4aea562
MB
217 hctx->driver_data = nvmeq;
218 return 0;
b60503ba
MW
219}
220
a4aea562
MB
221static int nvme_init_request(void *data, struct request *req,
222 unsigned int hctx_idx, unsigned int rq_idx,
223 unsigned int numa_node)
b60503ba 224{
a4aea562
MB
225 struct nvme_dev *dev = data;
226 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
227 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
228
229 BUG_ON(!nvmeq);
230 cmd->nvmeq = nvmeq;
231 return 0;
232}
233
234static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
235 nvme_completion_fn handler)
236{
237 cmd->fn = handler;
238 cmd->ctx = ctx;
239 cmd->aborted = 0;
c917dfe5 240 blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
b60503ba
MW
241}
242
ac3dd5bd
JA
243static void *iod_get_private(struct nvme_iod *iod)
244{
245 return (void *) (iod->private & ~0x1UL);
246}
247
248/*
249 * If bit 0 is set, the iod is embedded in the request payload.
250 */
251static bool iod_should_kfree(struct nvme_iod *iod)
252{
fda631ff 253 return (iod->private & NVME_INT_MASK) == 0;
ac3dd5bd
JA
254}
255
c2f5b650
MW
256/* Special values must be less than 0x1000 */
257#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
d2d87034
MW
258#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
259#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
260#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
be7b6275 261
edd10d33 262static void special_completion(struct nvme_queue *nvmeq, void *ctx,
c2f5b650
MW
263 struct nvme_completion *cqe)
264{
265 if (ctx == CMD_CTX_CANCELLED)
266 return;
c2f5b650 267 if (ctx == CMD_CTX_COMPLETED) {
edd10d33 268 dev_warn(nvmeq->q_dmadev,
c2f5b650
MW
269 "completed id %d twice on queue %d\n",
270 cqe->command_id, le16_to_cpup(&cqe->sq_id));
271 return;
272 }
273 if (ctx == CMD_CTX_INVALID) {
edd10d33 274 dev_warn(nvmeq->q_dmadev,
c2f5b650
MW
275 "invalid id %d completed on queue %d\n",
276 cqe->command_id, le16_to_cpup(&cqe->sq_id));
277 return;
278 }
edd10d33 279 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
c2f5b650
MW
280}
281
a4aea562 282static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
b60503ba 283{
c2f5b650 284 void *ctx;
b60503ba 285
859361a2 286 if (fn)
a4aea562
MB
287 *fn = cmd->fn;
288 ctx = cmd->ctx;
289 cmd->fn = special_completion;
290 cmd->ctx = CMD_CTX_CANCELLED;
c2f5b650 291 return ctx;
b60503ba
MW
292}
293
a4aea562
MB
294static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
295 struct nvme_completion *cqe)
3c0cf138 296{
a4aea562
MB
297 u32 result = le32_to_cpup(&cqe->result);
298 u16 status = le16_to_cpup(&cqe->status) >> 1;
299
300 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
301 ++nvmeq->dev->event_limit;
302 if (status == NVME_SC_SUCCESS)
303 dev_warn(nvmeq->q_dmadev,
304 "async event result %08x\n", result);
b60503ba
MW
305}
306
a4aea562
MB
307static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
308 struct nvme_completion *cqe)
5a92e700 309{
a4aea562
MB
310 struct request *req = ctx;
311
312 u16 status = le16_to_cpup(&cqe->status) >> 1;
313 u32 result = le32_to_cpup(&cqe->result);
a51afb54 314
42483228 315 blk_mq_free_request(req);
a51afb54 316
a4aea562
MB
317 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
318 ++nvmeq->dev->abort_limit;
5a92e700
KB
319}
320
a4aea562
MB
321static void async_completion(struct nvme_queue *nvmeq, void *ctx,
322 struct nvme_completion *cqe)
b60503ba 323{
a4aea562
MB
324 struct async_cmd_info *cmdinfo = ctx;
325 cmdinfo->result = le32_to_cpup(&cqe->result);
326 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
327 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
42483228 328 blk_mq_free_request(cmdinfo->req);
b60503ba
MW
329}
330
a4aea562
MB
331static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
332 unsigned int tag)
b60503ba 333{
42483228 334 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
a51afb54 335
a4aea562 336 return blk_mq_rq_to_pdu(req);
4f5099af
KB
337}
338
a4aea562
MB
339/*
340 * Called with local interrupts disabled and the q_lock held. May not sleep.
341 */
342static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
343 nvme_completion_fn *fn)
4f5099af 344{
a4aea562
MB
345 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
346 void *ctx;
347 if (tag >= nvmeq->q_depth) {
348 *fn = special_completion;
349 return CMD_CTX_INVALID;
350 }
351 if (fn)
352 *fn = cmd->fn;
353 ctx = cmd->ctx;
354 cmd->fn = special_completion;
355 cmd->ctx = CMD_CTX_COMPLETED;
356 return ctx;
b60503ba
MW
357}
358
359/**
714a7a22 360 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
b60503ba
MW
361 * @nvmeq: The queue to use
362 * @cmd: The command to send
363 *
364 * Safe to use from interrupt context
365 */
a4aea562 366static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
b60503ba 367{
a4aea562
MB
368 u16 tail = nvmeq->sq_tail;
369
b60503ba 370 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
b60503ba
MW
371 if (++tail == nvmeq->q_depth)
372 tail = 0;
7547881d 373 writel(tail, nvmeq->q_db);
b60503ba 374 nvmeq->sq_tail = tail;
b60503ba
MW
375
376 return 0;
377}
378
a4aea562
MB
379static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
380{
381 unsigned long flags;
382 int ret;
383 spin_lock_irqsave(&nvmeq->q_lock, flags);
384 ret = __nvme_submit_cmd(nvmeq, cmd);
385 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
386 return ret;
387}
388
eca18b23 389static __le64 **iod_list(struct nvme_iod *iod)
e025344c 390{
eca18b23 391 return ((void *)iod) + iod->offset;
e025344c
SMM
392}
393
ac3dd5bd
JA
394static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
395 unsigned nseg, unsigned long private)
eca18b23 396{
ac3dd5bd
JA
397 iod->private = private;
398 iod->offset = offsetof(struct nvme_iod, sg[nseg]);
399 iod->npages = -1;
400 iod->length = nbytes;
401 iod->nents = 0;
eca18b23 402}
b60503ba 403
eca18b23 404static struct nvme_iod *
ac3dd5bd
JA
405__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
406 unsigned long priv, gfp_t gfp)
b60503ba 407{
eca18b23 408 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
ac3dd5bd 409 sizeof(__le64 *) * nvme_npages(bytes, dev) +
eca18b23
MW
410 sizeof(struct scatterlist) * nseg, gfp);
411
ac3dd5bd
JA
412 if (iod)
413 iod_init(iod, bytes, nseg, priv);
eca18b23
MW
414
415 return iod;
b60503ba
MW
416}
417
ac3dd5bd
JA
418static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
419 gfp_t gfp)
420{
421 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
422 sizeof(struct nvme_dsm_range);
ac3dd5bd
JA
423 struct nvme_iod *iod;
424
425 if (rq->nr_phys_segments <= NVME_INT_PAGES &&
426 size <= NVME_INT_BYTES(dev)) {
427 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
428
429 iod = cmd->iod;
ac3dd5bd 430 iod_init(iod, size, rq->nr_phys_segments,
fda631ff 431 (unsigned long) rq | NVME_INT_MASK);
ac3dd5bd
JA
432 return iod;
433 }
434
435 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
436 (unsigned long) rq, gfp);
437}
438
d29ec824 439static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
b60503ba 440{
1d090624 441 const int last_prp = dev->page_size / 8 - 1;
eca18b23
MW
442 int i;
443 __le64 **list = iod_list(iod);
444 dma_addr_t prp_dma = iod->first_dma;
445
446 if (iod->npages == 0)
447 dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
448 for (i = 0; i < iod->npages; i++) {
449 __le64 *prp_list = list[i];
450 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
451 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
452 prp_dma = next_prp_dma;
453 }
ac3dd5bd
JA
454
455 if (iod_should_kfree(iod))
456 kfree(iod);
b60503ba
MW
457}
458
b4ff9c8d
KB
459static int nvme_error_status(u16 status)
460{
461 switch (status & 0x7ff) {
462 case NVME_SC_SUCCESS:
463 return 0;
464 case NVME_SC_CAP_EXCEEDED:
465 return -ENOSPC;
466 default:
467 return -EIO;
468 }
469}
470
52b68d7e 471#ifdef CONFIG_BLK_DEV_INTEGRITY
e1e5e564
KB
472static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
473{
474 if (be32_to_cpu(pi->ref_tag) == v)
475 pi->ref_tag = cpu_to_be32(p);
476}
477
478static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
479{
480 if (be32_to_cpu(pi->ref_tag) == p)
481 pi->ref_tag = cpu_to_be32(v);
482}
483
484/**
485 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
486 *
487 * The virtual start sector is the one that was originally submitted by the
488 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
489 * start sector may be different. Remap protection information to match the
490 * physical LBA on writes, and back to the original seed on reads.
491 *
492 * Type 0 and 3 do not have a ref tag, so no remapping required.
493 */
494static void nvme_dif_remap(struct request *req,
495 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
496{
497 struct nvme_ns *ns = req->rq_disk->private_data;
498 struct bio_integrity_payload *bip;
499 struct t10_pi_tuple *pi;
500 void *p, *pmap;
501 u32 i, nlb, ts, phys, virt;
502
503 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
504 return;
505
506 bip = bio_integrity(req->bio);
507 if (!bip)
508 return;
509
510 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
e1e5e564
KB
511
512 p = pmap;
513 virt = bip_get_seed(bip);
514 phys = nvme_block_nr(ns, blk_rq_pos(req));
515 nlb = (blk_rq_bytes(req) >> ns->lba_shift);
516 ts = ns->disk->integrity->tuple_size;
517
518 for (i = 0; i < nlb; i++, virt++, phys++) {
519 pi = (struct t10_pi_tuple *)p;
520 dif_swap(phys, virt, pi);
521 p += ts;
522 }
523 kunmap_atomic(pmap);
524}
525
52b68d7e
KB
526static int nvme_noop_verify(struct blk_integrity_iter *iter)
527{
528 return 0;
529}
530
531static int nvme_noop_generate(struct blk_integrity_iter *iter)
532{
533 return 0;
534}
535
536struct blk_integrity nvme_meta_noop = {
537 .name = "NVME_META_NOOP",
538 .generate_fn = nvme_noop_generate,
539 .verify_fn = nvme_noop_verify,
540};
541
542static void nvme_init_integrity(struct nvme_ns *ns)
543{
544 struct blk_integrity integrity;
545
546 switch (ns->pi_type) {
547 case NVME_NS_DPS_PI_TYPE3:
548 integrity = t10_pi_type3_crc;
549 break;
550 case NVME_NS_DPS_PI_TYPE1:
551 case NVME_NS_DPS_PI_TYPE2:
552 integrity = t10_pi_type1_crc;
553 break;
554 default:
555 integrity = nvme_meta_noop;
556 break;
557 }
558 integrity.tuple_size = ns->ms;
559 blk_integrity_register(ns->disk, &integrity);
560 blk_queue_max_integrity_segments(ns->queue, 1);
561}
562#else /* CONFIG_BLK_DEV_INTEGRITY */
563static void nvme_dif_remap(struct request *req,
564 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
565{
566}
567static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
568{
569}
570static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
571{
572}
573static void nvme_init_integrity(struct nvme_ns *ns)
574{
575}
576#endif
577
a4aea562 578static void req_completion(struct nvme_queue *nvmeq, void *ctx,
b60503ba
MW
579 struct nvme_completion *cqe)
580{
eca18b23 581 struct nvme_iod *iod = ctx;
ac3dd5bd 582 struct request *req = iod_get_private(iod);
a4aea562
MB
583 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
584
b60503ba
MW
585 u16 status = le16_to_cpup(&cqe->status) >> 1;
586
edd10d33 587 if (unlikely(status)) {
a4aea562
MB
588 if (!(status & NVME_SC_DNR || blk_noretry_request(req))
589 && (jiffies - req->start_time) < req->timeout) {
c9d3bf88
KB
590 unsigned long flags;
591
a4aea562 592 blk_mq_requeue_request(req);
c9d3bf88
KB
593 spin_lock_irqsave(req->q->queue_lock, flags);
594 if (!blk_queue_stopped(req->q))
595 blk_mq_kick_requeue_list(req->q);
596 spin_unlock_irqrestore(req->q->queue_lock, flags);
edd10d33
KB
597 return;
598 }
d29ec824 599 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
d29ec824
CH
600 req->errors = status;
601 } else {
602 req->errors = nvme_error_status(status);
603 }
a4aea562
MB
604 } else
605 req->errors = 0;
a0a931d6
KB
606 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
607 u32 result = le32_to_cpup(&cqe->result);
608 req->special = (void *)(uintptr_t)result;
609 }
a4aea562
MB
610
611 if (cmd_rq->aborted)
e75ec752 612 dev_warn(nvmeq->dev->dev,
a4aea562
MB
613 "completing aborted command with status:%04x\n",
614 status);
615
e1e5e564 616 if (iod->nents) {
e75ec752 617 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
a4aea562 618 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
e1e5e564
KB
619 if (blk_integrity_rq(req)) {
620 if (!rq_data_dir(req))
621 nvme_dif_remap(req, nvme_dif_complete);
e75ec752 622 dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
e1e5e564
KB
623 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
624 }
625 }
edd10d33 626 nvme_free_iod(nvmeq->dev, iod);
3291fa57 627
a4aea562 628 blk_mq_complete_request(req);
b60503ba
MW
629}
630
184d2944 631/* length is in bytes. gfp flags indicates whether we may sleep. */
d29ec824
CH
632static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
633 int total_len, gfp_t gfp)
ff22b54f 634{
99802a7a 635 struct dma_pool *pool;
eca18b23
MW
636 int length = total_len;
637 struct scatterlist *sg = iod->sg;
ff22b54f
MW
638 int dma_len = sg_dma_len(sg);
639 u64 dma_addr = sg_dma_address(sg);
f137e0f1
MI
640 u32 page_size = dev->page_size;
641 int offset = dma_addr & (page_size - 1);
e025344c 642 __le64 *prp_list;
eca18b23 643 __le64 **list = iod_list(iod);
e025344c 644 dma_addr_t prp_dma;
eca18b23 645 int nprps, i;
ff22b54f 646
1d090624 647 length -= (page_size - offset);
ff22b54f 648 if (length <= 0)
eca18b23 649 return total_len;
ff22b54f 650
1d090624 651 dma_len -= (page_size - offset);
ff22b54f 652 if (dma_len) {
1d090624 653 dma_addr += (page_size - offset);
ff22b54f
MW
654 } else {
655 sg = sg_next(sg);
656 dma_addr = sg_dma_address(sg);
657 dma_len = sg_dma_len(sg);
658 }
659
1d090624 660 if (length <= page_size) {
edd10d33 661 iod->first_dma = dma_addr;
eca18b23 662 return total_len;
e025344c
SMM
663 }
664
1d090624 665 nprps = DIV_ROUND_UP(length, page_size);
99802a7a
MW
666 if (nprps <= (256 / 8)) {
667 pool = dev->prp_small_pool;
eca18b23 668 iod->npages = 0;
99802a7a
MW
669 } else {
670 pool = dev->prp_page_pool;
eca18b23 671 iod->npages = 1;
99802a7a
MW
672 }
673
b77954cb
MW
674 prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
675 if (!prp_list) {
edd10d33 676 iod->first_dma = dma_addr;
eca18b23 677 iod->npages = -1;
1d090624 678 return (total_len - length) + page_size;
b77954cb 679 }
eca18b23
MW
680 list[0] = prp_list;
681 iod->first_dma = prp_dma;
e025344c
SMM
682 i = 0;
683 for (;;) {
1d090624 684 if (i == page_size >> 3) {
e025344c 685 __le64 *old_prp_list = prp_list;
b77954cb 686 prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
eca18b23
MW
687 if (!prp_list)
688 return total_len - length;
689 list[iod->npages++] = prp_list;
7523d834
MW
690 prp_list[0] = old_prp_list[i - 1];
691 old_prp_list[i - 1] = cpu_to_le64(prp_dma);
692 i = 1;
e025344c
SMM
693 }
694 prp_list[i++] = cpu_to_le64(dma_addr);
1d090624
KB
695 dma_len -= page_size;
696 dma_addr += page_size;
697 length -= page_size;
e025344c
SMM
698 if (length <= 0)
699 break;
700 if (dma_len > 0)
701 continue;
702 BUG_ON(dma_len < 0);
703 sg = sg_next(sg);
704 dma_addr = sg_dma_address(sg);
705 dma_len = sg_dma_len(sg);
ff22b54f
MW
706 }
707
eca18b23 708 return total_len;
ff22b54f
MW
709}
710
d29ec824
CH
711static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
712 struct nvme_iod *iod)
713{
714 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
715
716 memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
717 cmnd->rw.command_id = req->tag;
718 if (req->nr_phys_segments) {
719 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
720 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
721 }
722
723 if (++nvmeq->sq_tail == nvmeq->q_depth)
724 nvmeq->sq_tail = 0;
725 writel(nvmeq->sq_tail, nvmeq->q_db);
726}
727
a4aea562
MB
728/*
729 * We reuse the small pool to allocate the 16-byte range here as it is not
730 * worth having a special pool for these or additional cases to handle freeing
731 * the iod.
732 */
733static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
734 struct request *req, struct nvme_iod *iod)
0e5e4f0e 735{
edd10d33
KB
736 struct nvme_dsm_range *range =
737 (struct nvme_dsm_range *)iod_list(iod)[0];
0e5e4f0e
KB
738 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
739
0e5e4f0e 740 range->cattr = cpu_to_le32(0);
a4aea562
MB
741 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
742 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
0e5e4f0e
KB
743
744 memset(cmnd, 0, sizeof(*cmnd));
745 cmnd->dsm.opcode = nvme_cmd_dsm;
a4aea562 746 cmnd->dsm.command_id = req->tag;
0e5e4f0e
KB
747 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
748 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
749 cmnd->dsm.nr = 0;
750 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
751
752 if (++nvmeq->sq_tail == nvmeq->q_depth)
753 nvmeq->sq_tail = 0;
754 writel(nvmeq->sq_tail, nvmeq->q_db);
0e5e4f0e
KB
755}
756
a4aea562 757static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
00df5cb4
MW
758 int cmdid)
759{
760 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
761
762 memset(cmnd, 0, sizeof(*cmnd));
763 cmnd->common.opcode = nvme_cmd_flush;
764 cmnd->common.command_id = cmdid;
765 cmnd->common.nsid = cpu_to_le32(ns->ns_id);
766
767 if (++nvmeq->sq_tail == nvmeq->q_depth)
768 nvmeq->sq_tail = 0;
769 writel(nvmeq->sq_tail, nvmeq->q_db);
00df5cb4
MW
770}
771
a4aea562
MB
772static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
773 struct nvme_ns *ns)
b60503ba 774{
ac3dd5bd 775 struct request *req = iod_get_private(iod);
ff22b54f 776 struct nvme_command *cmnd;
a4aea562
MB
777 u16 control = 0;
778 u32 dsmgmt = 0;
00df5cb4 779
a4aea562 780 if (req->cmd_flags & REQ_FUA)
b60503ba 781 control |= NVME_RW_FUA;
a4aea562 782 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
b60503ba
MW
783 control |= NVME_RW_LR;
784
a4aea562 785 if (req->cmd_flags & REQ_RAHEAD)
b60503ba
MW
786 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
787
ff22b54f 788 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
b8deb62c 789 memset(cmnd, 0, sizeof(*cmnd));
b60503ba 790
a4aea562
MB
791 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
792 cmnd->rw.command_id = req->tag;
ff22b54f 793 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
edd10d33
KB
794 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
795 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
a4aea562
MB
796 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
797 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
e1e5e564
KB
798
799 if (blk_integrity_rq(req)) {
800 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
801 switch (ns->pi_type) {
802 case NVME_NS_DPS_PI_TYPE3:
803 control |= NVME_RW_PRINFO_PRCHK_GUARD;
804 break;
805 case NVME_NS_DPS_PI_TYPE1:
806 case NVME_NS_DPS_PI_TYPE2:
807 control |= NVME_RW_PRINFO_PRCHK_GUARD |
808 NVME_RW_PRINFO_PRCHK_REF;
809 cmnd->rw.reftag = cpu_to_le32(
810 nvme_block_nr(ns, blk_rq_pos(req)));
811 break;
812 }
813 } else if (ns->ms)
814 control |= NVME_RW_PRINFO_PRACT;
815
ff22b54f
MW
816 cmnd->rw.control = cpu_to_le16(control);
817 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
b60503ba 818
b60503ba
MW
819 if (++nvmeq->sq_tail == nvmeq->q_depth)
820 nvmeq->sq_tail = 0;
7547881d 821 writel(nvmeq->sq_tail, nvmeq->q_db);
b60503ba 822
1974b1ae 823 return 0;
edd10d33
KB
824}
825
d29ec824
CH
826/*
827 * NOTE: ns is NULL when called on the admin queue.
828 */
a4aea562
MB
829static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
830 const struct blk_mq_queue_data *bd)
edd10d33 831{
a4aea562
MB
832 struct nvme_ns *ns = hctx->queue->queuedata;
833 struct nvme_queue *nvmeq = hctx->driver_data;
d29ec824 834 struct nvme_dev *dev = nvmeq->dev;
a4aea562
MB
835 struct request *req = bd->rq;
836 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
edd10d33 837 struct nvme_iod *iod;
a4aea562 838 enum dma_data_direction dma_dir;
edd10d33 839
e1e5e564
KB
840 /*
841 * If formated with metadata, require the block layer provide a buffer
842 * unless this namespace is formated such that the metadata can be
843 * stripped/generated by the controller with PRACT=1.
844 */
d29ec824 845 if (ns && ns->ms && !blk_integrity_rq(req)) {
e1e5e564
KB
846 if (!(ns->pi_type && ns->ms == 8)) {
847 req->errors = -EFAULT;
848 blk_mq_complete_request(req);
849 return BLK_MQ_RQ_QUEUE_OK;
850 }
851 }
852
d29ec824 853 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
edd10d33 854 if (!iod)
fe54303e 855 return BLK_MQ_RQ_QUEUE_BUSY;
a4aea562 856
a4aea562 857 if (req->cmd_flags & REQ_DISCARD) {
edd10d33
KB
858 void *range;
859 /*
860 * We reuse the small pool to allocate the 16-byte range here
861 * as it is not worth having a special pool for these or
862 * additional cases to handle freeing the iod.
863 */
d29ec824 864 range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
edd10d33 865 &iod->first_dma);
a4aea562 866 if (!range)
fe54303e 867 goto retry_cmd;
edd10d33
KB
868 iod_list(iod)[0] = (__le64 *)range;
869 iod->npages = 0;
ac3dd5bd 870 } else if (req->nr_phys_segments) {
a4aea562
MB
871 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
872
ac3dd5bd 873 sg_init_table(iod->sg, req->nr_phys_segments);
a4aea562 874 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
fe54303e
JA
875 if (!iod->nents)
876 goto error_cmd;
a4aea562
MB
877
878 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
fe54303e 879 goto retry_cmd;
a4aea562 880
fe54303e 881 if (blk_rq_bytes(req) !=
d29ec824
CH
882 nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
883 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
fe54303e
JA
884 goto retry_cmd;
885 }
e1e5e564
KB
886 if (blk_integrity_rq(req)) {
887 if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
888 goto error_cmd;
889
890 sg_init_table(iod->meta_sg, 1);
891 if (blk_rq_map_integrity_sg(
892 req->q, req->bio, iod->meta_sg) != 1)
893 goto error_cmd;
894
895 if (rq_data_dir(req))
896 nvme_dif_remap(req, nvme_dif_prep);
897
898 if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
899 goto error_cmd;
900 }
edd10d33 901 }
1974b1ae 902
9af8785a 903 nvme_set_info(cmd, iod, req_completion);
a4aea562 904 spin_lock_irq(&nvmeq->q_lock);
d29ec824
CH
905 if (req->cmd_type == REQ_TYPE_DRV_PRIV)
906 nvme_submit_priv(nvmeq, req, iod);
907 else if (req->cmd_flags & REQ_DISCARD)
a4aea562
MB
908 nvme_submit_discard(nvmeq, ns, req, iod);
909 else if (req->cmd_flags & REQ_FLUSH)
910 nvme_submit_flush(nvmeq, ns, req->tag);
911 else
912 nvme_submit_iod(nvmeq, iod, ns);
913
914 nvme_process_cq(nvmeq);
915 spin_unlock_irq(&nvmeq->q_lock);
916 return BLK_MQ_RQ_QUEUE_OK;
917
fe54303e 918 error_cmd:
d29ec824 919 nvme_free_iod(dev, iod);
fe54303e
JA
920 return BLK_MQ_RQ_QUEUE_ERROR;
921 retry_cmd:
d29ec824 922 nvme_free_iod(dev, iod);
fe54303e 923 return BLK_MQ_RQ_QUEUE_BUSY;
b60503ba
MW
924}
925
e9539f47 926static int nvme_process_cq(struct nvme_queue *nvmeq)
b60503ba 927{
82123460 928 u16 head, phase;
b60503ba 929
b60503ba 930 head = nvmeq->cq_head;
82123460 931 phase = nvmeq->cq_phase;
b60503ba
MW
932
933 for (;;) {
c2f5b650
MW
934 void *ctx;
935 nvme_completion_fn fn;
b60503ba 936 struct nvme_completion cqe = nvmeq->cqes[head];
82123460 937 if ((le16_to_cpu(cqe.status) & 1) != phase)
b60503ba
MW
938 break;
939 nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
940 if (++head == nvmeq->q_depth) {
941 head = 0;
82123460 942 phase = !phase;
b60503ba 943 }
a4aea562 944 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
edd10d33 945 fn(nvmeq, ctx, &cqe);
b60503ba
MW
946 }
947
948 /* If the controller ignores the cq head doorbell and continuously
949 * writes to the queue, it is theoretically possible to wrap around
950 * the queue twice and mistakenly return IRQ_NONE. Linux only
951 * requires that 0.1% of your interrupts are handled, so this isn't
952 * a big problem.
953 */
82123460 954 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
e9539f47 955 return 0;
b60503ba 956
b80d5ccc 957 writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
b60503ba 958 nvmeq->cq_head = head;
82123460 959 nvmeq->cq_phase = phase;
b60503ba 960
e9539f47
MW
961 nvmeq->cqe_seen = 1;
962 return 1;
b60503ba
MW
963}
964
965static irqreturn_t nvme_irq(int irq, void *data)
58ffacb5
MW
966{
967 irqreturn_t result;
968 struct nvme_queue *nvmeq = data;
969 spin_lock(&nvmeq->q_lock);
e9539f47
MW
970 nvme_process_cq(nvmeq);
971 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
972 nvmeq->cqe_seen = 0;
58ffacb5
MW
973 spin_unlock(&nvmeq->q_lock);
974 return result;
975}
976
977static irqreturn_t nvme_irq_check(int irq, void *data)
978{
979 struct nvme_queue *nvmeq = data;
980 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
981 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
982 return IRQ_NONE;
983 return IRQ_WAKE_THREAD;
984}
985
b60503ba
MW
986/*
987 * Returns 0 on success. If the result is negative, it's a Linux error code;
988 * if the result is positive, it's an NVM Express status code
989 */
d29ec824
CH
990int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
991 void *buffer, void __user *ubuffer, unsigned bufflen,
992 u32 *result, unsigned timeout)
b60503ba 993{
d29ec824
CH
994 bool write = cmd->common.opcode & 1;
995 struct bio *bio = NULL;
f705f837 996 struct request *req;
d29ec824 997 int ret;
f705f837 998
d29ec824 999 req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
f705f837
CH
1000 if (IS_ERR(req))
1001 return PTR_ERR(req);
b60503ba 1002
d29ec824 1003 req->cmd_type = REQ_TYPE_DRV_PRIV;
75619bfa 1004 req->cmd_flags = REQ_FAILFAST_DRIVER;
d29ec824
CH
1005 req->__data_len = 0;
1006 req->__sector = (sector_t) -1;
1007 req->bio = req->biotail = NULL;
b60503ba 1008
f4ff414a 1009 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
a4aea562 1010
d29ec824
CH
1011 req->cmd = (unsigned char *)cmd;
1012 req->cmd_len = sizeof(struct nvme_command);
a0a931d6 1013 req->special = (void *)0;
b60503ba 1014
d29ec824
CH
1015 if (buffer && bufflen) {
1016 ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
1017 if (ret)
1018 goto out;
1019 } else if (ubuffer && bufflen) {
1020 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
1021 if (ret)
1022 goto out;
1023 bio = req->bio;
1024 }
3c0cf138 1025
d29ec824
CH
1026 blk_execute_rq(req->q, NULL, req, 0);
1027 if (bio)
1028 blk_rq_unmap_user(bio);
b60503ba 1029 if (result)
a0a931d6 1030 *result = (u32)(uintptr_t)req->special;
d29ec824
CH
1031 ret = req->errors;
1032 out:
f705f837 1033 blk_mq_free_request(req);
d29ec824 1034 return ret;
f705f837
CH
1035}
1036
d29ec824
CH
1037int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1038 void *buffer, unsigned bufflen)
f705f837 1039{
d29ec824 1040 return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
b60503ba
MW
1041}
1042
a4aea562
MB
1043static int nvme_submit_async_admin_req(struct nvme_dev *dev)
1044{
1045 struct nvme_queue *nvmeq = dev->queues[0];
1046 struct nvme_command c;
1047 struct nvme_cmd_info *cmd_info;
1048 struct request *req;
1049
1efccc9d 1050 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
9f173b33
DC
1051 if (IS_ERR(req))
1052 return PTR_ERR(req);
a4aea562 1053
c917dfe5 1054 req->cmd_flags |= REQ_NO_TIMEOUT;
a4aea562 1055 cmd_info = blk_mq_rq_to_pdu(req);
1efccc9d 1056 nvme_set_info(cmd_info, NULL, async_req_completion);
a4aea562
MB
1057
1058 memset(&c, 0, sizeof(c));
1059 c.common.opcode = nvme_admin_async_event;
1060 c.common.command_id = req->tag;
1061
42483228 1062 blk_mq_free_request(req);
a4aea562
MB
1063 return __nvme_submit_cmd(nvmeq, &c);
1064}
1065
1066static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
4d115420
KB
1067 struct nvme_command *cmd,
1068 struct async_cmd_info *cmdinfo, unsigned timeout)
1069{
a4aea562
MB
1070 struct nvme_queue *nvmeq = dev->queues[0];
1071 struct request *req;
1072 struct nvme_cmd_info *cmd_rq;
4d115420 1073
a4aea562 1074 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
9f173b33
DC
1075 if (IS_ERR(req))
1076 return PTR_ERR(req);
a4aea562
MB
1077
1078 req->timeout = timeout;
1079 cmd_rq = blk_mq_rq_to_pdu(req);
1080 cmdinfo->req = req;
1081 nvme_set_info(cmd_rq, cmdinfo, async_completion);
4d115420 1082 cmdinfo->status = -EINTR;
a4aea562
MB
1083
1084 cmd->common.command_id = req->tag;
1085
4f5099af 1086 return nvme_submit_cmd(nvmeq, cmd);
4d115420
KB
1087}
1088
b60503ba
MW
1089static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1090{
b60503ba
MW
1091 struct nvme_command c;
1092
1093 memset(&c, 0, sizeof(c));
1094 c.delete_queue.opcode = opcode;
1095 c.delete_queue.qid = cpu_to_le16(id);
1096
d29ec824 1097 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
b60503ba
MW
1098}
1099
1100static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1101 struct nvme_queue *nvmeq)
1102{
b60503ba
MW
1103 struct nvme_command c;
1104 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
1105
d29ec824
CH
1106 /*
1107 * Note: we (ab)use the fact the the prp fields survive if no data
1108 * is attached to the request.
1109 */
b60503ba
MW
1110 memset(&c, 0, sizeof(c));
1111 c.create_cq.opcode = nvme_admin_create_cq;
1112 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
1113 c.create_cq.cqid = cpu_to_le16(qid);
1114 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1115 c.create_cq.cq_flags = cpu_to_le16(flags);
1116 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
1117
d29ec824 1118 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
b60503ba
MW
1119}
1120
1121static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1122 struct nvme_queue *nvmeq)
1123{
b60503ba
MW
1124 struct nvme_command c;
1125 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
1126
d29ec824
CH
1127 /*
1128 * Note: we (ab)use the fact the the prp fields survive if no data
1129 * is attached to the request.
1130 */
b60503ba
MW
1131 memset(&c, 0, sizeof(c));
1132 c.create_sq.opcode = nvme_admin_create_sq;
1133 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
1134 c.create_sq.sqid = cpu_to_le16(qid);
1135 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1136 c.create_sq.sq_flags = cpu_to_le16(flags);
1137 c.create_sq.cqid = cpu_to_le16(qid);
1138
d29ec824 1139 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
b60503ba
MW
1140}
1141
1142static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
1143{
1144 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
1145}
1146
1147static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
1148{
1149 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
1150}
1151
d29ec824 1152int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
bc5fc7e4 1153{
d29ec824
CH
1154 struct nvme_command c = {
1155 .identify.opcode = nvme_admin_identify,
1156 .identify.cns = cpu_to_le32(1),
1157 };
1158 int error;
bc5fc7e4 1159
d29ec824
CH
1160 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1161 if (!*id)
1162 return -ENOMEM;
1163
1164 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1165 sizeof(struct nvme_id_ctrl));
1166 if (error)
1167 kfree(*id);
1168 return error;
1169}
1170
1171int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
1172 struct nvme_id_ns **id)
1173{
1174 struct nvme_command c = {
1175 .identify.opcode = nvme_admin_identify,
1176 .identify.nsid = cpu_to_le32(nsid),
1177 };
1178 int error;
bc5fc7e4 1179
d29ec824
CH
1180 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
1181 if (!*id)
1182 return -ENOMEM;
1183
1184 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1185 sizeof(struct nvme_id_ns));
1186 if (error)
1187 kfree(*id);
1188 return error;
bc5fc7e4
MW
1189}
1190
5d0f6131 1191int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
08df1e05 1192 dma_addr_t dma_addr, u32 *result)
bc5fc7e4
MW
1193{
1194 struct nvme_command c;
1195
1196 memset(&c, 0, sizeof(c));
1197 c.features.opcode = nvme_admin_get_features;
a42cecce 1198 c.features.nsid = cpu_to_le32(nsid);
bc5fc7e4
MW
1199 c.features.prp1 = cpu_to_le64(dma_addr);
1200 c.features.fid = cpu_to_le32(fid);
bc5fc7e4 1201
d29ec824
CH
1202 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1203 result, 0);
df348139
MW
1204}
1205
5d0f6131
VV
1206int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1207 dma_addr_t dma_addr, u32 *result)
df348139
MW
1208{
1209 struct nvme_command c;
1210
1211 memset(&c, 0, sizeof(c));
1212 c.features.opcode = nvme_admin_set_features;
1213 c.features.prp1 = cpu_to_le64(dma_addr);
1214 c.features.fid = cpu_to_le32(fid);
1215 c.features.dword11 = cpu_to_le32(dword11);
1216
d29ec824
CH
1217 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1218 result, 0);
1219}
1220
1221int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
1222{
1223 struct nvme_command c = {
1224 .common.opcode = nvme_admin_get_log_page,
1225 .common.nsid = cpu_to_le32(0xFFFFFFFF),
1226 .common.cdw10[0] = cpu_to_le32(
1227 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
1228 NVME_LOG_SMART),
1229 };
1230 int error;
1231
1232 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
1233 if (!*log)
1234 return -ENOMEM;
1235
1236 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
1237 sizeof(struct nvme_smart_log));
1238 if (error)
1239 kfree(*log);
1240 return error;
bc5fc7e4
MW
1241}
1242
c30341dc 1243/**
a4aea562 1244 * nvme_abort_req - Attempt aborting a request
c30341dc
KB
1245 *
1246 * Schedule controller reset if the command was already aborted once before and
1247 * still hasn't been returned to the driver, or if this is the admin queue.
1248 */
a4aea562 1249static void nvme_abort_req(struct request *req)
c30341dc 1250{
a4aea562
MB
1251 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
1252 struct nvme_queue *nvmeq = cmd_rq->nvmeq;
c30341dc 1253 struct nvme_dev *dev = nvmeq->dev;
a4aea562
MB
1254 struct request *abort_req;
1255 struct nvme_cmd_info *abort_cmd;
1256 struct nvme_command cmd;
c30341dc 1257
a4aea562 1258 if (!nvmeq->qid || cmd_rq->aborted) {
7a509a6b
KB
1259 unsigned long flags;
1260
1261 spin_lock_irqsave(&dev_list_lock, flags);
c30341dc 1262 if (work_busy(&dev->reset_work))
7a509a6b 1263 goto out;
c30341dc 1264 list_del_init(&dev->node);
e75ec752 1265 dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
a4aea562 1266 req->tag, nvmeq->qid);
9ca97374 1267 dev->reset_workfn = nvme_reset_failed_dev;
c30341dc 1268 queue_work(nvme_workq, &dev->reset_work);
7a509a6b
KB
1269 out:
1270 spin_unlock_irqrestore(&dev_list_lock, flags);
c30341dc
KB
1271 return;
1272 }
1273
1274 if (!dev->abort_limit)
1275 return;
1276
a4aea562
MB
1277 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
1278 false);
9f173b33 1279 if (IS_ERR(abort_req))
c30341dc
KB
1280 return;
1281
a4aea562
MB
1282 abort_cmd = blk_mq_rq_to_pdu(abort_req);
1283 nvme_set_info(abort_cmd, abort_req, abort_completion);
1284
c30341dc
KB
1285 memset(&cmd, 0, sizeof(cmd));
1286 cmd.abort.opcode = nvme_admin_abort_cmd;
a4aea562 1287 cmd.abort.cid = req->tag;
c30341dc 1288 cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
a4aea562 1289 cmd.abort.command_id = abort_req->tag;
c30341dc
KB
1290
1291 --dev->abort_limit;
a4aea562 1292 cmd_rq->aborted = 1;
c30341dc 1293
a4aea562 1294 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
c30341dc 1295 nvmeq->qid);
a4aea562
MB
1296 if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
1297 dev_warn(nvmeq->q_dmadev,
1298 "Could not abort I/O %d QID %d",
1299 req->tag, nvmeq->qid);
c87fd540 1300 blk_mq_free_request(abort_req);
a4aea562 1301 }
c30341dc
KB
1302}
1303
42483228 1304static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
a09115b2 1305{
a4aea562
MB
1306 struct nvme_queue *nvmeq = data;
1307 void *ctx;
1308 nvme_completion_fn fn;
1309 struct nvme_cmd_info *cmd;
cef6a948
KB
1310 struct nvme_completion cqe;
1311
1312 if (!blk_mq_request_started(req))
1313 return;
a09115b2 1314
a4aea562 1315 cmd = blk_mq_rq_to_pdu(req);
a09115b2 1316
a4aea562
MB
1317 if (cmd->ctx == CMD_CTX_CANCELLED)
1318 return;
1319
cef6a948
KB
1320 if (blk_queue_dying(req->q))
1321 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
1322 else
1323 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
1324
1325
a4aea562
MB
1326 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
1327 req->tag, nvmeq->qid);
1328 ctx = cancel_cmd_info(cmd, &fn);
1329 fn(nvmeq, ctx, &cqe);
a09115b2
MW
1330}
1331
a4aea562 1332static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
9e866774 1333{
a4aea562
MB
1334 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
1335 struct nvme_queue *nvmeq = cmd->nvmeq;
1336
1337 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
1338 nvmeq->qid);
7a509a6b 1339 spin_lock_irq(&nvmeq->q_lock);
07836e65 1340 nvme_abort_req(req);
7a509a6b 1341 spin_unlock_irq(&nvmeq->q_lock);
a4aea562 1342
07836e65
KB
1343 /*
1344 * The aborted req will be completed on receiving the abort req.
1345 * We enable the timer again. If hit twice, it'll cause a device reset,
1346 * as the device then is in a faulty state.
1347 */
1348 return BLK_EH_RESET_TIMER;
a4aea562 1349}
22404274 1350
a4aea562
MB
1351static void nvme_free_queue(struct nvme_queue *nvmeq)
1352{
9e866774
MW
1353 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1354 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1355 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1356 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1357 kfree(nvmeq);
1358}
1359
a1a5ef99 1360static void nvme_free_queues(struct nvme_dev *dev, int lowest)
22404274
KB
1361{
1362 int i;
1363
a1a5ef99 1364 for (i = dev->queue_count - 1; i >= lowest; i--) {
a4aea562 1365 struct nvme_queue *nvmeq = dev->queues[i];
22404274 1366 dev->queue_count--;
a4aea562 1367 dev->queues[i] = NULL;
f435c282 1368 nvme_free_queue(nvmeq);
121c7ad4 1369 }
22404274
KB
1370}
1371
4d115420
KB
1372/**
1373 * nvme_suspend_queue - put queue into suspended state
1374 * @nvmeq - queue to suspend
4d115420
KB
1375 */
1376static int nvme_suspend_queue(struct nvme_queue *nvmeq)
b60503ba 1377{
2b25d981 1378 int vector;
b60503ba 1379
a09115b2 1380 spin_lock_irq(&nvmeq->q_lock);
2b25d981
KB
1381 if (nvmeq->cq_vector == -1) {
1382 spin_unlock_irq(&nvmeq->q_lock);
1383 return 1;
1384 }
1385 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
42f61420 1386 nvmeq->dev->online_queues--;
2b25d981 1387 nvmeq->cq_vector = -1;
a09115b2
MW
1388 spin_unlock_irq(&nvmeq->q_lock);
1389
6df3dbc8
KB
1390 if (!nvmeq->qid && nvmeq->dev->admin_q)
1391 blk_mq_freeze_queue_start(nvmeq->dev->admin_q);
1392
aba2080f
MW
1393 irq_set_affinity_hint(vector, NULL);
1394 free_irq(vector, nvmeq);
b60503ba 1395
4d115420
KB
1396 return 0;
1397}
b60503ba 1398
4d115420
KB
1399static void nvme_clear_queue(struct nvme_queue *nvmeq)
1400{
22404274 1401 spin_lock_irq(&nvmeq->q_lock);
42483228
KB
1402 if (nvmeq->tags && *nvmeq->tags)
1403 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
22404274 1404 spin_unlock_irq(&nvmeq->q_lock);
b60503ba
MW
1405}
1406
4d115420
KB
1407static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1408{
a4aea562 1409 struct nvme_queue *nvmeq = dev->queues[qid];
4d115420
KB
1410
1411 if (!nvmeq)
1412 return;
1413 if (nvme_suspend_queue(nvmeq))
1414 return;
1415
0e53d180
KB
1416 /* Don't tell the adapter to delete the admin queue.
1417 * Don't tell a removed adapter to delete IO queues. */
1418 if (qid && readl(&dev->bar->csts) != -1) {
b60503ba
MW
1419 adapter_delete_sq(dev, qid);
1420 adapter_delete_cq(dev, qid);
1421 }
07836e65
KB
1422
1423 spin_lock_irq(&nvmeq->q_lock);
1424 nvme_process_cq(nvmeq);
1425 spin_unlock_irq(&nvmeq->q_lock);
b60503ba
MW
1426}
1427
1428static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
2b25d981 1429 int depth)
b60503ba 1430{
a4aea562 1431 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
b60503ba
MW
1432 if (!nvmeq)
1433 return NULL;
1434
e75ec752 1435 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
4d51abf9 1436 &nvmeq->cq_dma_addr, GFP_KERNEL);
b60503ba
MW
1437 if (!nvmeq->cqes)
1438 goto free_nvmeq;
b60503ba 1439
e75ec752 1440 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
b60503ba
MW
1441 &nvmeq->sq_dma_addr, GFP_KERNEL);
1442 if (!nvmeq->sq_cmds)
1443 goto free_cqdma;
1444
e75ec752 1445 nvmeq->q_dmadev = dev->dev;
091b6092 1446 nvmeq->dev = dev;
3193f07b
MW
1447 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1448 dev->instance, qid);
b60503ba
MW
1449 spin_lock_init(&nvmeq->q_lock);
1450 nvmeq->cq_head = 0;
82123460 1451 nvmeq->cq_phase = 1;
b80d5ccc 1452 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
b60503ba 1453 nvmeq->q_depth = depth;
c30341dc 1454 nvmeq->qid = qid;
22404274 1455 dev->queue_count++;
a4aea562 1456 dev->queues[qid] = nvmeq;
b60503ba
MW
1457
1458 return nvmeq;
1459
1460 free_cqdma:
e75ec752 1461 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
b60503ba
MW
1462 nvmeq->cq_dma_addr);
1463 free_nvmeq:
1464 kfree(nvmeq);
1465 return NULL;
1466}
1467
3001082c
MW
1468static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1469 const char *name)
1470{
58ffacb5
MW
1471 if (use_threaded_interrupts)
1472 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
481e5bad 1473 nvme_irq_check, nvme_irq, IRQF_SHARED,
58ffacb5 1474 name, nvmeq);
3001082c 1475 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
481e5bad 1476 IRQF_SHARED, name, nvmeq);
3001082c
MW
1477}
1478
22404274 1479static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
b60503ba 1480{
22404274 1481 struct nvme_dev *dev = nvmeq->dev;
b60503ba 1482
7be50e93 1483 spin_lock_irq(&nvmeq->q_lock);
22404274
KB
1484 nvmeq->sq_tail = 0;
1485 nvmeq->cq_head = 0;
1486 nvmeq->cq_phase = 1;
b80d5ccc 1487 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
22404274 1488 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
42f61420 1489 dev->online_queues++;
7be50e93 1490 spin_unlock_irq(&nvmeq->q_lock);
22404274
KB
1491}
1492
1493static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1494{
1495 struct nvme_dev *dev = nvmeq->dev;
1496 int result;
3f85d50b 1497
2b25d981 1498 nvmeq->cq_vector = qid - 1;
b60503ba
MW
1499 result = adapter_alloc_cq(dev, qid, nvmeq);
1500 if (result < 0)
22404274 1501 return result;
b60503ba
MW
1502
1503 result = adapter_alloc_sq(dev, qid, nvmeq);
1504 if (result < 0)
1505 goto release_cq;
1506
3193f07b 1507 result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
b60503ba
MW
1508 if (result < 0)
1509 goto release_sq;
1510
22404274 1511 nvme_init_queue(nvmeq, qid);
22404274 1512 return result;
b60503ba
MW
1513
1514 release_sq:
1515 adapter_delete_sq(dev, qid);
1516 release_cq:
1517 adapter_delete_cq(dev, qid);
22404274 1518 return result;
b60503ba
MW
1519}
1520
ba47e386
MW
1521static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
1522{
1523 unsigned long timeout;
1524 u32 bit = enabled ? NVME_CSTS_RDY : 0;
1525
1526 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1527
1528 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
1529 msleep(100);
1530 if (fatal_signal_pending(current))
1531 return -EINTR;
1532 if (time_after(jiffies, timeout)) {
e75ec752 1533 dev_err(dev->dev,
27e8166c
MW
1534 "Device not ready; aborting %s\n", enabled ?
1535 "initialisation" : "reset");
ba47e386
MW
1536 return -ENODEV;
1537 }
1538 }
1539
1540 return 0;
1541}
1542
1543/*
1544 * If the device has been passed off to us in an enabled state, just clear
1545 * the enabled bit. The spec says we should set the 'shutdown notification
1546 * bits', but doing so may cause the device to complete commands to the
1547 * admin queue ... and we don't know what memory that might be pointing at!
1548 */
1549static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
1550{
01079522
DM
1551 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1552 dev->ctrl_config &= ~NVME_CC_ENABLE;
1553 writel(dev->ctrl_config, &dev->bar->cc);
44af146a 1554
ba47e386
MW
1555 return nvme_wait_ready(dev, cap, false);
1556}
1557
1558static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
1559{
01079522
DM
1560 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1561 dev->ctrl_config |= NVME_CC_ENABLE;
1562 writel(dev->ctrl_config, &dev->bar->cc);
1563
ba47e386
MW
1564 return nvme_wait_ready(dev, cap, true);
1565}
1566
1894d8f1
KB
1567static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1568{
1569 unsigned long timeout;
1894d8f1 1570
01079522
DM
1571 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1572 dev->ctrl_config |= NVME_CC_SHN_NORMAL;
1573
1574 writel(dev->ctrl_config, &dev->bar->cc);
1894d8f1 1575
2484f407 1576 timeout = SHUTDOWN_TIMEOUT + jiffies;
1894d8f1
KB
1577 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
1578 NVME_CSTS_SHST_CMPLT) {
1579 msleep(100);
1580 if (fatal_signal_pending(current))
1581 return -EINTR;
1582 if (time_after(jiffies, timeout)) {
e75ec752 1583 dev_err(dev->dev,
1894d8f1
KB
1584 "Device shutdown incomplete; abort shutdown\n");
1585 return -ENODEV;
1586 }
1587 }
1588
1589 return 0;
1590}
1591
a4aea562 1592static struct blk_mq_ops nvme_mq_admin_ops = {
d29ec824 1593 .queue_rq = nvme_queue_rq,
a4aea562
MB
1594 .map_queue = blk_mq_map_queue,
1595 .init_hctx = nvme_admin_init_hctx,
1596 .init_request = nvme_admin_init_request,
1597 .timeout = nvme_timeout,
1598};
1599
1600static struct blk_mq_ops nvme_mq_ops = {
1601 .queue_rq = nvme_queue_rq,
1602 .map_queue = blk_mq_map_queue,
1603 .init_hctx = nvme_init_hctx,
1604 .init_request = nvme_init_request,
1605 .timeout = nvme_timeout,
1606};
1607
ea191d2f
KB
1608static void nvme_dev_remove_admin(struct nvme_dev *dev)
1609{
1610 if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
1611 blk_cleanup_queue(dev->admin_q);
1612 blk_mq_free_tag_set(&dev->admin_tagset);
1613 }
1614}
1615
a4aea562
MB
1616static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1617{
1618 if (!dev->admin_q) {
1619 dev->admin_tagset.ops = &nvme_mq_admin_ops;
1620 dev->admin_tagset.nr_hw_queues = 1;
1621 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1efccc9d 1622 dev->admin_tagset.reserved_tags = 1;
a4aea562 1623 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
e75ec752 1624 dev->admin_tagset.numa_node = dev_to_node(dev->dev);
ac3dd5bd 1625 dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
a4aea562
MB
1626 dev->admin_tagset.driver_data = dev;
1627
1628 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
1629 return -ENOMEM;
1630
1631 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
35b489d3 1632 if (IS_ERR(dev->admin_q)) {
a4aea562
MB
1633 blk_mq_free_tag_set(&dev->admin_tagset);
1634 return -ENOMEM;
1635 }
ea191d2f
KB
1636 if (!blk_get_queue(dev->admin_q)) {
1637 nvme_dev_remove_admin(dev);
1638 return -ENODEV;
1639 }
0fb59cbc
KB
1640 } else
1641 blk_mq_unfreeze_queue(dev->admin_q);
a4aea562
MB
1642
1643 return 0;
1644}
1645
8d85fce7 1646static int nvme_configure_admin_queue(struct nvme_dev *dev)
b60503ba 1647{
ba47e386 1648 int result;
b60503ba 1649 u32 aqa;
ba47e386 1650 u64 cap = readq(&dev->bar->cap);
b60503ba 1651 struct nvme_queue *nvmeq;
1d090624
KB
1652 unsigned page_shift = PAGE_SHIFT;
1653 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
1654 unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
1655
1656 if (page_shift < dev_page_min) {
e75ec752 1657 dev_err(dev->dev,
1d090624
KB
1658 "Minimum device page size (%u) too large for "
1659 "host (%u)\n", 1 << dev_page_min,
1660 1 << page_shift);
1661 return -ENODEV;
1662 }
1663 if (page_shift > dev_page_max) {
e75ec752 1664 dev_info(dev->dev,
1d090624
KB
1665 "Device maximum page size (%u) smaller than "
1666 "host (%u); enabling work-around\n",
1667 1 << dev_page_max, 1 << page_shift);
1668 page_shift = dev_page_max;
1669 }
b60503ba 1670
ba47e386
MW
1671 result = nvme_disable_ctrl(dev, cap);
1672 if (result < 0)
1673 return result;
b60503ba 1674
a4aea562 1675 nvmeq = dev->queues[0];
cd638946 1676 if (!nvmeq) {
2b25d981 1677 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
cd638946
KB
1678 if (!nvmeq)
1679 return -ENOMEM;
cd638946 1680 }
b60503ba
MW
1681
1682 aqa = nvmeq->q_depth - 1;
1683 aqa |= aqa << 16;
1684
1d090624
KB
1685 dev->page_size = 1 << page_shift;
1686
01079522 1687 dev->ctrl_config = NVME_CC_CSS_NVM;
1d090624 1688 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
b60503ba 1689 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
7f53f9d2 1690 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
b60503ba
MW
1691
1692 writel(aqa, &dev->bar->aqa);
1693 writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1694 writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
b60503ba 1695
ba47e386 1696 result = nvme_enable_ctrl(dev, cap);
025c557a 1697 if (result)
a4aea562
MB
1698 goto free_nvmeq;
1699
2b25d981 1700 nvmeq->cq_vector = 0;
3193f07b 1701 result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
025c557a 1702 if (result)
0fb59cbc 1703 goto free_nvmeq;
025c557a 1704
b60503ba 1705 return result;
a4aea562 1706
a4aea562
MB
1707 free_nvmeq:
1708 nvme_free_queues(dev, 0);
1709 return result;
b60503ba
MW
1710}
1711
a53295b6
MW
1712static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1713{
1714 struct nvme_dev *dev = ns->dev;
a53295b6
MW
1715 struct nvme_user_io io;
1716 struct nvme_command c;
d29ec824 1717 unsigned length, meta_len;
a67a9513 1718 int status, write;
a67a9513
KB
1719 dma_addr_t meta_dma = 0;
1720 void *meta = NULL;
a53295b6
MW
1721
1722 if (copy_from_user(&io, uio, sizeof(io)))
1723 return -EFAULT;
6c7d4945
MW
1724
1725 switch (io.opcode) {
1726 case nvme_cmd_write:
1727 case nvme_cmd_read:
6bbf1acd 1728 case nvme_cmd_compare:
6413214c 1729 break;
6c7d4945 1730 default:
6bbf1acd 1731 return -EINVAL;
6c7d4945
MW
1732 }
1733
d29ec824
CH
1734 length = (io.nblocks + 1) << ns->lba_shift;
1735 meta_len = (io.nblocks + 1) * ns->ms;
1736 write = io.opcode & 1;
a53295b6 1737
a67a9513 1738 if (meta_len) {
d29ec824
CH
1739 if (((io.metadata & 3) || !io.metadata) && !ns->ext)
1740 return -EINVAL;
1741
1742 if (ns->ext) {
1743 length += meta_len;
1744 meta_len = 0;
1745 }
1746
e75ec752 1747 meta = dma_alloc_coherent(dev->dev, meta_len,
a67a9513
KB
1748 &meta_dma, GFP_KERNEL);
1749 if (!meta) {
1750 status = -ENOMEM;
1751 goto unmap;
1752 }
1753 if (write) {
1754 if (copy_from_user(meta, (void __user *)io.metadata,
1755 meta_len)) {
1756 status = -EFAULT;
1757 goto unmap;
1758 }
1759 }
1760 }
1761
a53295b6
MW
1762 memset(&c, 0, sizeof(c));
1763 c.rw.opcode = io.opcode;
1764 c.rw.flags = io.flags;
6c7d4945 1765 c.rw.nsid = cpu_to_le32(ns->ns_id);
a53295b6 1766 c.rw.slba = cpu_to_le64(io.slba);
6c7d4945 1767 c.rw.length = cpu_to_le16(io.nblocks);
a53295b6 1768 c.rw.control = cpu_to_le16(io.control);
1c9b5265
MW
1769 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1770 c.rw.reftag = cpu_to_le32(io.reftag);
1771 c.rw.apptag = cpu_to_le16(io.apptag);
1772 c.rw.appmask = cpu_to_le16(io.appmask);
a67a9513 1773 c.rw.metadata = cpu_to_le64(meta_dma);
d29ec824
CH
1774
1775 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
1776 (void __user *)io.addr, length, NULL, 0);
f410c680 1777 unmap:
a67a9513
KB
1778 if (meta) {
1779 if (status == NVME_SC_SUCCESS && !write) {
1780 if (copy_to_user((void __user *)io.metadata, meta,
1781 meta_len))
1782 status = -EFAULT;
1783 }
e75ec752 1784 dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
f410c680 1785 }
a53295b6
MW
1786 return status;
1787}
1788
a4aea562
MB
1789static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1790 struct nvme_passthru_cmd __user *ucmd)
6ee44cdc 1791{
7963e521 1792 struct nvme_passthru_cmd cmd;
6ee44cdc 1793 struct nvme_command c;
d29ec824
CH
1794 unsigned timeout = 0;
1795 int status;
6ee44cdc 1796
6bbf1acd
MW
1797 if (!capable(CAP_SYS_ADMIN))
1798 return -EACCES;
1799 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
6ee44cdc 1800 return -EFAULT;
6ee44cdc
MW
1801
1802 memset(&c, 0, sizeof(c));
6bbf1acd
MW
1803 c.common.opcode = cmd.opcode;
1804 c.common.flags = cmd.flags;
1805 c.common.nsid = cpu_to_le32(cmd.nsid);
1806 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1807 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1808 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1809 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1810 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1811 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1812 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1813 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1814
d29ec824
CH
1815 if (cmd.timeout_ms)
1816 timeout = msecs_to_jiffies(cmd.timeout_ms);
f705f837
CH
1817
1818 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
d29ec824
CH
1819 NULL, (void __user *)cmd.addr, cmd.data_len,
1820 &cmd.result, timeout);
1821 if (status >= 0) {
1822 if (put_user(cmd.result, &ucmd->result))
1823 return -EFAULT;
6bbf1acd 1824 }
f4f117f6 1825
6ee44cdc
MW
1826 return status;
1827}
1828
b60503ba
MW
1829static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1830 unsigned long arg)
1831{
1832 struct nvme_ns *ns = bdev->bd_disk->private_data;
1833
1834 switch (cmd) {
6bbf1acd 1835 case NVME_IOCTL_ID:
c3bfe717 1836 force_successful_syscall_return();
6bbf1acd
MW
1837 return ns->ns_id;
1838 case NVME_IOCTL_ADMIN_CMD:
a4aea562 1839 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
7963e521 1840 case NVME_IOCTL_IO_CMD:
a4aea562 1841 return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
a53295b6
MW
1842 case NVME_IOCTL_SUBMIT_IO:
1843 return nvme_submit_io(ns, (void __user *)arg);
5d0f6131
VV
1844 case SG_GET_VERSION_NUM:
1845 return nvme_sg_get_version_num((void __user *)arg);
1846 case SG_IO:
1847 return nvme_sg_io(ns, (void __user *)arg);
b60503ba
MW
1848 default:
1849 return -ENOTTY;
1850 }
1851}
1852
320a3827
KB
1853#ifdef CONFIG_COMPAT
1854static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1855 unsigned int cmd, unsigned long arg)
1856{
320a3827
KB
1857 switch (cmd) {
1858 case SG_IO:
e179729a 1859 return -ENOIOCTLCMD;
320a3827
KB
1860 }
1861 return nvme_ioctl(bdev, mode, cmd, arg);
1862}
1863#else
1864#define nvme_compat_ioctl NULL
1865#endif
1866
9ac27090
KB
1867static int nvme_open(struct block_device *bdev, fmode_t mode)
1868{
9e60352c
KB
1869 int ret = 0;
1870 struct nvme_ns *ns;
9ac27090 1871
9e60352c
KB
1872 spin_lock(&dev_list_lock);
1873 ns = bdev->bd_disk->private_data;
1874 if (!ns)
1875 ret = -ENXIO;
1876 else if (!kref_get_unless_zero(&ns->dev->kref))
1877 ret = -ENXIO;
1878 spin_unlock(&dev_list_lock);
1879
1880 return ret;
9ac27090
KB
1881}
1882
1883static void nvme_free_dev(struct kref *kref);
1884
1885static void nvme_release(struct gendisk *disk, fmode_t mode)
1886{
1887 struct nvme_ns *ns = disk->private_data;
1888 struct nvme_dev *dev = ns->dev;
1889
1890 kref_put(&dev->kref, nvme_free_dev);
1891}
1892
4cc09e2d
KB
1893static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
1894{
1895 /* some standard values */
1896 geo->heads = 1 << 6;
1897 geo->sectors = 1 << 5;
1898 geo->cylinders = get_capacity(bd->bd_disk) >> 11;
1899 return 0;
1900}
1901
e1e5e564
KB
1902static void nvme_config_discard(struct nvme_ns *ns)
1903{
1904 u32 logical_block_size = queue_logical_block_size(ns->queue);
1905 ns->queue->limits.discard_zeroes_data = 0;
1906 ns->queue->limits.discard_alignment = logical_block_size;
1907 ns->queue->limits.discard_granularity = logical_block_size;
1908 ns->queue->limits.max_discard_sectors = 0xffffffff;
1909 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1910}
1911
1b9dbf7f
KB
1912static int nvme_revalidate_disk(struct gendisk *disk)
1913{
1914 struct nvme_ns *ns = disk->private_data;
1915 struct nvme_dev *dev = ns->dev;
1916 struct nvme_id_ns *id;
a67a9513
KB
1917 u8 lbaf, pi_type;
1918 u16 old_ms;
e1e5e564 1919 unsigned short bs;
1b9dbf7f 1920
d29ec824
CH
1921 if (nvme_identify_ns(dev, ns->ns_id, &id)) {
1922 dev_warn(dev->dev, "%s: Identify failure\n", __func__);
1b9dbf7f
KB
1923 return 0;
1924 }
1925
e1e5e564
KB
1926 old_ms = ns->ms;
1927 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
1b9dbf7f 1928 ns->lba_shift = id->lbaf[lbaf].ds;
e1e5e564 1929 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
a67a9513 1930 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
e1e5e564
KB
1931
1932 /*
1933 * If identify namespace failed, use default 512 byte block size so
1934 * block layer can use before failing read/write for 0 capacity.
1935 */
1936 if (ns->lba_shift == 0)
1937 ns->lba_shift = 9;
1938 bs = 1 << ns->lba_shift;
1939
1940 /* XXX: PI implementation requires metadata equal t10 pi tuple size */
1941 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
1942 id->dps & NVME_NS_DPS_PI_MASK : 0;
1943
52b68d7e
KB
1944 if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
1945 ns->ms != old_ms ||
e1e5e564 1946 bs != queue_logical_block_size(disk->queue) ||
a67a9513 1947 (ns->ms && ns->ext)))
e1e5e564
KB
1948 blk_integrity_unregister(disk);
1949
1950 ns->pi_type = pi_type;
1951 blk_queue_logical_block_size(ns->queue, bs);
1952
52b68d7e 1953 if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
a67a9513 1954 !ns->ext)
e1e5e564
KB
1955 nvme_init_integrity(ns);
1956
52b68d7e 1957 if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk)))
e1e5e564
KB
1958 set_capacity(disk, 0);
1959 else
1960 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1961
1962 if (dev->oncs & NVME_CTRL_ONCS_DSM)
1963 nvme_config_discard(ns);
1b9dbf7f 1964
d29ec824 1965 kfree(id);
1b9dbf7f
KB
1966 return 0;
1967}
1968
b60503ba
MW
1969static const struct block_device_operations nvme_fops = {
1970 .owner = THIS_MODULE,
1971 .ioctl = nvme_ioctl,
320a3827 1972 .compat_ioctl = nvme_compat_ioctl,
9ac27090
KB
1973 .open = nvme_open,
1974 .release = nvme_release,
4cc09e2d 1975 .getgeo = nvme_getgeo,
1b9dbf7f 1976 .revalidate_disk= nvme_revalidate_disk,
b60503ba
MW
1977};
1978
1fa6aead
MW
1979static int nvme_kthread(void *data)
1980{
d4b4ff8e 1981 struct nvme_dev *dev, *next;
1fa6aead
MW
1982
1983 while (!kthread_should_stop()) {
564a232c 1984 set_current_state(TASK_INTERRUPTIBLE);
1fa6aead 1985 spin_lock(&dev_list_lock);
d4b4ff8e 1986 list_for_each_entry_safe(dev, next, &dev_list, node) {
1fa6aead 1987 int i;
07836e65 1988 if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
d4b4ff8e
KB
1989 if (work_busy(&dev->reset_work))
1990 continue;
1991 list_del_init(&dev->node);
e75ec752 1992 dev_warn(dev->dev,
a4aea562
MB
1993 "Failed status: %x, reset controller\n",
1994 readl(&dev->bar->csts));
9ca97374 1995 dev->reset_workfn = nvme_reset_failed_dev;
d4b4ff8e
KB
1996 queue_work(nvme_workq, &dev->reset_work);
1997 continue;
1998 }
1fa6aead 1999 for (i = 0; i < dev->queue_count; i++) {
a4aea562 2000 struct nvme_queue *nvmeq = dev->queues[i];
740216fc
MW
2001 if (!nvmeq)
2002 continue;
1fa6aead 2003 spin_lock_irq(&nvmeq->q_lock);
bc57a0f7 2004 nvme_process_cq(nvmeq);
6fccf938
KB
2005
2006 while ((i == 0) && (dev->event_limit > 0)) {
a4aea562 2007 if (nvme_submit_async_admin_req(dev))
6fccf938
KB
2008 break;
2009 dev->event_limit--;
2010 }
1fa6aead
MW
2011 spin_unlock_irq(&nvmeq->q_lock);
2012 }
2013 }
2014 spin_unlock(&dev_list_lock);
acb7aa0d 2015 schedule_timeout(round_jiffies_relative(HZ));
1fa6aead
MW
2016 }
2017 return 0;
2018}
2019
e1e5e564 2020static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
b60503ba
MW
2021{
2022 struct nvme_ns *ns;
2023 struct gendisk *disk;
e75ec752 2024 int node = dev_to_node(dev->dev);
b60503ba 2025
a4aea562 2026 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
b60503ba 2027 if (!ns)
e1e5e564
KB
2028 return;
2029
a4aea562 2030 ns->queue = blk_mq_init_queue(&dev->tagset);
9f173b33 2031 if (IS_ERR(ns->queue))
b60503ba 2032 goto out_free_ns;
4eeb9215
MW
2033 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
2034 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
a4aea562 2035 queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
b60503ba
MW
2036 ns->dev = dev;
2037 ns->queue->queuedata = ns;
2038
a4aea562 2039 disk = alloc_disk_node(0, node);
b60503ba
MW
2040 if (!disk)
2041 goto out_free_queue;
a4aea562 2042
5aff9382 2043 ns->ns_id = nsid;
b60503ba 2044 ns->disk = disk;
e1e5e564
KB
2045 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2046 list_add_tail(&ns->list, &dev->namespaces);
2047
e9ef4636 2048 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
8fc23e03
KB
2049 if (dev->max_hw_sectors)
2050 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
a4aea562
MB
2051 if (dev->stripe_size)
2052 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
a7d2ce28
KB
2053 if (dev->vwc & NVME_CTRL_VWC_PRESENT)
2054 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
b60503ba
MW
2055
2056 disk->major = nvme_major;
469071a3 2057 disk->first_minor = 0;
b60503ba
MW
2058 disk->fops = &nvme_fops;
2059 disk->private_data = ns;
2060 disk->queue = ns->queue;
b3fffdef 2061 disk->driverfs_dev = dev->device;
469071a3 2062 disk->flags = GENHD_FL_EXT_DEVT;
5aff9382 2063 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
b60503ba 2064
e1e5e564
KB
2065 /*
2066 * Initialize capacity to 0 until we establish the namespace format and
2067 * setup integrity extentions if necessary. The revalidate_disk after
2068 * add_disk allows the driver to register with integrity if the format
2069 * requires it.
2070 */
2071 set_capacity(disk, 0);
2072 nvme_revalidate_disk(ns->disk);
2073 add_disk(ns->disk);
2074 if (ns->ms)
2075 revalidate_disk(ns->disk);
2076 return;
b60503ba
MW
2077 out_free_queue:
2078 blk_cleanup_queue(ns->queue);
2079 out_free_ns:
2080 kfree(ns);
b60503ba
MW
2081}
2082
42f61420
KB
2083static void nvme_create_io_queues(struct nvme_dev *dev)
2084{
a4aea562 2085 unsigned i;
42f61420 2086
a4aea562 2087 for (i = dev->queue_count; i <= dev->max_qid; i++)
2b25d981 2088 if (!nvme_alloc_queue(dev, i, dev->q_depth))
42f61420
KB
2089 break;
2090
a4aea562
MB
2091 for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
2092 if (nvme_create_queue(dev->queues[i], i))
42f61420
KB
2093 break;
2094}
2095
b3b06812 2096static int set_queue_count(struct nvme_dev *dev, int count)
b60503ba
MW
2097{
2098 int status;
2099 u32 result;
b3b06812 2100 u32 q_count = (count - 1) | ((count - 1) << 16);
b60503ba 2101
df348139 2102 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
bc5fc7e4 2103 &result);
27e8166c
MW
2104 if (status < 0)
2105 return status;
2106 if (status > 0) {
e75ec752 2107 dev_err(dev->dev, "Could not set queue count (%d)\n", status);
badc34d4 2108 return 0;
27e8166c 2109 }
b60503ba
MW
2110 return min(result & 0xffff, result >> 16) + 1;
2111}
2112
9d713c2b
KB
2113static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
2114{
b80d5ccc 2115 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
9d713c2b
KB
2116}
2117
8d85fce7 2118static int nvme_setup_io_queues(struct nvme_dev *dev)
b60503ba 2119{
a4aea562 2120 struct nvme_queue *adminq = dev->queues[0];
e75ec752 2121 struct pci_dev *pdev = to_pci_dev(dev->dev);
42f61420 2122 int result, i, vecs, nr_io_queues, size;
b60503ba 2123
42f61420 2124 nr_io_queues = num_possible_cpus();
b348b7d5 2125 result = set_queue_count(dev, nr_io_queues);
badc34d4 2126 if (result <= 0)
1b23484b 2127 return result;
b348b7d5
MW
2128 if (result < nr_io_queues)
2129 nr_io_queues = result;
b60503ba 2130
9d713c2b
KB
2131 size = db_bar_size(dev, nr_io_queues);
2132 if (size > 8192) {
f1938f6e 2133 iounmap(dev->bar);
9d713c2b
KB
2134 do {
2135 dev->bar = ioremap(pci_resource_start(pdev, 0), size);
2136 if (dev->bar)
2137 break;
2138 if (!--nr_io_queues)
2139 return -ENOMEM;
2140 size = db_bar_size(dev, nr_io_queues);
2141 } while (1);
f1938f6e 2142 dev->dbs = ((void __iomem *)dev->bar) + 4096;
5a92e700 2143 adminq->q_db = dev->dbs;
f1938f6e
MW
2144 }
2145
9d713c2b 2146 /* Deregister the admin queue's interrupt */
3193f07b 2147 free_irq(dev->entry[0].vector, adminq);
9d713c2b 2148
e32efbfc
JA
2149 /*
2150 * If we enable msix early due to not intx, disable it again before
2151 * setting up the full range we need.
2152 */
2153 if (!pdev->irq)
2154 pci_disable_msix(pdev);
2155
be577fab 2156 for (i = 0; i < nr_io_queues; i++)
1b23484b 2157 dev->entry[i].entry = i;
be577fab
AG
2158 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
2159 if (vecs < 0) {
2160 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
2161 if (vecs < 0) {
2162 vecs = 1;
2163 } else {
2164 for (i = 0; i < vecs; i++)
2165 dev->entry[i].vector = i + pdev->irq;
fa08a396
RRG
2166 }
2167 }
2168
063a8096
MW
2169 /*
2170 * Should investigate if there's a performance win from allocating
2171 * more queues than interrupt vectors; it might allow the submission
2172 * path to scale better, even if the receive path is limited by the
2173 * number of interrupts.
2174 */
2175 nr_io_queues = vecs;
42f61420 2176 dev->max_qid = nr_io_queues;
063a8096 2177
3193f07b 2178 result = queue_request_irq(dev, adminq, adminq->irqname);
a4aea562 2179 if (result)
22404274 2180 goto free_queues;
1b23484b 2181
cd638946 2182 /* Free previously allocated queues that are no longer usable */
42f61420 2183 nvme_free_queues(dev, nr_io_queues + 1);
a4aea562 2184 nvme_create_io_queues(dev);
9ecdc946 2185
22404274 2186 return 0;
b60503ba 2187
22404274 2188 free_queues:
a1a5ef99 2189 nvme_free_queues(dev, 1);
22404274 2190 return result;
b60503ba
MW
2191}
2192
422ef0c7
MW
2193/*
2194 * Return: error value if an error occurred setting up the queues or calling
2195 * Identify Device. 0 if these succeeded, even if adding some of the
2196 * namespaces failed. At the moment, these failures are silent. TBD which
2197 * failures should be reported.
2198 */
8d85fce7 2199static int nvme_dev_add(struct nvme_dev *dev)
b60503ba 2200{
e75ec752 2201 struct pci_dev *pdev = to_pci_dev(dev->dev);
c3bfe717
MW
2202 int res;
2203 unsigned nn, i;
51814232 2204 struct nvme_id_ctrl *ctrl;
159b67d7 2205 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
b60503ba 2206
d29ec824 2207 res = nvme_identify_ctrl(dev, &ctrl);
b60503ba 2208 if (res) {
e75ec752 2209 dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
e1e5e564 2210 return -EIO;
b60503ba
MW
2211 }
2212
51814232 2213 nn = le32_to_cpup(&ctrl->nn);
0e5e4f0e 2214 dev->oncs = le16_to_cpup(&ctrl->oncs);
c30341dc 2215 dev->abort_limit = ctrl->acl + 1;
a7d2ce28 2216 dev->vwc = ctrl->vwc;
51814232
MW
2217 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
2218 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
2219 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
159b67d7 2220 if (ctrl->mdts)
8fc23e03 2221 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
68608c26 2222 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
a4aea562
MB
2223 (pdev->device == 0x0953) && ctrl->vs[3]) {
2224 unsigned int max_hw_sectors;
2225
159b67d7 2226 dev->stripe_size = 1 << (ctrl->vs[3] + shift);
a4aea562
MB
2227 max_hw_sectors = dev->stripe_size >> (shift - 9);
2228 if (dev->max_hw_sectors) {
2229 dev->max_hw_sectors = min(max_hw_sectors,
2230 dev->max_hw_sectors);
2231 } else
2232 dev->max_hw_sectors = max_hw_sectors;
2233 }
d29ec824 2234 kfree(ctrl);
a4aea562
MB
2235
2236 dev->tagset.ops = &nvme_mq_ops;
2237 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2238 dev->tagset.timeout = NVME_IO_TIMEOUT;
e75ec752 2239 dev->tagset.numa_node = dev_to_node(dev->dev);
a4aea562
MB
2240 dev->tagset.queue_depth =
2241 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
ac3dd5bd 2242 dev->tagset.cmd_size = nvme_cmd_size(dev);
a4aea562
MB
2243 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2244 dev->tagset.driver_data = dev;
2245
2246 if (blk_mq_alloc_tag_set(&dev->tagset))
e1e5e564 2247 return 0;
b60503ba 2248
e1e5e564
KB
2249 for (i = 1; i <= nn; i++)
2250 nvme_alloc_ns(dev, i);
b60503ba 2251
e1e5e564 2252 return 0;
b60503ba
MW
2253}
2254
0877cb0d
KB
2255static int nvme_dev_map(struct nvme_dev *dev)
2256{
42f61420 2257 u64 cap;
0877cb0d 2258 int bars, result = -ENOMEM;
e75ec752 2259 struct pci_dev *pdev = to_pci_dev(dev->dev);
0877cb0d
KB
2260
2261 if (pci_enable_device_mem(pdev))
2262 return result;
2263
2264 dev->entry[0].vector = pdev->irq;
2265 pci_set_master(pdev);
2266 bars = pci_select_bars(pdev, IORESOURCE_MEM);
be7837e8
JA
2267 if (!bars)
2268 goto disable_pci;
2269
0877cb0d
KB
2270 if (pci_request_selected_regions(pdev, bars, "nvme"))
2271 goto disable_pci;
2272
e75ec752
CH
2273 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
2274 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
052d0efa 2275 goto disable;
0877cb0d 2276
0877cb0d
KB
2277 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
2278 if (!dev->bar)
2279 goto disable;
e32efbfc 2280
0e53d180
KB
2281 if (readl(&dev->bar->csts) == -1) {
2282 result = -ENODEV;
2283 goto unmap;
2284 }
e32efbfc
JA
2285
2286 /*
2287 * Some devices don't advertse INTx interrupts, pre-enable a single
2288 * MSIX vec for setup. We'll adjust this later.
2289 */
2290 if (!pdev->irq) {
2291 result = pci_enable_msix(pdev, dev->entry, 1);
2292 if (result < 0)
2293 goto unmap;
2294 }
2295
42f61420
KB
2296 cap = readq(&dev->bar->cap);
2297 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2298 dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
0877cb0d
KB
2299 dev->dbs = ((void __iomem *)dev->bar) + 4096;
2300
2301 return 0;
2302
0e53d180
KB
2303 unmap:
2304 iounmap(dev->bar);
2305 dev->bar = NULL;
0877cb0d
KB
2306 disable:
2307 pci_release_regions(pdev);
2308 disable_pci:
2309 pci_disable_device(pdev);
2310 return result;
2311}
2312
2313static void nvme_dev_unmap(struct nvme_dev *dev)
2314{
e75ec752
CH
2315 struct pci_dev *pdev = to_pci_dev(dev->dev);
2316
2317 if (pdev->msi_enabled)
2318 pci_disable_msi(pdev);
2319 else if (pdev->msix_enabled)
2320 pci_disable_msix(pdev);
0877cb0d
KB
2321
2322 if (dev->bar) {
2323 iounmap(dev->bar);
2324 dev->bar = NULL;
e75ec752 2325 pci_release_regions(pdev);
0877cb0d
KB
2326 }
2327
e75ec752
CH
2328 if (pci_is_enabled(pdev))
2329 pci_disable_device(pdev);
0877cb0d
KB
2330}
2331
4d115420
KB
2332struct nvme_delq_ctx {
2333 struct task_struct *waiter;
2334 struct kthread_worker *worker;
2335 atomic_t refcount;
2336};
2337
2338static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
2339{
2340 dq->waiter = current;
2341 mb();
2342
2343 for (;;) {
2344 set_current_state(TASK_KILLABLE);
2345 if (!atomic_read(&dq->refcount))
2346 break;
2347 if (!schedule_timeout(ADMIN_TIMEOUT) ||
2348 fatal_signal_pending(current)) {
0fb59cbc
KB
2349 /*
2350 * Disable the controller first since we can't trust it
2351 * at this point, but leave the admin queue enabled
2352 * until all queue deletion requests are flushed.
2353 * FIXME: This may take a while if there are more h/w
2354 * queues than admin tags.
2355 */
4d115420 2356 set_current_state(TASK_RUNNING);
4d115420 2357 nvme_disable_ctrl(dev, readq(&dev->bar->cap));
0fb59cbc 2358 nvme_clear_queue(dev->queues[0]);
4d115420 2359 flush_kthread_worker(dq->worker);
0fb59cbc 2360 nvme_disable_queue(dev, 0);
4d115420
KB
2361 return;
2362 }
2363 }
2364 set_current_state(TASK_RUNNING);
2365}
2366
2367static void nvme_put_dq(struct nvme_delq_ctx *dq)
2368{
2369 atomic_dec(&dq->refcount);
2370 if (dq->waiter)
2371 wake_up_process(dq->waiter);
2372}
2373
2374static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
2375{
2376 atomic_inc(&dq->refcount);
2377 return dq;
2378}
2379
2380static void nvme_del_queue_end(struct nvme_queue *nvmeq)
2381{
2382 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
4d115420
KB
2383 nvme_put_dq(dq);
2384}
2385
2386static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2387 kthread_work_func_t fn)
2388{
2389 struct nvme_command c;
2390
2391 memset(&c, 0, sizeof(c));
2392 c.delete_queue.opcode = opcode;
2393 c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2394
2395 init_kthread_work(&nvmeq->cmdinfo.work, fn);
a4aea562
MB
2396 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
2397 ADMIN_TIMEOUT);
4d115420
KB
2398}
2399
2400static void nvme_del_cq_work_handler(struct kthread_work *work)
2401{
2402 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2403 cmdinfo.work);
2404 nvme_del_queue_end(nvmeq);
2405}
2406
2407static int nvme_delete_cq(struct nvme_queue *nvmeq)
2408{
2409 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
2410 nvme_del_cq_work_handler);
2411}
2412
2413static void nvme_del_sq_work_handler(struct kthread_work *work)
2414{
2415 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2416 cmdinfo.work);
2417 int status = nvmeq->cmdinfo.status;
2418
2419 if (!status)
2420 status = nvme_delete_cq(nvmeq);
2421 if (status)
2422 nvme_del_queue_end(nvmeq);
2423}
2424
2425static int nvme_delete_sq(struct nvme_queue *nvmeq)
2426{
2427 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
2428 nvme_del_sq_work_handler);
2429}
2430
2431static void nvme_del_queue_start(struct kthread_work *work)
2432{
2433 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2434 cmdinfo.work);
4d115420
KB
2435 if (nvme_delete_sq(nvmeq))
2436 nvme_del_queue_end(nvmeq);
2437}
2438
2439static void nvme_disable_io_queues(struct nvme_dev *dev)
2440{
2441 int i;
2442 DEFINE_KTHREAD_WORKER_ONSTACK(worker);
2443 struct nvme_delq_ctx dq;
2444 struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
2445 &worker, "nvme%d", dev->instance);
2446
2447 if (IS_ERR(kworker_task)) {
e75ec752 2448 dev_err(dev->dev,
4d115420
KB
2449 "Failed to create queue del task\n");
2450 for (i = dev->queue_count - 1; i > 0; i--)
2451 nvme_disable_queue(dev, i);
2452 return;
2453 }
2454
2455 dq.waiter = NULL;
2456 atomic_set(&dq.refcount, 0);
2457 dq.worker = &worker;
2458 for (i = dev->queue_count - 1; i > 0; i--) {
a4aea562 2459 struct nvme_queue *nvmeq = dev->queues[i];
4d115420
KB
2460
2461 if (nvme_suspend_queue(nvmeq))
2462 continue;
2463 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
2464 nvmeq->cmdinfo.worker = dq.worker;
2465 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
2466 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
2467 }
2468 nvme_wait_dq(&dq, dev);
2469 kthread_stop(kworker_task);
2470}
2471
b9afca3e
DM
2472/*
2473* Remove the node from the device list and check
2474* for whether or not we need to stop the nvme_thread.
2475*/
2476static void nvme_dev_list_remove(struct nvme_dev *dev)
2477{
2478 struct task_struct *tmp = NULL;
2479
2480 spin_lock(&dev_list_lock);
2481 list_del_init(&dev->node);
2482 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
2483 tmp = nvme_thread;
2484 nvme_thread = NULL;
2485 }
2486 spin_unlock(&dev_list_lock);
2487
2488 if (tmp)
2489 kthread_stop(tmp);
2490}
2491
c9d3bf88
KB
2492static void nvme_freeze_queues(struct nvme_dev *dev)
2493{
2494 struct nvme_ns *ns;
2495
2496 list_for_each_entry(ns, &dev->namespaces, list) {
2497 blk_mq_freeze_queue_start(ns->queue);
2498
cddcd72b 2499 spin_lock_irq(ns->queue->queue_lock);
c9d3bf88 2500 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
cddcd72b 2501 spin_unlock_irq(ns->queue->queue_lock);
c9d3bf88
KB
2502
2503 blk_mq_cancel_requeue_work(ns->queue);
2504 blk_mq_stop_hw_queues(ns->queue);
2505 }
2506}
2507
2508static void nvme_unfreeze_queues(struct nvme_dev *dev)
2509{
2510 struct nvme_ns *ns;
2511
2512 list_for_each_entry(ns, &dev->namespaces, list) {
2513 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
2514 blk_mq_unfreeze_queue(ns->queue);
2515 blk_mq_start_stopped_hw_queues(ns->queue, true);
2516 blk_mq_kick_requeue_list(ns->queue);
2517 }
2518}
2519
f0b50732 2520static void nvme_dev_shutdown(struct nvme_dev *dev)
b60503ba 2521{
22404274 2522 int i;
7c1b2450 2523 u32 csts = -1;
22404274 2524
b9afca3e 2525 nvme_dev_list_remove(dev);
1fa6aead 2526
c9d3bf88
KB
2527 if (dev->bar) {
2528 nvme_freeze_queues(dev);
7c1b2450 2529 csts = readl(&dev->bar->csts);
c9d3bf88 2530 }
7c1b2450 2531 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
4d115420 2532 for (i = dev->queue_count - 1; i >= 0; i--) {
a4aea562 2533 struct nvme_queue *nvmeq = dev->queues[i];
4d115420 2534 nvme_suspend_queue(nvmeq);
4d115420
KB
2535 }
2536 } else {
2537 nvme_disable_io_queues(dev);
1894d8f1 2538 nvme_shutdown_ctrl(dev);
4d115420
KB
2539 nvme_disable_queue(dev, 0);
2540 }
f0b50732 2541 nvme_dev_unmap(dev);
07836e65
KB
2542
2543 for (i = dev->queue_count - 1; i >= 0; i--)
2544 nvme_clear_queue(dev->queues[i]);
f0b50732
KB
2545}
2546
2547static void nvme_dev_remove(struct nvme_dev *dev)
2548{
9ac27090 2549 struct nvme_ns *ns;
f0b50732 2550
9ac27090 2551 list_for_each_entry(ns, &dev->namespaces, list) {
e1e5e564 2552 if (ns->disk->flags & GENHD_FL_UP) {
52b68d7e 2553 if (blk_get_integrity(ns->disk))
e1e5e564 2554 blk_integrity_unregister(ns->disk);
9ac27090 2555 del_gendisk(ns->disk);
e1e5e564 2556 }
cef6a948
KB
2557 if (!blk_queue_dying(ns->queue)) {
2558 blk_mq_abort_requeue_list(ns->queue);
9ac27090 2559 blk_cleanup_queue(ns->queue);
cef6a948 2560 }
b60503ba 2561 }
b60503ba
MW
2562}
2563
091b6092
MW
2564static int nvme_setup_prp_pools(struct nvme_dev *dev)
2565{
e75ec752 2566 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
091b6092
MW
2567 PAGE_SIZE, PAGE_SIZE, 0);
2568 if (!dev->prp_page_pool)
2569 return -ENOMEM;
2570
99802a7a 2571 /* Optimisation for I/Os between 4k and 128k */
e75ec752 2572 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
99802a7a
MW
2573 256, 256, 0);
2574 if (!dev->prp_small_pool) {
2575 dma_pool_destroy(dev->prp_page_pool);
2576 return -ENOMEM;
2577 }
091b6092
MW
2578 return 0;
2579}
2580
2581static void nvme_release_prp_pools(struct nvme_dev *dev)
2582{
2583 dma_pool_destroy(dev->prp_page_pool);
99802a7a 2584 dma_pool_destroy(dev->prp_small_pool);
091b6092
MW
2585}
2586
cd58ad7d
QSA
2587static DEFINE_IDA(nvme_instance_ida);
2588
2589static int nvme_set_instance(struct nvme_dev *dev)
b60503ba 2590{
cd58ad7d
QSA
2591 int instance, error;
2592
2593 do {
2594 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
2595 return -ENODEV;
2596
2597 spin_lock(&dev_list_lock);
2598 error = ida_get_new(&nvme_instance_ida, &instance);
2599 spin_unlock(&dev_list_lock);
2600 } while (error == -EAGAIN);
2601
2602 if (error)
2603 return -ENODEV;
2604
2605 dev->instance = instance;
2606 return 0;
b60503ba
MW
2607}
2608
2609static void nvme_release_instance(struct nvme_dev *dev)
2610{
cd58ad7d
QSA
2611 spin_lock(&dev_list_lock);
2612 ida_remove(&nvme_instance_ida, dev->instance);
2613 spin_unlock(&dev_list_lock);
b60503ba
MW
2614}
2615
9ac27090
KB
2616static void nvme_free_namespaces(struct nvme_dev *dev)
2617{
2618 struct nvme_ns *ns, *next;
2619
2620 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
2621 list_del(&ns->list);
9e60352c
KB
2622
2623 spin_lock(&dev_list_lock);
2624 ns->disk->private_data = NULL;
2625 spin_unlock(&dev_list_lock);
2626
9ac27090
KB
2627 put_disk(ns->disk);
2628 kfree(ns);
2629 }
2630}
2631
5e82e952
KB
2632static void nvme_free_dev(struct kref *kref)
2633{
2634 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
9ac27090 2635
e75ec752 2636 put_device(dev->dev);
b3fffdef 2637 put_device(dev->device);
9ac27090 2638 nvme_free_namespaces(dev);
285dffc9 2639 nvme_release_instance(dev);
a4aea562 2640 blk_mq_free_tag_set(&dev->tagset);
ea191d2f 2641 blk_put_queue(dev->admin_q);
5e82e952
KB
2642 kfree(dev->queues);
2643 kfree(dev->entry);
2644 kfree(dev);
2645}
2646
2647static int nvme_dev_open(struct inode *inode, struct file *f)
2648{
b3fffdef
KB
2649 struct nvme_dev *dev;
2650 int instance = iminor(inode);
2651 int ret = -ENODEV;
2652
2653 spin_lock(&dev_list_lock);
2654 list_for_each_entry(dev, &dev_list, node) {
2655 if (dev->instance == instance) {
2e1d8448
KB
2656 if (!dev->admin_q) {
2657 ret = -EWOULDBLOCK;
2658 break;
2659 }
b3fffdef
KB
2660 if (!kref_get_unless_zero(&dev->kref))
2661 break;
2662 f->private_data = dev;
2663 ret = 0;
2664 break;
2665 }
2666 }
2667 spin_unlock(&dev_list_lock);
2668
2669 return ret;
5e82e952
KB
2670}
2671
2672static int nvme_dev_release(struct inode *inode, struct file *f)
2673{
2674 struct nvme_dev *dev = f->private_data;
2675 kref_put(&dev->kref, nvme_free_dev);
2676 return 0;
2677}
2678
2679static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
2680{
2681 struct nvme_dev *dev = f->private_data;
a4aea562
MB
2682 struct nvme_ns *ns;
2683
5e82e952
KB
2684 switch (cmd) {
2685 case NVME_IOCTL_ADMIN_CMD:
a4aea562 2686 return nvme_user_cmd(dev, NULL, (void __user *)arg);
7963e521 2687 case NVME_IOCTL_IO_CMD:
a4aea562
MB
2688 if (list_empty(&dev->namespaces))
2689 return -ENOTTY;
2690 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
2691 return nvme_user_cmd(dev, ns, (void __user *)arg);
5e82e952
KB
2692 default:
2693 return -ENOTTY;
2694 }
2695}
2696
2697static const struct file_operations nvme_dev_fops = {
2698 .owner = THIS_MODULE,
2699 .open = nvme_dev_open,
2700 .release = nvme_dev_release,
2701 .unlocked_ioctl = nvme_dev_ioctl,
2702 .compat_ioctl = nvme_dev_ioctl,
2703};
2704
a4aea562
MB
2705static void nvme_set_irq_hints(struct nvme_dev *dev)
2706{
2707 struct nvme_queue *nvmeq;
2708 int i;
2709
2710 for (i = 0; i < dev->online_queues; i++) {
2711 nvmeq = dev->queues[i];
2712
42483228 2713 if (!nvmeq->tags || !(*nvmeq->tags))
a4aea562
MB
2714 continue;
2715
2716 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
42483228 2717 blk_mq_tags_cpumask(*nvmeq->tags));
a4aea562
MB
2718 }
2719}
2720
f0b50732
KB
2721static int nvme_dev_start(struct nvme_dev *dev)
2722{
2723 int result;
b9afca3e 2724 bool start_thread = false;
f0b50732
KB
2725
2726 result = nvme_dev_map(dev);
2727 if (result)
2728 return result;
2729
2730 result = nvme_configure_admin_queue(dev);
2731 if (result)
2732 goto unmap;
2733
2734 spin_lock(&dev_list_lock);
b9afca3e
DM
2735 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
2736 start_thread = true;
2737 nvme_thread = NULL;
2738 }
f0b50732
KB
2739 list_add(&dev->node, &dev_list);
2740 spin_unlock(&dev_list_lock);
2741
b9afca3e
DM
2742 if (start_thread) {
2743 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
387caa5a 2744 wake_up_all(&nvme_kthread_wait);
b9afca3e
DM
2745 } else
2746 wait_event_killable(nvme_kthread_wait, nvme_thread);
2747
2748 if (IS_ERR_OR_NULL(nvme_thread)) {
2749 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
2750 goto disable;
2751 }
a4aea562
MB
2752
2753 nvme_init_queue(dev->queues[0], 0);
0fb59cbc
KB
2754 result = nvme_alloc_admin_tags(dev);
2755 if (result)
2756 goto disable;
b9afca3e 2757
f0b50732 2758 result = nvme_setup_io_queues(dev);
badc34d4 2759 if (result)
0fb59cbc 2760 goto free_tags;
f0b50732 2761
a4aea562
MB
2762 nvme_set_irq_hints(dev);
2763
1efccc9d 2764 dev->event_limit = 1;
d82e8bfd 2765 return result;
f0b50732 2766
0fb59cbc
KB
2767 free_tags:
2768 nvme_dev_remove_admin(dev);
f0b50732 2769 disable:
a1a5ef99 2770 nvme_disable_queue(dev, 0);
b9afca3e 2771 nvme_dev_list_remove(dev);
f0b50732
KB
2772 unmap:
2773 nvme_dev_unmap(dev);
2774 return result;
2775}
2776
9a6b9458
KB
2777static int nvme_remove_dead_ctrl(void *arg)
2778{
2779 struct nvme_dev *dev = (struct nvme_dev *)arg;
e75ec752 2780 struct pci_dev *pdev = to_pci_dev(dev->dev);
9a6b9458
KB
2781
2782 if (pci_get_drvdata(pdev))
c81f4975 2783 pci_stop_and_remove_bus_device_locked(pdev);
9a6b9458
KB
2784 kref_put(&dev->kref, nvme_free_dev);
2785 return 0;
2786}
2787
2788static void nvme_remove_disks(struct work_struct *ws)
2789{
9a6b9458
KB
2790 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2791
5a92e700 2792 nvme_free_queues(dev, 1);
302c6727 2793 nvme_dev_remove(dev);
9a6b9458
KB
2794}
2795
2796static int nvme_dev_resume(struct nvme_dev *dev)
2797{
2798 int ret;
2799
2800 ret = nvme_dev_start(dev);
badc34d4 2801 if (ret)
9a6b9458 2802 return ret;
badc34d4 2803 if (dev->online_queues < 2) {
9a6b9458 2804 spin_lock(&dev_list_lock);
9ca97374 2805 dev->reset_workfn = nvme_remove_disks;
9a6b9458
KB
2806 queue_work(nvme_workq, &dev->reset_work);
2807 spin_unlock(&dev_list_lock);
c9d3bf88
KB
2808 } else {
2809 nvme_unfreeze_queues(dev);
2810 nvme_set_irq_hints(dev);
9a6b9458
KB
2811 }
2812 return 0;
2813}
2814
2815static void nvme_dev_reset(struct nvme_dev *dev)
2816{
2817 nvme_dev_shutdown(dev);
2818 if (nvme_dev_resume(dev)) {
e75ec752 2819 dev_warn(dev->dev, "Device failed to resume\n");
9a6b9458
KB
2820 kref_get(&dev->kref);
2821 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
2822 dev->instance))) {
e75ec752 2823 dev_err(dev->dev,
9a6b9458
KB
2824 "Failed to start controller remove task\n");
2825 kref_put(&dev->kref, nvme_free_dev);
2826 }
2827 }
2828}
2829
2830static void nvme_reset_failed_dev(struct work_struct *ws)
2831{
2832 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2833 nvme_dev_reset(dev);
2834}
2835
9ca97374
TH
2836static void nvme_reset_workfn(struct work_struct *work)
2837{
2838 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
2839 dev->reset_workfn(work);
2840}
2841
2e1d8448 2842static void nvme_async_probe(struct work_struct *work);
8d85fce7 2843static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
b60503ba 2844{
a4aea562 2845 int node, result = -ENOMEM;
b60503ba
MW
2846 struct nvme_dev *dev;
2847
a4aea562
MB
2848 node = dev_to_node(&pdev->dev);
2849 if (node == NUMA_NO_NODE)
2850 set_dev_node(&pdev->dev, 0);
2851
2852 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
b60503ba
MW
2853 if (!dev)
2854 return -ENOMEM;
a4aea562
MB
2855 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
2856 GFP_KERNEL, node);
b60503ba
MW
2857 if (!dev->entry)
2858 goto free;
a4aea562
MB
2859 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
2860 GFP_KERNEL, node);
b60503ba
MW
2861 if (!dev->queues)
2862 goto free;
2863
2864 INIT_LIST_HEAD(&dev->namespaces);
9ca97374
TH
2865 dev->reset_workfn = nvme_reset_failed_dev;
2866 INIT_WORK(&dev->reset_work, nvme_reset_workfn);
e75ec752 2867 dev->dev = get_device(&pdev->dev);
9a6b9458 2868 pci_set_drvdata(pdev, dev);
cd58ad7d
QSA
2869 result = nvme_set_instance(dev);
2870 if (result)
a96d4f5c 2871 goto put_pci;
b60503ba 2872
091b6092
MW
2873 result = nvme_setup_prp_pools(dev);
2874 if (result)
0877cb0d 2875 goto release;
091b6092 2876
fb35e914 2877 kref_init(&dev->kref);
b3fffdef
KB
2878 dev->device = device_create(nvme_class, &pdev->dev,
2879 MKDEV(nvme_char_major, dev->instance),
2880 dev, "nvme%d", dev->instance);
2881 if (IS_ERR(dev->device)) {
2882 result = PTR_ERR(dev->device);
2e1d8448 2883 goto release_pools;
b3fffdef
KB
2884 }
2885 get_device(dev->device);
740216fc 2886
e6e96d73 2887 INIT_LIST_HEAD(&dev->node);
2e1d8448
KB
2888 INIT_WORK(&dev->probe_work, nvme_async_probe);
2889 schedule_work(&dev->probe_work);
b60503ba
MW
2890 return 0;
2891
0877cb0d 2892 release_pools:
091b6092 2893 nvme_release_prp_pools(dev);
0877cb0d
KB
2894 release:
2895 nvme_release_instance(dev);
a96d4f5c 2896 put_pci:
e75ec752 2897 put_device(dev->dev);
b60503ba
MW
2898 free:
2899 kfree(dev->queues);
2900 kfree(dev->entry);
2901 kfree(dev);
2902 return result;
2903}
2904
2e1d8448
KB
2905static void nvme_async_probe(struct work_struct *work)
2906{
2907 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
2908 int result;
2909
2910 result = nvme_dev_start(dev);
2911 if (result)
2912 goto reset;
2913
2914 if (dev->online_queues > 1)
2915 result = nvme_dev_add(dev);
2916 if (result)
2917 goto reset;
2918
2919 nvme_set_irq_hints(dev);
2e1d8448
KB
2920 return;
2921 reset:
07836e65
KB
2922 if (!work_busy(&dev->reset_work)) {
2923 dev->reset_workfn = nvme_reset_failed_dev;
2924 queue_work(nvme_workq, &dev->reset_work);
2925 }
2e1d8448
KB
2926}
2927
f0d54a54
KB
2928static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
2929{
a6739479 2930 struct nvme_dev *dev = pci_get_drvdata(pdev);
f0d54a54 2931
a6739479
KB
2932 if (prepare)
2933 nvme_dev_shutdown(dev);
2934 else
2935 nvme_dev_resume(dev);
f0d54a54
KB
2936}
2937
09ece142
KB
2938static void nvme_shutdown(struct pci_dev *pdev)
2939{
2940 struct nvme_dev *dev = pci_get_drvdata(pdev);
2941 nvme_dev_shutdown(dev);
2942}
2943
8d85fce7 2944static void nvme_remove(struct pci_dev *pdev)
b60503ba
MW
2945{
2946 struct nvme_dev *dev = pci_get_drvdata(pdev);
9a6b9458
KB
2947
2948 spin_lock(&dev_list_lock);
2949 list_del_init(&dev->node);
2950 spin_unlock(&dev_list_lock);
2951
2952 pci_set_drvdata(pdev, NULL);
2e1d8448 2953 flush_work(&dev->probe_work);
9a6b9458 2954 flush_work(&dev->reset_work);
9a6b9458 2955 nvme_dev_shutdown(dev);
c9d3bf88 2956 nvme_dev_remove(dev);
a4aea562 2957 nvme_dev_remove_admin(dev);
b3fffdef 2958 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
a1a5ef99 2959 nvme_free_queues(dev, 0);
9a6b9458 2960 nvme_release_prp_pools(dev);
5e82e952 2961 kref_put(&dev->kref, nvme_free_dev);
b60503ba
MW
2962}
2963
2964/* These functions are yet to be implemented */
2965#define nvme_error_detected NULL
2966#define nvme_dump_registers NULL
2967#define nvme_link_reset NULL
2968#define nvme_slot_reset NULL
2969#define nvme_error_resume NULL
cd638946 2970
671a6018 2971#ifdef CONFIG_PM_SLEEP
cd638946
KB
2972static int nvme_suspend(struct device *dev)
2973{
2974 struct pci_dev *pdev = to_pci_dev(dev);
2975 struct nvme_dev *ndev = pci_get_drvdata(pdev);
2976
2977 nvme_dev_shutdown(ndev);
2978 return 0;
2979}
2980
2981static int nvme_resume(struct device *dev)
2982{
2983 struct pci_dev *pdev = to_pci_dev(dev);
2984 struct nvme_dev *ndev = pci_get_drvdata(pdev);
cd638946 2985
9a6b9458 2986 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
9ca97374 2987 ndev->reset_workfn = nvme_reset_failed_dev;
9a6b9458
KB
2988 queue_work(nvme_workq, &ndev->reset_work);
2989 }
2990 return 0;
cd638946 2991}
671a6018 2992#endif
cd638946
KB
2993
2994static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
b60503ba 2995
1d352035 2996static const struct pci_error_handlers nvme_err_handler = {
b60503ba
MW
2997 .error_detected = nvme_error_detected,
2998 .mmio_enabled = nvme_dump_registers,
2999 .link_reset = nvme_link_reset,
3000 .slot_reset = nvme_slot_reset,
3001 .resume = nvme_error_resume,
f0d54a54 3002 .reset_notify = nvme_reset_notify,
b60503ba
MW
3003};
3004
3005/* Move to pci_ids.h later */
3006#define PCI_CLASS_STORAGE_EXPRESS 0x010802
3007
6eb0d698 3008static const struct pci_device_id nvme_id_table[] = {
b60503ba
MW
3009 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3010 { 0, }
3011};
3012MODULE_DEVICE_TABLE(pci, nvme_id_table);
3013
3014static struct pci_driver nvme_driver = {
3015 .name = "nvme",
3016 .id_table = nvme_id_table,
3017 .probe = nvme_probe,
8d85fce7 3018 .remove = nvme_remove,
09ece142 3019 .shutdown = nvme_shutdown,
cd638946
KB
3020 .driver = {
3021 .pm = &nvme_dev_pm_ops,
3022 },
b60503ba
MW
3023 .err_handler = &nvme_err_handler,
3024};
3025
3026static int __init nvme_init(void)
3027{
0ac13140 3028 int result;
1fa6aead 3029
b9afca3e 3030 init_waitqueue_head(&nvme_kthread_wait);
b60503ba 3031
9a6b9458
KB
3032 nvme_workq = create_singlethread_workqueue("nvme");
3033 if (!nvme_workq)
b9afca3e 3034 return -ENOMEM;
9a6b9458 3035
5c42ea16
KB
3036 result = register_blkdev(nvme_major, "nvme");
3037 if (result < 0)
9a6b9458 3038 goto kill_workq;
5c42ea16 3039 else if (result > 0)
0ac13140 3040 nvme_major = result;
b60503ba 3041
b3fffdef
KB
3042 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
3043 &nvme_dev_fops);
3044 if (result < 0)
3045 goto unregister_blkdev;
3046 else if (result > 0)
3047 nvme_char_major = result;
3048
3049 nvme_class = class_create(THIS_MODULE, "nvme");
c727040b
AK
3050 if (IS_ERR(nvme_class)) {
3051 result = PTR_ERR(nvme_class);
b3fffdef 3052 goto unregister_chrdev;
c727040b 3053 }
b3fffdef 3054
f3db22fe
KB
3055 result = pci_register_driver(&nvme_driver);
3056 if (result)
b3fffdef 3057 goto destroy_class;
1fa6aead 3058 return 0;
b60503ba 3059
b3fffdef
KB
3060 destroy_class:
3061 class_destroy(nvme_class);
3062 unregister_chrdev:
3063 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1fa6aead 3064 unregister_blkdev:
b60503ba 3065 unregister_blkdev(nvme_major, "nvme");
9a6b9458
KB
3066 kill_workq:
3067 destroy_workqueue(nvme_workq);
b60503ba
MW
3068 return result;
3069}
3070
3071static void __exit nvme_exit(void)
3072{
3073 pci_unregister_driver(&nvme_driver);
3074 unregister_blkdev(nvme_major, "nvme");
9a6b9458 3075 destroy_workqueue(nvme_workq);
b3fffdef
KB
3076 class_destroy(nvme_class);
3077 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
b9afca3e 3078 BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
21bd78bc 3079 _nvme_check_size();
b60503ba
MW
3080}
3081
3082MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
3083MODULE_LICENSE("GPL");
c78b4713 3084MODULE_VERSION("1.0");
b60503ba
MW
3085module_init(nvme_init);
3086module_exit(nvme_exit);