nvme: fix another 32-bit build warning
[linux-2.6-block.git] / drivers / nvme / host / core.c
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <scsi/sg.h>
30 #include <asm/unaligned.h>
31
32 #include "nvme.h"
33
34 #define NVME_MINORS             (1U << MINORBITS)
35
36 static int nvme_major;
37 module_param(nvme_major, int, 0);
38
39 static int nvme_char_major;
40 module_param(nvme_char_major, int, 0);
41
42 static LIST_HEAD(nvme_ctrl_list);
43 DEFINE_SPINLOCK(dev_list_lock);
44
45 static struct class *nvme_class;
46
47 static void nvme_free_ns(struct kref *kref)
48 {
49         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
50
51         if (ns->type == NVME_NS_LIGHTNVM)
52                 nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
53
54         spin_lock(&dev_list_lock);
55         ns->disk->private_data = NULL;
56         spin_unlock(&dev_list_lock);
57
58         nvme_put_ctrl(ns->ctrl);
59         put_disk(ns->disk);
60         kfree(ns);
61 }
62
63 static void nvme_put_ns(struct nvme_ns *ns)
64 {
65         kref_put(&ns->kref, nvme_free_ns);
66 }
67
68 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
69 {
70         struct nvme_ns *ns;
71
72         spin_lock(&dev_list_lock);
73         ns = disk->private_data;
74         if (ns && !kref_get_unless_zero(&ns->kref))
75                 ns = NULL;
76         spin_unlock(&dev_list_lock);
77
78         return ns;
79 }
80
81 struct request *nvme_alloc_request(struct request_queue *q,
82                 struct nvme_command *cmd, unsigned int flags)
83 {
84         bool write = cmd->common.opcode & 1;
85         struct request *req;
86
87         req = blk_mq_alloc_request(q, write, flags);
88         if (IS_ERR(req))
89                 return req;
90
91         req->cmd_type = REQ_TYPE_DRV_PRIV;
92         req->cmd_flags |= REQ_FAILFAST_DRIVER;
93         req->__data_len = 0;
94         req->__sector = (sector_t) -1;
95         req->bio = req->biotail = NULL;
96
97         req->cmd = (unsigned char *)cmd;
98         req->cmd_len = sizeof(struct nvme_command);
99         req->special = (void *)0;
100
101         return req;
102 }
103
104 /*
105  * Returns 0 on success.  If the result is negative, it's a Linux error code;
106  * if the result is positive, it's an NVM Express status code
107  */
108 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
109                 void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
110 {
111         struct request *req;
112         int ret;
113
114         req = nvme_alloc_request(q, cmd, 0);
115         if (IS_ERR(req))
116                 return PTR_ERR(req);
117
118         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
119
120         if (buffer && bufflen) {
121                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
122                 if (ret)
123                         goto out;
124         }
125
126         blk_execute_rq(req->q, NULL, req, 0);
127         if (result)
128                 *result = (u32)(uintptr_t)req->special;
129         ret = req->errors;
130  out:
131         blk_mq_free_request(req);
132         return ret;
133 }
134
135 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
136                 void *buffer, unsigned bufflen)
137 {
138         return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
139 }
140
141 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
142                 void __user *ubuffer, unsigned bufflen,
143                 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
144                 u32 *result, unsigned timeout)
145 {
146         bool write = cmd->common.opcode & 1;
147         struct nvme_ns *ns = q->queuedata;
148         struct gendisk *disk = ns ? ns->disk : NULL;
149         struct request *req;
150         struct bio *bio = NULL;
151         void *meta = NULL;
152         int ret;
153
154         req = nvme_alloc_request(q, cmd, 0);
155         if (IS_ERR(req))
156                 return PTR_ERR(req);
157
158         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
159
160         if (ubuffer && bufflen) {
161                 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
162                                 GFP_KERNEL);
163                 if (ret)
164                         goto out;
165                 bio = req->bio;
166
167                 if (!disk)
168                         goto submit;
169                 bio->bi_bdev = bdget_disk(disk, 0);
170                 if (!bio->bi_bdev) {
171                         ret = -ENODEV;
172                         goto out_unmap;
173                 }
174
175                 if (meta_buffer) {
176                         struct bio_integrity_payload *bip;
177
178                         meta = kmalloc(meta_len, GFP_KERNEL);
179                         if (!meta) {
180                                 ret = -ENOMEM;
181                                 goto out_unmap;
182                         }
183
184                         if (write) {
185                                 if (copy_from_user(meta, meta_buffer,
186                                                 meta_len)) {
187                                         ret = -EFAULT;
188                                         goto out_free_meta;
189                                 }
190                         }
191
192                         bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
193                         if (IS_ERR(bip)) {
194                                 ret = PTR_ERR(bip);
195                                 goto out_free_meta;
196                         }
197
198                         bip->bip_iter.bi_size = meta_len;
199                         bip->bip_iter.bi_sector = meta_seed;
200
201                         ret = bio_integrity_add_page(bio, virt_to_page(meta),
202                                         meta_len, offset_in_page(meta));
203                         if (ret != meta_len) {
204                                 ret = -ENOMEM;
205                                 goto out_free_meta;
206                         }
207                 }
208         }
209  submit:
210         blk_execute_rq(req->q, disk, req, 0);
211         ret = req->errors;
212         if (result)
213                 *result = (u32)(uintptr_t)req->special;
214         if (meta && !ret && !write) {
215                 if (copy_to_user(meta_buffer, meta, meta_len))
216                         ret = -EFAULT;
217         }
218  out_free_meta:
219         kfree(meta);
220  out_unmap:
221         if (bio) {
222                 if (disk && bio->bi_bdev)
223                         bdput(bio->bi_bdev);
224                 blk_rq_unmap_user(bio);
225         }
226  out:
227         blk_mq_free_request(req);
228         return ret;
229 }
230
231 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
232                 void __user *ubuffer, unsigned bufflen, u32 *result,
233                 unsigned timeout)
234 {
235         return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
236                         result, timeout);
237 }
238
239 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
240 {
241         struct nvme_command c = { };
242         int error;
243
244         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
245         c.identify.opcode = nvme_admin_identify;
246         c.identify.cns = cpu_to_le32(1);
247
248         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
249         if (!*id)
250                 return -ENOMEM;
251
252         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
253                         sizeof(struct nvme_id_ctrl));
254         if (error)
255                 kfree(*id);
256         return error;
257 }
258
259 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
260                 struct nvme_id_ns **id)
261 {
262         struct nvme_command c = { };
263         int error;
264
265         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
266         c.identify.opcode = nvme_admin_identify,
267         c.identify.nsid = cpu_to_le32(nsid),
268
269         *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
270         if (!*id)
271                 return -ENOMEM;
272
273         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
274                         sizeof(struct nvme_id_ns));
275         if (error)
276                 kfree(*id);
277         return error;
278 }
279
280 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
281                                         dma_addr_t dma_addr, u32 *result)
282 {
283         struct nvme_command c;
284
285         memset(&c, 0, sizeof(c));
286         c.features.opcode = nvme_admin_get_features;
287         c.features.nsid = cpu_to_le32(nsid);
288         c.features.prp1 = cpu_to_le64(dma_addr);
289         c.features.fid = cpu_to_le32(fid);
290
291         return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
292 }
293
294 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
295                                         dma_addr_t dma_addr, u32 *result)
296 {
297         struct nvme_command c;
298
299         memset(&c, 0, sizeof(c));
300         c.features.opcode = nvme_admin_set_features;
301         c.features.prp1 = cpu_to_le64(dma_addr);
302         c.features.fid = cpu_to_le32(fid);
303         c.features.dword11 = cpu_to_le32(dword11);
304
305         return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
306 }
307
308 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
309 {
310         struct nvme_command c = { };
311         int error;
312
313         c.common.opcode = nvme_admin_get_log_page,
314         c.common.nsid = cpu_to_le32(0xFFFFFFFF),
315         c.common.cdw10[0] = cpu_to_le32(
316                         (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
317                          NVME_LOG_SMART),
318
319         *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
320         if (!*log)
321                 return -ENOMEM;
322
323         error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
324                         sizeof(struct nvme_smart_log));
325         if (error)
326                 kfree(*log);
327         return error;
328 }
329
330 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
331 {
332         u32 q_count = (*count - 1) | ((*count - 1) << 16);
333         u32 result;
334         int status, nr_io_queues;
335
336         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
337                         &result);
338         if (status)
339                 return status;
340
341         nr_io_queues = min(result & 0xffff, result >> 16) + 1;
342         *count = min(*count, nr_io_queues);
343         return 0;
344 }
345
346 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
347 {
348         struct nvme_user_io io;
349         struct nvme_command c;
350         unsigned length, meta_len;
351         void __user *metadata;
352
353         if (copy_from_user(&io, uio, sizeof(io)))
354                 return -EFAULT;
355
356         switch (io.opcode) {
357         case nvme_cmd_write:
358         case nvme_cmd_read:
359         case nvme_cmd_compare:
360                 break;
361         default:
362                 return -EINVAL;
363         }
364
365         length = (io.nblocks + 1) << ns->lba_shift;
366         meta_len = (io.nblocks + 1) * ns->ms;
367         metadata = (void __user *)(uintptr_t)io.metadata;
368
369         if (ns->ext) {
370                 length += meta_len;
371                 meta_len = 0;
372         } else if (meta_len) {
373                 if ((io.metadata & 3) || !io.metadata)
374                         return -EINVAL;
375         }
376
377         memset(&c, 0, sizeof(c));
378         c.rw.opcode = io.opcode;
379         c.rw.flags = io.flags;
380         c.rw.nsid = cpu_to_le32(ns->ns_id);
381         c.rw.slba = cpu_to_le64(io.slba);
382         c.rw.length = cpu_to_le16(io.nblocks);
383         c.rw.control = cpu_to_le16(io.control);
384         c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
385         c.rw.reftag = cpu_to_le32(io.reftag);
386         c.rw.apptag = cpu_to_le16(io.apptag);
387         c.rw.appmask = cpu_to_le16(io.appmask);
388
389         return __nvme_submit_user_cmd(ns->queue, &c,
390                         (void __user *)(uintptr_t)io.addr, length,
391                         metadata, meta_len, io.slba, NULL, 0);
392 }
393
394 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
395                         struct nvme_passthru_cmd __user *ucmd)
396 {
397         struct nvme_passthru_cmd cmd;
398         struct nvme_command c;
399         unsigned timeout = 0;
400         int status;
401
402         if (!capable(CAP_SYS_ADMIN))
403                 return -EACCES;
404         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
405                 return -EFAULT;
406
407         memset(&c, 0, sizeof(c));
408         c.common.opcode = cmd.opcode;
409         c.common.flags = cmd.flags;
410         c.common.nsid = cpu_to_le32(cmd.nsid);
411         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
412         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
413         c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
414         c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
415         c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
416         c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
417         c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
418         c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
419
420         if (cmd.timeout_ms)
421                 timeout = msecs_to_jiffies(cmd.timeout_ms);
422
423         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
424                         (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
425                         &cmd.result, timeout);
426         if (status >= 0) {
427                 if (put_user(cmd.result, &ucmd->result))
428                         return -EFAULT;
429         }
430
431         return status;
432 }
433
434 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
435                 unsigned int cmd, unsigned long arg)
436 {
437         struct nvme_ns *ns = bdev->bd_disk->private_data;
438
439         switch (cmd) {
440         case NVME_IOCTL_ID:
441                 force_successful_syscall_return();
442                 return ns->ns_id;
443         case NVME_IOCTL_ADMIN_CMD:
444                 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
445         case NVME_IOCTL_IO_CMD:
446                 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
447         case NVME_IOCTL_SUBMIT_IO:
448                 return nvme_submit_io(ns, (void __user *)arg);
449         case SG_GET_VERSION_NUM:
450                 return nvme_sg_get_version_num((void __user *)arg);
451         case SG_IO:
452                 return nvme_sg_io(ns, (void __user *)arg);
453         default:
454                 return -ENOTTY;
455         }
456 }
457
458 #ifdef CONFIG_COMPAT
459 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
460                         unsigned int cmd, unsigned long arg)
461 {
462         switch (cmd) {
463         case SG_IO:
464                 return -ENOIOCTLCMD;
465         }
466         return nvme_ioctl(bdev, mode, cmd, arg);
467 }
468 #else
469 #define nvme_compat_ioctl       NULL
470 #endif
471
472 static int nvme_open(struct block_device *bdev, fmode_t mode)
473 {
474         return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
475 }
476
477 static void nvme_release(struct gendisk *disk, fmode_t mode)
478 {
479         nvme_put_ns(disk->private_data);
480 }
481
482 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
483 {
484         /* some standard values */
485         geo->heads = 1 << 6;
486         geo->sectors = 1 << 5;
487         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
488         return 0;
489 }
490
491 #ifdef CONFIG_BLK_DEV_INTEGRITY
492 static void nvme_init_integrity(struct nvme_ns *ns)
493 {
494         struct blk_integrity integrity;
495
496         switch (ns->pi_type) {
497         case NVME_NS_DPS_PI_TYPE3:
498                 integrity.profile = &t10_pi_type3_crc;
499                 break;
500         case NVME_NS_DPS_PI_TYPE1:
501         case NVME_NS_DPS_PI_TYPE2:
502                 integrity.profile = &t10_pi_type1_crc;
503                 break;
504         default:
505                 integrity.profile = NULL;
506                 break;
507         }
508         integrity.tuple_size = ns->ms;
509         blk_integrity_register(ns->disk, &integrity);
510         blk_queue_max_integrity_segments(ns->queue, 1);
511 }
512 #else
513 static void nvme_init_integrity(struct nvme_ns *ns)
514 {
515 }
516 #endif /* CONFIG_BLK_DEV_INTEGRITY */
517
518 static void nvme_config_discard(struct nvme_ns *ns)
519 {
520         u32 logical_block_size = queue_logical_block_size(ns->queue);
521         ns->queue->limits.discard_zeroes_data = 0;
522         ns->queue->limits.discard_alignment = logical_block_size;
523         ns->queue->limits.discard_granularity = logical_block_size;
524         blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
525         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
526 }
527
528 static int nvme_revalidate_disk(struct gendisk *disk)
529 {
530         struct nvme_ns *ns = disk->private_data;
531         struct nvme_id_ns *id;
532         u8 lbaf, pi_type;
533         u16 old_ms;
534         unsigned short bs;
535
536         if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
537                 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
538                                 __func__, ns->ctrl->instance, ns->ns_id);
539                 return -ENODEV;
540         }
541         if (id->ncap == 0) {
542                 kfree(id);
543                 return -ENODEV;
544         }
545
546         if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
547                 if (nvme_nvm_register(ns->queue, disk->disk_name)) {
548                         dev_warn(ns->ctrl->dev,
549                                 "%s: LightNVM init failure\n", __func__);
550                         kfree(id);
551                         return -ENODEV;
552                 }
553                 ns->type = NVME_NS_LIGHTNVM;
554         }
555
556         old_ms = ns->ms;
557         lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
558         ns->lba_shift = id->lbaf[lbaf].ds;
559         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
560         ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
561
562         /*
563          * If identify namespace failed, use default 512 byte block size so
564          * block layer can use before failing read/write for 0 capacity.
565          */
566         if (ns->lba_shift == 0)
567                 ns->lba_shift = 9;
568         bs = 1 << ns->lba_shift;
569
570         /* XXX: PI implementation requires metadata equal t10 pi tuple size */
571         pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
572                                         id->dps & NVME_NS_DPS_PI_MASK : 0;
573
574         blk_mq_freeze_queue(disk->queue);
575         if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
576                                 ns->ms != old_ms ||
577                                 bs != queue_logical_block_size(disk->queue) ||
578                                 (ns->ms && ns->ext)))
579                 blk_integrity_unregister(disk);
580
581         ns->pi_type = pi_type;
582         blk_queue_logical_block_size(ns->queue, bs);
583
584         if (ns->ms && !ns->ext)
585                 nvme_init_integrity(ns);
586
587         if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
588                 set_capacity(disk, 0);
589         else
590                 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
591
592         if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
593                 nvme_config_discard(ns);
594         blk_mq_unfreeze_queue(disk->queue);
595
596         kfree(id);
597         return 0;
598 }
599
600 static char nvme_pr_type(enum pr_type type)
601 {
602         switch (type) {
603         case PR_WRITE_EXCLUSIVE:
604                 return 1;
605         case PR_EXCLUSIVE_ACCESS:
606                 return 2;
607         case PR_WRITE_EXCLUSIVE_REG_ONLY:
608                 return 3;
609         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
610                 return 4;
611         case PR_WRITE_EXCLUSIVE_ALL_REGS:
612                 return 5;
613         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
614                 return 6;
615         default:
616                 return 0;
617         }
618 };
619
620 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
621                                 u64 key, u64 sa_key, u8 op)
622 {
623         struct nvme_ns *ns = bdev->bd_disk->private_data;
624         struct nvme_command c;
625         u8 data[16] = { 0, };
626
627         put_unaligned_le64(key, &data[0]);
628         put_unaligned_le64(sa_key, &data[8]);
629
630         memset(&c, 0, sizeof(c));
631         c.common.opcode = op;
632         c.common.nsid = cpu_to_le32(ns->ns_id);
633         c.common.cdw10[0] = cpu_to_le32(cdw10);
634
635         return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
636 }
637
638 static int nvme_pr_register(struct block_device *bdev, u64 old,
639                 u64 new, unsigned flags)
640 {
641         u32 cdw10;
642
643         if (flags & ~PR_FL_IGNORE_KEY)
644                 return -EOPNOTSUPP;
645
646         cdw10 = old ? 2 : 0;
647         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
648         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
649         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
650 }
651
652 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
653                 enum pr_type type, unsigned flags)
654 {
655         u32 cdw10;
656
657         if (flags & ~PR_FL_IGNORE_KEY)
658                 return -EOPNOTSUPP;
659
660         cdw10 = nvme_pr_type(type) << 8;
661         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
662         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
663 }
664
665 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
666                 enum pr_type type, bool abort)
667 {
668         u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
669         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
670 }
671
672 static int nvme_pr_clear(struct block_device *bdev, u64 key)
673 {
674         u32 cdw10 = 1 | key ? 1 << 3 : 0;
675         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
676 }
677
678 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
679 {
680         u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
681         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
682 }
683
684 static const struct pr_ops nvme_pr_ops = {
685         .pr_register    = nvme_pr_register,
686         .pr_reserve     = nvme_pr_reserve,
687         .pr_release     = nvme_pr_release,
688         .pr_preempt     = nvme_pr_preempt,
689         .pr_clear       = nvme_pr_clear,
690 };
691
692 static const struct block_device_operations nvme_fops = {
693         .owner          = THIS_MODULE,
694         .ioctl          = nvme_ioctl,
695         .compat_ioctl   = nvme_compat_ioctl,
696         .open           = nvme_open,
697         .release        = nvme_release,
698         .getgeo         = nvme_getgeo,
699         .revalidate_disk= nvme_revalidate_disk,
700         .pr_ops         = &nvme_pr_ops,
701 };
702
703 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
704 {
705         unsigned long timeout =
706                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
707         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
708         int ret;
709
710         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
711                 if ((csts & NVME_CSTS_RDY) == bit)
712                         break;
713
714                 msleep(100);
715                 if (fatal_signal_pending(current))
716                         return -EINTR;
717                 if (time_after(jiffies, timeout)) {
718                         dev_err(ctrl->dev,
719                                 "Device not ready; aborting %s\n", enabled ?
720                                                 "initialisation" : "reset");
721                         return -ENODEV;
722                 }
723         }
724
725         return ret;
726 }
727
728 /*
729  * If the device has been passed off to us in an enabled state, just clear
730  * the enabled bit.  The spec says we should set the 'shutdown notification
731  * bits', but doing so may cause the device to complete commands to the
732  * admin queue ... and we don't know what memory that might be pointing at!
733  */
734 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
735 {
736         int ret;
737
738         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
739         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
740
741         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
742         if (ret)
743                 return ret;
744         return nvme_wait_ready(ctrl, cap, false);
745 }
746
747 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
748 {
749         /*
750          * Default to a 4K page size, with the intention to update this
751          * path in the future to accomodate architectures with differing
752          * kernel and IO page sizes.
753          */
754         unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
755         int ret;
756
757         if (page_shift < dev_page_min) {
758                 dev_err(ctrl->dev,
759                         "Minimum device page size %u too large for host (%u)\n",
760                         1 << dev_page_min, 1 << page_shift);
761                 return -ENODEV;
762         }
763
764         ctrl->page_size = 1 << page_shift;
765
766         ctrl->ctrl_config = NVME_CC_CSS_NVM;
767         ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
768         ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
769         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
770         ctrl->ctrl_config |= NVME_CC_ENABLE;
771
772         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
773         if (ret)
774                 return ret;
775         return nvme_wait_ready(ctrl, cap, true);
776 }
777
778 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
779 {
780         unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
781         u32 csts;
782         int ret;
783
784         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
785         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
786
787         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
788         if (ret)
789                 return ret;
790
791         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
792                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
793                         break;
794
795                 msleep(100);
796                 if (fatal_signal_pending(current))
797                         return -EINTR;
798                 if (time_after(jiffies, timeout)) {
799                         dev_err(ctrl->dev,
800                                 "Device shutdown incomplete; abort shutdown\n");
801                         return -ENODEV;
802                 }
803         }
804
805         return ret;
806 }
807
808 /*
809  * Initialize the cached copies of the Identify data and various controller
810  * register in our nvme_ctrl structure.  This should be called as soon as
811  * the admin queue is fully up and running.
812  */
813 int nvme_init_identify(struct nvme_ctrl *ctrl)
814 {
815         struct nvme_id_ctrl *id;
816         u64 cap;
817         int ret, page_shift;
818
819         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
820         if (ret) {
821                 dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
822                 return ret;
823         }
824
825         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
826         if (ret) {
827                 dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
828                 return ret;
829         }
830         page_shift = NVME_CAP_MPSMIN(cap) + 12;
831
832         if (ctrl->vs >= NVME_VS(1, 1))
833                 ctrl->subsystem = NVME_CAP_NSSRC(cap);
834
835         ret = nvme_identify_ctrl(ctrl, &id);
836         if (ret) {
837                 dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
838                 return -EIO;
839         }
840
841         ctrl->oncs = le16_to_cpup(&id->oncs);
842         ctrl->abort_limit = id->acl + 1;
843         ctrl->vwc = id->vwc;
844         memcpy(ctrl->serial, id->sn, sizeof(id->sn));
845         memcpy(ctrl->model, id->mn, sizeof(id->mn));
846         memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
847         if (id->mdts)
848                 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
849         else
850                 ctrl->max_hw_sectors = UINT_MAX;
851
852         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
853                 unsigned int max_hw_sectors;
854
855                 ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
856                 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
857                 if (ctrl->max_hw_sectors) {
858                         ctrl->max_hw_sectors = min(max_hw_sectors,
859                                                         ctrl->max_hw_sectors);
860                 } else {
861                         ctrl->max_hw_sectors = max_hw_sectors;
862                 }
863         }
864
865         kfree(id);
866         return 0;
867 }
868
869 static int nvme_dev_open(struct inode *inode, struct file *file)
870 {
871         struct nvme_ctrl *ctrl;
872         int instance = iminor(inode);
873         int ret = -ENODEV;
874
875         spin_lock(&dev_list_lock);
876         list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
877                 if (ctrl->instance != instance)
878                         continue;
879
880                 if (!ctrl->admin_q) {
881                         ret = -EWOULDBLOCK;
882                         break;
883                 }
884                 if (!kref_get_unless_zero(&ctrl->kref))
885                         break;
886                 file->private_data = ctrl;
887                 ret = 0;
888                 break;
889         }
890         spin_unlock(&dev_list_lock);
891
892         return ret;
893 }
894
895 static int nvme_dev_release(struct inode *inode, struct file *file)
896 {
897         nvme_put_ctrl(file->private_data);
898         return 0;
899 }
900
901 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
902                 unsigned long arg)
903 {
904         struct nvme_ctrl *ctrl = file->private_data;
905         void __user *argp = (void __user *)arg;
906         struct nvme_ns *ns;
907
908         switch (cmd) {
909         case NVME_IOCTL_ADMIN_CMD:
910                 return nvme_user_cmd(ctrl, NULL, argp);
911         case NVME_IOCTL_IO_CMD:
912                 if (list_empty(&ctrl->namespaces))
913                         return -ENOTTY;
914                 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
915                 return nvme_user_cmd(ctrl, ns, argp);
916         case NVME_IOCTL_RESET:
917                 dev_warn(ctrl->dev, "resetting controller\n");
918                 return ctrl->ops->reset_ctrl(ctrl);
919         case NVME_IOCTL_SUBSYS_RESET:
920                 return nvme_reset_subsystem(ctrl);
921         default:
922                 return -ENOTTY;
923         }
924 }
925
926 static const struct file_operations nvme_dev_fops = {
927         .owner          = THIS_MODULE,
928         .open           = nvme_dev_open,
929         .release        = nvme_dev_release,
930         .unlocked_ioctl = nvme_dev_ioctl,
931         .compat_ioctl   = nvme_dev_ioctl,
932 };
933
934 static ssize_t nvme_sysfs_reset(struct device *dev,
935                                 struct device_attribute *attr, const char *buf,
936                                 size_t count)
937 {
938         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
939         int ret;
940
941         ret = ctrl->ops->reset_ctrl(ctrl);
942         if (ret < 0)
943                 return ret;
944         return count;
945 }
946 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
947
948 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
949 {
950         struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
951         struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
952
953         return nsa->ns_id - nsb->ns_id;
954 }
955
956 static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
957 {
958         struct nvme_ns *ns;
959
960         list_for_each_entry(ns, &ctrl->namespaces, list) {
961                 if (ns->ns_id == nsid)
962                         return ns;
963                 if (ns->ns_id > nsid)
964                         break;
965         }
966         return NULL;
967 }
968
969 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
970 {
971         struct nvme_ns *ns;
972         struct gendisk *disk;
973         int node = dev_to_node(ctrl->dev);
974
975         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
976         if (!ns)
977                 return;
978
979         ns->queue = blk_mq_init_queue(ctrl->tagset);
980         if (IS_ERR(ns->queue))
981                 goto out_free_ns;
982         queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
983         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
984         ns->queue->queuedata = ns;
985         ns->ctrl = ctrl;
986
987         disk = alloc_disk_node(0, node);
988         if (!disk)
989                 goto out_free_queue;
990
991         kref_init(&ns->kref);
992         ns->ns_id = nsid;
993         ns->disk = disk;
994         ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
995         list_add_tail(&ns->list, &ctrl->namespaces);
996
997         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
998         if (ctrl->max_hw_sectors) {
999                 blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
1000                 blk_queue_max_segments(ns->queue,
1001                         (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
1002         }
1003         if (ctrl->stripe_size)
1004                 blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
1005         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1006                 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1007         blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
1008
1009         disk->major = nvme_major;
1010         disk->first_minor = 0;
1011         disk->fops = &nvme_fops;
1012         disk->private_data = ns;
1013         disk->queue = ns->queue;
1014         disk->driverfs_dev = ctrl->device;
1015         disk->flags = GENHD_FL_EXT_DEVT;
1016         sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
1017
1018         /*
1019          * Initialize capacity to 0 until we establish the namespace format and
1020          * setup integrity extentions if necessary. The revalidate_disk after
1021          * add_disk allows the driver to register with integrity if the format
1022          * requires it.
1023          */
1024         set_capacity(disk, 0);
1025         if (nvme_revalidate_disk(ns->disk))
1026                 goto out_free_disk;
1027
1028         kref_get(&ctrl->kref);
1029         if (ns->type != NVME_NS_LIGHTNVM) {
1030                 add_disk(ns->disk);
1031                 if (ns->ms) {
1032                         struct block_device *bd = bdget_disk(ns->disk, 0);
1033                         if (!bd)
1034                                 return;
1035                         if (blkdev_get(bd, FMODE_READ, NULL)) {
1036                                 bdput(bd);
1037                                 return;
1038                         }
1039                         blkdev_reread_part(bd);
1040                         blkdev_put(bd, FMODE_READ);
1041                 }
1042         }
1043
1044         return;
1045  out_free_disk:
1046         kfree(disk);
1047         list_del(&ns->list);
1048  out_free_queue:
1049         blk_cleanup_queue(ns->queue);
1050  out_free_ns:
1051         kfree(ns);
1052 }
1053
1054 static void nvme_ns_remove(struct nvme_ns *ns)
1055 {
1056         bool kill = nvme_io_incapable(ns->ctrl) &&
1057                         !blk_queue_dying(ns->queue);
1058
1059         if (kill)
1060                 blk_set_queue_dying(ns->queue);
1061         if (ns->disk->flags & GENHD_FL_UP) {
1062                 if (blk_get_integrity(ns->disk))
1063                         blk_integrity_unregister(ns->disk);
1064                 del_gendisk(ns->disk);
1065         }
1066         if (kill || !blk_queue_dying(ns->queue)) {
1067                 blk_mq_abort_requeue_list(ns->queue);
1068                 blk_cleanup_queue(ns->queue);
1069         }
1070         list_del_init(&ns->list);
1071         nvme_put_ns(ns);
1072 }
1073
1074 static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
1075 {
1076         struct nvme_ns *ns, *next;
1077         unsigned i;
1078
1079         for (i = 1; i <= nn; i++) {
1080                 ns = nvme_find_ns(ctrl, i);
1081                 if (ns) {
1082                         if (revalidate_disk(ns->disk))
1083                                 nvme_ns_remove(ns);
1084                 } else
1085                         nvme_alloc_ns(ctrl, i);
1086         }
1087         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1088                 if (ns->ns_id > nn)
1089                         nvme_ns_remove(ns);
1090         }
1091         list_sort(NULL, &ctrl->namespaces, ns_cmp);
1092 }
1093
1094 void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
1095 {
1096         struct nvme_id_ctrl *id;
1097
1098         if (nvme_identify_ctrl(ctrl, &id))
1099                 return;
1100         __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
1101         kfree(id);
1102 }
1103
1104 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1105 {
1106         struct nvme_ns *ns, *next;
1107
1108         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1109                 nvme_ns_remove(ns);
1110 }
1111
1112 static DEFINE_IDA(nvme_instance_ida);
1113
1114 static int nvme_set_instance(struct nvme_ctrl *ctrl)
1115 {
1116         int instance, error;
1117
1118         do {
1119                 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1120                         return -ENODEV;
1121
1122                 spin_lock(&dev_list_lock);
1123                 error = ida_get_new(&nvme_instance_ida, &instance);
1124                 spin_unlock(&dev_list_lock);
1125         } while (error == -EAGAIN);
1126
1127         if (error)
1128                 return -ENODEV;
1129
1130         ctrl->instance = instance;
1131         return 0;
1132 }
1133
1134 static void nvme_release_instance(struct nvme_ctrl *ctrl)
1135 {
1136         spin_lock(&dev_list_lock);
1137         ida_remove(&nvme_instance_ida, ctrl->instance);
1138         spin_unlock(&dev_list_lock);
1139 }
1140
1141 static void nvme_free_ctrl(struct kref *kref)
1142 {
1143         struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
1144
1145         spin_lock(&dev_list_lock);
1146         list_del(&ctrl->node);
1147         spin_unlock(&dev_list_lock);
1148
1149         put_device(ctrl->device);
1150         nvme_release_instance(ctrl);
1151         device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1152
1153         ctrl->ops->free_ctrl(ctrl);
1154 }
1155
1156 void nvme_put_ctrl(struct nvme_ctrl *ctrl)
1157 {
1158         kref_put(&ctrl->kref, nvme_free_ctrl);
1159 }
1160
1161 /*
1162  * Initialize a NVMe controller structures.  This needs to be called during
1163  * earliest initialization so that we have the initialized structured around
1164  * during probing.
1165  */
1166 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1167                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
1168 {
1169         int ret;
1170
1171         INIT_LIST_HEAD(&ctrl->namespaces);
1172         kref_init(&ctrl->kref);
1173         ctrl->dev = dev;
1174         ctrl->ops = ops;
1175         ctrl->quirks = quirks;
1176
1177         ret = nvme_set_instance(ctrl);
1178         if (ret)
1179                 goto out;
1180
1181         ctrl->device = device_create(nvme_class, ctrl->dev,
1182                                 MKDEV(nvme_char_major, ctrl->instance),
1183                                 dev, "nvme%d", ctrl->instance);
1184         if (IS_ERR(ctrl->device)) {
1185                 ret = PTR_ERR(ctrl->device);
1186                 goto out_release_instance;
1187         }
1188         get_device(ctrl->device);
1189         dev_set_drvdata(ctrl->device, ctrl);
1190
1191         ret = device_create_file(ctrl->device, &dev_attr_reset_controller);
1192         if (ret)
1193                 goto out_put_device;
1194
1195         spin_lock(&dev_list_lock);
1196         list_add_tail(&ctrl->node, &nvme_ctrl_list);
1197         spin_unlock(&dev_list_lock);
1198
1199         return 0;
1200
1201 out_put_device:
1202         put_device(ctrl->device);
1203         device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1204 out_release_instance:
1205         nvme_release_instance(ctrl);
1206 out:
1207         return ret;
1208 }
1209
1210 int __init nvme_core_init(void)
1211 {
1212         int result;
1213
1214         result = register_blkdev(nvme_major, "nvme");
1215         if (result < 0)
1216                 return result;
1217         else if (result > 0)
1218                 nvme_major = result;
1219
1220         result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
1221                                                         &nvme_dev_fops);
1222         if (result < 0)
1223                 goto unregister_blkdev;
1224         else if (result > 0)
1225                 nvme_char_major = result;
1226
1227         nvme_class = class_create(THIS_MODULE, "nvme");
1228         if (IS_ERR(nvme_class)) {
1229                 result = PTR_ERR(nvme_class);
1230                 goto unregister_chrdev;
1231         }
1232
1233         return 0;
1234
1235  unregister_chrdev:
1236         __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1237  unregister_blkdev:
1238         unregister_blkdev(nvme_major, "nvme");
1239         return result;
1240 }
1241
1242 void nvme_core_exit(void)
1243 {
1244         unregister_blkdev(nvme_major, "nvme");
1245         class_destroy(nvme_class);
1246         __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1247 }