Merge tag 'block-6.2-2023-02-03' of git://git.kernel.dk/linux
[linux-block.git] / drivers / nvme / host / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/blk-integrity.h>
10 #include <linux/compat.h>
11 #include <linux/delay.h>
12 #include <linux/errno.h>
13 #include <linux/hdreg.h>
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/backing-dev.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/pr.h>
20 #include <linux/ptrace.h>
21 #include <linux/nvme_ioctl.h>
22 #include <linux/pm_qos.h>
23 #include <asm/unaligned.h>
24
25 #include "nvme.h"
26 #include "fabrics.h"
27 #include <linux/nvme-auth.h>
28
29 #define CREATE_TRACE_POINTS
30 #include "trace.h"
31
32 #define NVME_MINORS             (1U << MINORBITS)
33
34 struct nvme_ns_info {
35         struct nvme_ns_ids ids;
36         u32 nsid;
37         __le32 anagrpid;
38         bool is_shared;
39         bool is_readonly;
40         bool is_ready;
41 };
42
43 unsigned int admin_timeout = 60;
44 module_param(admin_timeout, uint, 0644);
45 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
46 EXPORT_SYMBOL_GPL(admin_timeout);
47
48 unsigned int nvme_io_timeout = 30;
49 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
50 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
51 EXPORT_SYMBOL_GPL(nvme_io_timeout);
52
53 static unsigned char shutdown_timeout = 5;
54 module_param(shutdown_timeout, byte, 0644);
55 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
56
57 static u8 nvme_max_retries = 5;
58 module_param_named(max_retries, nvme_max_retries, byte, 0644);
59 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
60
61 static unsigned long default_ps_max_latency_us = 100000;
62 module_param(default_ps_max_latency_us, ulong, 0644);
63 MODULE_PARM_DESC(default_ps_max_latency_us,
64                  "max power saving latency for new devices; use PM QOS to change per device");
65
66 static bool force_apst;
67 module_param(force_apst, bool, 0644);
68 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
69
70 static unsigned long apst_primary_timeout_ms = 100;
71 module_param(apst_primary_timeout_ms, ulong, 0644);
72 MODULE_PARM_DESC(apst_primary_timeout_ms,
73         "primary APST timeout in ms");
74
75 static unsigned long apst_secondary_timeout_ms = 2000;
76 module_param(apst_secondary_timeout_ms, ulong, 0644);
77 MODULE_PARM_DESC(apst_secondary_timeout_ms,
78         "secondary APST timeout in ms");
79
80 static unsigned long apst_primary_latency_tol_us = 15000;
81 module_param(apst_primary_latency_tol_us, ulong, 0644);
82 MODULE_PARM_DESC(apst_primary_latency_tol_us,
83         "primary APST latency tolerance in us");
84
85 static unsigned long apst_secondary_latency_tol_us = 100000;
86 module_param(apst_secondary_latency_tol_us, ulong, 0644);
87 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
88         "secondary APST latency tolerance in us");
89
90 /*
91  * nvme_wq - hosts nvme related works that are not reset or delete
92  * nvme_reset_wq - hosts nvme reset works
93  * nvme_delete_wq - hosts nvme delete works
94  *
95  * nvme_wq will host works such as scan, aen handling, fw activation,
96  * keep-alive, periodic reconnects etc. nvme_reset_wq
97  * runs reset works which also flush works hosted on nvme_wq for
98  * serialization purposes. nvme_delete_wq host controller deletion
99  * works which flush reset works for serialization.
100  */
101 struct workqueue_struct *nvme_wq;
102 EXPORT_SYMBOL_GPL(nvme_wq);
103
104 struct workqueue_struct *nvme_reset_wq;
105 EXPORT_SYMBOL_GPL(nvme_reset_wq);
106
107 struct workqueue_struct *nvme_delete_wq;
108 EXPORT_SYMBOL_GPL(nvme_delete_wq);
109
110 static LIST_HEAD(nvme_subsystems);
111 static DEFINE_MUTEX(nvme_subsystems_lock);
112
113 static DEFINE_IDA(nvme_instance_ida);
114 static dev_t nvme_ctrl_base_chr_devt;
115 static struct class *nvme_class;
116 static struct class *nvme_subsys_class;
117
118 static DEFINE_IDA(nvme_ns_chr_minor_ida);
119 static dev_t nvme_ns_chr_devt;
120 static struct class *nvme_ns_chr_class;
121
122 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
123 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
124                                            unsigned nsid);
125 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
126                                    struct nvme_command *cmd);
127
128 void nvme_queue_scan(struct nvme_ctrl *ctrl)
129 {
130         /*
131          * Only new queue scan work when admin and IO queues are both alive
132          */
133         if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
134                 queue_work(nvme_wq, &ctrl->scan_work);
135 }
136
137 /*
138  * Use this function to proceed with scheduling reset_work for a controller
139  * that had previously been set to the resetting state. This is intended for
140  * code paths that can't be interrupted by other reset attempts. A hot removal
141  * may prevent this from succeeding.
142  */
143 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
144 {
145         if (ctrl->state != NVME_CTRL_RESETTING)
146                 return -EBUSY;
147         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
148                 return -EBUSY;
149         return 0;
150 }
151 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
152
153 static void nvme_failfast_work(struct work_struct *work)
154 {
155         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
156                         struct nvme_ctrl, failfast_work);
157
158         if (ctrl->state != NVME_CTRL_CONNECTING)
159                 return;
160
161         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
162         dev_info(ctrl->device, "failfast expired\n");
163         nvme_kick_requeue_lists(ctrl);
164 }
165
166 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
167 {
168         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
169                 return;
170
171         schedule_delayed_work(&ctrl->failfast_work,
172                               ctrl->opts->fast_io_fail_tmo * HZ);
173 }
174
175 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
176 {
177         if (!ctrl->opts)
178                 return;
179
180         cancel_delayed_work_sync(&ctrl->failfast_work);
181         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
182 }
183
184
185 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
186 {
187         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
188                 return -EBUSY;
189         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
190                 return -EBUSY;
191         return 0;
192 }
193 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
194
195 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
196 {
197         int ret;
198
199         ret = nvme_reset_ctrl(ctrl);
200         if (!ret) {
201                 flush_work(&ctrl->reset_work);
202                 if (ctrl->state != NVME_CTRL_LIVE)
203                         ret = -ENETRESET;
204         }
205
206         return ret;
207 }
208
209 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
210 {
211         dev_info(ctrl->device,
212                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
213
214         flush_work(&ctrl->reset_work);
215         nvme_stop_ctrl(ctrl);
216         nvme_remove_namespaces(ctrl);
217         ctrl->ops->delete_ctrl(ctrl);
218         nvme_uninit_ctrl(ctrl);
219 }
220
221 static void nvme_delete_ctrl_work(struct work_struct *work)
222 {
223         struct nvme_ctrl *ctrl =
224                 container_of(work, struct nvme_ctrl, delete_work);
225
226         nvme_do_delete_ctrl(ctrl);
227 }
228
229 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
230 {
231         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
232                 return -EBUSY;
233         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
234                 return -EBUSY;
235         return 0;
236 }
237 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
238
239 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
240 {
241         /*
242          * Keep a reference until nvme_do_delete_ctrl() complete,
243          * since ->delete_ctrl can free the controller.
244          */
245         nvme_get_ctrl(ctrl);
246         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
247                 nvme_do_delete_ctrl(ctrl);
248         nvme_put_ctrl(ctrl);
249 }
250
251 static blk_status_t nvme_error_status(u16 status)
252 {
253         switch (status & 0x7ff) {
254         case NVME_SC_SUCCESS:
255                 return BLK_STS_OK;
256         case NVME_SC_CAP_EXCEEDED:
257                 return BLK_STS_NOSPC;
258         case NVME_SC_LBA_RANGE:
259         case NVME_SC_CMD_INTERRUPTED:
260         case NVME_SC_NS_NOT_READY:
261                 return BLK_STS_TARGET;
262         case NVME_SC_BAD_ATTRIBUTES:
263         case NVME_SC_ONCS_NOT_SUPPORTED:
264         case NVME_SC_INVALID_OPCODE:
265         case NVME_SC_INVALID_FIELD:
266         case NVME_SC_INVALID_NS:
267                 return BLK_STS_NOTSUPP;
268         case NVME_SC_WRITE_FAULT:
269         case NVME_SC_READ_ERROR:
270         case NVME_SC_UNWRITTEN_BLOCK:
271         case NVME_SC_ACCESS_DENIED:
272         case NVME_SC_READ_ONLY:
273         case NVME_SC_COMPARE_FAILED:
274                 return BLK_STS_MEDIUM;
275         case NVME_SC_GUARD_CHECK:
276         case NVME_SC_APPTAG_CHECK:
277         case NVME_SC_REFTAG_CHECK:
278         case NVME_SC_INVALID_PI:
279                 return BLK_STS_PROTECTION;
280         case NVME_SC_RESERVATION_CONFLICT:
281                 return BLK_STS_NEXUS;
282         case NVME_SC_HOST_PATH_ERROR:
283                 return BLK_STS_TRANSPORT;
284         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
285                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
286         case NVME_SC_ZONE_TOO_MANY_OPEN:
287                 return BLK_STS_ZONE_OPEN_RESOURCE;
288         default:
289                 return BLK_STS_IOERR;
290         }
291 }
292
293 static void nvme_retry_req(struct request *req)
294 {
295         unsigned long delay = 0;
296         u16 crd;
297
298         /* The mask and shift result must be <= 3 */
299         crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
300         if (crd)
301                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
302
303         nvme_req(req)->retries++;
304         blk_mq_requeue_request(req, false);
305         blk_mq_delay_kick_requeue_list(req->q, delay);
306 }
307
308 static void nvme_log_error(struct request *req)
309 {
310         struct nvme_ns *ns = req->q->queuedata;
311         struct nvme_request *nr = nvme_req(req);
312
313         if (ns) {
314                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
315                        ns->disk ? ns->disk->disk_name : "?",
316                        nvme_get_opcode_str(nr->cmd->common.opcode),
317                        nr->cmd->common.opcode,
318                        (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
319                        (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
320                        nvme_get_error_status_str(nr->status),
321                        nr->status >> 8 & 7,     /* Status Code Type */
322                        nr->status & 0xff,       /* Status Code */
323                        nr->status & NVME_SC_MORE ? "MORE " : "",
324                        nr->status & NVME_SC_DNR  ? "DNR "  : "");
325                 return;
326         }
327
328         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
329                            dev_name(nr->ctrl->device),
330                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
331                            nr->cmd->common.opcode,
332                            nvme_get_error_status_str(nr->status),
333                            nr->status >> 8 & 7, /* Status Code Type */
334                            nr->status & 0xff,   /* Status Code */
335                            nr->status & NVME_SC_MORE ? "MORE " : "",
336                            nr->status & NVME_SC_DNR  ? "DNR "  : "");
337 }
338
339 enum nvme_disposition {
340         COMPLETE,
341         RETRY,
342         FAILOVER,
343         AUTHENTICATE,
344 };
345
346 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
347 {
348         if (likely(nvme_req(req)->status == 0))
349                 return COMPLETE;
350
351         if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
352                 return AUTHENTICATE;
353
354         if (blk_noretry_request(req) ||
355             (nvme_req(req)->status & NVME_SC_DNR) ||
356             nvme_req(req)->retries >= nvme_max_retries)
357                 return COMPLETE;
358
359         if (req->cmd_flags & REQ_NVME_MPATH) {
360                 if (nvme_is_path_error(nvme_req(req)->status) ||
361                     blk_queue_dying(req->q))
362                         return FAILOVER;
363         } else {
364                 if (blk_queue_dying(req->q))
365                         return COMPLETE;
366         }
367
368         return RETRY;
369 }
370
371 static inline void nvme_end_req_zoned(struct request *req)
372 {
373         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
374             req_op(req) == REQ_OP_ZONE_APPEND)
375                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
376                         le64_to_cpu(nvme_req(req)->result.u64));
377 }
378
379 static inline void nvme_end_req(struct request *req)
380 {
381         blk_status_t status = nvme_error_status(nvme_req(req)->status);
382
383         if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET)))
384                 nvme_log_error(req);
385         nvme_end_req_zoned(req);
386         nvme_trace_bio_complete(req);
387         if (req->cmd_flags & REQ_NVME_MPATH)
388                 nvme_mpath_end_request(req);
389         blk_mq_end_request(req, status);
390 }
391
392 void nvme_complete_rq(struct request *req)
393 {
394         struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
395
396         trace_nvme_complete_rq(req);
397         nvme_cleanup_cmd(req);
398
399         if (ctrl->kas)
400                 ctrl->comp_seen = true;
401
402         switch (nvme_decide_disposition(req)) {
403         case COMPLETE:
404                 nvme_end_req(req);
405                 return;
406         case RETRY:
407                 nvme_retry_req(req);
408                 return;
409         case FAILOVER:
410                 nvme_failover_req(req);
411                 return;
412         case AUTHENTICATE:
413 #ifdef CONFIG_NVME_AUTH
414                 queue_work(nvme_wq, &ctrl->dhchap_auth_work);
415                 nvme_retry_req(req);
416 #else
417                 nvme_end_req(req);
418 #endif
419                 return;
420         }
421 }
422 EXPORT_SYMBOL_GPL(nvme_complete_rq);
423
424 void nvme_complete_batch_req(struct request *req)
425 {
426         trace_nvme_complete_rq(req);
427         nvme_cleanup_cmd(req);
428         nvme_end_req_zoned(req);
429 }
430 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
431
432 /*
433  * Called to unwind from ->queue_rq on a failed command submission so that the
434  * multipathing code gets called to potentially failover to another path.
435  * The caller needs to unwind all transport specific resource allocations and
436  * must return propagate the return value.
437  */
438 blk_status_t nvme_host_path_error(struct request *req)
439 {
440         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
441         blk_mq_set_request_complete(req);
442         nvme_complete_rq(req);
443         return BLK_STS_OK;
444 }
445 EXPORT_SYMBOL_GPL(nvme_host_path_error);
446
447 bool nvme_cancel_request(struct request *req, void *data)
448 {
449         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
450                                 "Cancelling I/O %d", req->tag);
451
452         /* don't abort one completed request */
453         if (blk_mq_request_completed(req))
454                 return true;
455
456         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
457         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
458         blk_mq_complete_request(req);
459         return true;
460 }
461 EXPORT_SYMBOL_GPL(nvme_cancel_request);
462
463 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
464 {
465         if (ctrl->tagset) {
466                 blk_mq_tagset_busy_iter(ctrl->tagset,
467                                 nvme_cancel_request, ctrl);
468                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
469         }
470 }
471 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
472
473 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
474 {
475         if (ctrl->admin_tagset) {
476                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
477                                 nvme_cancel_request, ctrl);
478                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
479         }
480 }
481 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
482
483 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
484                 enum nvme_ctrl_state new_state)
485 {
486         enum nvme_ctrl_state old_state;
487         unsigned long flags;
488         bool changed = false;
489
490         spin_lock_irqsave(&ctrl->lock, flags);
491
492         old_state = ctrl->state;
493         switch (new_state) {
494         case NVME_CTRL_LIVE:
495                 switch (old_state) {
496                 case NVME_CTRL_NEW:
497                 case NVME_CTRL_RESETTING:
498                 case NVME_CTRL_CONNECTING:
499                         changed = true;
500                         fallthrough;
501                 default:
502                         break;
503                 }
504                 break;
505         case NVME_CTRL_RESETTING:
506                 switch (old_state) {
507                 case NVME_CTRL_NEW:
508                 case NVME_CTRL_LIVE:
509                         changed = true;
510                         fallthrough;
511                 default:
512                         break;
513                 }
514                 break;
515         case NVME_CTRL_CONNECTING:
516                 switch (old_state) {
517                 case NVME_CTRL_NEW:
518                 case NVME_CTRL_RESETTING:
519                         changed = true;
520                         fallthrough;
521                 default:
522                         break;
523                 }
524                 break;
525         case NVME_CTRL_DELETING:
526                 switch (old_state) {
527                 case NVME_CTRL_LIVE:
528                 case NVME_CTRL_RESETTING:
529                 case NVME_CTRL_CONNECTING:
530                         changed = true;
531                         fallthrough;
532                 default:
533                         break;
534                 }
535                 break;
536         case NVME_CTRL_DELETING_NOIO:
537                 switch (old_state) {
538                 case NVME_CTRL_DELETING:
539                 case NVME_CTRL_DEAD:
540                         changed = true;
541                         fallthrough;
542                 default:
543                         break;
544                 }
545                 break;
546         case NVME_CTRL_DEAD:
547                 switch (old_state) {
548                 case NVME_CTRL_DELETING:
549                         changed = true;
550                         fallthrough;
551                 default:
552                         break;
553                 }
554                 break;
555         default:
556                 break;
557         }
558
559         if (changed) {
560                 ctrl->state = new_state;
561                 wake_up_all(&ctrl->state_wq);
562         }
563
564         spin_unlock_irqrestore(&ctrl->lock, flags);
565         if (!changed)
566                 return false;
567
568         if (ctrl->state == NVME_CTRL_LIVE) {
569                 if (old_state == NVME_CTRL_CONNECTING)
570                         nvme_stop_failfast_work(ctrl);
571                 nvme_kick_requeue_lists(ctrl);
572         } else if (ctrl->state == NVME_CTRL_CONNECTING &&
573                 old_state == NVME_CTRL_RESETTING) {
574                 nvme_start_failfast_work(ctrl);
575         }
576         return changed;
577 }
578 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
579
580 /*
581  * Returns true for sink states that can't ever transition back to live.
582  */
583 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
584 {
585         switch (ctrl->state) {
586         case NVME_CTRL_NEW:
587         case NVME_CTRL_LIVE:
588         case NVME_CTRL_RESETTING:
589         case NVME_CTRL_CONNECTING:
590                 return false;
591         case NVME_CTRL_DELETING:
592         case NVME_CTRL_DELETING_NOIO:
593         case NVME_CTRL_DEAD:
594                 return true;
595         default:
596                 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
597                 return true;
598         }
599 }
600
601 /*
602  * Waits for the controller state to be resetting, or returns false if it is
603  * not possible to ever transition to that state.
604  */
605 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
606 {
607         wait_event(ctrl->state_wq,
608                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
609                    nvme_state_terminal(ctrl));
610         return ctrl->state == NVME_CTRL_RESETTING;
611 }
612 EXPORT_SYMBOL_GPL(nvme_wait_reset);
613
614 static void nvme_free_ns_head(struct kref *ref)
615 {
616         struct nvme_ns_head *head =
617                 container_of(ref, struct nvme_ns_head, ref);
618
619         nvme_mpath_remove_disk(head);
620         ida_free(&head->subsys->ns_ida, head->instance);
621         cleanup_srcu_struct(&head->srcu);
622         nvme_put_subsystem(head->subsys);
623         kfree(head);
624 }
625
626 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
627 {
628         return kref_get_unless_zero(&head->ref);
629 }
630
631 void nvme_put_ns_head(struct nvme_ns_head *head)
632 {
633         kref_put(&head->ref, nvme_free_ns_head);
634 }
635
636 static void nvme_free_ns(struct kref *kref)
637 {
638         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
639
640         put_disk(ns->disk);
641         nvme_put_ns_head(ns->head);
642         nvme_put_ctrl(ns->ctrl);
643         kfree(ns);
644 }
645
646 static inline bool nvme_get_ns(struct nvme_ns *ns)
647 {
648         return kref_get_unless_zero(&ns->kref);
649 }
650
651 void nvme_put_ns(struct nvme_ns *ns)
652 {
653         kref_put(&ns->kref, nvme_free_ns);
654 }
655 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
656
657 static inline void nvme_clear_nvme_request(struct request *req)
658 {
659         nvme_req(req)->status = 0;
660         nvme_req(req)->retries = 0;
661         nvme_req(req)->flags = 0;
662         req->rq_flags |= RQF_DONTPREP;
663 }
664
665 /* initialize a passthrough request */
666 void nvme_init_request(struct request *req, struct nvme_command *cmd)
667 {
668         if (req->q->queuedata)
669                 req->timeout = NVME_IO_TIMEOUT;
670         else /* no queuedata implies admin queue */
671                 req->timeout = NVME_ADMIN_TIMEOUT;
672
673         /* passthru commands should let the driver set the SGL flags */
674         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
675
676         req->cmd_flags |= REQ_FAILFAST_DRIVER;
677         if (req->mq_hctx->type == HCTX_TYPE_POLL)
678                 req->cmd_flags |= REQ_POLLED;
679         nvme_clear_nvme_request(req);
680         req->rq_flags |= RQF_QUIET;
681         memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
682 }
683 EXPORT_SYMBOL_GPL(nvme_init_request);
684
685 /*
686  * For something we're not in a state to send to the device the default action
687  * is to busy it and retry it after the controller state is recovered.  However,
688  * if the controller is deleting or if anything is marked for failfast or
689  * nvme multipath it is immediately failed.
690  *
691  * Note: commands used to initialize the controller will be marked for failfast.
692  * Note: nvme cli/ioctl commands are marked for failfast.
693  */
694 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
695                 struct request *rq)
696 {
697         if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
698             ctrl->state != NVME_CTRL_DELETING &&
699             ctrl->state != NVME_CTRL_DEAD &&
700             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
701             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
702                 return BLK_STS_RESOURCE;
703         return nvme_host_path_error(rq);
704 }
705 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
706
707 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
708                 bool queue_live)
709 {
710         struct nvme_request *req = nvme_req(rq);
711
712         /*
713          * currently we have a problem sending passthru commands
714          * on the admin_q if the controller is not LIVE because we can't
715          * make sure that they are going out after the admin connect,
716          * controller enable and/or other commands in the initialization
717          * sequence. until the controller will be LIVE, fail with
718          * BLK_STS_RESOURCE so that they will be rescheduled.
719          */
720         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
721                 return false;
722
723         if (ctrl->ops->flags & NVME_F_FABRICS) {
724                 /*
725                  * Only allow commands on a live queue, except for the connect
726                  * command, which is require to set the queue live in the
727                  * appropinquate states.
728                  */
729                 switch (ctrl->state) {
730                 case NVME_CTRL_CONNECTING:
731                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
732                             (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
733                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
734                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
735                                 return true;
736                         break;
737                 default:
738                         break;
739                 case NVME_CTRL_DEAD:
740                         return false;
741                 }
742         }
743
744         return queue_live;
745 }
746 EXPORT_SYMBOL_GPL(__nvme_check_ready);
747
748 static inline void nvme_setup_flush(struct nvme_ns *ns,
749                 struct nvme_command *cmnd)
750 {
751         memset(cmnd, 0, sizeof(*cmnd));
752         cmnd->common.opcode = nvme_cmd_flush;
753         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
754 }
755
756 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
757                 struct nvme_command *cmnd)
758 {
759         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
760         struct nvme_dsm_range *range;
761         struct bio *bio;
762
763         /*
764          * Some devices do not consider the DSM 'Number of Ranges' field when
765          * determining how much data to DMA. Always allocate memory for maximum
766          * number of segments to prevent device reading beyond end of buffer.
767          */
768         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
769
770         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
771         if (!range) {
772                 /*
773                  * If we fail allocation our range, fallback to the controller
774                  * discard page. If that's also busy, it's safe to return
775                  * busy, as we know we can make progress once that's freed.
776                  */
777                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
778                         return BLK_STS_RESOURCE;
779
780                 range = page_address(ns->ctrl->discard_page);
781         }
782
783         __rq_for_each_bio(bio, req) {
784                 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
785                 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
786
787                 if (n < segments) {
788                         range[n].cattr = cpu_to_le32(0);
789                         range[n].nlb = cpu_to_le32(nlb);
790                         range[n].slba = cpu_to_le64(slba);
791                 }
792                 n++;
793         }
794
795         if (WARN_ON_ONCE(n != segments)) {
796                 if (virt_to_page(range) == ns->ctrl->discard_page)
797                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
798                 else
799                         kfree(range);
800                 return BLK_STS_IOERR;
801         }
802
803         memset(cmnd, 0, sizeof(*cmnd));
804         cmnd->dsm.opcode = nvme_cmd_dsm;
805         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
806         cmnd->dsm.nr = cpu_to_le32(segments - 1);
807         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
808
809         req->special_vec.bv_page = virt_to_page(range);
810         req->special_vec.bv_offset = offset_in_page(range);
811         req->special_vec.bv_len = alloc_size;
812         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
813
814         return BLK_STS_OK;
815 }
816
817 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
818                               struct request *req)
819 {
820         u32 upper, lower;
821         u64 ref48;
822
823         /* both rw and write zeroes share the same reftag format */
824         switch (ns->guard_type) {
825         case NVME_NVM_NS_16B_GUARD:
826                 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
827                 break;
828         case NVME_NVM_NS_64B_GUARD:
829                 ref48 = ext_pi_ref_tag(req);
830                 lower = lower_32_bits(ref48);
831                 upper = upper_32_bits(ref48);
832
833                 cmnd->rw.reftag = cpu_to_le32(lower);
834                 cmnd->rw.cdw3 = cpu_to_le32(upper);
835                 break;
836         default:
837                 break;
838         }
839 }
840
841 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
842                 struct request *req, struct nvme_command *cmnd)
843 {
844         memset(cmnd, 0, sizeof(*cmnd));
845
846         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
847                 return nvme_setup_discard(ns, req, cmnd);
848
849         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
850         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
851         cmnd->write_zeroes.slba =
852                 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
853         cmnd->write_zeroes.length =
854                 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
855
856         if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
857                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
858
859         if (nvme_ns_has_pi(ns)) {
860                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
861
862                 switch (ns->pi_type) {
863                 case NVME_NS_DPS_PI_TYPE1:
864                 case NVME_NS_DPS_PI_TYPE2:
865                         nvme_set_ref_tag(ns, cmnd, req);
866                         break;
867                 }
868         }
869
870         return BLK_STS_OK;
871 }
872
873 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
874                 struct request *req, struct nvme_command *cmnd,
875                 enum nvme_opcode op)
876 {
877         u16 control = 0;
878         u32 dsmgmt = 0;
879
880         if (req->cmd_flags & REQ_FUA)
881                 control |= NVME_RW_FUA;
882         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
883                 control |= NVME_RW_LR;
884
885         if (req->cmd_flags & REQ_RAHEAD)
886                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
887
888         cmnd->rw.opcode = op;
889         cmnd->rw.flags = 0;
890         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
891         cmnd->rw.cdw2 = 0;
892         cmnd->rw.cdw3 = 0;
893         cmnd->rw.metadata = 0;
894         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
895         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
896         cmnd->rw.reftag = 0;
897         cmnd->rw.apptag = 0;
898         cmnd->rw.appmask = 0;
899
900         if (ns->ms) {
901                 /*
902                  * If formated with metadata, the block layer always provides a
903                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
904                  * we enable the PRACT bit for protection information or set the
905                  * namespace capacity to zero to prevent any I/O.
906                  */
907                 if (!blk_integrity_rq(req)) {
908                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
909                                 return BLK_STS_NOTSUPP;
910                         control |= NVME_RW_PRINFO_PRACT;
911                 }
912
913                 switch (ns->pi_type) {
914                 case NVME_NS_DPS_PI_TYPE3:
915                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
916                         break;
917                 case NVME_NS_DPS_PI_TYPE1:
918                 case NVME_NS_DPS_PI_TYPE2:
919                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
920                                         NVME_RW_PRINFO_PRCHK_REF;
921                         if (op == nvme_cmd_zone_append)
922                                 control |= NVME_RW_APPEND_PIREMAP;
923                         nvme_set_ref_tag(ns, cmnd, req);
924                         break;
925                 }
926         }
927
928         cmnd->rw.control = cpu_to_le16(control);
929         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
930         return 0;
931 }
932
933 void nvme_cleanup_cmd(struct request *req)
934 {
935         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
936                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
937
938                 if (req->special_vec.bv_page == ctrl->discard_page)
939                         clear_bit_unlock(0, &ctrl->discard_page_busy);
940                 else
941                         kfree(bvec_virt(&req->special_vec));
942         }
943 }
944 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
945
946 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
947 {
948         struct nvme_command *cmd = nvme_req(req)->cmd;
949         blk_status_t ret = BLK_STS_OK;
950
951         if (!(req->rq_flags & RQF_DONTPREP))
952                 nvme_clear_nvme_request(req);
953
954         switch (req_op(req)) {
955         case REQ_OP_DRV_IN:
956         case REQ_OP_DRV_OUT:
957                 /* these are setup prior to execution in nvme_init_request() */
958                 break;
959         case REQ_OP_FLUSH:
960                 nvme_setup_flush(ns, cmd);
961                 break;
962         case REQ_OP_ZONE_RESET_ALL:
963         case REQ_OP_ZONE_RESET:
964                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
965                 break;
966         case REQ_OP_ZONE_OPEN:
967                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
968                 break;
969         case REQ_OP_ZONE_CLOSE:
970                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
971                 break;
972         case REQ_OP_ZONE_FINISH:
973                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
974                 break;
975         case REQ_OP_WRITE_ZEROES:
976                 ret = nvme_setup_write_zeroes(ns, req, cmd);
977                 break;
978         case REQ_OP_DISCARD:
979                 ret = nvme_setup_discard(ns, req, cmd);
980                 break;
981         case REQ_OP_READ:
982                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
983                 break;
984         case REQ_OP_WRITE:
985                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
986                 break;
987         case REQ_OP_ZONE_APPEND:
988                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
989                 break;
990         default:
991                 WARN_ON_ONCE(1);
992                 return BLK_STS_IOERR;
993         }
994
995         cmd->common.command_id = nvme_cid(req);
996         trace_nvme_setup_cmd(req, cmd);
997         return ret;
998 }
999 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1000
1001 /*
1002  * Return values:
1003  * 0:  success
1004  * >0: nvme controller's cqe status response
1005  * <0: kernel error in lieu of controller response
1006  */
1007 static int nvme_execute_rq(struct request *rq, bool at_head)
1008 {
1009         blk_status_t status;
1010
1011         status = blk_execute_rq(rq, at_head);
1012         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1013                 return -EINTR;
1014         if (nvme_req(rq)->status)
1015                 return nvme_req(rq)->status;
1016         return blk_status_to_errno(status);
1017 }
1018
1019 /*
1020  * Returns 0 on success.  If the result is negative, it's a Linux error code;
1021  * if the result is positive, it's an NVM Express status code
1022  */
1023 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1024                 union nvme_result *result, void *buffer, unsigned bufflen,
1025                 int qid, int at_head, blk_mq_req_flags_t flags)
1026 {
1027         struct request *req;
1028         int ret;
1029
1030         if (qid == NVME_QID_ANY)
1031                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
1032         else
1033                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
1034                                                 qid - 1);
1035
1036         if (IS_ERR(req))
1037                 return PTR_ERR(req);
1038         nvme_init_request(req, cmd);
1039
1040         if (buffer && bufflen) {
1041                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
1042                 if (ret)
1043                         goto out;
1044         }
1045
1046         ret = nvme_execute_rq(req, at_head);
1047         if (result && ret >= 0)
1048                 *result = nvme_req(req)->result;
1049  out:
1050         blk_mq_free_request(req);
1051         return ret;
1052 }
1053 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1054
1055 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1056                 void *buffer, unsigned bufflen)
1057 {
1058         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1059                         NVME_QID_ANY, 0, 0);
1060 }
1061 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1062
1063 static u32 nvme_known_admin_effects(u8 opcode)
1064 {
1065         switch (opcode) {
1066         case nvme_admin_format_nvm:
1067                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1068                         NVME_CMD_EFFECTS_CSE_MASK;
1069         case nvme_admin_sanitize_nvm:
1070                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1071         default:
1072                 break;
1073         }
1074         return 0;
1075 }
1076
1077 static u32 nvme_known_nvm_effects(u8 opcode)
1078 {
1079         switch (opcode) {
1080         case nvme_cmd_write:
1081         case nvme_cmd_write_zeroes:
1082         case nvme_cmd_write_uncor:
1083                  return NVME_CMD_EFFECTS_LBCC;
1084         default:
1085                 return 0;
1086         }
1087 }
1088
1089 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1090 {
1091         u32 effects = 0;
1092
1093         if (ns) {
1094                 if (ns->head->effects)
1095                         effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1096                 if (ns->head->ids.csi == NVME_CSI_NVM)
1097                         effects |= nvme_known_nvm_effects(opcode);
1098                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1099                         dev_warn_once(ctrl->device,
1100                                 "IO command:%02x has unusual effects:%08x\n",
1101                                 opcode, effects);
1102
1103                 /*
1104                  * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1105                  * which would deadlock when done on an I/O command.  Note that
1106                  * We already warn about an unusual effect above.
1107                  */
1108                 effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1109         } else {
1110                 if (ctrl->effects)
1111                         effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1112                 effects |= nvme_known_admin_effects(opcode);
1113         }
1114
1115         return effects;
1116 }
1117 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1118
1119 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1120                                u8 opcode)
1121 {
1122         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1123
1124         /*
1125          * For simplicity, IO to all namespaces is quiesced even if the command
1126          * effects say only one namespace is affected.
1127          */
1128         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1129                 mutex_lock(&ctrl->scan_lock);
1130                 mutex_lock(&ctrl->subsys->lock);
1131                 nvme_mpath_start_freeze(ctrl->subsys);
1132                 nvme_mpath_wait_freeze(ctrl->subsys);
1133                 nvme_start_freeze(ctrl);
1134                 nvme_wait_freeze(ctrl);
1135         }
1136         return effects;
1137 }
1138
1139 void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
1140                        struct nvme_command *cmd, int status)
1141 {
1142         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1143                 nvme_unfreeze(ctrl);
1144                 nvme_mpath_unfreeze(ctrl->subsys);
1145                 mutex_unlock(&ctrl->subsys->lock);
1146                 mutex_unlock(&ctrl->scan_lock);
1147         }
1148         if (effects & NVME_CMD_EFFECTS_CCC) {
1149                 dev_info(ctrl->device,
1150 "controller capabilities changed, reset may be required to take effect.\n");
1151         }
1152         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1153                 nvme_queue_scan(ctrl);
1154                 flush_work(&ctrl->scan_work);
1155         }
1156
1157         switch (cmd->common.opcode) {
1158         case nvme_admin_set_features:
1159                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1160                 case NVME_FEAT_KATO:
1161                         /*
1162                          * Keep alive commands interval on the host should be
1163                          * updated when KATO is modified by Set Features
1164                          * commands.
1165                          */
1166                         if (!status)
1167                                 nvme_update_keep_alive(ctrl, cmd);
1168                         break;
1169                 default:
1170                         break;
1171                 }
1172                 break;
1173         default:
1174                 break;
1175         }
1176 }
1177 EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
1178
1179 int nvme_execute_passthru_rq(struct request *rq, u32 *effects)
1180 {
1181         struct nvme_command *cmd = nvme_req(rq)->cmd;
1182         struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1183         struct nvme_ns *ns = rq->q->queuedata;
1184
1185         *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1186         return nvme_execute_rq(rq, false);
1187 }
1188 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1189
1190 /*
1191  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1192  * 
1193  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1194  *   accounting for transport roundtrip times [..].
1195  */
1196 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1197 {
1198         queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
1199 }
1200
1201 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
1202                                                  blk_status_t status)
1203 {
1204         struct nvme_ctrl *ctrl = rq->end_io_data;
1205         unsigned long flags;
1206         bool startka = false;
1207
1208         blk_mq_free_request(rq);
1209
1210         if (status) {
1211                 dev_err(ctrl->device,
1212                         "failed nvme_keep_alive_end_io error=%d\n",
1213                                 status);
1214                 return RQ_END_IO_NONE;
1215         }
1216
1217         ctrl->comp_seen = false;
1218         spin_lock_irqsave(&ctrl->lock, flags);
1219         if (ctrl->state == NVME_CTRL_LIVE ||
1220             ctrl->state == NVME_CTRL_CONNECTING)
1221                 startka = true;
1222         spin_unlock_irqrestore(&ctrl->lock, flags);
1223         if (startka)
1224                 nvme_queue_keep_alive_work(ctrl);
1225         return RQ_END_IO_NONE;
1226 }
1227
1228 static void nvme_keep_alive_work(struct work_struct *work)
1229 {
1230         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1231                         struct nvme_ctrl, ka_work);
1232         bool comp_seen = ctrl->comp_seen;
1233         struct request *rq;
1234
1235         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1236                 dev_dbg(ctrl->device,
1237                         "reschedule traffic based keep-alive timer\n");
1238                 ctrl->comp_seen = false;
1239                 nvme_queue_keep_alive_work(ctrl);
1240                 return;
1241         }
1242
1243         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1244                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1245         if (IS_ERR(rq)) {
1246                 /* allocation failure, reset the controller */
1247                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1248                 nvme_reset_ctrl(ctrl);
1249                 return;
1250         }
1251         nvme_init_request(rq, &ctrl->ka_cmd);
1252
1253         rq->timeout = ctrl->kato * HZ;
1254         rq->end_io = nvme_keep_alive_end_io;
1255         rq->end_io_data = ctrl;
1256         blk_execute_rq_nowait(rq, false);
1257 }
1258
1259 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1260 {
1261         if (unlikely(ctrl->kato == 0))
1262                 return;
1263
1264         nvme_queue_keep_alive_work(ctrl);
1265 }
1266
1267 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1268 {
1269         if (unlikely(ctrl->kato == 0))
1270                 return;
1271
1272         cancel_delayed_work_sync(&ctrl->ka_work);
1273 }
1274 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1275
1276 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1277                                    struct nvme_command *cmd)
1278 {
1279         unsigned int new_kato =
1280                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1281
1282         dev_info(ctrl->device,
1283                  "keep alive interval updated from %u ms to %u ms\n",
1284                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1285
1286         nvme_stop_keep_alive(ctrl);
1287         ctrl->kato = new_kato;
1288         nvme_start_keep_alive(ctrl);
1289 }
1290
1291 /*
1292  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1293  * flag, thus sending any new CNS opcodes has a big chance of not working.
1294  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1295  * (but not for any later version).
1296  */
1297 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1298 {
1299         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1300                 return ctrl->vs < NVME_VS(1, 2, 0);
1301         return ctrl->vs < NVME_VS(1, 1, 0);
1302 }
1303
1304 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1305 {
1306         struct nvme_command c = { };
1307         int error;
1308
1309         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1310         c.identify.opcode = nvme_admin_identify;
1311         c.identify.cns = NVME_ID_CNS_CTRL;
1312
1313         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1314         if (!*id)
1315                 return -ENOMEM;
1316
1317         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1318                         sizeof(struct nvme_id_ctrl));
1319         if (error)
1320                 kfree(*id);
1321         return error;
1322 }
1323
1324 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1325                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1326 {
1327         const char *warn_str = "ctrl returned bogus length:";
1328         void *data = cur;
1329
1330         switch (cur->nidt) {
1331         case NVME_NIDT_EUI64:
1332                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1333                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1334                                  warn_str, cur->nidl);
1335                         return -1;
1336                 }
1337                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1338                         return NVME_NIDT_EUI64_LEN;
1339                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1340                 return NVME_NIDT_EUI64_LEN;
1341         case NVME_NIDT_NGUID:
1342                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1343                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1344                                  warn_str, cur->nidl);
1345                         return -1;
1346                 }
1347                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1348                         return NVME_NIDT_NGUID_LEN;
1349                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1350                 return NVME_NIDT_NGUID_LEN;
1351         case NVME_NIDT_UUID:
1352                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1353                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1354                                  warn_str, cur->nidl);
1355                         return -1;
1356                 }
1357                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1358                         return NVME_NIDT_UUID_LEN;
1359                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1360                 return NVME_NIDT_UUID_LEN;
1361         case NVME_NIDT_CSI:
1362                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1363                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1364                                  warn_str, cur->nidl);
1365                         return -1;
1366                 }
1367                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1368                 *csi_seen = true;
1369                 return NVME_NIDT_CSI_LEN;
1370         default:
1371                 /* Skip unknown types */
1372                 return cur->nidl;
1373         }
1374 }
1375
1376 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1377                 struct nvme_ns_info *info)
1378 {
1379         struct nvme_command c = { };
1380         bool csi_seen = false;
1381         int status, pos, len;
1382         void *data;
1383
1384         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1385                 return 0;
1386         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1387                 return 0;
1388
1389         c.identify.opcode = nvme_admin_identify;
1390         c.identify.nsid = cpu_to_le32(info->nsid);
1391         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1392
1393         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1394         if (!data)
1395                 return -ENOMEM;
1396
1397         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1398                                       NVME_IDENTIFY_DATA_SIZE);
1399         if (status) {
1400                 dev_warn(ctrl->device,
1401                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1402                         info->nsid, status);
1403                 goto free_data;
1404         }
1405
1406         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1407                 struct nvme_ns_id_desc *cur = data + pos;
1408
1409                 if (cur->nidl == 0)
1410                         break;
1411
1412                 len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1413                 if (len < 0)
1414                         break;
1415
1416                 len += sizeof(*cur);
1417         }
1418
1419         if (nvme_multi_css(ctrl) && !csi_seen) {
1420                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1421                          info->nsid);
1422                 status = -EINVAL;
1423         }
1424
1425 free_data:
1426         kfree(data);
1427         return status;
1428 }
1429
1430 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1431                         struct nvme_id_ns **id)
1432 {
1433         struct nvme_command c = { };
1434         int error;
1435
1436         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1437         c.identify.opcode = nvme_admin_identify;
1438         c.identify.nsid = cpu_to_le32(nsid);
1439         c.identify.cns = NVME_ID_CNS_NS;
1440
1441         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1442         if (!*id)
1443                 return -ENOMEM;
1444
1445         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1446         if (error) {
1447                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1448                 goto out_free_id;
1449         }
1450
1451         error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1452         if ((*id)->ncap == 0) /* namespace not allocated or attached */
1453                 goto out_free_id;
1454         return 0;
1455
1456 out_free_id:
1457         kfree(*id);
1458         return error;
1459 }
1460
1461 static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1462                 struct nvme_ns_info *info)
1463 {
1464         struct nvme_ns_ids *ids = &info->ids;
1465         struct nvme_id_ns *id;
1466         int ret;
1467
1468         ret = nvme_identify_ns(ctrl, info->nsid, &id);
1469         if (ret)
1470                 return ret;
1471         info->anagrpid = id->anagrpid;
1472         info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1473         info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1474         info->is_ready = true;
1475         if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1476                 dev_info(ctrl->device,
1477                          "Ignoring bogus Namespace Identifiers\n");
1478         } else {
1479                 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1480                     !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1481                         memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1482                 if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1483                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1484                         memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1485         }
1486         kfree(id);
1487         return 0;
1488 }
1489
1490 static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1491                 struct nvme_ns_info *info)
1492 {
1493         struct nvme_id_ns_cs_indep *id;
1494         struct nvme_command c = {
1495                 .identify.opcode        = nvme_admin_identify,
1496                 .identify.nsid          = cpu_to_le32(info->nsid),
1497                 .identify.cns           = NVME_ID_CNS_NS_CS_INDEP,
1498         };
1499         int ret;
1500
1501         id = kmalloc(sizeof(*id), GFP_KERNEL);
1502         if (!id)
1503                 return -ENOMEM;
1504
1505         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1506         if (!ret) {
1507                 info->anagrpid = id->anagrpid;
1508                 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1509                 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1510                 info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1511         }
1512         kfree(id);
1513         return ret;
1514 }
1515
1516 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1517                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1518 {
1519         union nvme_result res = { 0 };
1520         struct nvme_command c = { };
1521         int ret;
1522
1523         c.features.opcode = op;
1524         c.features.fid = cpu_to_le32(fid);
1525         c.features.dword11 = cpu_to_le32(dword11);
1526
1527         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1528                         buffer, buflen, NVME_QID_ANY, 0, 0);
1529         if (ret >= 0 && result)
1530                 *result = le32_to_cpu(res.u32);
1531         return ret;
1532 }
1533
1534 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1535                       unsigned int dword11, void *buffer, size_t buflen,
1536                       u32 *result)
1537 {
1538         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1539                              buflen, result);
1540 }
1541 EXPORT_SYMBOL_GPL(nvme_set_features);
1542
1543 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1544                       unsigned int dword11, void *buffer, size_t buflen,
1545                       u32 *result)
1546 {
1547         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1548                              buflen, result);
1549 }
1550 EXPORT_SYMBOL_GPL(nvme_get_features);
1551
1552 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1553 {
1554         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1555         u32 result;
1556         int status, nr_io_queues;
1557
1558         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1559                         &result);
1560         if (status < 0)
1561                 return status;
1562
1563         /*
1564          * Degraded controllers might return an error when setting the queue
1565          * count.  We still want to be able to bring them online and offer
1566          * access to the admin queue, as that might be only way to fix them up.
1567          */
1568         if (status > 0) {
1569                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1570                 *count = 0;
1571         } else {
1572                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1573                 *count = min(*count, nr_io_queues);
1574         }
1575
1576         return 0;
1577 }
1578 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1579
1580 #define NVME_AEN_SUPPORTED \
1581         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1582          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1583
1584 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1585 {
1586         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1587         int status;
1588
1589         if (!supported_aens)
1590                 return;
1591
1592         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1593                         NULL, 0, &result);
1594         if (status)
1595                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1596                          supported_aens);
1597
1598         queue_work(nvme_wq, &ctrl->async_event_work);
1599 }
1600
1601 static int nvme_ns_open(struct nvme_ns *ns)
1602 {
1603
1604         /* should never be called due to GENHD_FL_HIDDEN */
1605         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1606                 goto fail;
1607         if (!nvme_get_ns(ns))
1608                 goto fail;
1609         if (!try_module_get(ns->ctrl->ops->module))
1610                 goto fail_put_ns;
1611
1612         return 0;
1613
1614 fail_put_ns:
1615         nvme_put_ns(ns);
1616 fail:
1617         return -ENXIO;
1618 }
1619
1620 static void nvme_ns_release(struct nvme_ns *ns)
1621 {
1622
1623         module_put(ns->ctrl->ops->module);
1624         nvme_put_ns(ns);
1625 }
1626
1627 static int nvme_open(struct block_device *bdev, fmode_t mode)
1628 {
1629         return nvme_ns_open(bdev->bd_disk->private_data);
1630 }
1631
1632 static void nvme_release(struct gendisk *disk, fmode_t mode)
1633 {
1634         nvme_ns_release(disk->private_data);
1635 }
1636
1637 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1638 {
1639         /* some standard values */
1640         geo->heads = 1 << 6;
1641         geo->sectors = 1 << 5;
1642         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1643         return 0;
1644 }
1645
1646 #ifdef CONFIG_BLK_DEV_INTEGRITY
1647 static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
1648                                 u32 max_integrity_segments)
1649 {
1650         struct blk_integrity integrity = { };
1651
1652         switch (ns->pi_type) {
1653         case NVME_NS_DPS_PI_TYPE3:
1654                 switch (ns->guard_type) {
1655                 case NVME_NVM_NS_16B_GUARD:
1656                         integrity.profile = &t10_pi_type3_crc;
1657                         integrity.tag_size = sizeof(u16) + sizeof(u32);
1658                         integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1659                         break;
1660                 case NVME_NVM_NS_64B_GUARD:
1661                         integrity.profile = &ext_pi_type3_crc64;
1662                         integrity.tag_size = sizeof(u16) + 6;
1663                         integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1664                         break;
1665                 default:
1666                         integrity.profile = NULL;
1667                         break;
1668                 }
1669                 break;
1670         case NVME_NS_DPS_PI_TYPE1:
1671         case NVME_NS_DPS_PI_TYPE2:
1672                 switch (ns->guard_type) {
1673                 case NVME_NVM_NS_16B_GUARD:
1674                         integrity.profile = &t10_pi_type1_crc;
1675                         integrity.tag_size = sizeof(u16);
1676                         integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1677                         break;
1678                 case NVME_NVM_NS_64B_GUARD:
1679                         integrity.profile = &ext_pi_type1_crc64;
1680                         integrity.tag_size = sizeof(u16);
1681                         integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1682                         break;
1683                 default:
1684                         integrity.profile = NULL;
1685                         break;
1686                 }
1687                 break;
1688         default:
1689                 integrity.profile = NULL;
1690                 break;
1691         }
1692
1693         integrity.tuple_size = ns->ms;
1694         blk_integrity_register(disk, &integrity);
1695         blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1696 }
1697 #else
1698 static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
1699                                 u32 max_integrity_segments)
1700 {
1701 }
1702 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1703
1704 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1705 {
1706         struct nvme_ctrl *ctrl = ns->ctrl;
1707         struct request_queue *queue = disk->queue;
1708         u32 size = queue_logical_block_size(queue);
1709
1710         if (ctrl->max_discard_sectors == 0) {
1711                 blk_queue_max_discard_sectors(queue, 0);
1712                 return;
1713         }
1714
1715         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1716                         NVME_DSM_MAX_RANGES);
1717
1718         queue->limits.discard_granularity = size;
1719
1720         /* If discard is already enabled, don't reset queue limits */
1721         if (queue->limits.max_discard_sectors)
1722                 return;
1723
1724         if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
1725                 ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
1726
1727         blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
1728         blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
1729
1730         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1731                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1732 }
1733
1734 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1735 {
1736         return uuid_equal(&a->uuid, &b->uuid) &&
1737                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1738                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1739                 a->csi == b->csi;
1740 }
1741
1742 static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
1743 {
1744         bool first = id->dps & NVME_NS_DPS_PI_FIRST;
1745         unsigned lbaf = nvme_lbaf_index(id->flbas);
1746         struct nvme_ctrl *ctrl = ns->ctrl;
1747         struct nvme_command c = { };
1748         struct nvme_id_ns_nvm *nvm;
1749         int ret = 0;
1750         u32 elbaf;
1751
1752         ns->pi_size = 0;
1753         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
1754         if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1755                 ns->pi_size = sizeof(struct t10_pi_tuple);
1756                 ns->guard_type = NVME_NVM_NS_16B_GUARD;
1757                 goto set_pi;
1758         }
1759
1760         nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1761         if (!nvm)
1762                 return -ENOMEM;
1763
1764         c.identify.opcode = nvme_admin_identify;
1765         c.identify.nsid = cpu_to_le32(ns->head->ns_id);
1766         c.identify.cns = NVME_ID_CNS_CS_NS;
1767         c.identify.csi = NVME_CSI_NVM;
1768
1769         ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
1770         if (ret)
1771                 goto free_data;
1772
1773         elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
1774
1775         /* no support for storage tag formats right now */
1776         if (nvme_elbaf_sts(elbaf))
1777                 goto free_data;
1778
1779         ns->guard_type = nvme_elbaf_guard_type(elbaf);
1780         switch (ns->guard_type) {
1781         case NVME_NVM_NS_64B_GUARD:
1782                 ns->pi_size = sizeof(struct crc64_pi_tuple);
1783                 break;
1784         case NVME_NVM_NS_16B_GUARD:
1785                 ns->pi_size = sizeof(struct t10_pi_tuple);
1786                 break;
1787         default:
1788                 break;
1789         }
1790
1791 free_data:
1792         kfree(nvm);
1793 set_pi:
1794         if (ns->pi_size && (first || ns->ms == ns->pi_size))
1795                 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1796         else
1797                 ns->pi_type = 0;
1798
1799         return ret;
1800 }
1801
1802 static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1803 {
1804         struct nvme_ctrl *ctrl = ns->ctrl;
1805
1806         if (nvme_init_ms(ns, id))
1807                 return;
1808
1809         ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1810         if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1811                 return;
1812
1813         if (ctrl->ops->flags & NVME_F_FABRICS) {
1814                 /*
1815                  * The NVMe over Fabrics specification only supports metadata as
1816                  * part of the extended data LBA.  We rely on HCA/HBA support to
1817                  * remap the separate metadata buffer from the block layer.
1818                  */
1819                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1820                         return;
1821
1822                 ns->features |= NVME_NS_EXT_LBAS;
1823
1824                 /*
1825                  * The current fabrics transport drivers support namespace
1826                  * metadata formats only if nvme_ns_has_pi() returns true.
1827                  * Suppress support for all other formats so the namespace will
1828                  * have a 0 capacity and not be usable through the block stack.
1829                  *
1830                  * Note, this check will need to be modified if any drivers
1831                  * gain the ability to use other metadata formats.
1832                  */
1833                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
1834                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1835         } else {
1836                 /*
1837                  * For PCIe controllers, we can't easily remap the separate
1838                  * metadata buffer from the block layer and thus require a
1839                  * separate metadata buffer for block layer metadata/PI support.
1840                  * We allow extended LBAs for the passthrough interface, though.
1841                  */
1842                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1843                         ns->features |= NVME_NS_EXT_LBAS;
1844                 else
1845                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1846         }
1847 }
1848
1849 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1850                 struct request_queue *q)
1851 {
1852         bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
1853
1854         if (ctrl->max_hw_sectors) {
1855                 u32 max_segments =
1856                         (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
1857
1858                 max_segments = min_not_zero(max_segments, ctrl->max_segments);
1859                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1860                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1861         }
1862         blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
1863         blk_queue_dma_alignment(q, 3);
1864         blk_queue_write_cache(q, vwc, vwc);
1865 }
1866
1867 static void nvme_update_disk_info(struct gendisk *disk,
1868                 struct nvme_ns *ns, struct nvme_id_ns *id)
1869 {
1870         sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
1871         unsigned short bs = 1 << ns->lba_shift;
1872         u32 atomic_bs, phys_bs, io_opt = 0;
1873
1874         /*
1875          * The block layer can't support LBA sizes larger than the page size
1876          * yet, so catch this early and don't allow block I/O.
1877          */
1878         if (ns->lba_shift > PAGE_SHIFT) {
1879                 capacity = 0;
1880                 bs = (1 << 9);
1881         }
1882
1883         blk_integrity_unregister(disk);
1884
1885         atomic_bs = phys_bs = bs;
1886         if (id->nabo == 0) {
1887                 /*
1888                  * Bit 1 indicates whether NAWUPF is defined for this namespace
1889                  * and whether it should be used instead of AWUPF. If NAWUPF ==
1890                  * 0 then AWUPF must be used instead.
1891                  */
1892                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
1893                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
1894                 else
1895                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
1896         }
1897
1898         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
1899                 /* NPWG = Namespace Preferred Write Granularity */
1900                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
1901                 /* NOWS = Namespace Optimal Write Size */
1902                 io_opt = bs * (1 + le16_to_cpu(id->nows));
1903         }
1904
1905         blk_queue_logical_block_size(disk->queue, bs);
1906         /*
1907          * Linux filesystems assume writing a single physical block is
1908          * an atomic operation. Hence limit the physical block size to the
1909          * value of the Atomic Write Unit Power Fail parameter.
1910          */
1911         blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
1912         blk_queue_io_min(disk->queue, phys_bs);
1913         blk_queue_io_opt(disk->queue, io_opt);
1914
1915         /*
1916          * Register a metadata profile for PI, or the plain non-integrity NVMe
1917          * metadata masquerading as Type 0 if supported, otherwise reject block
1918          * I/O to namespaces with metadata except when the namespace supports
1919          * PI, as it can strip/insert in that case.
1920          */
1921         if (ns->ms) {
1922                 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
1923                     (ns->features & NVME_NS_METADATA_SUPPORTED))
1924                         nvme_init_integrity(disk, ns,
1925                                             ns->ctrl->max_integrity_segments);
1926                 else if (!nvme_ns_has_pi(ns))
1927                         capacity = 0;
1928         }
1929
1930         set_capacity_and_notify(disk, capacity);
1931
1932         nvme_config_discard(disk, ns);
1933         blk_queue_max_write_zeroes_sectors(disk->queue,
1934                                            ns->ctrl->max_zeroes_sectors);
1935 }
1936
1937 static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
1938 {
1939         return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
1940 }
1941
1942 static inline bool nvme_first_scan(struct gendisk *disk)
1943 {
1944         /* nvme_alloc_ns() scans the disk prior to adding it */
1945         return !disk_live(disk);
1946 }
1947
1948 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
1949 {
1950         struct nvme_ctrl *ctrl = ns->ctrl;
1951         u32 iob;
1952
1953         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1954             is_power_of_2(ctrl->max_hw_sectors))
1955                 iob = ctrl->max_hw_sectors;
1956         else
1957                 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
1958
1959         if (!iob)
1960                 return;
1961
1962         if (!is_power_of_2(iob)) {
1963                 if (nvme_first_scan(ns->disk))
1964                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
1965                                 ns->disk->disk_name, iob);
1966                 return;
1967         }
1968
1969         if (blk_queue_is_zoned(ns->disk->queue)) {
1970                 if (nvme_first_scan(ns->disk))
1971                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
1972                                 ns->disk->disk_name);
1973                 return;
1974         }
1975
1976         blk_queue_chunk_sectors(ns->queue, iob);
1977 }
1978
1979 static int nvme_update_ns_info_generic(struct nvme_ns *ns,
1980                 struct nvme_ns_info *info)
1981 {
1982         blk_mq_freeze_queue(ns->disk->queue);
1983         nvme_set_queue_limits(ns->ctrl, ns->queue);
1984         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
1985         blk_mq_unfreeze_queue(ns->disk->queue);
1986
1987         if (nvme_ns_head_multipath(ns->head)) {
1988                 blk_mq_freeze_queue(ns->head->disk->queue);
1989                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
1990                 nvme_mpath_revalidate_paths(ns);
1991                 blk_stack_limits(&ns->head->disk->queue->limits,
1992                                  &ns->queue->limits, 0);
1993                 ns->head->disk->flags |= GENHD_FL_HIDDEN;
1994                 blk_mq_unfreeze_queue(ns->head->disk->queue);
1995         }
1996
1997         /* Hide the block-interface for these devices */
1998         ns->disk->flags |= GENHD_FL_HIDDEN;
1999         set_bit(NVME_NS_READY, &ns->flags);
2000
2001         return 0;
2002 }
2003
2004 static int nvme_update_ns_info_block(struct nvme_ns *ns,
2005                 struct nvme_ns_info *info)
2006 {
2007         struct nvme_id_ns *id;
2008         unsigned lbaf;
2009         int ret;
2010
2011         ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
2012         if (ret)
2013                 return ret;
2014
2015         blk_mq_freeze_queue(ns->disk->queue);
2016         lbaf = nvme_lbaf_index(id->flbas);
2017         ns->lba_shift = id->lbaf[lbaf].ds;
2018         nvme_set_queue_limits(ns->ctrl, ns->queue);
2019
2020         nvme_configure_metadata(ns, id);
2021         nvme_set_chunk_sectors(ns, id);
2022         nvme_update_disk_info(ns->disk, ns, id);
2023
2024         if (ns->head->ids.csi == NVME_CSI_ZNS) {
2025                 ret = nvme_update_zone_info(ns, lbaf);
2026                 if (ret) {
2027                         blk_mq_unfreeze_queue(ns->disk->queue);
2028                         goto out;
2029                 }
2030         }
2031
2032         /*
2033          * Only set the DEAC bit if the device guarantees that reads from
2034          * deallocated data return zeroes.  While the DEAC bit does not
2035          * require that, it must be a no-op if reads from deallocated data
2036          * do not return zeroes.
2037          */
2038         if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
2039                 ns->features |= NVME_NS_DEAC;
2040         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2041         set_bit(NVME_NS_READY, &ns->flags);
2042         blk_mq_unfreeze_queue(ns->disk->queue);
2043
2044         if (blk_queue_is_zoned(ns->queue)) {
2045                 ret = nvme_revalidate_zones(ns);
2046                 if (ret && !nvme_first_scan(ns->disk))
2047                         goto out;
2048         }
2049
2050         if (nvme_ns_head_multipath(ns->head)) {
2051                 blk_mq_freeze_queue(ns->head->disk->queue);
2052                 nvme_update_disk_info(ns->head->disk, ns, id);
2053                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2054                 nvme_mpath_revalidate_paths(ns);
2055                 blk_stack_limits(&ns->head->disk->queue->limits,
2056                                  &ns->queue->limits, 0);
2057                 disk_update_readahead(ns->head->disk);
2058                 blk_mq_unfreeze_queue(ns->head->disk->queue);
2059         }
2060
2061         ret = 0;
2062 out:
2063         /*
2064          * If probing fails due an unsupported feature, hide the block device,
2065          * but still allow other access.
2066          */
2067         if (ret == -ENODEV) {
2068                 ns->disk->flags |= GENHD_FL_HIDDEN;
2069                 set_bit(NVME_NS_READY, &ns->flags);
2070                 ret = 0;
2071         }
2072         kfree(id);
2073         return ret;
2074 }
2075
2076 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2077 {
2078         switch (info->ids.csi) {
2079         case NVME_CSI_ZNS:
2080                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2081                         dev_info(ns->ctrl->device,
2082         "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2083                                 info->nsid);
2084                         return nvme_update_ns_info_generic(ns, info);
2085                 }
2086                 return nvme_update_ns_info_block(ns, info);
2087         case NVME_CSI_NVM:
2088                 return nvme_update_ns_info_block(ns, info);
2089         default:
2090                 dev_info(ns->ctrl->device,
2091                         "block device for nsid %u not supported (csi %u)\n",
2092                         info->nsid, info->ids.csi);
2093                 return nvme_update_ns_info_generic(ns, info);
2094         }
2095 }
2096
2097 static char nvme_pr_type(enum pr_type type)
2098 {
2099         switch (type) {
2100         case PR_WRITE_EXCLUSIVE:
2101                 return 1;
2102         case PR_EXCLUSIVE_ACCESS:
2103                 return 2;
2104         case PR_WRITE_EXCLUSIVE_REG_ONLY:
2105                 return 3;
2106         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2107                 return 4;
2108         case PR_WRITE_EXCLUSIVE_ALL_REGS:
2109                 return 5;
2110         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2111                 return 6;
2112         default:
2113                 return 0;
2114         }
2115 }
2116
2117 static int nvme_send_ns_head_pr_command(struct block_device *bdev,
2118                 struct nvme_command *c, u8 data[16])
2119 {
2120         struct nvme_ns_head *head = bdev->bd_disk->private_data;
2121         int srcu_idx = srcu_read_lock(&head->srcu);
2122         struct nvme_ns *ns = nvme_find_path(head);
2123         int ret = -EWOULDBLOCK;
2124
2125         if (ns) {
2126                 c->common.nsid = cpu_to_le32(ns->head->ns_id);
2127                 ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
2128         }
2129         srcu_read_unlock(&head->srcu, srcu_idx);
2130         return ret;
2131 }
2132         
2133 static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
2134                 u8 data[16])
2135 {
2136         c->common.nsid = cpu_to_le32(ns->head->ns_id);
2137         return nvme_submit_sync_cmd(ns->queue, c, data, 16);
2138 }
2139
2140 static int nvme_sc_to_pr_err(int nvme_sc)
2141 {
2142         if (nvme_is_path_error(nvme_sc))
2143                 return PR_STS_PATH_FAILED;
2144
2145         switch (nvme_sc) {
2146         case NVME_SC_SUCCESS:
2147                 return PR_STS_SUCCESS;
2148         case NVME_SC_RESERVATION_CONFLICT:
2149                 return PR_STS_RESERVATION_CONFLICT;
2150         case NVME_SC_ONCS_NOT_SUPPORTED:
2151                 return -EOPNOTSUPP;
2152         case NVME_SC_BAD_ATTRIBUTES:
2153         case NVME_SC_INVALID_OPCODE:
2154         case NVME_SC_INVALID_FIELD:
2155         case NVME_SC_INVALID_NS:
2156                 return -EINVAL;
2157         default:
2158                 return PR_STS_IOERR;
2159         }
2160 }
2161
2162 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2163                                 u64 key, u64 sa_key, u8 op)
2164 {
2165         struct nvme_command c = { };
2166         u8 data[16] = { 0, };
2167         int ret;
2168
2169         put_unaligned_le64(key, &data[0]);
2170         put_unaligned_le64(sa_key, &data[8]);
2171
2172         c.common.opcode = op;
2173         c.common.cdw10 = cpu_to_le32(cdw10);
2174
2175         if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
2176             bdev->bd_disk->fops == &nvme_ns_head_ops)
2177                 ret = nvme_send_ns_head_pr_command(bdev, &c, data);
2178         else
2179                 ret = nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c,
2180                                               data);
2181         if (ret < 0)
2182                 return ret;
2183
2184         return nvme_sc_to_pr_err(ret);
2185 }
2186
2187 static int nvme_pr_register(struct block_device *bdev, u64 old,
2188                 u64 new, unsigned flags)
2189 {
2190         u32 cdw10;
2191
2192         if (flags & ~PR_FL_IGNORE_KEY)
2193                 return -EOPNOTSUPP;
2194
2195         cdw10 = old ? 2 : 0;
2196         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2197         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2198         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2199 }
2200
2201 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2202                 enum pr_type type, unsigned flags)
2203 {
2204         u32 cdw10;
2205
2206         if (flags & ~PR_FL_IGNORE_KEY)
2207                 return -EOPNOTSUPP;
2208
2209         cdw10 = nvme_pr_type(type) << 8;
2210         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2211         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2212 }
2213
2214 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2215                 enum pr_type type, bool abort)
2216 {
2217         u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2218
2219         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2220 }
2221
2222 static int nvme_pr_clear(struct block_device *bdev, u64 key)
2223 {
2224         u32 cdw10 = 1 | (key ? 0 : 1 << 3);
2225
2226         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2227 }
2228
2229 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2230 {
2231         u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
2232
2233         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2234 }
2235
2236 const struct pr_ops nvme_pr_ops = {
2237         .pr_register    = nvme_pr_register,
2238         .pr_reserve     = nvme_pr_reserve,
2239         .pr_release     = nvme_pr_release,
2240         .pr_preempt     = nvme_pr_preempt,
2241         .pr_clear       = nvme_pr_clear,
2242 };
2243
2244 #ifdef CONFIG_BLK_SED_OPAL
2245 static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2246                 bool send)
2247 {
2248         struct nvme_ctrl *ctrl = data;
2249         struct nvme_command cmd = { };
2250
2251         if (send)
2252                 cmd.common.opcode = nvme_admin_security_send;
2253         else
2254                 cmd.common.opcode = nvme_admin_security_recv;
2255         cmd.common.nsid = 0;
2256         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2257         cmd.common.cdw11 = cpu_to_le32(len);
2258
2259         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2260                         NVME_QID_ANY, 1, 0);
2261 }
2262
2263 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2264 {
2265         if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2266                 if (!ctrl->opal_dev)
2267                         ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
2268                 else if (was_suspended)
2269                         opal_unlock_from_suspend(ctrl->opal_dev);
2270         } else {
2271                 free_opal_dev(ctrl->opal_dev);
2272                 ctrl->opal_dev = NULL;
2273         }
2274 }
2275 #else
2276 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2277 {
2278 }
2279 #endif /* CONFIG_BLK_SED_OPAL */
2280
2281 #ifdef CONFIG_BLK_DEV_ZONED
2282 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2283                 unsigned int nr_zones, report_zones_cb cb, void *data)
2284 {
2285         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
2286                         data);
2287 }
2288 #else
2289 #define nvme_report_zones       NULL
2290 #endif /* CONFIG_BLK_DEV_ZONED */
2291
2292 static const struct block_device_operations nvme_bdev_ops = {
2293         .owner          = THIS_MODULE,
2294         .ioctl          = nvme_ioctl,
2295         .compat_ioctl   = blkdev_compat_ptr_ioctl,
2296         .open           = nvme_open,
2297         .release        = nvme_release,
2298         .getgeo         = nvme_getgeo,
2299         .report_zones   = nvme_report_zones,
2300         .pr_ops         = &nvme_pr_ops,
2301 };
2302
2303 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2304                 u32 timeout, const char *op)
2305 {
2306         unsigned long timeout_jiffies = jiffies + timeout * HZ;
2307         u32 csts;
2308         int ret;
2309
2310         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2311                 if (csts == ~0)
2312                         return -ENODEV;
2313                 if ((csts & mask) == val)
2314                         break;
2315
2316                 usleep_range(1000, 2000);
2317                 if (fatal_signal_pending(current))
2318                         return -EINTR;
2319                 if (time_after(jiffies, timeout_jiffies)) {
2320                         dev_err(ctrl->device,
2321                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2322                                 op, csts);
2323                         return -ENODEV;
2324                 }
2325         }
2326
2327         return ret;
2328 }
2329
2330 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2331 {
2332         int ret;
2333
2334         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2335         if (shutdown)
2336                 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2337         else
2338                 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2339
2340         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2341         if (ret)
2342                 return ret;
2343
2344         if (shutdown) {
2345                 return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
2346                                        NVME_CSTS_SHST_CMPLT,
2347                                        ctrl->shutdown_timeout, "shutdown");
2348         }
2349         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2350                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2351         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
2352                                (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2353 }
2354 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2355
2356 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2357 {
2358         unsigned dev_page_min;
2359         u32 timeout;
2360         int ret;
2361
2362         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2363         if (ret) {
2364                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2365                 return ret;
2366         }
2367         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2368
2369         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2370                 dev_err(ctrl->device,
2371                         "Minimum device page size %u too large for host (%u)\n",
2372                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2373                 return -ENODEV;
2374         }
2375
2376         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2377                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2378         else
2379                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2380
2381         if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2382                 u32 crto;
2383
2384                 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2385                 if (ret) {
2386                         dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2387                                 ret);
2388                         return ret;
2389                 }
2390
2391                 if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
2392                         ctrl->ctrl_config |= NVME_CC_CRIME;
2393                         timeout = NVME_CRTO_CRIMT(crto);
2394                 } else {
2395                         timeout = NVME_CRTO_CRWMT(crto);
2396                 }
2397         } else {
2398                 timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2399         }
2400
2401         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2402         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2403         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2404         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2405         if (ret)
2406                 return ret;
2407
2408         /* Flush write to device (required if transport is PCI) */
2409         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
2410         if (ret)
2411                 return ret;
2412
2413         ctrl->ctrl_config |= NVME_CC_ENABLE;
2414         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2415         if (ret)
2416                 return ret;
2417         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
2418                                (timeout + 1) / 2, "initialisation");
2419 }
2420 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2421
2422 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2423 {
2424         __le64 ts;
2425         int ret;
2426
2427         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2428                 return 0;
2429
2430         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2431         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2432                         NULL);
2433         if (ret)
2434                 dev_warn_once(ctrl->device,
2435                         "could not set timestamp (%d)\n", ret);
2436         return ret;
2437 }
2438
2439 static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2440 {
2441         struct nvme_feat_host_behavior *host;
2442         u8 acre = 0, lbafee = 0;
2443         int ret;
2444
2445         /* Don't bother enabling the feature if retry delay is not reported */
2446         if (ctrl->crdt[0])
2447                 acre = NVME_ENABLE_ACRE;
2448         if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2449                 lbafee = NVME_ENABLE_LBAFEE;
2450
2451         if (!acre && !lbafee)
2452                 return 0;
2453
2454         host = kzalloc(sizeof(*host), GFP_KERNEL);
2455         if (!host)
2456                 return 0;
2457
2458         host->acre = acre;
2459         host->lbafee = lbafee;
2460         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2461                                 host, sizeof(*host), NULL);
2462         kfree(host);
2463         return ret;
2464 }
2465
2466 /*
2467  * The function checks whether the given total (exlat + enlat) latency of
2468  * a power state allows the latter to be used as an APST transition target.
2469  * It does so by comparing the latency to the primary and secondary latency
2470  * tolerances defined by module params. If there's a match, the corresponding
2471  * timeout value is returned and the matching tolerance index (1 or 2) is
2472  * reported.
2473  */
2474 static bool nvme_apst_get_transition_time(u64 total_latency,
2475                 u64 *transition_time, unsigned *last_index)
2476 {
2477         if (total_latency <= apst_primary_latency_tol_us) {
2478                 if (*last_index == 1)
2479                         return false;
2480                 *last_index = 1;
2481                 *transition_time = apst_primary_timeout_ms;
2482                 return true;
2483         }
2484         if (apst_secondary_timeout_ms &&
2485                 total_latency <= apst_secondary_latency_tol_us) {
2486                 if (*last_index <= 2)
2487                         return false;
2488                 *last_index = 2;
2489                 *transition_time = apst_secondary_timeout_ms;
2490                 return true;
2491         }
2492         return false;
2493 }
2494
2495 /*
2496  * APST (Autonomous Power State Transition) lets us program a table of power
2497  * state transitions that the controller will perform automatically.
2498  *
2499  * Depending on module params, one of the two supported techniques will be used:
2500  *
2501  * - If the parameters provide explicit timeouts and tolerances, they will be
2502  *   used to build a table with up to 2 non-operational states to transition to.
2503  *   The default parameter values were selected based on the values used by
2504  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2505  *   regeneration of the APST table in the event of switching between external
2506  *   and battery power, the timeouts and tolerances reflect a compromise
2507  *   between values used by Microsoft for AC and battery scenarios.
2508  * - If not, we'll configure the table with a simple heuristic: we are willing
2509  *   to spend at most 2% of the time transitioning between power states.
2510  *   Therefore, when running in any given state, we will enter the next
2511  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2512  *   microseconds, as long as that state's exit latency is under the requested
2513  *   maximum latency.
2514  *
2515  * We will not autonomously enter any non-operational state for which the total
2516  * latency exceeds ps_max_latency_us.
2517  *
2518  * Users can set ps_max_latency_us to zero to turn off APST.
2519  */
2520 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2521 {
2522         struct nvme_feat_auto_pst *table;
2523         unsigned apste = 0;
2524         u64 max_lat_us = 0;
2525         __le64 target = 0;
2526         int max_ps = -1;
2527         int state;
2528         int ret;
2529         unsigned last_lt_index = UINT_MAX;
2530
2531         /*
2532          * If APST isn't supported or if we haven't been initialized yet,
2533          * then don't do anything.
2534          */
2535         if (!ctrl->apsta)
2536                 return 0;
2537
2538         if (ctrl->npss > 31) {
2539                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2540                 return 0;
2541         }
2542
2543         table = kzalloc(sizeof(*table), GFP_KERNEL);
2544         if (!table)
2545                 return 0;
2546
2547         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2548                 /* Turn off APST. */
2549                 dev_dbg(ctrl->device, "APST disabled\n");
2550                 goto done;
2551         }
2552
2553         /*
2554          * Walk through all states from lowest- to highest-power.
2555          * According to the spec, lower-numbered states use more power.  NPSS,
2556          * despite the name, is the index of the lowest-power state, not the
2557          * number of states.
2558          */
2559         for (state = (int)ctrl->npss; state >= 0; state--) {
2560                 u64 total_latency_us, exit_latency_us, transition_ms;
2561
2562                 if (target)
2563                         table->entries[state] = target;
2564
2565                 /*
2566                  * Don't allow transitions to the deepest state if it's quirked
2567                  * off.
2568                  */
2569                 if (state == ctrl->npss &&
2570                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2571                         continue;
2572
2573                 /*
2574                  * Is this state a useful non-operational state for higher-power
2575                  * states to autonomously transition to?
2576                  */
2577                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2578                         continue;
2579
2580                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2581                 if (exit_latency_us > ctrl->ps_max_latency_us)
2582                         continue;
2583
2584                 total_latency_us = exit_latency_us +
2585                         le32_to_cpu(ctrl->psd[state].entry_lat);
2586
2587                 /*
2588                  * This state is good. It can be used as the APST idle target
2589                  * for higher power states.
2590                  */
2591                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2592                         if (!nvme_apst_get_transition_time(total_latency_us,
2593                                         &transition_ms, &last_lt_index))
2594                                 continue;
2595                 } else {
2596                         transition_ms = total_latency_us + 19;
2597                         do_div(transition_ms, 20);
2598                         if (transition_ms > (1 << 24) - 1)
2599                                 transition_ms = (1 << 24) - 1;
2600                 }
2601
2602                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2603                 if (max_ps == -1)
2604                         max_ps = state;
2605                 if (total_latency_us > max_lat_us)
2606                         max_lat_us = total_latency_us;
2607         }
2608
2609         if (max_ps == -1)
2610                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2611         else
2612                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2613                         max_ps, max_lat_us, (int)sizeof(*table), table);
2614         apste = 1;
2615
2616 done:
2617         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2618                                 table, sizeof(*table), NULL);
2619         if (ret)
2620                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2621         kfree(table);
2622         return ret;
2623 }
2624
2625 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2626 {
2627         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2628         u64 latency;
2629
2630         switch (val) {
2631         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2632         case PM_QOS_LATENCY_ANY:
2633                 latency = U64_MAX;
2634                 break;
2635
2636         default:
2637                 latency = val;
2638         }
2639
2640         if (ctrl->ps_max_latency_us != latency) {
2641                 ctrl->ps_max_latency_us = latency;
2642                 if (ctrl->state == NVME_CTRL_LIVE)
2643                         nvme_configure_apst(ctrl);
2644         }
2645 }
2646
2647 struct nvme_core_quirk_entry {
2648         /*
2649          * NVMe model and firmware strings are padded with spaces.  For
2650          * simplicity, strings in the quirk table are padded with NULLs
2651          * instead.
2652          */
2653         u16 vid;
2654         const char *mn;
2655         const char *fr;
2656         unsigned long quirks;
2657 };
2658
2659 static const struct nvme_core_quirk_entry core_quirks[] = {
2660         {
2661                 /*
2662                  * This Toshiba device seems to die using any APST states.  See:
2663                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2664                  */
2665                 .vid = 0x1179,
2666                 .mn = "THNSF5256GPUK TOSHIBA",
2667                 .quirks = NVME_QUIRK_NO_APST,
2668         },
2669         {
2670                 /*
2671                  * This LiteON CL1-3D*-Q11 firmware version has a race
2672                  * condition associated with actions related to suspend to idle
2673                  * LiteON has resolved the problem in future firmware
2674                  */
2675                 .vid = 0x14a4,
2676                 .fr = "22301111",
2677                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2678         },
2679         {
2680                 /*
2681                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2682                  * aborts I/O during any load, but more easily reproducible
2683                  * with discards (fstrim).
2684                  *
2685                  * The device is left in a state where it is also not possible
2686                  * to use "nvme set-feature" to disable APST, but booting with
2687                  * nvme_core.default_ps_max_latency=0 works.
2688                  */
2689                 .vid = 0x1e0f,
2690                 .mn = "KCD6XVUL6T40",
2691                 .quirks = NVME_QUIRK_NO_APST,
2692         },
2693         {
2694                 /*
2695                  * The external Samsung X5 SSD fails initialization without a
2696                  * delay before checking if it is ready and has a whole set of
2697                  * other problems.  To make this even more interesting, it
2698                  * shares the PCI ID with internal Samsung 970 Evo Plus that
2699                  * does not need or want these quirks.
2700                  */
2701                 .vid = 0x144d,
2702                 .mn = "Samsung Portable SSD X5",
2703                 .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2704                           NVME_QUIRK_NO_DEEPEST_PS |
2705                           NVME_QUIRK_IGNORE_DEV_SUBNQN,
2706         }
2707 };
2708
2709 /* match is null-terminated but idstr is space-padded. */
2710 static bool string_matches(const char *idstr, const char *match, size_t len)
2711 {
2712         size_t matchlen;
2713
2714         if (!match)
2715                 return true;
2716
2717         matchlen = strlen(match);
2718         WARN_ON_ONCE(matchlen > len);
2719
2720         if (memcmp(idstr, match, matchlen))
2721                 return false;
2722
2723         for (; matchlen < len; matchlen++)
2724                 if (idstr[matchlen] != ' ')
2725                         return false;
2726
2727         return true;
2728 }
2729
2730 static bool quirk_matches(const struct nvme_id_ctrl *id,
2731                           const struct nvme_core_quirk_entry *q)
2732 {
2733         return q->vid == le16_to_cpu(id->vid) &&
2734                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2735                 string_matches(id->fr, q->fr, sizeof(id->fr));
2736 }
2737
2738 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2739                 struct nvme_id_ctrl *id)
2740 {
2741         size_t nqnlen;
2742         int off;
2743
2744         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2745                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2746                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2747                         strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2748                         return;
2749                 }
2750
2751                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2752                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2753         }
2754
2755         /*
2756          * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
2757          * Base Specification 2.0.  It is slightly different from the format
2758          * specified there due to historic reasons, and we can't change it now.
2759          */
2760         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2761                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2762                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2763         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2764         off += sizeof(id->sn);
2765         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2766         off += sizeof(id->mn);
2767         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2768 }
2769
2770 static void nvme_release_subsystem(struct device *dev)
2771 {
2772         struct nvme_subsystem *subsys =
2773                 container_of(dev, struct nvme_subsystem, dev);
2774
2775         if (subsys->instance >= 0)
2776                 ida_free(&nvme_instance_ida, subsys->instance);
2777         kfree(subsys);
2778 }
2779
2780 static void nvme_destroy_subsystem(struct kref *ref)
2781 {
2782         struct nvme_subsystem *subsys =
2783                         container_of(ref, struct nvme_subsystem, ref);
2784
2785         mutex_lock(&nvme_subsystems_lock);
2786         list_del(&subsys->entry);
2787         mutex_unlock(&nvme_subsystems_lock);
2788
2789         ida_destroy(&subsys->ns_ida);
2790         device_del(&subsys->dev);
2791         put_device(&subsys->dev);
2792 }
2793
2794 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2795 {
2796         kref_put(&subsys->ref, nvme_destroy_subsystem);
2797 }
2798
2799 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2800 {
2801         struct nvme_subsystem *subsys;
2802
2803         lockdep_assert_held(&nvme_subsystems_lock);
2804
2805         /*
2806          * Fail matches for discovery subsystems. This results
2807          * in each discovery controller bound to a unique subsystem.
2808          * This avoids issues with validating controller values
2809          * that can only be true when there is a single unique subsystem.
2810          * There may be multiple and completely independent entities
2811          * that provide discovery controllers.
2812          */
2813         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2814                 return NULL;
2815
2816         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2817                 if (strcmp(subsys->subnqn, subsysnqn))
2818                         continue;
2819                 if (!kref_get_unless_zero(&subsys->ref))
2820                         continue;
2821                 return subsys;
2822         }
2823
2824         return NULL;
2825 }
2826
2827 #define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2828         struct device_attribute subsys_attr_##_name = \
2829                 __ATTR(_name, _mode, _show, NULL)
2830
2831 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2832                                     struct device_attribute *attr,
2833                                     char *buf)
2834 {
2835         struct nvme_subsystem *subsys =
2836                 container_of(dev, struct nvme_subsystem, dev);
2837
2838         return sysfs_emit(buf, "%s\n", subsys->subnqn);
2839 }
2840 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2841
2842 static ssize_t nvme_subsys_show_type(struct device *dev,
2843                                     struct device_attribute *attr,
2844                                     char *buf)
2845 {
2846         struct nvme_subsystem *subsys =
2847                 container_of(dev, struct nvme_subsystem, dev);
2848
2849         switch (subsys->subtype) {
2850         case NVME_NQN_DISC:
2851                 return sysfs_emit(buf, "discovery\n");
2852         case NVME_NQN_NVME:
2853                 return sysfs_emit(buf, "nvm\n");
2854         default:
2855                 return sysfs_emit(buf, "reserved\n");
2856         }
2857 }
2858 static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
2859
2860 #define nvme_subsys_show_str_function(field)                            \
2861 static ssize_t subsys_##field##_show(struct device *dev,                \
2862                             struct device_attribute *attr, char *buf)   \
2863 {                                                                       \
2864         struct nvme_subsystem *subsys =                                 \
2865                 container_of(dev, struct nvme_subsystem, dev);          \
2866         return sysfs_emit(buf, "%.*s\n",                                \
2867                            (int)sizeof(subsys->field), subsys->field);  \
2868 }                                                                       \
2869 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2870
2871 nvme_subsys_show_str_function(model);
2872 nvme_subsys_show_str_function(serial);
2873 nvme_subsys_show_str_function(firmware_rev);
2874
2875 static struct attribute *nvme_subsys_attrs[] = {
2876         &subsys_attr_model.attr,
2877         &subsys_attr_serial.attr,
2878         &subsys_attr_firmware_rev.attr,
2879         &subsys_attr_subsysnqn.attr,
2880         &subsys_attr_subsystype.attr,
2881 #ifdef CONFIG_NVME_MULTIPATH
2882         &subsys_attr_iopolicy.attr,
2883 #endif
2884         NULL,
2885 };
2886
2887 static const struct attribute_group nvme_subsys_attrs_group = {
2888         .attrs = nvme_subsys_attrs,
2889 };
2890
2891 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2892         &nvme_subsys_attrs_group,
2893         NULL,
2894 };
2895
2896 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2897 {
2898         return ctrl->opts && ctrl->opts->discovery_nqn;
2899 }
2900
2901 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2902                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2903 {
2904         struct nvme_ctrl *tmp;
2905
2906         lockdep_assert_held(&nvme_subsystems_lock);
2907
2908         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2909                 if (nvme_state_terminal(tmp))
2910                         continue;
2911
2912                 if (tmp->cntlid == ctrl->cntlid) {
2913                         dev_err(ctrl->device,
2914                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2915                                 ctrl->cntlid, dev_name(tmp->device),
2916                                 subsys->subnqn);
2917                         return false;
2918                 }
2919
2920                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2921                     nvme_discovery_ctrl(ctrl))
2922                         continue;
2923
2924                 dev_err(ctrl->device,
2925                         "Subsystem does not support multiple controllers\n");
2926                 return false;
2927         }
2928
2929         return true;
2930 }
2931
2932 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2933 {
2934         struct nvme_subsystem *subsys, *found;
2935         int ret;
2936
2937         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2938         if (!subsys)
2939                 return -ENOMEM;
2940
2941         subsys->instance = -1;
2942         mutex_init(&subsys->lock);
2943         kref_init(&subsys->ref);
2944         INIT_LIST_HEAD(&subsys->ctrls);
2945         INIT_LIST_HEAD(&subsys->nsheads);
2946         nvme_init_subnqn(subsys, ctrl, id);
2947         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2948         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2949         subsys->vendor_id = le16_to_cpu(id->vid);
2950         subsys->cmic = id->cmic;
2951
2952         /* Versions prior to 1.4 don't necessarily report a valid type */
2953         if (id->cntrltype == NVME_CTRL_DISC ||
2954             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
2955                 subsys->subtype = NVME_NQN_DISC;
2956         else
2957                 subsys->subtype = NVME_NQN_NVME;
2958
2959         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
2960                 dev_err(ctrl->device,
2961                         "Subsystem %s is not a discovery controller",
2962                         subsys->subnqn);
2963                 kfree(subsys);
2964                 return -EINVAL;
2965         }
2966         subsys->awupf = le16_to_cpu(id->awupf);
2967         nvme_mpath_default_iopolicy(subsys);
2968
2969         subsys->dev.class = nvme_subsys_class;
2970         subsys->dev.release = nvme_release_subsystem;
2971         subsys->dev.groups = nvme_subsys_attrs_groups;
2972         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2973         device_initialize(&subsys->dev);
2974
2975         mutex_lock(&nvme_subsystems_lock);
2976         found = __nvme_find_get_subsystem(subsys->subnqn);
2977         if (found) {
2978                 put_device(&subsys->dev);
2979                 subsys = found;
2980
2981                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2982                         ret = -EINVAL;
2983                         goto out_put_subsystem;
2984                 }
2985         } else {
2986                 ret = device_add(&subsys->dev);
2987                 if (ret) {
2988                         dev_err(ctrl->device,
2989                                 "failed to register subsystem device.\n");
2990                         put_device(&subsys->dev);
2991                         goto out_unlock;
2992                 }
2993                 ida_init(&subsys->ns_ida);
2994                 list_add_tail(&subsys->entry, &nvme_subsystems);
2995         }
2996
2997         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2998                                 dev_name(ctrl->device));
2999         if (ret) {
3000                 dev_err(ctrl->device,
3001                         "failed to create sysfs link from subsystem.\n");
3002                 goto out_put_subsystem;
3003         }
3004
3005         if (!found)
3006                 subsys->instance = ctrl->instance;
3007         ctrl->subsys = subsys;
3008         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
3009         mutex_unlock(&nvme_subsystems_lock);
3010         return 0;
3011
3012 out_put_subsystem:
3013         nvme_put_subsystem(subsys);
3014 out_unlock:
3015         mutex_unlock(&nvme_subsystems_lock);
3016         return ret;
3017 }
3018
3019 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3020                 void *log, size_t size, u64 offset)
3021 {
3022         struct nvme_command c = { };
3023         u32 dwlen = nvme_bytes_to_numd(size);
3024
3025         c.get_log_page.opcode = nvme_admin_get_log_page;
3026         c.get_log_page.nsid = cpu_to_le32(nsid);
3027         c.get_log_page.lid = log_page;
3028         c.get_log_page.lsp = lsp;
3029         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3030         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3031         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3032         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3033         c.get_log_page.csi = csi;
3034
3035         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3036 }
3037
3038 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3039                                 struct nvme_effects_log **log)
3040 {
3041         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
3042         int ret;
3043
3044         if (cel)
3045                 goto out;
3046
3047         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3048         if (!cel)
3049                 return -ENOMEM;
3050
3051         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3052                         cel, sizeof(*cel), 0);
3053         if (ret) {
3054                 kfree(cel);
3055                 return ret;
3056         }
3057
3058         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3059 out:
3060         *log = cel;
3061         return 0;
3062 }
3063
3064 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3065 {
3066         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
3067
3068         if (check_shl_overflow(1U, units + page_shift - 9, &val))
3069                 return UINT_MAX;
3070         return val;
3071 }
3072
3073 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3074 {
3075         struct nvme_command c = { };
3076         struct nvme_id_ctrl_nvm *id;
3077         int ret;
3078
3079         if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
3080                 ctrl->max_discard_sectors = UINT_MAX;
3081                 ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
3082         } else {
3083                 ctrl->max_discard_sectors = 0;
3084                 ctrl->max_discard_segments = 0;
3085         }
3086
3087         /*
3088          * Even though NVMe spec explicitly states that MDTS is not applicable
3089          * to the write-zeroes, we are cautious and limit the size to the
3090          * controllers max_hw_sectors value, which is based on the MDTS field
3091          * and possibly other limiting factors.
3092          */
3093         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3094             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3095                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3096         else
3097                 ctrl->max_zeroes_sectors = 0;
3098
3099         if (nvme_ctrl_limited_cns(ctrl))
3100                 return 0;
3101
3102         id = kzalloc(sizeof(*id), GFP_KERNEL);
3103         if (!id)
3104                 return -ENOMEM;
3105
3106         c.identify.opcode = nvme_admin_identify;
3107         c.identify.cns = NVME_ID_CNS_CS_CTRL;
3108         c.identify.csi = NVME_CSI_NVM;
3109
3110         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3111         if (ret)
3112                 goto free_data;
3113
3114         if (id->dmrl)
3115                 ctrl->max_discard_segments = id->dmrl;
3116         ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3117         if (id->wzsl)
3118                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
3119
3120 free_data:
3121         kfree(id);
3122         return ret;
3123 }
3124
3125 static int nvme_init_identify(struct nvme_ctrl *ctrl)
3126 {
3127         struct nvme_id_ctrl *id;
3128         u32 max_hw_sectors;
3129         bool prev_apst_enabled;
3130         int ret;
3131
3132         ret = nvme_identify_ctrl(ctrl, &id);
3133         if (ret) {
3134                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3135                 return -EIO;
3136         }
3137
3138         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3139                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3140                 if (ret < 0)
3141                         goto out_free;
3142         }
3143
3144         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3145                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3146
3147         if (!ctrl->identified) {
3148                 unsigned int i;
3149
3150                 /*
3151                  * Check for quirks.  Quirk can depend on firmware version,
3152                  * so, in principle, the set of quirks present can change
3153                  * across a reset.  As a possible future enhancement, we
3154                  * could re-scan for quirks every time we reinitialize
3155                  * the device, but we'd have to make sure that the driver
3156                  * behaves intelligently if the quirks change.
3157                  */
3158                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3159                         if (quirk_matches(id, &core_quirks[i]))
3160                                 ctrl->quirks |= core_quirks[i].quirks;
3161                 }
3162
3163                 ret = nvme_init_subsystem(ctrl, id);
3164                 if (ret)
3165                         goto out_free;
3166         }
3167         memcpy(ctrl->subsys->firmware_rev, id->fr,
3168                sizeof(ctrl->subsys->firmware_rev));
3169
3170         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3171                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3172                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3173         }
3174
3175         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3176         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3177         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3178
3179         ctrl->oacs = le16_to_cpu(id->oacs);
3180         ctrl->oncs = le16_to_cpu(id->oncs);
3181         ctrl->mtfa = le16_to_cpu(id->mtfa);
3182         ctrl->oaes = le32_to_cpu(id->oaes);
3183         ctrl->wctemp = le16_to_cpu(id->wctemp);
3184         ctrl->cctemp = le16_to_cpu(id->cctemp);
3185
3186         atomic_set(&ctrl->abort_limit, id->acl + 1);
3187         ctrl->vwc = id->vwc;
3188         if (id->mdts)
3189                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3190         else
3191                 max_hw_sectors = UINT_MAX;
3192         ctrl->max_hw_sectors =
3193                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3194
3195         nvme_set_queue_limits(ctrl, ctrl->admin_q);
3196         ctrl->sgls = le32_to_cpu(id->sgls);
3197         ctrl->kas = le16_to_cpu(id->kas);
3198         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3199         ctrl->ctratt = le32_to_cpu(id->ctratt);
3200
3201         ctrl->cntrltype = id->cntrltype;
3202         ctrl->dctype = id->dctype;
3203
3204         if (id->rtd3e) {
3205                 /* us -> s */
3206                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3207
3208                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3209                                                  shutdown_timeout, 60);
3210
3211                 if (ctrl->shutdown_timeout != shutdown_timeout)
3212                         dev_info(ctrl->device,
3213                                  "Shutdown timeout set to %u seconds\n",
3214                                  ctrl->shutdown_timeout);
3215         } else
3216                 ctrl->shutdown_timeout = shutdown_timeout;
3217
3218         ctrl->npss = id->npss;
3219         ctrl->apsta = id->apsta;
3220         prev_apst_enabled = ctrl->apst_enabled;
3221         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3222                 if (force_apst && id->apsta) {
3223                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3224                         ctrl->apst_enabled = true;
3225                 } else {
3226                         ctrl->apst_enabled = false;
3227                 }
3228         } else {
3229                 ctrl->apst_enabled = id->apsta;
3230         }
3231         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3232
3233         if (ctrl->ops->flags & NVME_F_FABRICS) {
3234                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3235                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3236                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3237                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3238
3239                 /*
3240                  * In fabrics we need to verify the cntlid matches the
3241                  * admin connect
3242                  */
3243                 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3244                         dev_err(ctrl->device,
3245                                 "Mismatching cntlid: Connect %u vs Identify "
3246                                 "%u, rejecting\n",
3247                                 ctrl->cntlid, le16_to_cpu(id->cntlid));
3248                         ret = -EINVAL;
3249                         goto out_free;
3250                 }
3251
3252                 if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3253                         dev_err(ctrl->device,
3254                                 "keep-alive support is mandatory for fabrics\n");
3255                         ret = -EINVAL;
3256                         goto out_free;
3257                 }
3258         } else {
3259                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3260                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3261                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3262                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3263         }
3264
3265         ret = nvme_mpath_init_identify(ctrl, id);
3266         if (ret < 0)
3267                 goto out_free;
3268
3269         if (ctrl->apst_enabled && !prev_apst_enabled)
3270                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3271         else if (!ctrl->apst_enabled && prev_apst_enabled)
3272                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3273
3274 out_free:
3275         kfree(id);
3276         return ret;
3277 }
3278
3279 /*
3280  * Initialize the cached copies of the Identify data and various controller
3281  * register in our nvme_ctrl structure.  This should be called as soon as
3282  * the admin queue is fully up and running.
3283  */
3284 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3285 {
3286         int ret;
3287
3288         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3289         if (ret) {
3290                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3291                 return ret;
3292         }
3293
3294         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3295
3296         if (ctrl->vs >= NVME_VS(1, 1, 0))
3297                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3298
3299         ret = nvme_init_identify(ctrl);
3300         if (ret)
3301                 return ret;
3302
3303         ret = nvme_configure_apst(ctrl);
3304         if (ret < 0)
3305                 return ret;
3306
3307         ret = nvme_configure_timestamp(ctrl);
3308         if (ret < 0)
3309                 return ret;
3310
3311         ret = nvme_configure_host_options(ctrl);
3312         if (ret < 0)
3313                 return ret;
3314
3315         nvme_configure_opal(ctrl, was_suspended);
3316
3317         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3318                 /*
3319                  * Do not return errors unless we are in a controller reset,
3320                  * the controller works perfectly fine without hwmon.
3321                  */
3322                 ret = nvme_hwmon_init(ctrl);
3323                 if (ret == -EINTR)
3324                         return ret;
3325         }
3326
3327         ctrl->identified = true;
3328
3329         return 0;
3330 }
3331 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3332
3333 static int nvme_dev_open(struct inode *inode, struct file *file)
3334 {
3335         struct nvme_ctrl *ctrl =
3336                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3337
3338         switch (ctrl->state) {
3339         case NVME_CTRL_LIVE:
3340                 break;
3341         default:
3342                 return -EWOULDBLOCK;
3343         }
3344
3345         nvme_get_ctrl(ctrl);
3346         if (!try_module_get(ctrl->ops->module)) {
3347                 nvme_put_ctrl(ctrl);
3348                 return -EINVAL;
3349         }
3350
3351         file->private_data = ctrl;
3352         return 0;
3353 }
3354
3355 static int nvme_dev_release(struct inode *inode, struct file *file)
3356 {
3357         struct nvme_ctrl *ctrl =
3358                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3359
3360         module_put(ctrl->ops->module);
3361         nvme_put_ctrl(ctrl);
3362         return 0;
3363 }
3364
3365 static const struct file_operations nvme_dev_fops = {
3366         .owner          = THIS_MODULE,
3367         .open           = nvme_dev_open,
3368         .release        = nvme_dev_release,
3369         .unlocked_ioctl = nvme_dev_ioctl,
3370         .compat_ioctl   = compat_ptr_ioctl,
3371         .uring_cmd      = nvme_dev_uring_cmd,
3372 };
3373
3374 static ssize_t nvme_sysfs_reset(struct device *dev,
3375                                 struct device_attribute *attr, const char *buf,
3376                                 size_t count)
3377 {
3378         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3379         int ret;
3380
3381         ret = nvme_reset_ctrl_sync(ctrl);
3382         if (ret < 0)
3383                 return ret;
3384         return count;
3385 }
3386 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3387
3388 static ssize_t nvme_sysfs_rescan(struct device *dev,
3389                                 struct device_attribute *attr, const char *buf,
3390                                 size_t count)
3391 {
3392         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3393
3394         nvme_queue_scan(ctrl);
3395         return count;
3396 }
3397 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3398
3399 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3400 {
3401         struct gendisk *disk = dev_to_disk(dev);
3402
3403         if (disk->fops == &nvme_bdev_ops)
3404                 return nvme_get_ns_from_dev(dev)->head;
3405         else
3406                 return disk->private_data;
3407 }
3408
3409 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3410                 char *buf)
3411 {
3412         struct nvme_ns_head *head = dev_to_ns_head(dev);
3413         struct nvme_ns_ids *ids = &head->ids;
3414         struct nvme_subsystem *subsys = head->subsys;
3415         int serial_len = sizeof(subsys->serial);
3416         int model_len = sizeof(subsys->model);
3417
3418         if (!uuid_is_null(&ids->uuid))
3419                 return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
3420
3421         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3422                 return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
3423
3424         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3425                 return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
3426
3427         while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3428                                   subsys->serial[serial_len - 1] == '\0'))
3429                 serial_len--;
3430         while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3431                                  subsys->model[model_len - 1] == '\0'))
3432                 model_len--;
3433
3434         return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3435                 serial_len, subsys->serial, model_len, subsys->model,
3436                 head->ns_id);
3437 }
3438 static DEVICE_ATTR_RO(wwid);
3439
3440 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3441                 char *buf)
3442 {
3443         return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3444 }
3445 static DEVICE_ATTR_RO(nguid);
3446
3447 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3448                 char *buf)
3449 {
3450         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3451
3452         /* For backward compatibility expose the NGUID to userspace if
3453          * we have no UUID set
3454          */
3455         if (uuid_is_null(&ids->uuid)) {
3456                 dev_warn_ratelimited(dev,
3457                         "No UUID available providing old NGUID\n");
3458                 return sysfs_emit(buf, "%pU\n", ids->nguid);
3459         }
3460         return sysfs_emit(buf, "%pU\n", &ids->uuid);
3461 }
3462 static DEVICE_ATTR_RO(uuid);
3463
3464 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3465                 char *buf)
3466 {
3467         return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3468 }
3469 static DEVICE_ATTR_RO(eui);
3470
3471 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3472                 char *buf)
3473 {
3474         return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3475 }
3476 static DEVICE_ATTR_RO(nsid);
3477
3478 static struct attribute *nvme_ns_id_attrs[] = {
3479         &dev_attr_wwid.attr,
3480         &dev_attr_uuid.attr,
3481         &dev_attr_nguid.attr,
3482         &dev_attr_eui.attr,
3483         &dev_attr_nsid.attr,
3484 #ifdef CONFIG_NVME_MULTIPATH
3485         &dev_attr_ana_grpid.attr,
3486         &dev_attr_ana_state.attr,
3487 #endif
3488         NULL,
3489 };
3490
3491 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3492                 struct attribute *a, int n)
3493 {
3494         struct device *dev = container_of(kobj, struct device, kobj);
3495         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3496
3497         if (a == &dev_attr_uuid.attr) {
3498                 if (uuid_is_null(&ids->uuid) &&
3499                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3500                         return 0;
3501         }
3502         if (a == &dev_attr_nguid.attr) {
3503                 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3504                         return 0;
3505         }
3506         if (a == &dev_attr_eui.attr) {
3507                 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3508                         return 0;
3509         }
3510 #ifdef CONFIG_NVME_MULTIPATH
3511         if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3512                 if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
3513                         return 0;
3514                 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3515                         return 0;
3516         }
3517 #endif
3518         return a->mode;
3519 }
3520
3521 static const struct attribute_group nvme_ns_id_attr_group = {
3522         .attrs          = nvme_ns_id_attrs,
3523         .is_visible     = nvme_ns_id_attrs_are_visible,
3524 };
3525
3526 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3527         &nvme_ns_id_attr_group,
3528         NULL,
3529 };
3530
3531 #define nvme_show_str_function(field)                                           \
3532 static ssize_t  field##_show(struct device *dev,                                \
3533                             struct device_attribute *attr, char *buf)           \
3534 {                                                                               \
3535         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3536         return sysfs_emit(buf, "%.*s\n",                                        \
3537                 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
3538 }                                                                               \
3539 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3540
3541 nvme_show_str_function(model);
3542 nvme_show_str_function(serial);
3543 nvme_show_str_function(firmware_rev);
3544
3545 #define nvme_show_int_function(field)                                           \
3546 static ssize_t  field##_show(struct device *dev,                                \
3547                             struct device_attribute *attr, char *buf)           \
3548 {                                                                               \
3549         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3550         return sysfs_emit(buf, "%d\n", ctrl->field);                            \
3551 }                                                                               \
3552 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3553
3554 nvme_show_int_function(cntlid);
3555 nvme_show_int_function(numa_node);
3556 nvme_show_int_function(queue_count);
3557 nvme_show_int_function(sqsize);
3558 nvme_show_int_function(kato);
3559
3560 static ssize_t nvme_sysfs_delete(struct device *dev,
3561                                 struct device_attribute *attr, const char *buf,
3562                                 size_t count)
3563 {
3564         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3565
3566         if (device_remove_file_self(dev, attr))
3567                 nvme_delete_ctrl_sync(ctrl);
3568         return count;
3569 }
3570 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3571
3572 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3573                                          struct device_attribute *attr,
3574                                          char *buf)
3575 {
3576         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3577
3578         return sysfs_emit(buf, "%s\n", ctrl->ops->name);
3579 }
3580 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3581
3582 static ssize_t nvme_sysfs_show_state(struct device *dev,
3583                                      struct device_attribute *attr,
3584                                      char *buf)
3585 {
3586         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3587         static const char *const state_name[] = {
3588                 [NVME_CTRL_NEW]         = "new",
3589                 [NVME_CTRL_LIVE]        = "live",
3590                 [NVME_CTRL_RESETTING]   = "resetting",
3591                 [NVME_CTRL_CONNECTING]  = "connecting",
3592                 [NVME_CTRL_DELETING]    = "deleting",
3593                 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3594                 [NVME_CTRL_DEAD]        = "dead",
3595         };
3596
3597         if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3598             state_name[ctrl->state])
3599                 return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
3600
3601         return sysfs_emit(buf, "unknown state\n");
3602 }
3603
3604 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3605
3606 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3607                                          struct device_attribute *attr,
3608                                          char *buf)
3609 {
3610         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3611
3612         return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
3613 }
3614 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3615
3616 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3617                                         struct device_attribute *attr,
3618                                         char *buf)
3619 {
3620         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3621
3622         return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
3623 }
3624 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3625
3626 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3627                                         struct device_attribute *attr,
3628                                         char *buf)
3629 {
3630         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3631
3632         return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
3633 }
3634 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3635
3636 static ssize_t nvme_sysfs_show_address(struct device *dev,
3637                                          struct device_attribute *attr,
3638                                          char *buf)
3639 {
3640         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3641
3642         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3643 }
3644 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3645
3646 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3647                 struct device_attribute *attr, char *buf)
3648 {
3649         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3650         struct nvmf_ctrl_options *opts = ctrl->opts;
3651
3652         if (ctrl->opts->max_reconnects == -1)
3653                 return sysfs_emit(buf, "off\n");
3654         return sysfs_emit(buf, "%d\n",
3655                           opts->max_reconnects * opts->reconnect_delay);
3656 }
3657
3658 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3659                 struct device_attribute *attr, const char *buf, size_t count)
3660 {
3661         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3662         struct nvmf_ctrl_options *opts = ctrl->opts;
3663         int ctrl_loss_tmo, err;
3664
3665         err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3666         if (err)
3667                 return -EINVAL;
3668
3669         if (ctrl_loss_tmo < 0)
3670                 opts->max_reconnects = -1;
3671         else
3672                 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3673                                                 opts->reconnect_delay);
3674         return count;
3675 }
3676 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3677         nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3678
3679 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3680                 struct device_attribute *attr, char *buf)
3681 {
3682         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3683
3684         if (ctrl->opts->reconnect_delay == -1)
3685                 return sysfs_emit(buf, "off\n");
3686         return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3687 }
3688
3689 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3690                 struct device_attribute *attr, const char *buf, size_t count)
3691 {
3692         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3693         unsigned int v;
3694         int err;
3695
3696         err = kstrtou32(buf, 10, &v);
3697         if (err)
3698                 return err;
3699
3700         ctrl->opts->reconnect_delay = v;
3701         return count;
3702 }
3703 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3704         nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3705
3706 static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
3707                 struct device_attribute *attr, char *buf)
3708 {
3709         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3710
3711         if (ctrl->opts->fast_io_fail_tmo == -1)
3712                 return sysfs_emit(buf, "off\n");
3713         return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
3714 }
3715
3716 static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
3717                 struct device_attribute *attr, const char *buf, size_t count)
3718 {
3719         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3720         struct nvmf_ctrl_options *opts = ctrl->opts;
3721         int fast_io_fail_tmo, err;
3722
3723         err = kstrtoint(buf, 10, &fast_io_fail_tmo);
3724         if (err)
3725                 return -EINVAL;
3726
3727         if (fast_io_fail_tmo < 0)
3728                 opts->fast_io_fail_tmo = -1;
3729         else
3730                 opts->fast_io_fail_tmo = fast_io_fail_tmo;
3731         return count;
3732 }
3733 static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
3734         nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
3735
3736 static ssize_t cntrltype_show(struct device *dev,
3737                               struct device_attribute *attr, char *buf)
3738 {
3739         static const char * const type[] = {
3740                 [NVME_CTRL_IO] = "io\n",
3741                 [NVME_CTRL_DISC] = "discovery\n",
3742                 [NVME_CTRL_ADMIN] = "admin\n",
3743         };
3744         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3745
3746         if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
3747                 return sysfs_emit(buf, "reserved\n");
3748
3749         return sysfs_emit(buf, type[ctrl->cntrltype]);
3750 }
3751 static DEVICE_ATTR_RO(cntrltype);
3752
3753 static ssize_t dctype_show(struct device *dev,
3754                            struct device_attribute *attr, char *buf)
3755 {
3756         static const char * const type[] = {
3757                 [NVME_DCTYPE_NOT_REPORTED] = "none\n",
3758                 [NVME_DCTYPE_DDC] = "ddc\n",
3759                 [NVME_DCTYPE_CDC] = "cdc\n",
3760         };
3761         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3762
3763         if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
3764                 return sysfs_emit(buf, "reserved\n");
3765
3766         return sysfs_emit(buf, type[ctrl->dctype]);
3767 }
3768 static DEVICE_ATTR_RO(dctype);
3769
3770 #ifdef CONFIG_NVME_AUTH
3771 static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
3772                 struct device_attribute *attr, char *buf)
3773 {
3774         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3775         struct nvmf_ctrl_options *opts = ctrl->opts;
3776
3777         if (!opts->dhchap_secret)
3778                 return sysfs_emit(buf, "none\n");
3779         return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
3780 }
3781
3782 static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
3783                 struct device_attribute *attr, const char *buf, size_t count)
3784 {
3785         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3786         struct nvmf_ctrl_options *opts = ctrl->opts;
3787         char *dhchap_secret;
3788
3789         if (!ctrl->opts->dhchap_secret)
3790                 return -EINVAL;
3791         if (count < 7)
3792                 return -EINVAL;
3793         if (memcmp(buf, "DHHC-1:", 7))
3794                 return -EINVAL;
3795
3796         dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
3797         if (!dhchap_secret)
3798                 return -ENOMEM;
3799         memcpy(dhchap_secret, buf, count);
3800         nvme_auth_stop(ctrl);
3801         if (strcmp(dhchap_secret, opts->dhchap_secret)) {
3802                 struct nvme_dhchap_key *key, *host_key;
3803                 int ret;
3804
3805                 ret = nvme_auth_generate_key(dhchap_secret, &key);
3806                 if (ret)
3807                         return ret;
3808                 kfree(opts->dhchap_secret);
3809                 opts->dhchap_secret = dhchap_secret;
3810                 host_key = ctrl->host_key;
3811                 mutex_lock(&ctrl->dhchap_auth_mutex);
3812                 ctrl->host_key = key;
3813                 mutex_unlock(&ctrl->dhchap_auth_mutex);
3814                 nvme_auth_free_key(host_key);
3815         }
3816         /* Start re-authentication */
3817         dev_info(ctrl->device, "re-authenticating controller\n");
3818         queue_work(nvme_wq, &ctrl->dhchap_auth_work);
3819
3820         return count;
3821 }
3822 static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
3823         nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
3824
3825 static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
3826                 struct device_attribute *attr, char *buf)
3827 {
3828         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3829         struct nvmf_ctrl_options *opts = ctrl->opts;
3830
3831         if (!opts->dhchap_ctrl_secret)
3832                 return sysfs_emit(buf, "none\n");
3833         return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
3834 }
3835
3836 static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
3837                 struct device_attribute *attr, const char *buf, size_t count)
3838 {
3839         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3840         struct nvmf_ctrl_options *opts = ctrl->opts;
3841         char *dhchap_secret;
3842
3843         if (!ctrl->opts->dhchap_ctrl_secret)
3844                 return -EINVAL;
3845         if (count < 7)
3846                 return -EINVAL;
3847         if (memcmp(buf, "DHHC-1:", 7))
3848                 return -EINVAL;
3849
3850         dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
3851         if (!dhchap_secret)
3852                 return -ENOMEM;
3853         memcpy(dhchap_secret, buf, count);
3854         nvme_auth_stop(ctrl);
3855         if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
3856                 struct nvme_dhchap_key *key, *ctrl_key;
3857                 int ret;
3858
3859                 ret = nvme_auth_generate_key(dhchap_secret, &key);
3860                 if (ret)
3861                         return ret;
3862                 kfree(opts->dhchap_ctrl_secret);
3863                 opts->dhchap_ctrl_secret = dhchap_secret;
3864                 ctrl_key = ctrl->ctrl_key;
3865                 mutex_lock(&ctrl->dhchap_auth_mutex);
3866                 ctrl->ctrl_key = key;
3867                 mutex_unlock(&ctrl->dhchap_auth_mutex);
3868                 nvme_auth_free_key(ctrl_key);
3869         }
3870         /* Start re-authentication */
3871         dev_info(ctrl->device, "re-authenticating controller\n");
3872         queue_work(nvme_wq, &ctrl->dhchap_auth_work);
3873
3874         return count;
3875 }
3876 static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
3877         nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
3878 #endif
3879
3880 static struct attribute *nvme_dev_attrs[] = {
3881         &dev_attr_reset_controller.attr,
3882         &dev_attr_rescan_controller.attr,
3883         &dev_attr_model.attr,
3884         &dev_attr_serial.attr,
3885         &dev_attr_firmware_rev.attr,
3886         &dev_attr_cntlid.attr,
3887         &dev_attr_delete_controller.attr,
3888         &dev_attr_transport.attr,
3889         &dev_attr_subsysnqn.attr,
3890         &dev_attr_address.attr,
3891         &dev_attr_state.attr,
3892         &dev_attr_numa_node.attr,
3893         &dev_attr_queue_count.attr,
3894         &dev_attr_sqsize.attr,
3895         &dev_attr_hostnqn.attr,
3896         &dev_attr_hostid.attr,
3897         &dev_attr_ctrl_loss_tmo.attr,
3898         &dev_attr_reconnect_delay.attr,
3899         &dev_attr_fast_io_fail_tmo.attr,
3900         &dev_attr_kato.attr,
3901         &dev_attr_cntrltype.attr,
3902         &dev_attr_dctype.attr,
3903 #ifdef CONFIG_NVME_AUTH
3904         &dev_attr_dhchap_secret.attr,
3905         &dev_attr_dhchap_ctrl_secret.attr,
3906 #endif
3907         NULL
3908 };
3909
3910 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3911                 struct attribute *a, int n)
3912 {
3913         struct device *dev = container_of(kobj, struct device, kobj);
3914         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3915
3916         if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3917                 return 0;
3918         if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3919                 return 0;
3920         if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3921                 return 0;
3922         if (a == &dev_attr_hostid.attr && !ctrl->opts)
3923                 return 0;
3924         if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3925                 return 0;
3926         if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
3927                 return 0;
3928         if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
3929                 return 0;
3930 #ifdef CONFIG_NVME_AUTH
3931         if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
3932                 return 0;
3933         if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
3934                 return 0;
3935 #endif
3936
3937         return a->mode;
3938 }
3939
3940 const struct attribute_group nvme_dev_attrs_group = {
3941         .attrs          = nvme_dev_attrs,
3942         .is_visible     = nvme_dev_attrs_are_visible,
3943 };
3944 EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
3945
3946 static const struct attribute_group *nvme_dev_attr_groups[] = {
3947         &nvme_dev_attrs_group,
3948         NULL,
3949 };
3950
3951 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3952                 unsigned nsid)
3953 {
3954         struct nvme_ns_head *h;
3955
3956         lockdep_assert_held(&ctrl->subsys->lock);
3957
3958         list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3959                 /*
3960                  * Private namespaces can share NSIDs under some conditions.
3961                  * In that case we can't use the same ns_head for namespaces
3962                  * with the same NSID.
3963                  */
3964                 if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3965                         continue;
3966                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3967                         return h;
3968         }
3969
3970         return NULL;
3971 }
3972
3973 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3974                 struct nvme_ns_ids *ids)
3975 {
3976         bool has_uuid = !uuid_is_null(&ids->uuid);
3977         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3978         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3979         struct nvme_ns_head *h;
3980
3981         lockdep_assert_held(&subsys->lock);
3982
3983         list_for_each_entry(h, &subsys->nsheads, entry) {
3984                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3985                         return -EINVAL;
3986                 if (has_nguid &&
3987                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3988                         return -EINVAL;
3989                 if (has_eui64 &&
3990                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3991                         return -EINVAL;
3992         }
3993
3994         return 0;
3995 }
3996
3997 static void nvme_cdev_rel(struct device *dev)
3998 {
3999         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
4000 }
4001
4002 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
4003 {
4004         cdev_device_del(cdev, cdev_device);
4005         put_device(cdev_device);
4006 }
4007
4008 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
4009                 const struct file_operations *fops, struct module *owner)
4010 {
4011         int minor, ret;
4012
4013         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
4014         if (minor < 0)
4015                 return minor;
4016         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
4017         cdev_device->class = nvme_ns_chr_class;
4018         cdev_device->release = nvme_cdev_rel;
4019         device_initialize(cdev_device);
4020         cdev_init(cdev, fops);
4021         cdev->owner = owner;
4022         ret = cdev_device_add(cdev, cdev_device);
4023         if (ret)
4024                 put_device(cdev_device);
4025
4026         return ret;
4027 }
4028
4029 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
4030 {
4031         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
4032 }
4033
4034 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
4035 {
4036         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
4037         return 0;
4038 }
4039
4040 static const struct file_operations nvme_ns_chr_fops = {
4041         .owner          = THIS_MODULE,
4042         .open           = nvme_ns_chr_open,
4043         .release        = nvme_ns_chr_release,
4044         .unlocked_ioctl = nvme_ns_chr_ioctl,
4045         .compat_ioctl   = compat_ptr_ioctl,
4046         .uring_cmd      = nvme_ns_chr_uring_cmd,
4047         .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
4048 };
4049
4050 static int nvme_add_ns_cdev(struct nvme_ns *ns)
4051 {
4052         int ret;
4053
4054         ns->cdev_device.parent = ns->ctrl->device;
4055         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
4056                            ns->ctrl->instance, ns->head->instance);
4057         if (ret)
4058                 return ret;
4059
4060         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
4061                              ns->ctrl->ops->module);
4062 }
4063
4064 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
4065                 struct nvme_ns_info *info)
4066 {
4067         struct nvme_ns_head *head;
4068         size_t size = sizeof(*head);
4069         int ret = -ENOMEM;
4070
4071 #ifdef CONFIG_NVME_MULTIPATH
4072         size += num_possible_nodes() * sizeof(struct nvme_ns *);
4073 #endif
4074
4075         head = kzalloc(size, GFP_KERNEL);
4076         if (!head)
4077                 goto out;
4078         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
4079         if (ret < 0)
4080                 goto out_free_head;
4081         head->instance = ret;
4082         INIT_LIST_HEAD(&head->list);
4083         ret = init_srcu_struct(&head->srcu);
4084         if (ret)
4085                 goto out_ida_remove;
4086         head->subsys = ctrl->subsys;
4087         head->ns_id = info->nsid;
4088         head->ids = info->ids;
4089         head->shared = info->is_shared;
4090         kref_init(&head->ref);
4091
4092         if (head->ids.csi) {
4093                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
4094                 if (ret)
4095                         goto out_cleanup_srcu;
4096         } else
4097                 head->effects = ctrl->effects;
4098
4099         ret = nvme_mpath_alloc_disk(ctrl, head);
4100         if (ret)
4101                 goto out_cleanup_srcu;
4102
4103         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
4104
4105         kref_get(&ctrl->subsys->ref);
4106
4107         return head;
4108 out_cleanup_srcu:
4109         cleanup_srcu_struct(&head->srcu);
4110 out_ida_remove:
4111         ida_free(&ctrl->subsys->ns_ida, head->instance);
4112 out_free_head:
4113         kfree(head);
4114 out:
4115         if (ret > 0)
4116                 ret = blk_status_to_errno(nvme_error_status(ret));
4117         return ERR_PTR(ret);
4118 }
4119
4120 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
4121                 struct nvme_ns_ids *ids)
4122 {
4123         struct nvme_subsystem *s;
4124         int ret = 0;
4125
4126         /*
4127          * Note that this check is racy as we try to avoid holding the global
4128          * lock over the whole ns_head creation.  But it is only intended as
4129          * a sanity check anyway.
4130          */
4131         mutex_lock(&nvme_subsystems_lock);
4132         list_for_each_entry(s, &nvme_subsystems, entry) {
4133                 if (s == this)
4134                         continue;
4135                 mutex_lock(&s->lock);
4136                 ret = nvme_subsys_check_duplicate_ids(s, ids);
4137                 mutex_unlock(&s->lock);
4138                 if (ret)
4139                         break;
4140         }
4141         mutex_unlock(&nvme_subsystems_lock);
4142
4143         return ret;
4144 }
4145
4146 static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
4147 {
4148         struct nvme_ctrl *ctrl = ns->ctrl;
4149         struct nvme_ns_head *head = NULL;
4150         int ret;
4151
4152         ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
4153         if (ret) {
4154                 dev_err(ctrl->device,
4155                         "globally duplicate IDs for nsid %d\n", info->nsid);
4156                 nvme_print_device_info(ctrl);
4157                 return ret;
4158         }
4159
4160         mutex_lock(&ctrl->subsys->lock);
4161         head = nvme_find_ns_head(ctrl, info->nsid);
4162         if (!head) {
4163                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
4164                 if (ret) {
4165                         dev_err(ctrl->device,
4166                                 "duplicate IDs in subsystem for nsid %d\n",
4167                                 info->nsid);
4168                         goto out_unlock;
4169                 }
4170                 head = nvme_alloc_ns_head(ctrl, info);
4171                 if (IS_ERR(head)) {
4172                         ret = PTR_ERR(head);
4173                         goto out_unlock;
4174                 }
4175         } else {
4176                 ret = -EINVAL;
4177                 if (!info->is_shared || !head->shared) {
4178                         dev_err(ctrl->device,
4179                                 "Duplicate unshared namespace %d\n",
4180                                 info->nsid);
4181                         goto out_put_ns_head;
4182                 }
4183                 if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
4184                         dev_err(ctrl->device,
4185                                 "IDs don't match for shared namespace %d\n",
4186                                         info->nsid);
4187                         goto out_put_ns_head;
4188                 }
4189
4190                 if (!multipath && !list_empty(&head->list)) {
4191                         dev_warn(ctrl->device,
4192                                 "Found shared namespace %d, but multipathing not supported.\n",
4193                                 info->nsid);
4194                         dev_warn_once(ctrl->device,
4195                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
4196                 }
4197         }
4198
4199         list_add_tail_rcu(&ns->siblings, &head->list);
4200         ns->head = head;
4201         mutex_unlock(&ctrl->subsys->lock);
4202         return 0;
4203
4204 out_put_ns_head:
4205         nvme_put_ns_head(head);
4206 out_unlock:
4207         mutex_unlock(&ctrl->subsys->lock);
4208         return ret;
4209 }
4210
4211 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4212 {
4213         struct nvme_ns *ns, *ret = NULL;
4214
4215         down_read(&ctrl->namespaces_rwsem);
4216         list_for_each_entry(ns, &ctrl->namespaces, list) {
4217                 if (ns->head->ns_id == nsid) {
4218                         if (!nvme_get_ns(ns))
4219                                 continue;
4220                         ret = ns;
4221                         break;
4222                 }
4223                 if (ns->head->ns_id > nsid)
4224                         break;
4225         }
4226         up_read(&ctrl->namespaces_rwsem);
4227         return ret;
4228 }
4229 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
4230
4231 /*
4232  * Add the namespace to the controller list while keeping the list ordered.
4233  */
4234 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
4235 {
4236         struct nvme_ns *tmp;
4237
4238         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
4239                 if (tmp->head->ns_id < ns->head->ns_id) {
4240                         list_add(&ns->list, &tmp->list);
4241                         return;
4242                 }
4243         }
4244         list_add(&ns->list, &ns->ctrl->namespaces);
4245 }
4246
4247 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
4248 {
4249         struct nvme_ns *ns;
4250         struct gendisk *disk;
4251         int node = ctrl->numa_node;
4252
4253         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
4254         if (!ns)
4255                 return;
4256
4257         disk = blk_mq_alloc_disk(ctrl->tagset, ns);
4258         if (IS_ERR(disk))
4259                 goto out_free_ns;
4260         disk->fops = &nvme_bdev_ops;
4261         disk->private_data = ns;
4262
4263         ns->disk = disk;
4264         ns->queue = disk->queue;
4265
4266         if (ctrl->opts && ctrl->opts->data_digest)
4267                 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
4268
4269         blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
4270         if (ctrl->ops->supports_pci_p2pdma &&
4271             ctrl->ops->supports_pci_p2pdma(ctrl))
4272                 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
4273
4274         ns->ctrl = ctrl;
4275         kref_init(&ns->kref);
4276
4277         if (nvme_init_ns_head(ns, info))
4278                 goto out_cleanup_disk;
4279
4280         /*
4281          * If multipathing is enabled, the device name for all disks and not
4282          * just those that represent shared namespaces needs to be based on the
4283          * subsystem instance.  Using the controller instance for private
4284          * namespaces could lead to naming collisions between shared and private
4285          * namespaces if they don't use a common numbering scheme.
4286          *
4287          * If multipathing is not enabled, disk names must use the controller
4288          * instance as shared namespaces will show up as multiple block
4289          * devices.
4290          */
4291         if (ns->head->disk) {
4292                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
4293                         ctrl->instance, ns->head->instance);
4294                 disk->flags |= GENHD_FL_HIDDEN;
4295         } else if (multipath) {
4296                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
4297                         ns->head->instance);
4298         } else {
4299                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
4300                         ns->head->instance);
4301         }
4302
4303         if (nvme_update_ns_info(ns, info))
4304                 goto out_unlink_ns;
4305
4306         down_write(&ctrl->namespaces_rwsem);
4307         nvme_ns_add_to_ctrl_list(ns);
4308         up_write(&ctrl->namespaces_rwsem);
4309         nvme_get_ctrl(ctrl);
4310
4311         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
4312                 goto out_cleanup_ns_from_list;
4313
4314         if (!nvme_ns_head_multipath(ns->head))
4315                 nvme_add_ns_cdev(ns);
4316
4317         nvme_mpath_add_disk(ns, info->anagrpid);
4318         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
4319
4320         return;
4321
4322  out_cleanup_ns_from_list:
4323         nvme_put_ctrl(ctrl);
4324         down_write(&ctrl->namespaces_rwsem);
4325         list_del_init(&ns->list);
4326         up_write(&ctrl->namespaces_rwsem);
4327  out_unlink_ns:
4328         mutex_lock(&ctrl->subsys->lock);
4329         list_del_rcu(&ns->siblings);
4330         if (list_empty(&ns->head->list))
4331                 list_del_init(&ns->head->entry);
4332         mutex_unlock(&ctrl->subsys->lock);
4333         nvme_put_ns_head(ns->head);
4334  out_cleanup_disk:
4335         put_disk(disk);
4336  out_free_ns:
4337         kfree(ns);
4338 }
4339
4340 static void nvme_ns_remove(struct nvme_ns *ns)
4341 {
4342         bool last_path = false;
4343
4344         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
4345                 return;
4346
4347         clear_bit(NVME_NS_READY, &ns->flags);
4348         set_capacity(ns->disk, 0);
4349         nvme_fault_inject_fini(&ns->fault_inject);
4350
4351         /*
4352          * Ensure that !NVME_NS_READY is seen by other threads to prevent
4353          * this ns going back into current_path.
4354          */
4355         synchronize_srcu(&ns->head->srcu);
4356
4357         /* wait for concurrent submissions */
4358         if (nvme_mpath_clear_current_path(ns))
4359                 synchronize_srcu(&ns->head->srcu);
4360
4361         mutex_lock(&ns->ctrl->subsys->lock);
4362         list_del_rcu(&ns->siblings);
4363         if (list_empty(&ns->head->list)) {
4364                 list_del_init(&ns->head->entry);
4365                 last_path = true;
4366         }
4367         mutex_unlock(&ns->ctrl->subsys->lock);
4368
4369         /* guarantee not available in head->list */
4370         synchronize_srcu(&ns->head->srcu);
4371
4372         if (!nvme_ns_head_multipath(ns->head))
4373                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
4374         del_gendisk(ns->disk);
4375
4376         down_write(&ns->ctrl->namespaces_rwsem);
4377         list_del_init(&ns->list);
4378         up_write(&ns->ctrl->namespaces_rwsem);
4379
4380         if (last_path)
4381                 nvme_mpath_shutdown_disk(ns->head);
4382         nvme_put_ns(ns);
4383 }
4384
4385 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
4386 {
4387         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4388
4389         if (ns) {
4390                 nvme_ns_remove(ns);
4391                 nvme_put_ns(ns);
4392         }
4393 }
4394
4395 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
4396 {
4397         int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4398
4399         if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
4400                 dev_err(ns->ctrl->device,
4401                         "identifiers changed for nsid %d\n", ns->head->ns_id);
4402                 goto out;
4403         }
4404
4405         ret = nvme_update_ns_info(ns, info);
4406 out:
4407         /*
4408          * Only remove the namespace if we got a fatal error back from the
4409          * device, otherwise ignore the error and just move on.
4410          *
4411          * TODO: we should probably schedule a delayed retry here.
4412          */
4413         if (ret > 0 && (ret & NVME_SC_DNR))
4414                 nvme_ns_remove(ns);
4415 }
4416
4417 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4418 {
4419         struct nvme_ns_info info = { .nsid = nsid };
4420         struct nvme_ns *ns;
4421
4422         if (nvme_identify_ns_descs(ctrl, &info))
4423                 return;
4424
4425         if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4426                 dev_warn(ctrl->device,
4427                         "command set not reported for nsid: %d\n", nsid);
4428                 return;
4429         }
4430
4431         /*
4432          * If available try to use the Command Set Idependent Identify Namespace
4433          * data structure to find all the generic information that is needed to
4434          * set up a namespace.  If not fall back to the legacy version.
4435          */
4436         if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
4437             (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) {
4438                 if (nvme_ns_info_from_id_cs_indep(ctrl, &info))
4439                         return;
4440         } else {
4441                 if (nvme_ns_info_from_identify(ctrl, &info))
4442                         return;
4443         }
4444
4445         /*
4446          * Ignore the namespace if it is not ready. We will get an AEN once it
4447          * becomes ready and restart the scan.
4448          */
4449         if (!info.is_ready)
4450                 return;
4451
4452         ns = nvme_find_get_ns(ctrl, nsid);
4453         if (ns) {
4454                 nvme_validate_ns(ns, &info);
4455                 nvme_put_ns(ns);
4456         } else {
4457                 nvme_alloc_ns(ctrl, &info);
4458         }
4459 }
4460
4461 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4462                                         unsigned nsid)
4463 {
4464         struct nvme_ns *ns, *next;
4465         LIST_HEAD(rm_list);
4466
4467         down_write(&ctrl->namespaces_rwsem);
4468         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4469                 if (ns->head->ns_id > nsid)
4470                         list_move_tail(&ns->list, &rm_list);
4471         }
4472         up_write(&ctrl->namespaces_rwsem);
4473
4474         list_for_each_entry_safe(ns, next, &rm_list, list)
4475                 nvme_ns_remove(ns);
4476
4477 }
4478
4479 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4480 {
4481         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4482         __le32 *ns_list;
4483         u32 prev = 0;
4484         int ret = 0, i;
4485
4486         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4487         if (!ns_list)
4488                 return -ENOMEM;
4489
4490         for (;;) {
4491                 struct nvme_command cmd = {
4492                         .identify.opcode        = nvme_admin_identify,
4493                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4494                         .identify.nsid          = cpu_to_le32(prev),
4495                 };
4496
4497                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4498                                             NVME_IDENTIFY_DATA_SIZE);
4499                 if (ret) {
4500                         dev_warn(ctrl->device,
4501                                 "Identify NS List failed (status=0x%x)\n", ret);
4502                         goto free;
4503                 }
4504
4505                 for (i = 0; i < nr_entries; i++) {
4506                         u32 nsid = le32_to_cpu(ns_list[i]);
4507
4508                         if (!nsid)      /* end of the list? */
4509                                 goto out;
4510                         nvme_scan_ns(ctrl, nsid);
4511                         while (++prev < nsid)
4512                                 nvme_ns_remove_by_nsid(ctrl, prev);
4513                 }
4514         }
4515  out:
4516         nvme_remove_invalid_namespaces(ctrl, prev);
4517  free:
4518         kfree(ns_list);
4519         return ret;
4520 }
4521
4522 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4523 {
4524         struct nvme_id_ctrl *id;
4525         u32 nn, i;
4526
4527         if (nvme_identify_ctrl(ctrl, &id))
4528                 return;
4529         nn = le32_to_cpu(id->nn);
4530         kfree(id);
4531
4532         for (i = 1; i <= nn; i++)
4533                 nvme_scan_ns(ctrl, i);
4534
4535         nvme_remove_invalid_namespaces(ctrl, nn);
4536 }
4537
4538 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4539 {
4540         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4541         __le32 *log;
4542         int error;
4543
4544         log = kzalloc(log_size, GFP_KERNEL);
4545         if (!log)
4546                 return;
4547
4548         /*
4549          * We need to read the log to clear the AEN, but we don't want to rely
4550          * on it for the changed namespace information as userspace could have
4551          * raced with us in reading the log page, which could cause us to miss
4552          * updates.
4553          */
4554         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4555                         NVME_CSI_NVM, log, log_size, 0);
4556         if (error)
4557                 dev_warn(ctrl->device,
4558                         "reading changed ns log failed: %d\n", error);
4559
4560         kfree(log);
4561 }
4562
4563 static void nvme_scan_work(struct work_struct *work)
4564 {
4565         struct nvme_ctrl *ctrl =
4566                 container_of(work, struct nvme_ctrl, scan_work);
4567         int ret;
4568
4569         /* No tagset on a live ctrl means IO queues could not created */
4570         if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4571                 return;
4572
4573         /*
4574          * Identify controller limits can change at controller reset due to
4575          * new firmware download, even though it is not common we cannot ignore
4576          * such scenario. Controller's non-mdts limits are reported in the unit
4577          * of logical blocks that is dependent on the format of attached
4578          * namespace. Hence re-read the limits at the time of ns allocation.
4579          */
4580         ret = nvme_init_non_mdts_limits(ctrl);
4581         if (ret < 0) {
4582                 dev_warn(ctrl->device,
4583                         "reading non-mdts-limits failed: %d\n", ret);
4584                 return;
4585         }
4586
4587         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4588                 dev_info(ctrl->device, "rescanning namespaces.\n");
4589                 nvme_clear_changed_ns_log(ctrl);
4590         }
4591
4592         mutex_lock(&ctrl->scan_lock);
4593         if (nvme_ctrl_limited_cns(ctrl)) {
4594                 nvme_scan_ns_sequential(ctrl);
4595         } else {
4596                 /*
4597                  * Fall back to sequential scan if DNR is set to handle broken
4598                  * devices which should support Identify NS List (as per the VS
4599                  * they report) but don't actually support it.
4600                  */
4601                 ret = nvme_scan_ns_list(ctrl);
4602                 if (ret > 0 && ret & NVME_SC_DNR)
4603                         nvme_scan_ns_sequential(ctrl);
4604         }
4605         mutex_unlock(&ctrl->scan_lock);
4606 }
4607
4608 /*
4609  * This function iterates the namespace list unlocked to allow recovery from
4610  * controller failure. It is up to the caller to ensure the namespace list is
4611  * not modified by scan work while this function is executing.
4612  */
4613 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4614 {
4615         struct nvme_ns *ns, *next;
4616         LIST_HEAD(ns_list);
4617
4618         /*
4619          * make sure to requeue I/O to all namespaces as these
4620          * might result from the scan itself and must complete
4621          * for the scan_work to make progress
4622          */
4623         nvme_mpath_clear_ctrl_paths(ctrl);
4624
4625         /* prevent racing with ns scanning */
4626         flush_work(&ctrl->scan_work);
4627
4628         /*
4629          * The dead states indicates the controller was not gracefully
4630          * disconnected. In that case, we won't be able to flush any data while
4631          * removing the namespaces' disks; fail all the queues now to avoid
4632          * potentially having to clean up the failed sync later.
4633          */
4634         if (ctrl->state == NVME_CTRL_DEAD) {
4635                 nvme_mark_namespaces_dead(ctrl);
4636                 nvme_unquiesce_io_queues(ctrl);
4637         }
4638
4639         /* this is a no-op when called from the controller reset handler */
4640         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4641
4642         down_write(&ctrl->namespaces_rwsem);
4643         list_splice_init(&ctrl->namespaces, &ns_list);
4644         up_write(&ctrl->namespaces_rwsem);
4645
4646         list_for_each_entry_safe(ns, next, &ns_list, list)
4647                 nvme_ns_remove(ns);
4648 }
4649 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4650
4651 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4652 {
4653         const struct nvme_ctrl *ctrl =
4654                 container_of(dev, struct nvme_ctrl, ctrl_device);
4655         struct nvmf_ctrl_options *opts = ctrl->opts;
4656         int ret;
4657
4658         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4659         if (ret)
4660                 return ret;
4661
4662         if (opts) {
4663                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4664                 if (ret)
4665                         return ret;
4666
4667                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4668                                 opts->trsvcid ?: "none");
4669                 if (ret)
4670                         return ret;
4671
4672                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4673                                 opts->host_traddr ?: "none");
4674                 if (ret)
4675                         return ret;
4676
4677                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4678                                 opts->host_iface ?: "none");
4679         }
4680         return ret;
4681 }
4682
4683 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4684 {
4685         char *envp[2] = { envdata, NULL };
4686
4687         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4688 }
4689
4690 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4691 {
4692         char *envp[2] = { NULL, NULL };
4693         u32 aen_result = ctrl->aen_result;
4694
4695         ctrl->aen_result = 0;
4696         if (!aen_result)
4697                 return;
4698
4699         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4700         if (!envp[0])
4701                 return;
4702         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4703         kfree(envp[0]);
4704 }
4705
4706 static void nvme_async_event_work(struct work_struct *work)
4707 {
4708         struct nvme_ctrl *ctrl =
4709                 container_of(work, struct nvme_ctrl, async_event_work);
4710
4711         nvme_aen_uevent(ctrl);
4712
4713         /*
4714          * The transport drivers must guarantee AER submission here is safe by
4715          * flushing ctrl async_event_work after changing the controller state
4716          * from LIVE and before freeing the admin queue.
4717         */
4718         if (ctrl->state == NVME_CTRL_LIVE)
4719                 ctrl->ops->submit_async_event(ctrl);
4720 }
4721
4722 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4723 {
4724
4725         u32 csts;
4726
4727         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4728                 return false;
4729
4730         if (csts == ~0)
4731                 return false;
4732
4733         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4734 }
4735
4736 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4737 {
4738         struct nvme_fw_slot_info_log *log;
4739
4740         log = kmalloc(sizeof(*log), GFP_KERNEL);
4741         if (!log)
4742                 return;
4743
4744         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4745                         log, sizeof(*log), 0))
4746                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4747         kfree(log);
4748 }
4749
4750 static void nvme_fw_act_work(struct work_struct *work)
4751 {
4752         struct nvme_ctrl *ctrl = container_of(work,
4753                                 struct nvme_ctrl, fw_act_work);
4754         unsigned long fw_act_timeout;
4755
4756         if (ctrl->mtfa)
4757                 fw_act_timeout = jiffies +
4758                                 msecs_to_jiffies(ctrl->mtfa * 100);
4759         else
4760                 fw_act_timeout = jiffies +
4761                                 msecs_to_jiffies(admin_timeout * 1000);
4762
4763         nvme_quiesce_io_queues(ctrl);
4764         while (nvme_ctrl_pp_status(ctrl)) {
4765                 if (time_after(jiffies, fw_act_timeout)) {
4766                         dev_warn(ctrl->device,
4767                                 "Fw activation timeout, reset controller\n");
4768                         nvme_try_sched_reset(ctrl);
4769                         return;
4770                 }
4771                 msleep(100);
4772         }
4773
4774         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4775                 return;
4776
4777         nvme_unquiesce_io_queues(ctrl);
4778         /* read FW slot information to clear the AER */
4779         nvme_get_fw_slot_info(ctrl);
4780
4781         queue_work(nvme_wq, &ctrl->async_event_work);
4782 }
4783
4784 static u32 nvme_aer_type(u32 result)
4785 {
4786         return result & 0x7;
4787 }
4788
4789 static u32 nvme_aer_subtype(u32 result)
4790 {
4791         return (result & 0xff00) >> 8;
4792 }
4793
4794 static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4795 {
4796         u32 aer_notice_type = nvme_aer_subtype(result);
4797         bool requeue = true;
4798
4799         trace_nvme_async_event(ctrl, aer_notice_type);
4800
4801         switch (aer_notice_type) {
4802         case NVME_AER_NOTICE_NS_CHANGED:
4803                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4804                 nvme_queue_scan(ctrl);
4805                 break;
4806         case NVME_AER_NOTICE_FW_ACT_STARTING:
4807                 /*
4808                  * We are (ab)using the RESETTING state to prevent subsequent
4809                  * recovery actions from interfering with the controller's
4810                  * firmware activation.
4811                  */
4812                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4813                         nvme_auth_stop(ctrl);
4814                         requeue = false;
4815                         queue_work(nvme_wq, &ctrl->fw_act_work);
4816                 }
4817                 break;
4818 #ifdef CONFIG_NVME_MULTIPATH
4819         case NVME_AER_NOTICE_ANA:
4820                 if (!ctrl->ana_log_buf)
4821                         break;
4822                 queue_work(nvme_wq, &ctrl->ana_work);
4823                 break;
4824 #endif
4825         case NVME_AER_NOTICE_DISC_CHANGED:
4826                 ctrl->aen_result = result;
4827                 break;
4828         default:
4829                 dev_warn(ctrl->device, "async event result %08x\n", result);
4830         }
4831         return requeue;
4832 }
4833
4834 static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4835 {
4836         trace_nvme_async_event(ctrl, NVME_AER_ERROR);
4837         dev_warn(ctrl->device, "resetting controller due to AER\n");
4838         nvme_reset_ctrl(ctrl);
4839 }
4840
4841 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4842                 volatile union nvme_result *res)
4843 {
4844         u32 result = le32_to_cpu(res->u32);
4845         u32 aer_type = nvme_aer_type(result);
4846         u32 aer_subtype = nvme_aer_subtype(result);
4847         bool requeue = true;
4848
4849         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4850                 return;
4851
4852         switch (aer_type) {
4853         case NVME_AER_NOTICE:
4854                 requeue = nvme_handle_aen_notice(ctrl, result);
4855                 break;
4856         case NVME_AER_ERROR:
4857                 /*
4858                  * For a persistent internal error, don't run async_event_work
4859                  * to submit a new AER. The controller reset will do it.
4860                  */
4861                 if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4862                         nvme_handle_aer_persistent_error(ctrl);
4863                         return;
4864                 }
4865                 fallthrough;
4866         case NVME_AER_SMART:
4867         case NVME_AER_CSS:
4868         case NVME_AER_VS:
4869                 trace_nvme_async_event(ctrl, aer_type);
4870                 ctrl->aen_result = result;
4871                 break;
4872         default:
4873                 break;
4874         }
4875
4876         if (requeue)
4877                 queue_work(nvme_wq, &ctrl->async_event_work);
4878 }
4879 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4880
4881 int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4882                 const struct blk_mq_ops *ops, unsigned int cmd_size)
4883 {
4884         int ret;
4885
4886         memset(set, 0, sizeof(*set));
4887         set->ops = ops;
4888         set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4889         if (ctrl->ops->flags & NVME_F_FABRICS)
4890                 set->reserved_tags = NVMF_RESERVED_TAGS;
4891         set->numa_node = ctrl->numa_node;
4892         set->flags = BLK_MQ_F_NO_SCHED;
4893         if (ctrl->ops->flags & NVME_F_BLOCKING)
4894                 set->flags |= BLK_MQ_F_BLOCKING;
4895         set->cmd_size = cmd_size;
4896         set->driver_data = ctrl;
4897         set->nr_hw_queues = 1;
4898         set->timeout = NVME_ADMIN_TIMEOUT;
4899         ret = blk_mq_alloc_tag_set(set);
4900         if (ret)
4901                 return ret;
4902
4903         ctrl->admin_q = blk_mq_init_queue(set);
4904         if (IS_ERR(ctrl->admin_q)) {
4905                 ret = PTR_ERR(ctrl->admin_q);
4906                 goto out_free_tagset;
4907         }
4908
4909         if (ctrl->ops->flags & NVME_F_FABRICS) {
4910                 ctrl->fabrics_q = blk_mq_init_queue(set);
4911                 if (IS_ERR(ctrl->fabrics_q)) {
4912                         ret = PTR_ERR(ctrl->fabrics_q);
4913                         goto out_cleanup_admin_q;
4914                 }
4915         }
4916
4917         ctrl->admin_tagset = set;
4918         return 0;
4919
4920 out_cleanup_admin_q:
4921         blk_mq_destroy_queue(ctrl->admin_q);
4922         blk_put_queue(ctrl->admin_q);
4923 out_free_tagset:
4924         blk_mq_free_tag_set(set);
4925         ctrl->admin_q = NULL;
4926         ctrl->fabrics_q = NULL;
4927         return ret;
4928 }
4929 EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4930
4931 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4932 {
4933         blk_mq_destroy_queue(ctrl->admin_q);
4934         blk_put_queue(ctrl->admin_q);
4935         if (ctrl->ops->flags & NVME_F_FABRICS) {
4936                 blk_mq_destroy_queue(ctrl->fabrics_q);
4937                 blk_put_queue(ctrl->fabrics_q);
4938         }
4939         blk_mq_free_tag_set(ctrl->admin_tagset);
4940 }
4941 EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4942
4943 int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4944                 const struct blk_mq_ops *ops, unsigned int nr_maps,
4945                 unsigned int cmd_size)
4946 {
4947         int ret;
4948
4949         memset(set, 0, sizeof(*set));
4950         set->ops = ops;
4951         set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4952         /*
4953          * Some Apple controllers requires tags to be unique across admin and
4954          * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4955          */
4956         if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4957                 set->reserved_tags = NVME_AQ_DEPTH;
4958         else if (ctrl->ops->flags & NVME_F_FABRICS)
4959                 set->reserved_tags = NVMF_RESERVED_TAGS;
4960         set->numa_node = ctrl->numa_node;
4961         set->flags = BLK_MQ_F_SHOULD_MERGE;
4962         if (ctrl->ops->flags & NVME_F_BLOCKING)
4963                 set->flags |= BLK_MQ_F_BLOCKING;
4964         set->cmd_size = cmd_size,
4965         set->driver_data = ctrl;
4966         set->nr_hw_queues = ctrl->queue_count - 1;
4967         set->timeout = NVME_IO_TIMEOUT;
4968         set->nr_maps = nr_maps;
4969         ret = blk_mq_alloc_tag_set(set);
4970         if (ret)
4971                 return ret;
4972
4973         if (ctrl->ops->flags & NVME_F_FABRICS) {
4974                 ctrl->connect_q = blk_mq_init_queue(set);
4975                 if (IS_ERR(ctrl->connect_q)) {
4976                         ret = PTR_ERR(ctrl->connect_q);
4977                         goto out_free_tag_set;
4978                 }
4979                 blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
4980                                    ctrl->connect_q);
4981         }
4982
4983         ctrl->tagset = set;
4984         return 0;
4985
4986 out_free_tag_set:
4987         blk_mq_free_tag_set(set);
4988         ctrl->connect_q = NULL;
4989         return ret;
4990 }
4991 EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4992
4993 void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4994 {
4995         if (ctrl->ops->flags & NVME_F_FABRICS) {
4996                 blk_mq_destroy_queue(ctrl->connect_q);
4997                 blk_put_queue(ctrl->connect_q);
4998         }
4999         blk_mq_free_tag_set(ctrl->tagset);
5000 }
5001 EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
5002
5003 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
5004 {
5005         nvme_mpath_stop(ctrl);
5006         nvme_auth_stop(ctrl);
5007         nvme_stop_keep_alive(ctrl);
5008         nvme_stop_failfast_work(ctrl);
5009         flush_work(&ctrl->async_event_work);
5010         cancel_work_sync(&ctrl->fw_act_work);
5011         if (ctrl->ops->stop_ctrl)
5012                 ctrl->ops->stop_ctrl(ctrl);
5013 }
5014 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
5015
5016 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
5017 {
5018         nvme_start_keep_alive(ctrl);
5019
5020         nvme_enable_aen(ctrl);
5021
5022         /*
5023          * persistent discovery controllers need to send indication to userspace
5024          * to re-read the discovery log page to learn about possible changes
5025          * that were missed. We identify persistent discovery controllers by
5026          * checking that they started once before, hence are reconnecting back.
5027          */
5028         if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
5029             nvme_discovery_ctrl(ctrl))
5030                 nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
5031
5032         if (ctrl->queue_count > 1) {
5033                 nvme_queue_scan(ctrl);
5034                 nvme_unquiesce_io_queues(ctrl);
5035                 nvme_mpath_update(ctrl);
5036         }
5037
5038         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
5039 }
5040 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
5041
5042 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
5043 {
5044         nvme_hwmon_exit(ctrl);
5045         nvme_fault_inject_fini(&ctrl->fault_inject);
5046         dev_pm_qos_hide_latency_tolerance(ctrl->device);
5047         cdev_device_del(&ctrl->cdev, ctrl->device);
5048         nvme_put_ctrl(ctrl);
5049 }
5050 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
5051
5052 static void nvme_free_cels(struct nvme_ctrl *ctrl)
5053 {
5054         struct nvme_effects_log *cel;
5055         unsigned long i;
5056
5057         xa_for_each(&ctrl->cels, i, cel) {
5058                 xa_erase(&ctrl->cels, i);
5059                 kfree(cel);
5060         }
5061
5062         xa_destroy(&ctrl->cels);
5063 }
5064
5065 static void nvme_free_ctrl(struct device *dev)
5066 {
5067         struct nvme_ctrl *ctrl =
5068                 container_of(dev, struct nvme_ctrl, ctrl_device);
5069         struct nvme_subsystem *subsys = ctrl->subsys;
5070
5071         if (!subsys || ctrl->instance != subsys->instance)
5072                 ida_free(&nvme_instance_ida, ctrl->instance);
5073
5074         nvme_free_cels(ctrl);
5075         nvme_mpath_uninit(ctrl);
5076         nvme_auth_stop(ctrl);
5077         nvme_auth_free(ctrl);
5078         __free_page(ctrl->discard_page);
5079         free_opal_dev(ctrl->opal_dev);
5080
5081         if (subsys) {
5082                 mutex_lock(&nvme_subsystems_lock);
5083                 list_del(&ctrl->subsys_entry);
5084                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
5085                 mutex_unlock(&nvme_subsystems_lock);
5086         }
5087
5088         ctrl->ops->free_ctrl(ctrl);
5089
5090         if (subsys)
5091                 nvme_put_subsystem(subsys);
5092 }
5093
5094 /*
5095  * Initialize a NVMe controller structures.  This needs to be called during
5096  * earliest initialization so that we have the initialized structured around
5097  * during probing.
5098  */
5099 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
5100                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
5101 {
5102         int ret;
5103
5104         ctrl->state = NVME_CTRL_NEW;
5105         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
5106         spin_lock_init(&ctrl->lock);
5107         mutex_init(&ctrl->scan_lock);
5108         INIT_LIST_HEAD(&ctrl->namespaces);
5109         xa_init(&ctrl->cels);
5110         init_rwsem(&ctrl->namespaces_rwsem);
5111         ctrl->dev = dev;
5112         ctrl->ops = ops;
5113         ctrl->quirks = quirks;
5114         ctrl->numa_node = NUMA_NO_NODE;
5115         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
5116         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
5117         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
5118         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
5119         init_waitqueue_head(&ctrl->state_wq);
5120
5121         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
5122         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
5123         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
5124         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
5125
5126         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
5127                         PAGE_SIZE);
5128         ctrl->discard_page = alloc_page(GFP_KERNEL);
5129         if (!ctrl->discard_page) {
5130                 ret = -ENOMEM;
5131                 goto out;
5132         }
5133
5134         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
5135         if (ret < 0)
5136                 goto out;
5137         ctrl->instance = ret;
5138
5139         device_initialize(&ctrl->ctrl_device);
5140         ctrl->device = &ctrl->ctrl_device;
5141         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
5142                         ctrl->instance);
5143         ctrl->device->class = nvme_class;
5144         ctrl->device->parent = ctrl->dev;
5145         if (ops->dev_attr_groups)
5146                 ctrl->device->groups = ops->dev_attr_groups;
5147         else
5148                 ctrl->device->groups = nvme_dev_attr_groups;
5149         ctrl->device->release = nvme_free_ctrl;
5150         dev_set_drvdata(ctrl->device, ctrl);
5151         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
5152         if (ret)
5153                 goto out_release_instance;
5154
5155         nvme_get_ctrl(ctrl);
5156         cdev_init(&ctrl->cdev, &nvme_dev_fops);
5157         ctrl->cdev.owner = ops->module;
5158         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
5159         if (ret)
5160                 goto out_free_name;
5161
5162         /*
5163          * Initialize latency tolerance controls.  The sysfs files won't
5164          * be visible to userspace unless the device actually supports APST.
5165          */
5166         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
5167         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
5168                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
5169
5170         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
5171         nvme_mpath_init_ctrl(ctrl);
5172         ret = nvme_auth_init_ctrl(ctrl);
5173         if (ret)
5174                 goto out_free_cdev;
5175
5176         return 0;
5177 out_free_cdev:
5178         cdev_device_del(&ctrl->cdev, ctrl->device);
5179 out_free_name:
5180         nvme_put_ctrl(ctrl);
5181         kfree_const(ctrl->device->kobj.name);
5182 out_release_instance:
5183         ida_free(&nvme_instance_ida, ctrl->instance);
5184 out:
5185         if (ctrl->discard_page)
5186                 __free_page(ctrl->discard_page);
5187         return ret;
5188 }
5189 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
5190
5191 /* let I/O to all namespaces fail in preparation for surprise removal */
5192 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
5193 {
5194         struct nvme_ns *ns;
5195
5196         down_read(&ctrl->namespaces_rwsem);
5197         list_for_each_entry(ns, &ctrl->namespaces, list)
5198                 blk_mark_disk_dead(ns->disk);
5199         up_read(&ctrl->namespaces_rwsem);
5200 }
5201 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
5202
5203 void nvme_unfreeze(struct nvme_ctrl *ctrl)
5204 {
5205         struct nvme_ns *ns;
5206
5207         down_read(&ctrl->namespaces_rwsem);
5208         list_for_each_entry(ns, &ctrl->namespaces, list)
5209                 blk_mq_unfreeze_queue(ns->queue);
5210         up_read(&ctrl->namespaces_rwsem);
5211 }
5212 EXPORT_SYMBOL_GPL(nvme_unfreeze);
5213
5214 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
5215 {
5216         struct nvme_ns *ns;
5217
5218         down_read(&ctrl->namespaces_rwsem);
5219         list_for_each_entry(ns, &ctrl->namespaces, list) {
5220                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
5221                 if (timeout <= 0)
5222                         break;
5223         }
5224         up_read(&ctrl->namespaces_rwsem);
5225         return timeout;
5226 }
5227 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
5228
5229 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
5230 {
5231         struct nvme_ns *ns;
5232
5233         down_read(&ctrl->namespaces_rwsem);
5234         list_for_each_entry(ns, &ctrl->namespaces, list)
5235                 blk_mq_freeze_queue_wait(ns->queue);
5236         up_read(&ctrl->namespaces_rwsem);
5237 }
5238 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
5239
5240 void nvme_start_freeze(struct nvme_ctrl *ctrl)
5241 {
5242         struct nvme_ns *ns;
5243
5244         down_read(&ctrl->namespaces_rwsem);
5245         list_for_each_entry(ns, &ctrl->namespaces, list)
5246                 blk_freeze_queue_start(ns->queue);
5247         up_read(&ctrl->namespaces_rwsem);
5248 }
5249 EXPORT_SYMBOL_GPL(nvme_start_freeze);
5250
5251 void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
5252 {
5253         if (!ctrl->tagset)
5254                 return;
5255         if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
5256                 blk_mq_quiesce_tagset(ctrl->tagset);
5257         else
5258                 blk_mq_wait_quiesce_done(ctrl->tagset);
5259 }
5260 EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
5261
5262 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
5263 {
5264         if (!ctrl->tagset)
5265                 return;
5266         if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
5267                 blk_mq_unquiesce_tagset(ctrl->tagset);
5268 }
5269 EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
5270
5271 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
5272 {
5273         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
5274                 blk_mq_quiesce_queue(ctrl->admin_q);
5275         else
5276                 blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
5277 }
5278 EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
5279
5280 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
5281 {
5282         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
5283                 blk_mq_unquiesce_queue(ctrl->admin_q);
5284 }
5285 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
5286
5287 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
5288 {
5289         struct nvme_ns *ns;
5290
5291         down_read(&ctrl->namespaces_rwsem);
5292         list_for_each_entry(ns, &ctrl->namespaces, list)
5293                 blk_sync_queue(ns->queue);
5294         up_read(&ctrl->namespaces_rwsem);
5295 }
5296 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
5297
5298 void nvme_sync_queues(struct nvme_ctrl *ctrl)
5299 {
5300         nvme_sync_io_queues(ctrl);
5301         if (ctrl->admin_q)
5302                 blk_sync_queue(ctrl->admin_q);
5303 }
5304 EXPORT_SYMBOL_GPL(nvme_sync_queues);
5305
5306 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
5307 {
5308         if (file->f_op != &nvme_dev_fops)
5309                 return NULL;
5310         return file->private_data;
5311 }
5312 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
5313
5314 /*
5315  * Check we didn't inadvertently grow the command structure sizes:
5316  */
5317 static inline void _nvme_check_size(void)
5318 {
5319         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
5320         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
5321         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
5322         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
5323         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
5324         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
5325         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
5326         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
5327         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
5328         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
5329         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
5330         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5331         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5332         BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5333                         NVME_IDENTIFY_DATA_SIZE);
5334         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5335         BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5336         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5337         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5338         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
5339         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
5340         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
5341         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
5342         BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
5343 }
5344
5345
5346 static int __init nvme_core_init(void)
5347 {
5348         int result = -ENOMEM;
5349
5350         _nvme_check_size();
5351
5352         nvme_wq = alloc_workqueue("nvme-wq",
5353                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5354         if (!nvme_wq)
5355                 goto out;
5356
5357         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
5358                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5359         if (!nvme_reset_wq)
5360                 goto destroy_wq;
5361
5362         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
5363                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5364         if (!nvme_delete_wq)
5365                 goto destroy_reset_wq;
5366
5367         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
5368                         NVME_MINORS, "nvme");
5369         if (result < 0)
5370                 goto destroy_delete_wq;
5371
5372         nvme_class = class_create(THIS_MODULE, "nvme");
5373         if (IS_ERR(nvme_class)) {
5374                 result = PTR_ERR(nvme_class);
5375                 goto unregister_chrdev;
5376         }
5377         nvme_class->dev_uevent = nvme_class_uevent;
5378
5379         nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
5380         if (IS_ERR(nvme_subsys_class)) {
5381                 result = PTR_ERR(nvme_subsys_class);
5382                 goto destroy_class;
5383         }
5384
5385         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
5386                                      "nvme-generic");
5387         if (result < 0)
5388                 goto destroy_subsys_class;
5389
5390         nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
5391         if (IS_ERR(nvme_ns_chr_class)) {
5392                 result = PTR_ERR(nvme_ns_chr_class);
5393                 goto unregister_generic_ns;
5394         }
5395
5396         result = nvme_init_auth();
5397         if (result)
5398                 goto destroy_ns_chr;
5399         return 0;
5400
5401 destroy_ns_chr:
5402         class_destroy(nvme_ns_chr_class);
5403 unregister_generic_ns:
5404         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5405 destroy_subsys_class:
5406         class_destroy(nvme_subsys_class);
5407 destroy_class:
5408         class_destroy(nvme_class);
5409 unregister_chrdev:
5410         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5411 destroy_delete_wq:
5412         destroy_workqueue(nvme_delete_wq);
5413 destroy_reset_wq:
5414         destroy_workqueue(nvme_reset_wq);
5415 destroy_wq:
5416         destroy_workqueue(nvme_wq);
5417 out:
5418         return result;
5419 }
5420
5421 static void __exit nvme_core_exit(void)
5422 {
5423         nvme_exit_auth();
5424         class_destroy(nvme_ns_chr_class);
5425         class_destroy(nvme_subsys_class);
5426         class_destroy(nvme_class);
5427         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5428         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5429         destroy_workqueue(nvme_delete_wq);
5430         destroy_workqueue(nvme_reset_wq);
5431         destroy_workqueue(nvme_wq);
5432         ida_destroy(&nvme_ns_chr_minor_ida);
5433         ida_destroy(&nvme_instance_ida);
5434 }
5435
5436 MODULE_LICENSE("GPL");
5437 MODULE_VERSION("1.0");
5438 module_init(nvme_core_init);
5439 module_exit(nvme_core_exit);