nvmet: use new ana_log_size instead the old one
[linux-block.git] / drivers / nvme / host / multipath.c
CommitLineData
bc50ad75 1// SPDX-License-Identifier: GPL-2.0
32acab31 2/*
0d0b660f 3 * Copyright (c) 2017-2018 Christoph Hellwig.
32acab31
CH
4 */
5
b2ce4d90 6#include <linux/backing-dev.h>
32acab31 7#include <linux/moduleparam.h>
2796b569 8#include <trace/events/block.h>
32acab31
CH
9#include "nvme.h"
10
11static bool multipath = true;
5cadde80 12module_param(multipath, bool, 0444);
32acab31
CH
13MODULE_PARM_DESC(multipath,
14 "turn on native support for multiple controllers per subsystem");
15
b9156dae
SG
16void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
17{
18 struct nvme_ns_head *h;
19
20 lockdep_assert_held(&subsys->lock);
21 list_for_each_entry(h, &subsys->nsheads, entry)
22 if (h->disk)
23 blk_mq_unfreeze_queue(h->disk->queue);
24}
25
26void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
27{
28 struct nvme_ns_head *h;
29
30 lockdep_assert_held(&subsys->lock);
31 list_for_each_entry(h, &subsys->nsheads, entry)
32 if (h->disk)
33 blk_mq_freeze_queue_wait(h->disk->queue);
34}
35
36void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
37{
38 struct nvme_ns_head *h;
39
40 lockdep_assert_held(&subsys->lock);
41 list_for_each_entry(h, &subsys->nsheads, entry)
42 if (h->disk)
43 blk_freeze_queue_start(h->disk->queue);
44}
45
a785dbcc
KB
46/*
47 * If multipathing is enabled we need to always use the subsystem instance
48 * number for numbering our devices to avoid conflicts between subsystems that
49 * have multiple controllers and thus use the multipath-aware subsystem node
50 * and those that have a single controller and use the controller node
51 * directly.
52 */
9953ab0c 53bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags)
a785dbcc 54{
9953ab0c
CH
55 if (!multipath)
56 return false;
57 if (!ns->head->disk) {
58 sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance,
59 ns->head->instance);
60 return true;
a785dbcc 61 }
9953ab0c
CH
62 sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance,
63 ns->ctrl->instance, ns->head->instance);
64 *flags = GENHD_FL_HIDDEN;
65 return true;
a785dbcc
KB
66}
67
5ddaabe8 68void nvme_failover_req(struct request *req)
32acab31
CH
69{
70 struct nvme_ns *ns = req->q->queuedata;
5ddaabe8 71 u16 status = nvme_req(req)->status & 0x7ff;
32acab31 72 unsigned long flags;
ce86dad2 73 struct bio *bio;
32acab31 74
5ddaabe8
CH
75 nvme_mpath_clear_current_path(ns);
76
77 /*
78 * If we got back an ANA error, we know the controller is alive but not
79 * ready to serve this namespace. Kick of a re-read of the ANA
80 * information page, and just try any other available path for now.
81 */
82 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
83 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
84 queue_work(nvme_wq, &ns->ctrl->ana_work);
0d0b660f
CH
85 }
86
764e9332 87 spin_lock_irqsave(&ns->head->requeue_lock, flags);
ce86dad2
DW
88 for (bio = req->bio; bio; bio = bio->bi_next)
89 bio_set_dev(bio, ns->head->disk->part0);
764e9332
JM
90 blk_steal_bios(&ns->head->requeue_list, req);
91 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
764e9332 92
5ddaabe8 93 blk_mq_end_request(req, 0);
32acab31
CH
94 kblockd_schedule_work(&ns->head->requeue_work);
95}
96
32acab31
CH
97void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
98{
99 struct nvme_ns *ns;
100
765cc031 101 down_read(&ctrl->namespaces_rwsem);
32acab31
CH
102 list_for_each_entry(ns, &ctrl->namespaces, list) {
103 if (ns->head->disk)
104 kblockd_schedule_work(&ns->head->requeue_work);
105 }
765cc031 106 up_read(&ctrl->namespaces_rwsem);
32acab31
CH
107}
108
0d0b660f
CH
109static const char *nvme_ana_state_names[] = {
110 [0] = "invalid state",
111 [NVME_ANA_OPTIMIZED] = "optimized",
112 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
113 [NVME_ANA_INACCESSIBLE] = "inaccessible",
114 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
115 [NVME_ANA_CHANGE] = "change",
116};
117
0157ec8d 118bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
32acab31 119{
f3334447 120 struct nvme_ns_head *head = ns->head;
0157ec8d 121 bool changed = false;
f3334447
CH
122 int node;
123
124 if (!head)
0157ec8d 125 goto out;
f3334447
CH
126
127 for_each_node(node) {
0157ec8d 128 if (ns == rcu_access_pointer(head->current_path[node])) {
f3334447 129 rcu_assign_pointer(head->current_path[node], NULL);
0157ec8d
SG
130 changed = true;
131 }
f3334447 132 }
0157ec8d
SG
133out:
134 return changed;
135}
136
137void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
138{
139 struct nvme_ns *ns;
140
141 mutex_lock(&ctrl->scan_lock);
763303a8 142 down_read(&ctrl->namespaces_rwsem);
0157ec8d
SG
143 list_for_each_entry(ns, &ctrl->namespaces, list)
144 if (nvme_mpath_clear_current_path(ns))
145 kblockd_schedule_work(&ns->head->requeue_work);
763303a8 146 up_read(&ctrl->namespaces_rwsem);
0157ec8d 147 mutex_unlock(&ctrl->scan_lock);
f3334447
CH
148}
149
ca7ae5c9
HR
150static bool nvme_path_is_disabled(struct nvme_ns *ns)
151{
ecca390e
SG
152 /*
153 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
154 * still be able to complete assuming that the controller is connected.
155 * Otherwise it will fail immediately and return to the requeue list.
156 */
157 if (ns->ctrl->state != NVME_CTRL_LIVE &&
158 ns->ctrl->state != NVME_CTRL_DELETING)
159 return true;
160 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
161 test_bit(NVME_NS_REMOVING, &ns->flags))
162 return true;
163 return false;
ca7ae5c9
HR
164}
165
f3334447
CH
166static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
167{
168 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
169 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
32acab31
CH
170
171 list_for_each_entry_rcu(ns, &head->list, siblings) {
ca7ae5c9 172 if (nvme_path_is_disabled(ns))
0d0b660f 173 continue;
f3334447 174
75c10e73
HR
175 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
176 distance = node_distance(node, ns->ctrl->numa_node);
177 else
178 distance = LOCAL_DISTANCE;
f3334447 179
0d0b660f
CH
180 switch (ns->ana_state) {
181 case NVME_ANA_OPTIMIZED:
f3334447
CH
182 if (distance < found_distance) {
183 found_distance = distance;
184 found = ns;
185 }
186 break;
0d0b660f 187 case NVME_ANA_NONOPTIMIZED:
f3334447
CH
188 if (distance < fallback_distance) {
189 fallback_distance = distance;
190 fallback = ns;
191 }
0d0b660f
CH
192 break;
193 default:
194 break;
32acab31
CH
195 }
196 }
197
f3334447
CH
198 if (!found)
199 found = fallback;
200 if (found)
201 rcu_assign_pointer(head->current_path[node], found);
202 return found;
0d0b660f
CH
203}
204
75c10e73
HR
205static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
206 struct nvme_ns *ns)
207{
208 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
209 siblings);
210 if (ns)
211 return ns;
212 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
213}
214
215static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
216 int node, struct nvme_ns *old)
217{
e398863b 218 struct nvme_ns *ns, *found = NULL;
75c10e73 219
2032d074
HR
220 if (list_is_singular(&head->list)) {
221 if (nvme_path_is_disabled(old))
222 return NULL;
75c10e73 223 return old;
2032d074 224 }
75c10e73
HR
225
226 for (ns = nvme_next_ns(head, old);
d1bcf006 227 ns && ns != old;
75c10e73 228 ns = nvme_next_ns(head, ns)) {
ca7ae5c9 229 if (nvme_path_is_disabled(ns))
75c10e73
HR
230 continue;
231
232 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
233 found = ns;
234 goto out;
235 }
236 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
e398863b 237 found = ns;
75c10e73
HR
238 }
239
93eb0381
MW
240 /*
241 * The loop above skips the current path for round-robin semantics.
242 * Fall back to the current path if either:
243 * - no other optimized path found and current is optimized,
244 * - no other usable path found and current is usable.
245 */
3f6e3246 246 if (!nvme_path_is_disabled(old) &&
93eb0381 247 (old->ana_state == NVME_ANA_OPTIMIZED ||
e398863b 248 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
93eb0381
MW
249 return old;
250
e398863b 251 if (!found)
75c10e73 252 return NULL;
75c10e73
HR
253out:
254 rcu_assign_pointer(head->current_path[node], found);
255 return found;
256}
257
0d0b660f
CH
258static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
259{
260 return ns->ctrl->state == NVME_CTRL_LIVE &&
261 ns->ana_state == NVME_ANA_OPTIMIZED;
32acab31
CH
262}
263
264inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
265{
f3334447
CH
266 int node = numa_node_id();
267 struct nvme_ns *ns;
32acab31 268
f3334447 269 ns = srcu_dereference(head->current_path[node], &head->srcu);
fbd6a42d
HR
270 if (unlikely(!ns))
271 return __nvme_find_path(head, node);
272
273 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
274 return nvme_round_robin_path(head, node, ns);
275 if (unlikely(!nvme_path_is_optimized(ns)))
276 return __nvme_find_path(head, node);
32acab31
CH
277 return ns;
278}
279
0157ec8d
SG
280static bool nvme_available_path(struct nvme_ns_head *head)
281{
282 struct nvme_ns *ns;
283
284 list_for_each_entry_rcu(ns, &head->list, siblings) {
8c4dfea9
VG
285 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
286 continue;
0157ec8d
SG
287 switch (ns->ctrl->state) {
288 case NVME_CTRL_LIVE:
289 case NVME_CTRL_RESETTING:
290 case NVME_CTRL_CONNECTING:
291 /* fallthru */
292 return true;
293 default:
294 break;
295 }
296 }
297 return false;
298}
299
1496bd49 300static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
32acab31 301{
309dca30 302 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
32acab31
CH
303 struct device *dev = disk_to_dev(head->disk);
304 struct nvme_ns *ns;
305 blk_qc_t ret = BLK_QC_T_NONE;
306 int srcu_idx;
307
525aa5a7 308 /*
f695ca38
CH
309 * The namespace might be going away and the bio might be moved to a
310 * different queue via blk_steal_bios(), so we need to use the bio_split
311 * pool from the original queue to allocate the bvecs from.
525aa5a7 312 */
f695ca38 313 blk_queue_split(&bio);
525aa5a7 314
32acab31
CH
315 srcu_idx = srcu_read_lock(&head->srcu);
316 ns = nvme_find_path(head);
317 if (likely(ns)) {
a7c7f7b2 318 bio_set_dev(bio, ns->disk->part0);
32acab31 319 bio->bi_opf |= REQ_NVME_MPATH;
1c02fca6 320 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
2796b569 321 bio->bi_iter.bi_sector);
5a6c35f9 322 ret = submit_bio_noacct(bio);
0157ec8d
SG
323 } else if (nvme_available_path(head)) {
324 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
32acab31
CH
325
326 spin_lock_irq(&head->requeue_lock);
327 bio_list_add(&head->requeue_list, bio);
328 spin_unlock_irq(&head->requeue_lock);
329 } else {
0157ec8d 330 dev_warn_ratelimited(dev, "no available path - failing I/O\n");
32acab31
CH
331
332 bio->bi_status = BLK_STS_IOERR;
333 bio_endio(bio);
334 }
335
336 srcu_read_unlock(&head->srcu, srcu_idx);
337 return ret;
338}
339
1496bd49
CH
340static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
341{
342 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
343 return -ENXIO;
344 return 0;
345}
346
347static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
348{
349 nvme_put_ns_head(disk->private_data);
350}
351
352const struct block_device_operations nvme_ns_head_ops = {
353 .owner = THIS_MODULE,
354 .submit_bio = nvme_ns_head_submit_bio,
355 .open = nvme_ns_head_open,
356 .release = nvme_ns_head_release,
357 .ioctl = nvme_ns_head_ioctl,
358 .getgeo = nvme_getgeo,
359 .report_zones = nvme_report_zones,
360 .pr_ops = &nvme_pr_ops,
361};
362
2637baed
MI
363static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
364{
365 return container_of(cdev, struct nvme_ns_head, cdev);
366}
367
368static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
369{
370 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
371 return -ENXIO;
372 return 0;
373}
374
375static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
376{
377 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
378 return 0;
379}
380
381static const struct file_operations nvme_ns_head_chr_fops = {
382 .owner = THIS_MODULE,
383 .open = nvme_ns_head_chr_open,
384 .release = nvme_ns_head_chr_release,
385 .unlocked_ioctl = nvme_ns_head_chr_ioctl,
386 .compat_ioctl = compat_ptr_ioctl,
387};
388
389static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
390{
391 int ret;
392
393 head->cdev_device.parent = &head->subsys->dev;
394 ret = dev_set_name(&head->cdev_device, "ng%dn%d",
395 head->subsys->instance, head->instance);
396 if (ret)
397 return ret;
398 ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
399 &nvme_ns_head_chr_fops, THIS_MODULE);
400 if (ret)
401 kfree_const(head->cdev_device.kobj.name);
402 return ret;
403}
404
32acab31
CH
405static void nvme_requeue_work(struct work_struct *work)
406{
407 struct nvme_ns_head *head =
408 container_of(work, struct nvme_ns_head, requeue_work);
409 struct bio *bio, *next;
410
411 spin_lock_irq(&head->requeue_lock);
412 next = bio_list_get(&head->requeue_list);
413 spin_unlock_irq(&head->requeue_lock);
414
415 while ((bio = next) != NULL) {
416 next = bio->bi_next;
417 bio->bi_next = NULL;
418
419 /*
420 * Reset disk to the mpath node and resubmit to select a new
421 * path.
422 */
a7c7f7b2 423 bio_set_dev(bio, head->disk->part0);
ed00aabd 424 submit_bio_noacct(bio);
32acab31
CH
425 }
426}
427
428int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
429{
430 struct request_queue *q;
431 bool vwc = false;
432
0d0b660f 433 mutex_init(&head->lock);
32acab31
CH
434 bio_list_init(&head->requeue_list);
435 spin_lock_init(&head->requeue_lock);
436 INIT_WORK(&head->requeue_work, nvme_requeue_work);
437
438 /*
439 * Add a multipath node if the subsystems supports multiple controllers.
440 * We also do this for private namespaces as the namespace sharing data could
441 * change after a rescan.
442 */
92decf11 443 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
32acab31
CH
444 return 0;
445
c62b37d9 446 q = blk_alloc_queue(ctrl->numa_node);
32acab31
CH
447 if (!q)
448 goto out;
8b904b5b 449 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
32acab31
CH
450 /* set to a default value for 512 until disk is validated */
451 blk_queue_logical_block_size(q, 512);
8f676b85 452 blk_set_stacking_limits(&q->limits);
32acab31
CH
453
454 /* we need to propagate up the VMC settings */
455 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
456 vwc = true;
457 blk_queue_write_cache(q, vwc, vwc);
458
459 head->disk = alloc_disk(0);
460 if (!head->disk)
461 goto out_cleanup_queue;
462 head->disk->fops = &nvme_ns_head_ops;
463 head->disk->private_data = head;
464 head->disk->queue = q;
465 head->disk->flags = GENHD_FL_EXT_DEVT;
466 sprintf(head->disk->disk_name, "nvme%dn%d",
467 ctrl->subsys->instance, head->instance);
468 return 0;
469
470out_cleanup_queue:
471 blk_cleanup_queue(q);
472out:
473 return -ENOMEM;
474}
475
0d0b660f 476static void nvme_mpath_set_live(struct nvme_ns *ns)
32acab31 477{
0d0b660f
CH
478 struct nvme_ns_head *head = ns->head;
479
32acab31
CH
480 if (!head->disk)
481 return;
9bd82b1a 482
2637baed 483 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
33b14f67
HR
484 device_add_disk(&head->subsys->dev, head->disk,
485 nvme_ns_id_attr_groups);
2637baed
MI
486 nvme_add_ns_head_cdev(head);
487 }
0d0b660f 488
d8a22f85 489 mutex_lock(&head->lock);
886fabf6
KB
490 if (nvme_path_is_optimized(ns)) {
491 int node, srcu_idx;
492
493 srcu_idx = srcu_read_lock(&head->srcu);
494 for_each_node(node)
495 __nvme_find_path(head, node);
496 srcu_read_unlock(&head->srcu, srcu_idx);
497 }
e164471d 498 mutex_unlock(&head->lock);
886fabf6 499
e164471d
SG
500 synchronize_srcu(&head->srcu);
501 kblockd_schedule_work(&head->requeue_work);
0d0b660f
CH
502}
503
504static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
505 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
506 void *))
507{
508 void *base = ctrl->ana_log_buf;
509 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
510 int error, i;
511
512 lockdep_assert_held(&ctrl->ana_lock);
513
514 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
515 struct nvme_ana_group_desc *desc = base + offset;
64fab729
PS
516 u32 nr_nsids;
517 size_t nsid_buf_size;
518
519 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
520 return -EINVAL;
521
522 nr_nsids = le32_to_cpu(desc->nnsids);
523 nsid_buf_size = nr_nsids * sizeof(__le32);
0d0b660f
CH
524
525 if (WARN_ON_ONCE(desc->grpid == 0))
526 return -EINVAL;
527 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
528 return -EINVAL;
529 if (WARN_ON_ONCE(desc->state == 0))
530 return -EINVAL;
531 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
532 return -EINVAL;
533
534 offset += sizeof(*desc);
535 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
536 return -EINVAL;
537
538 error = cb(ctrl, desc, data);
539 if (error)
540 return error;
541
542 offset += nsid_buf_size;
0d0b660f
CH
543 }
544
545 return 0;
546}
547
548static inline bool nvme_state_is_live(enum nvme_ana_state state)
549{
550 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
551}
552
553static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
554 struct nvme_ns *ns)
555{
0d0b660f
CH
556 ns->ana_grpid = le32_to_cpu(desc->grpid);
557 ns->ana_state = desc->state;
558 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
559
cc2278c4 560 if (nvme_state_is_live(ns->ana_state))
0d0b660f 561 nvme_mpath_set_live(ns);
0d0b660f
CH
562}
563
564static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
565 struct nvme_ana_group_desc *desc, void *data)
566{
567 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
568 unsigned *nr_change_groups = data;
569 struct nvme_ns *ns;
570
592b6e7b 571 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
0d0b660f
CH
572 le32_to_cpu(desc->grpid),
573 nvme_ana_state_names[desc->state]);
574
575 if (desc->state == NVME_ANA_CHANGE)
576 (*nr_change_groups)++;
577
578 if (!nr_nsids)
579 return 0;
580
657f1975 581 down_read(&ctrl->namespaces_rwsem);
0d0b660f 582 list_for_each_entry(ns, &ctrl->namespaces, list) {
e01f91df
AE
583 unsigned nsid = le32_to_cpu(desc->nsids[n]);
584
585 if (ns->head->ns_id < nsid)
0d0b660f 586 continue;
e01f91df
AE
587 if (ns->head->ns_id == nsid)
588 nvme_update_ns_ana_state(desc, ns);
0d0b660f
CH
589 if (++n == nr_nsids)
590 break;
591 }
657f1975 592 up_read(&ctrl->namespaces_rwsem);
0d0b660f
CH
593 return 0;
594}
595
86cccfbf 596static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
0d0b660f
CH
597{
598 u32 nr_change_groups = 0;
599 int error;
600
601 mutex_lock(&ctrl->ana_lock);
be93e87e 602 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
0d0b660f
CH
603 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
604 if (error) {
605 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
606 goto out_unlock;
607 }
608
609 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
610 nvme_update_ana_state);
611 if (error)
612 goto out_unlock;
613
614 /*
615 * In theory we should have an ANATT timer per group as they might enter
616 * the change state at different times. But that is a lot of overhead
617 * just to protect against a target that keeps entering new changes
618 * states while never finishing previous ones. But we'll still
619 * eventually time out once all groups are in change state, so this
620 * isn't a big deal.
621 *
622 * We also double the ANATT value to provide some slack for transports
623 * or AEN processing overhead.
624 */
625 if (nr_change_groups)
626 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
627 else
628 del_timer_sync(&ctrl->anatt_timer);
629out_unlock:
630 mutex_unlock(&ctrl->ana_lock);
631 return error;
632}
633
634static void nvme_ana_work(struct work_struct *work)
635{
636 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
637
ecca390e
SG
638 if (ctrl->state != NVME_CTRL_LIVE)
639 return;
640
86cccfbf 641 nvme_read_ana_log(ctrl);
0d0b660f
CH
642}
643
644static void nvme_anatt_timeout(struct timer_list *t)
645{
646 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
647
648 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
649 nvme_reset_ctrl(ctrl);
650}
651
652void nvme_mpath_stop(struct nvme_ctrl *ctrl)
653{
654 if (!nvme_ctrl_use_ana(ctrl))
655 return;
656 del_timer_sync(&ctrl->anatt_timer);
657 cancel_work_sync(&ctrl->ana_work);
658}
659
75c10e73
HR
660#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
661 struct device_attribute subsys_attr_##_name = \
662 __ATTR(_name, _mode, _show, _store)
663
664static const char *nvme_iopolicy_names[] = {
665 [NVME_IOPOLICY_NUMA] = "numa",
666 [NVME_IOPOLICY_RR] = "round-robin",
667};
668
669static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
670 struct device_attribute *attr, char *buf)
671{
672 struct nvme_subsystem *subsys =
673 container_of(dev, struct nvme_subsystem, dev);
674
bff4bcf3
DW
675 return sysfs_emit(buf, "%s\n",
676 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
75c10e73
HR
677}
678
679static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
680 struct device_attribute *attr, const char *buf, size_t count)
681{
682 struct nvme_subsystem *subsys =
683 container_of(dev, struct nvme_subsystem, dev);
684 int i;
685
686 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
687 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
688 WRITE_ONCE(subsys->iopolicy, i);
689 return count;
690 }
691 }
692
693 return -EINVAL;
694}
695SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
696 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
697
0d0b660f
CH
698static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
699 char *buf)
700{
bff4bcf3 701 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
0d0b660f
CH
702}
703DEVICE_ATTR_RO(ana_grpid);
704
705static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
706 char *buf)
707{
708 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
709
bff4bcf3 710 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
0d0b660f
CH
711}
712DEVICE_ATTR_RO(ana_state);
713
489dd102 714static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
0d0b660f
CH
715 struct nvme_ana_group_desc *desc, void *data)
716{
489dd102 717 struct nvme_ana_group_desc *dst = data;
0d0b660f 718
489dd102
AE
719 if (desc->grpid != dst->grpid)
720 return 0;
0d0b660f 721
489dd102
AE
722 *dst = *desc;
723 return -ENXIO; /* just break out of the loop */
0d0b660f
CH
724}
725
726void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
727{
728 if (nvme_ctrl_use_ana(ns->ctrl)) {
489dd102
AE
729 struct nvme_ana_group_desc desc = {
730 .grpid = id->anagrpid,
731 .state = 0,
732 };
733
0d0b660f
CH
734 mutex_lock(&ns->ctrl->ana_lock);
735 ns->ana_grpid = le32_to_cpu(id->anagrpid);
489dd102 736 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
0d0b660f 737 mutex_unlock(&ns->ctrl->ana_lock);
489dd102
AE
738 if (desc.state) {
739 /* found the group desc: update */
740 nvme_update_ns_ana_state(&desc, ns);
dd8f7fa9
HR
741 } else {
742 /* group desc not found: trigger a re-read */
743 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
744 queue_work(nvme_wq, &ns->ctrl->ana_work);
489dd102 745 }
0d0b660f 746 } else {
e234f1f8 747 ns->ana_state = NVME_ANA_OPTIMIZED;
0d0b660f 748 nvme_mpath_set_live(ns);
9bd82b1a 749 }
b2ce4d90 750
1cb039f3
CH
751 if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
752 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
753 ns->head->disk->queue);
73a1a229
KB
754#ifdef CONFIG_BLK_DEV_ZONED
755 if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
756 ns->head->disk->queue->nr_zones = ns->queue->nr_zones;
757#endif
32acab31
CH
758}
759
760void nvme_mpath_remove_disk(struct nvme_ns_head *head)
761{
762 if (!head->disk)
763 return;
2637baed
MI
764 if (head->disk->flags & GENHD_FL_UP) {
765 nvme_cdev_del(&head->cdev, &head->cdev_device);
0d0b660f 766 del_gendisk(head->disk);
2637baed 767 }
32acab31
CH
768 blk_set_queue_dying(head->disk->queue);
769 /* make sure all pending bios are cleaned up */
770 kblockd_schedule_work(&head->requeue_work);
771 flush_work(&head->requeue_work);
772 blk_cleanup_queue(head->disk->queue);
c3124466
SG
773 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
774 /*
775 * if device_add_disk wasn't called, prevent
776 * disk release to put a bogus reference on the
777 * request queue
778 */
779 head->disk->queue = NULL;
780 }
32acab31
CH
781 put_disk(head->disk);
782}
0d0b660f 783
5e1f6899 784void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
0d0b660f 785{
5e1f6899
CH
786 mutex_init(&ctrl->ana_lock);
787 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
788 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
789}
790
791int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
792{
793 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
794 size_t ana_log_size;
795 int error = 0;
0d0b660f 796
66b20ac0 797 /* check if multipath is enabled and we have the capability */
92decf11
KB
798 if (!multipath || !ctrl->subsys ||
799 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
0d0b660f
CH
800 return 0;
801
802 ctrl->anacap = id->anacap;
803 ctrl->anatt = id->anatt;
804 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
805 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
806
5e1f6899
CH
807 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
808 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
809 ctrl->max_namespaces * sizeof(__le32);
810 if (ana_log_size > max_transfer_size) {
0d0b660f 811 dev_err(ctrl->device,
5e1f6899
CH
812 "ANA log page size (%zd) larger than MDTS (%zd).\n",
813 ana_log_size, max_transfer_size);
0d0b660f 814 dev_err(ctrl->device, "disabling ANA support.\n");
5e1f6899 815 goto out_uninit;
0d0b660f 816 }
5e1f6899
CH
817 if (ana_log_size > ctrl->ana_log_size) {
818 nvme_mpath_stop(ctrl);
819 kfree(ctrl->ana_log_buf);
e181811b 820 ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL);
5e1f6899
CH
821 if (!ctrl->ana_log_buf)
822 return -ENOMEM;
bb830add 823 }
5e1f6899 824 ctrl->ana_log_size = ana_log_size;
86cccfbf 825 error = nvme_read_ana_log(ctrl);
0d0b660f 826 if (error)
5e1f6899 827 goto out_uninit;
0d0b660f 828 return 0;
5e1f6899
CH
829
830out_uninit:
831 nvme_mpath_uninit(ctrl);
bb830add 832 return error;
0d0b660f
CH
833}
834
835void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
836{
837 kfree(ctrl->ana_log_buf);
c7055fd1 838 ctrl->ana_log_buf = NULL;
0d0b660f 839}