Commit | Line | Data |
---|---|---|
bc50ad75 | 1 | // SPDX-License-Identifier: GPL-2.0 |
32acab31 | 2 | /* |
0d0b660f | 3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
32acab31 CH |
4 | */ |
5 | ||
b2ce4d90 | 6 | #include <linux/backing-dev.h> |
32acab31 | 7 | #include <linux/moduleparam.h> |
2796b569 | 8 | #include <trace/events/block.h> |
32acab31 CH |
9 | #include "nvme.h" |
10 | ||
11 | static bool multipath = true; | |
5cadde80 | 12 | module_param(multipath, bool, 0444); |
32acab31 CH |
13 | MODULE_PARM_DESC(multipath, |
14 | "turn on native support for multiple controllers per subsystem"); | |
15 | ||
b9156dae SG |
16 | void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) |
17 | { | |
18 | struct nvme_ns_head *h; | |
19 | ||
20 | lockdep_assert_held(&subsys->lock); | |
21 | list_for_each_entry(h, &subsys->nsheads, entry) | |
22 | if (h->disk) | |
23 | blk_mq_unfreeze_queue(h->disk->queue); | |
24 | } | |
25 | ||
26 | void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) | |
27 | { | |
28 | struct nvme_ns_head *h; | |
29 | ||
30 | lockdep_assert_held(&subsys->lock); | |
31 | list_for_each_entry(h, &subsys->nsheads, entry) | |
32 | if (h->disk) | |
33 | blk_mq_freeze_queue_wait(h->disk->queue); | |
34 | } | |
35 | ||
36 | void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) | |
37 | { | |
38 | struct nvme_ns_head *h; | |
39 | ||
40 | lockdep_assert_held(&subsys->lock); | |
41 | list_for_each_entry(h, &subsys->nsheads, entry) | |
42 | if (h->disk) | |
43 | blk_freeze_queue_start(h->disk->queue); | |
44 | } | |
45 | ||
a785dbcc KB |
46 | /* |
47 | * If multipathing is enabled we need to always use the subsystem instance | |
48 | * number for numbering our devices to avoid conflicts between subsystems that | |
49 | * have multiple controllers and thus use the multipath-aware subsystem node | |
50 | * and those that have a single controller and use the controller node | |
51 | * directly. | |
52 | */ | |
9953ab0c | 53 | bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) |
a785dbcc | 54 | { |
9953ab0c CH |
55 | if (!multipath) |
56 | return false; | |
57 | if (!ns->head->disk) { | |
58 | sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, | |
59 | ns->head->instance); | |
60 | return true; | |
a785dbcc | 61 | } |
9953ab0c CH |
62 | sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, |
63 | ns->ctrl->instance, ns->head->instance); | |
64 | *flags = GENHD_FL_HIDDEN; | |
65 | return true; | |
a785dbcc KB |
66 | } |
67 | ||
5ddaabe8 | 68 | void nvme_failover_req(struct request *req) |
32acab31 CH |
69 | { |
70 | struct nvme_ns *ns = req->q->queuedata; | |
5ddaabe8 | 71 | u16 status = nvme_req(req)->status & 0x7ff; |
32acab31 | 72 | unsigned long flags; |
ce86dad2 | 73 | struct bio *bio; |
32acab31 | 74 | |
5ddaabe8 CH |
75 | nvme_mpath_clear_current_path(ns); |
76 | ||
77 | /* | |
78 | * If we got back an ANA error, we know the controller is alive but not | |
79 | * ready to serve this namespace. Kick of a re-read of the ANA | |
80 | * information page, and just try any other available path for now. | |
81 | */ | |
82 | if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { | |
83 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
84 | queue_work(nvme_wq, &ns->ctrl->ana_work); | |
0d0b660f CH |
85 | } |
86 | ||
764e9332 | 87 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
ce86dad2 DW |
88 | for (bio = req->bio; bio; bio = bio->bi_next) |
89 | bio_set_dev(bio, ns->head->disk->part0); | |
764e9332 JM |
90 | blk_steal_bios(&ns->head->requeue_list, req); |
91 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | |
764e9332 | 92 | |
5ddaabe8 | 93 | blk_mq_end_request(req, 0); |
32acab31 CH |
94 | kblockd_schedule_work(&ns->head->requeue_work); |
95 | } | |
96 | ||
32acab31 CH |
97 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
98 | { | |
99 | struct nvme_ns *ns; | |
100 | ||
765cc031 | 101 | down_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
102 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
103 | if (ns->head->disk) | |
104 | kblockd_schedule_work(&ns->head->requeue_work); | |
105 | } | |
765cc031 | 106 | up_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
107 | } |
108 | ||
0d0b660f CH |
109 | static const char *nvme_ana_state_names[] = { |
110 | [0] = "invalid state", | |
111 | [NVME_ANA_OPTIMIZED] = "optimized", | |
112 | [NVME_ANA_NONOPTIMIZED] = "non-optimized", | |
113 | [NVME_ANA_INACCESSIBLE] = "inaccessible", | |
114 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", | |
115 | [NVME_ANA_CHANGE] = "change", | |
116 | }; | |
117 | ||
0157ec8d | 118 | bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
32acab31 | 119 | { |
f3334447 | 120 | struct nvme_ns_head *head = ns->head; |
0157ec8d | 121 | bool changed = false; |
f3334447 CH |
122 | int node; |
123 | ||
124 | if (!head) | |
0157ec8d | 125 | goto out; |
f3334447 CH |
126 | |
127 | for_each_node(node) { | |
0157ec8d | 128 | if (ns == rcu_access_pointer(head->current_path[node])) { |
f3334447 | 129 | rcu_assign_pointer(head->current_path[node], NULL); |
0157ec8d SG |
130 | changed = true; |
131 | } | |
f3334447 | 132 | } |
0157ec8d SG |
133 | out: |
134 | return changed; | |
135 | } | |
136 | ||
137 | void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) | |
138 | { | |
139 | struct nvme_ns *ns; | |
140 | ||
141 | mutex_lock(&ctrl->scan_lock); | |
763303a8 | 142 | down_read(&ctrl->namespaces_rwsem); |
0157ec8d SG |
143 | list_for_each_entry(ns, &ctrl->namespaces, list) |
144 | if (nvme_mpath_clear_current_path(ns)) | |
145 | kblockd_schedule_work(&ns->head->requeue_work); | |
763303a8 | 146 | up_read(&ctrl->namespaces_rwsem); |
0157ec8d | 147 | mutex_unlock(&ctrl->scan_lock); |
f3334447 CH |
148 | } |
149 | ||
ca7ae5c9 HR |
150 | static bool nvme_path_is_disabled(struct nvme_ns *ns) |
151 | { | |
ecca390e SG |
152 | /* |
153 | * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should | |
154 | * still be able to complete assuming that the controller is connected. | |
155 | * Otherwise it will fail immediately and return to the requeue list. | |
156 | */ | |
157 | if (ns->ctrl->state != NVME_CTRL_LIVE && | |
158 | ns->ctrl->state != NVME_CTRL_DELETING) | |
159 | return true; | |
160 | if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || | |
161 | test_bit(NVME_NS_REMOVING, &ns->flags)) | |
162 | return true; | |
163 | return false; | |
ca7ae5c9 HR |
164 | } |
165 | ||
f3334447 CH |
166 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
167 | { | |
168 | int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; | |
169 | struct nvme_ns *found = NULL, *fallback = NULL, *ns; | |
32acab31 CH |
170 | |
171 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
ca7ae5c9 | 172 | if (nvme_path_is_disabled(ns)) |
0d0b660f | 173 | continue; |
f3334447 | 174 | |
75c10e73 HR |
175 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
176 | distance = node_distance(node, ns->ctrl->numa_node); | |
177 | else | |
178 | distance = LOCAL_DISTANCE; | |
f3334447 | 179 | |
0d0b660f CH |
180 | switch (ns->ana_state) { |
181 | case NVME_ANA_OPTIMIZED: | |
f3334447 CH |
182 | if (distance < found_distance) { |
183 | found_distance = distance; | |
184 | found = ns; | |
185 | } | |
186 | break; | |
0d0b660f | 187 | case NVME_ANA_NONOPTIMIZED: |
f3334447 CH |
188 | if (distance < fallback_distance) { |
189 | fallback_distance = distance; | |
190 | fallback = ns; | |
191 | } | |
0d0b660f CH |
192 | break; |
193 | default: | |
194 | break; | |
32acab31 CH |
195 | } |
196 | } | |
197 | ||
f3334447 CH |
198 | if (!found) |
199 | found = fallback; | |
200 | if (found) | |
201 | rcu_assign_pointer(head->current_path[node], found); | |
202 | return found; | |
0d0b660f CH |
203 | } |
204 | ||
75c10e73 HR |
205 | static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
206 | struct nvme_ns *ns) | |
207 | { | |
208 | ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, | |
209 | siblings); | |
210 | if (ns) | |
211 | return ns; | |
212 | return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); | |
213 | } | |
214 | ||
215 | static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, | |
216 | int node, struct nvme_ns *old) | |
217 | { | |
e398863b | 218 | struct nvme_ns *ns, *found = NULL; |
75c10e73 | 219 | |
2032d074 HR |
220 | if (list_is_singular(&head->list)) { |
221 | if (nvme_path_is_disabled(old)) | |
222 | return NULL; | |
75c10e73 | 223 | return old; |
2032d074 | 224 | } |
75c10e73 HR |
225 | |
226 | for (ns = nvme_next_ns(head, old); | |
d1bcf006 | 227 | ns && ns != old; |
75c10e73 | 228 | ns = nvme_next_ns(head, ns)) { |
ca7ae5c9 | 229 | if (nvme_path_is_disabled(ns)) |
75c10e73 HR |
230 | continue; |
231 | ||
232 | if (ns->ana_state == NVME_ANA_OPTIMIZED) { | |
233 | found = ns; | |
234 | goto out; | |
235 | } | |
236 | if (ns->ana_state == NVME_ANA_NONOPTIMIZED) | |
e398863b | 237 | found = ns; |
75c10e73 HR |
238 | } |
239 | ||
93eb0381 MW |
240 | /* |
241 | * The loop above skips the current path for round-robin semantics. | |
242 | * Fall back to the current path if either: | |
243 | * - no other optimized path found and current is optimized, | |
244 | * - no other usable path found and current is usable. | |
245 | */ | |
3f6e3246 | 246 | if (!nvme_path_is_disabled(old) && |
93eb0381 | 247 | (old->ana_state == NVME_ANA_OPTIMIZED || |
e398863b | 248 | (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
93eb0381 MW |
249 | return old; |
250 | ||
e398863b | 251 | if (!found) |
75c10e73 | 252 | return NULL; |
75c10e73 HR |
253 | out: |
254 | rcu_assign_pointer(head->current_path[node], found); | |
255 | return found; | |
256 | } | |
257 | ||
0d0b660f CH |
258 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
259 | { | |
260 | return ns->ctrl->state == NVME_CTRL_LIVE && | |
261 | ns->ana_state == NVME_ANA_OPTIMIZED; | |
32acab31 CH |
262 | } |
263 | ||
264 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | |
265 | { | |
f3334447 CH |
266 | int node = numa_node_id(); |
267 | struct nvme_ns *ns; | |
32acab31 | 268 | |
f3334447 | 269 | ns = srcu_dereference(head->current_path[node], &head->srcu); |
fbd6a42d HR |
270 | if (unlikely(!ns)) |
271 | return __nvme_find_path(head, node); | |
272 | ||
273 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) | |
274 | return nvme_round_robin_path(head, node, ns); | |
275 | if (unlikely(!nvme_path_is_optimized(ns))) | |
276 | return __nvme_find_path(head, node); | |
32acab31 CH |
277 | return ns; |
278 | } | |
279 | ||
0157ec8d SG |
280 | static bool nvme_available_path(struct nvme_ns_head *head) |
281 | { | |
282 | struct nvme_ns *ns; | |
283 | ||
284 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
8c4dfea9 VG |
285 | if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) |
286 | continue; | |
0157ec8d SG |
287 | switch (ns->ctrl->state) { |
288 | case NVME_CTRL_LIVE: | |
289 | case NVME_CTRL_RESETTING: | |
290 | case NVME_CTRL_CONNECTING: | |
291 | /* fallthru */ | |
292 | return true; | |
293 | default: | |
294 | break; | |
295 | } | |
296 | } | |
297 | return false; | |
298 | } | |
299 | ||
1496bd49 | 300 | static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) |
32acab31 | 301 | { |
309dca30 | 302 | struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; |
32acab31 CH |
303 | struct device *dev = disk_to_dev(head->disk); |
304 | struct nvme_ns *ns; | |
305 | blk_qc_t ret = BLK_QC_T_NONE; | |
306 | int srcu_idx; | |
307 | ||
525aa5a7 | 308 | /* |
f695ca38 CH |
309 | * The namespace might be going away and the bio might be moved to a |
310 | * different queue via blk_steal_bios(), so we need to use the bio_split | |
311 | * pool from the original queue to allocate the bvecs from. | |
525aa5a7 | 312 | */ |
f695ca38 | 313 | blk_queue_split(&bio); |
525aa5a7 | 314 | |
32acab31 CH |
315 | srcu_idx = srcu_read_lock(&head->srcu); |
316 | ns = nvme_find_path(head); | |
317 | if (likely(ns)) { | |
a7c7f7b2 | 318 | bio_set_dev(bio, ns->disk->part0); |
32acab31 | 319 | bio->bi_opf |= REQ_NVME_MPATH; |
1c02fca6 | 320 | trace_block_bio_remap(bio, disk_devt(ns->head->disk), |
2796b569 | 321 | bio->bi_iter.bi_sector); |
5a6c35f9 | 322 | ret = submit_bio_noacct(bio); |
0157ec8d SG |
323 | } else if (nvme_available_path(head)) { |
324 | dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); | |
32acab31 CH |
325 | |
326 | spin_lock_irq(&head->requeue_lock); | |
327 | bio_list_add(&head->requeue_list, bio); | |
328 | spin_unlock_irq(&head->requeue_lock); | |
329 | } else { | |
0157ec8d | 330 | dev_warn_ratelimited(dev, "no available path - failing I/O\n"); |
32acab31 CH |
331 | |
332 | bio->bi_status = BLK_STS_IOERR; | |
333 | bio_endio(bio); | |
334 | } | |
335 | ||
336 | srcu_read_unlock(&head->srcu, srcu_idx); | |
337 | return ret; | |
338 | } | |
339 | ||
1496bd49 CH |
340 | static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) |
341 | { | |
342 | if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) | |
343 | return -ENXIO; | |
344 | return 0; | |
345 | } | |
346 | ||
347 | static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) | |
348 | { | |
349 | nvme_put_ns_head(disk->private_data); | |
350 | } | |
351 | ||
352 | const struct block_device_operations nvme_ns_head_ops = { | |
353 | .owner = THIS_MODULE, | |
354 | .submit_bio = nvme_ns_head_submit_bio, | |
355 | .open = nvme_ns_head_open, | |
356 | .release = nvme_ns_head_release, | |
357 | .ioctl = nvme_ns_head_ioctl, | |
358 | .getgeo = nvme_getgeo, | |
359 | .report_zones = nvme_report_zones, | |
360 | .pr_ops = &nvme_pr_ops, | |
361 | }; | |
362 | ||
2637baed MI |
363 | static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) |
364 | { | |
365 | return container_of(cdev, struct nvme_ns_head, cdev); | |
366 | } | |
367 | ||
368 | static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) | |
369 | { | |
370 | if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) | |
371 | return -ENXIO; | |
372 | return 0; | |
373 | } | |
374 | ||
375 | static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) | |
376 | { | |
377 | nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); | |
378 | return 0; | |
379 | } | |
380 | ||
381 | static const struct file_operations nvme_ns_head_chr_fops = { | |
382 | .owner = THIS_MODULE, | |
383 | .open = nvme_ns_head_chr_open, | |
384 | .release = nvme_ns_head_chr_release, | |
385 | .unlocked_ioctl = nvme_ns_head_chr_ioctl, | |
386 | .compat_ioctl = compat_ptr_ioctl, | |
387 | }; | |
388 | ||
389 | static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) | |
390 | { | |
391 | int ret; | |
392 | ||
393 | head->cdev_device.parent = &head->subsys->dev; | |
394 | ret = dev_set_name(&head->cdev_device, "ng%dn%d", | |
395 | head->subsys->instance, head->instance); | |
396 | if (ret) | |
397 | return ret; | |
398 | ret = nvme_cdev_add(&head->cdev, &head->cdev_device, | |
399 | &nvme_ns_head_chr_fops, THIS_MODULE); | |
400 | if (ret) | |
401 | kfree_const(head->cdev_device.kobj.name); | |
402 | return ret; | |
403 | } | |
404 | ||
32acab31 CH |
405 | static void nvme_requeue_work(struct work_struct *work) |
406 | { | |
407 | struct nvme_ns_head *head = | |
408 | container_of(work, struct nvme_ns_head, requeue_work); | |
409 | struct bio *bio, *next; | |
410 | ||
411 | spin_lock_irq(&head->requeue_lock); | |
412 | next = bio_list_get(&head->requeue_list); | |
413 | spin_unlock_irq(&head->requeue_lock); | |
414 | ||
415 | while ((bio = next) != NULL) { | |
416 | next = bio->bi_next; | |
417 | bio->bi_next = NULL; | |
418 | ||
419 | /* | |
420 | * Reset disk to the mpath node and resubmit to select a new | |
421 | * path. | |
422 | */ | |
a7c7f7b2 | 423 | bio_set_dev(bio, head->disk->part0); |
ed00aabd | 424 | submit_bio_noacct(bio); |
32acab31 CH |
425 | } |
426 | } | |
427 | ||
428 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | |
429 | { | |
430 | struct request_queue *q; | |
431 | bool vwc = false; | |
432 | ||
0d0b660f | 433 | mutex_init(&head->lock); |
32acab31 CH |
434 | bio_list_init(&head->requeue_list); |
435 | spin_lock_init(&head->requeue_lock); | |
436 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | |
437 | ||
438 | /* | |
439 | * Add a multipath node if the subsystems supports multiple controllers. | |
440 | * We also do this for private namespaces as the namespace sharing data could | |
441 | * change after a rescan. | |
442 | */ | |
92decf11 | 443 | if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) |
32acab31 CH |
444 | return 0; |
445 | ||
c62b37d9 | 446 | q = blk_alloc_queue(ctrl->numa_node); |
32acab31 CH |
447 | if (!q) |
448 | goto out; | |
8b904b5b | 449 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q); |
32acab31 CH |
450 | /* set to a default value for 512 until disk is validated */ |
451 | blk_queue_logical_block_size(q, 512); | |
8f676b85 | 452 | blk_set_stacking_limits(&q->limits); |
32acab31 CH |
453 | |
454 | /* we need to propagate up the VMC settings */ | |
455 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) | |
456 | vwc = true; | |
457 | blk_queue_write_cache(q, vwc, vwc); | |
458 | ||
459 | head->disk = alloc_disk(0); | |
460 | if (!head->disk) | |
461 | goto out_cleanup_queue; | |
462 | head->disk->fops = &nvme_ns_head_ops; | |
463 | head->disk->private_data = head; | |
464 | head->disk->queue = q; | |
465 | head->disk->flags = GENHD_FL_EXT_DEVT; | |
466 | sprintf(head->disk->disk_name, "nvme%dn%d", | |
467 | ctrl->subsys->instance, head->instance); | |
468 | return 0; | |
469 | ||
470 | out_cleanup_queue: | |
471 | blk_cleanup_queue(q); | |
472 | out: | |
473 | return -ENOMEM; | |
474 | } | |
475 | ||
0d0b660f | 476 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
32acab31 | 477 | { |
0d0b660f CH |
478 | struct nvme_ns_head *head = ns->head; |
479 | ||
32acab31 CH |
480 | if (!head->disk) |
481 | return; | |
9bd82b1a | 482 | |
2637baed | 483 | if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
33b14f67 HR |
484 | device_add_disk(&head->subsys->dev, head->disk, |
485 | nvme_ns_id_attr_groups); | |
2637baed MI |
486 | nvme_add_ns_head_cdev(head); |
487 | } | |
0d0b660f | 488 | |
d8a22f85 | 489 | mutex_lock(&head->lock); |
886fabf6 KB |
490 | if (nvme_path_is_optimized(ns)) { |
491 | int node, srcu_idx; | |
492 | ||
493 | srcu_idx = srcu_read_lock(&head->srcu); | |
494 | for_each_node(node) | |
495 | __nvme_find_path(head, node); | |
496 | srcu_read_unlock(&head->srcu, srcu_idx); | |
497 | } | |
e164471d | 498 | mutex_unlock(&head->lock); |
886fabf6 | 499 | |
e164471d SG |
500 | synchronize_srcu(&head->srcu); |
501 | kblockd_schedule_work(&head->requeue_work); | |
0d0b660f CH |
502 | } |
503 | ||
504 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, | |
505 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, | |
506 | void *)) | |
507 | { | |
508 | void *base = ctrl->ana_log_buf; | |
509 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); | |
510 | int error, i; | |
511 | ||
512 | lockdep_assert_held(&ctrl->ana_lock); | |
513 | ||
514 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { | |
515 | struct nvme_ana_group_desc *desc = base + offset; | |
64fab729 PS |
516 | u32 nr_nsids; |
517 | size_t nsid_buf_size; | |
518 | ||
519 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) | |
520 | return -EINVAL; | |
521 | ||
522 | nr_nsids = le32_to_cpu(desc->nnsids); | |
523 | nsid_buf_size = nr_nsids * sizeof(__le32); | |
0d0b660f CH |
524 | |
525 | if (WARN_ON_ONCE(desc->grpid == 0)) | |
526 | return -EINVAL; | |
527 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) | |
528 | return -EINVAL; | |
529 | if (WARN_ON_ONCE(desc->state == 0)) | |
530 | return -EINVAL; | |
531 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) | |
532 | return -EINVAL; | |
533 | ||
534 | offset += sizeof(*desc); | |
535 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) | |
536 | return -EINVAL; | |
537 | ||
538 | error = cb(ctrl, desc, data); | |
539 | if (error) | |
540 | return error; | |
541 | ||
542 | offset += nsid_buf_size; | |
0d0b660f CH |
543 | } |
544 | ||
545 | return 0; | |
546 | } | |
547 | ||
548 | static inline bool nvme_state_is_live(enum nvme_ana_state state) | |
549 | { | |
550 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; | |
551 | } | |
552 | ||
553 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, | |
554 | struct nvme_ns *ns) | |
555 | { | |
0d0b660f CH |
556 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
557 | ns->ana_state = desc->state; | |
558 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
559 | ||
cc2278c4 | 560 | if (nvme_state_is_live(ns->ana_state)) |
0d0b660f | 561 | nvme_mpath_set_live(ns); |
0d0b660f CH |
562 | } |
563 | ||
564 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, | |
565 | struct nvme_ana_group_desc *desc, void *data) | |
566 | { | |
567 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; | |
568 | unsigned *nr_change_groups = data; | |
569 | struct nvme_ns *ns; | |
570 | ||
592b6e7b | 571 | dev_dbg(ctrl->device, "ANA group %d: %s.\n", |
0d0b660f CH |
572 | le32_to_cpu(desc->grpid), |
573 | nvme_ana_state_names[desc->state]); | |
574 | ||
575 | if (desc->state == NVME_ANA_CHANGE) | |
576 | (*nr_change_groups)++; | |
577 | ||
578 | if (!nr_nsids) | |
579 | return 0; | |
580 | ||
657f1975 | 581 | down_read(&ctrl->namespaces_rwsem); |
0d0b660f | 582 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
e01f91df AE |
583 | unsigned nsid = le32_to_cpu(desc->nsids[n]); |
584 | ||
585 | if (ns->head->ns_id < nsid) | |
0d0b660f | 586 | continue; |
e01f91df AE |
587 | if (ns->head->ns_id == nsid) |
588 | nvme_update_ns_ana_state(desc, ns); | |
0d0b660f CH |
589 | if (++n == nr_nsids) |
590 | break; | |
591 | } | |
657f1975 | 592 | up_read(&ctrl->namespaces_rwsem); |
0d0b660f CH |
593 | return 0; |
594 | } | |
595 | ||
86cccfbf | 596 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
0d0b660f CH |
597 | { |
598 | u32 nr_change_groups = 0; | |
599 | int error; | |
600 | ||
601 | mutex_lock(&ctrl->ana_lock); | |
be93e87e | 602 | error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, |
0d0b660f CH |
603 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); |
604 | if (error) { | |
605 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); | |
606 | goto out_unlock; | |
607 | } | |
608 | ||
609 | error = nvme_parse_ana_log(ctrl, &nr_change_groups, | |
610 | nvme_update_ana_state); | |
611 | if (error) | |
612 | goto out_unlock; | |
613 | ||
614 | /* | |
615 | * In theory we should have an ANATT timer per group as they might enter | |
616 | * the change state at different times. But that is a lot of overhead | |
617 | * just to protect against a target that keeps entering new changes | |
618 | * states while never finishing previous ones. But we'll still | |
619 | * eventually time out once all groups are in change state, so this | |
620 | * isn't a big deal. | |
621 | * | |
622 | * We also double the ANATT value to provide some slack for transports | |
623 | * or AEN processing overhead. | |
624 | */ | |
625 | if (nr_change_groups) | |
626 | mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); | |
627 | else | |
628 | del_timer_sync(&ctrl->anatt_timer); | |
629 | out_unlock: | |
630 | mutex_unlock(&ctrl->ana_lock); | |
631 | return error; | |
632 | } | |
633 | ||
634 | static void nvme_ana_work(struct work_struct *work) | |
635 | { | |
636 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); | |
637 | ||
ecca390e SG |
638 | if (ctrl->state != NVME_CTRL_LIVE) |
639 | return; | |
640 | ||
86cccfbf | 641 | nvme_read_ana_log(ctrl); |
0d0b660f CH |
642 | } |
643 | ||
644 | static void nvme_anatt_timeout(struct timer_list *t) | |
645 | { | |
646 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); | |
647 | ||
648 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); | |
649 | nvme_reset_ctrl(ctrl); | |
650 | } | |
651 | ||
652 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) | |
653 | { | |
654 | if (!nvme_ctrl_use_ana(ctrl)) | |
655 | return; | |
656 | del_timer_sync(&ctrl->anatt_timer); | |
657 | cancel_work_sync(&ctrl->ana_work); | |
658 | } | |
659 | ||
75c10e73 HR |
660 | #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
661 | struct device_attribute subsys_attr_##_name = \ | |
662 | __ATTR(_name, _mode, _show, _store) | |
663 | ||
664 | static const char *nvme_iopolicy_names[] = { | |
665 | [NVME_IOPOLICY_NUMA] = "numa", | |
666 | [NVME_IOPOLICY_RR] = "round-robin", | |
667 | }; | |
668 | ||
669 | static ssize_t nvme_subsys_iopolicy_show(struct device *dev, | |
670 | struct device_attribute *attr, char *buf) | |
671 | { | |
672 | struct nvme_subsystem *subsys = | |
673 | container_of(dev, struct nvme_subsystem, dev); | |
674 | ||
bff4bcf3 DW |
675 | return sysfs_emit(buf, "%s\n", |
676 | nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); | |
75c10e73 HR |
677 | } |
678 | ||
679 | static ssize_t nvme_subsys_iopolicy_store(struct device *dev, | |
680 | struct device_attribute *attr, const char *buf, size_t count) | |
681 | { | |
682 | struct nvme_subsystem *subsys = | |
683 | container_of(dev, struct nvme_subsystem, dev); | |
684 | int i; | |
685 | ||
686 | for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { | |
687 | if (sysfs_streq(buf, nvme_iopolicy_names[i])) { | |
688 | WRITE_ONCE(subsys->iopolicy, i); | |
689 | return count; | |
690 | } | |
691 | } | |
692 | ||
693 | return -EINVAL; | |
694 | } | |
695 | SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, | |
696 | nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); | |
697 | ||
0d0b660f CH |
698 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
699 | char *buf) | |
700 | { | |
bff4bcf3 | 701 | return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
0d0b660f CH |
702 | } |
703 | DEVICE_ATTR_RO(ana_grpid); | |
704 | ||
705 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, | |
706 | char *buf) | |
707 | { | |
708 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | |
709 | ||
bff4bcf3 | 710 | return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
0d0b660f CH |
711 | } |
712 | DEVICE_ATTR_RO(ana_state); | |
713 | ||
489dd102 | 714 | static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, |
0d0b660f CH |
715 | struct nvme_ana_group_desc *desc, void *data) |
716 | { | |
489dd102 | 717 | struct nvme_ana_group_desc *dst = data; |
0d0b660f | 718 | |
489dd102 AE |
719 | if (desc->grpid != dst->grpid) |
720 | return 0; | |
0d0b660f | 721 | |
489dd102 AE |
722 | *dst = *desc; |
723 | return -ENXIO; /* just break out of the loop */ | |
0d0b660f CH |
724 | } |
725 | ||
726 | void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) | |
727 | { | |
728 | if (nvme_ctrl_use_ana(ns->ctrl)) { | |
489dd102 AE |
729 | struct nvme_ana_group_desc desc = { |
730 | .grpid = id->anagrpid, | |
731 | .state = 0, | |
732 | }; | |
733 | ||
0d0b660f CH |
734 | mutex_lock(&ns->ctrl->ana_lock); |
735 | ns->ana_grpid = le32_to_cpu(id->anagrpid); | |
489dd102 | 736 | nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); |
0d0b660f | 737 | mutex_unlock(&ns->ctrl->ana_lock); |
489dd102 AE |
738 | if (desc.state) { |
739 | /* found the group desc: update */ | |
740 | nvme_update_ns_ana_state(&desc, ns); | |
dd8f7fa9 HR |
741 | } else { |
742 | /* group desc not found: trigger a re-read */ | |
743 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
744 | queue_work(nvme_wq, &ns->ctrl->ana_work); | |
489dd102 | 745 | } |
0d0b660f | 746 | } else { |
e234f1f8 | 747 | ns->ana_state = NVME_ANA_OPTIMIZED; |
0d0b660f | 748 | nvme_mpath_set_live(ns); |
9bd82b1a | 749 | } |
b2ce4d90 | 750 | |
1cb039f3 CH |
751 | if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
752 | blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, | |
753 | ns->head->disk->queue); | |
73a1a229 KB |
754 | #ifdef CONFIG_BLK_DEV_ZONED |
755 | if (blk_queue_is_zoned(ns->queue) && ns->head->disk) | |
756 | ns->head->disk->queue->nr_zones = ns->queue->nr_zones; | |
757 | #endif | |
32acab31 CH |
758 | } |
759 | ||
760 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) | |
761 | { | |
762 | if (!head->disk) | |
763 | return; | |
2637baed MI |
764 | if (head->disk->flags & GENHD_FL_UP) { |
765 | nvme_cdev_del(&head->cdev, &head->cdev_device); | |
0d0b660f | 766 | del_gendisk(head->disk); |
2637baed | 767 | } |
32acab31 CH |
768 | blk_set_queue_dying(head->disk->queue); |
769 | /* make sure all pending bios are cleaned up */ | |
770 | kblockd_schedule_work(&head->requeue_work); | |
771 | flush_work(&head->requeue_work); | |
772 | blk_cleanup_queue(head->disk->queue); | |
c3124466 SG |
773 | if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
774 | /* | |
775 | * if device_add_disk wasn't called, prevent | |
776 | * disk release to put a bogus reference on the | |
777 | * request queue | |
778 | */ | |
779 | head->disk->queue = NULL; | |
780 | } | |
32acab31 CH |
781 | put_disk(head->disk); |
782 | } | |
0d0b660f | 783 | |
5e1f6899 | 784 | void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
0d0b660f | 785 | { |
5e1f6899 CH |
786 | mutex_init(&ctrl->ana_lock); |
787 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); | |
788 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); | |
789 | } | |
790 | ||
791 | int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | |
792 | { | |
793 | size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; | |
794 | size_t ana_log_size; | |
795 | int error = 0; | |
0d0b660f | 796 | |
66b20ac0 | 797 | /* check if multipath is enabled and we have the capability */ |
92decf11 KB |
798 | if (!multipath || !ctrl->subsys || |
799 | !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) | |
0d0b660f CH |
800 | return 0; |
801 | ||
802 | ctrl->anacap = id->anacap; | |
803 | ctrl->anatt = id->anatt; | |
804 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); | |
805 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); | |
806 | ||
5e1f6899 CH |
807 | ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
808 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + | |
809 | ctrl->max_namespaces * sizeof(__le32); | |
810 | if (ana_log_size > max_transfer_size) { | |
0d0b660f | 811 | dev_err(ctrl->device, |
5e1f6899 CH |
812 | "ANA log page size (%zd) larger than MDTS (%zd).\n", |
813 | ana_log_size, max_transfer_size); | |
0d0b660f | 814 | dev_err(ctrl->device, "disabling ANA support.\n"); |
5e1f6899 | 815 | goto out_uninit; |
0d0b660f | 816 | } |
5e1f6899 CH |
817 | if (ana_log_size > ctrl->ana_log_size) { |
818 | nvme_mpath_stop(ctrl); | |
819 | kfree(ctrl->ana_log_buf); | |
e181811b | 820 | ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); |
5e1f6899 CH |
821 | if (!ctrl->ana_log_buf) |
822 | return -ENOMEM; | |
bb830add | 823 | } |
5e1f6899 | 824 | ctrl->ana_log_size = ana_log_size; |
86cccfbf | 825 | error = nvme_read_ana_log(ctrl); |
0d0b660f | 826 | if (error) |
5e1f6899 | 827 | goto out_uninit; |
0d0b660f | 828 | return 0; |
5e1f6899 CH |
829 | |
830 | out_uninit: | |
831 | nvme_mpath_uninit(ctrl); | |
bb830add | 832 | return error; |
0d0b660f CH |
833 | } |
834 | ||
835 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) | |
836 | { | |
837 | kfree(ctrl->ana_log_buf); | |
c7055fd1 | 838 | ctrl->ana_log_buf = NULL; |
0d0b660f | 839 | } |