Commit | Line | Data |
---|---|---|
bc50ad75 | 1 | // SPDX-License-Identifier: GPL-2.0 |
32acab31 | 2 | /* |
0d0b660f | 3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
32acab31 CH |
4 | */ |
5 | ||
6 | #include <linux/moduleparam.h> | |
2796b569 | 7 | #include <trace/events/block.h> |
32acab31 CH |
8 | #include "nvme.h" |
9 | ||
10 | static bool multipath = true; | |
5cadde80 | 11 | module_param(multipath, bool, 0444); |
32acab31 CH |
12 | MODULE_PARM_DESC(multipath, |
13 | "turn on native support for multiple controllers per subsystem"); | |
14 | ||
b9156dae SG |
15 | void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) |
16 | { | |
17 | struct nvme_ns_head *h; | |
18 | ||
19 | lockdep_assert_held(&subsys->lock); | |
20 | list_for_each_entry(h, &subsys->nsheads, entry) | |
21 | if (h->disk) | |
22 | blk_mq_unfreeze_queue(h->disk->queue); | |
23 | } | |
24 | ||
25 | void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) | |
26 | { | |
27 | struct nvme_ns_head *h; | |
28 | ||
29 | lockdep_assert_held(&subsys->lock); | |
30 | list_for_each_entry(h, &subsys->nsheads, entry) | |
31 | if (h->disk) | |
32 | blk_mq_freeze_queue_wait(h->disk->queue); | |
33 | } | |
34 | ||
35 | void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) | |
36 | { | |
37 | struct nvme_ns_head *h; | |
38 | ||
39 | lockdep_assert_held(&subsys->lock); | |
40 | list_for_each_entry(h, &subsys->nsheads, entry) | |
41 | if (h->disk) | |
42 | blk_freeze_queue_start(h->disk->queue); | |
43 | } | |
44 | ||
a785dbcc KB |
45 | /* |
46 | * If multipathing is enabled we need to always use the subsystem instance | |
47 | * number for numbering our devices to avoid conflicts between subsystems that | |
48 | * have multiple controllers and thus use the multipath-aware subsystem node | |
49 | * and those that have a single controller and use the controller node | |
50 | * directly. | |
51 | */ | |
52 | void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, | |
53 | struct nvme_ctrl *ctrl, int *flags) | |
54 | { | |
55 | if (!multipath) { | |
56 | sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); | |
57 | } else if (ns->head->disk) { | |
58 | sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, | |
8a03b27e | 59 | ctrl->instance, ns->head->instance); |
a785dbcc KB |
60 | *flags = GENHD_FL_HIDDEN; |
61 | } else { | |
62 | sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, | |
63 | ns->head->instance); | |
64 | } | |
65 | } | |
66 | ||
32acab31 CH |
67 | void nvme_failover_req(struct request *req) |
68 | { | |
69 | struct nvme_ns *ns = req->q->queuedata; | |
0d0b660f | 70 | u16 status = nvme_req(req)->status; |
32acab31 CH |
71 | unsigned long flags; |
72 | ||
73 | spin_lock_irqsave(&ns->head->requeue_lock, flags); | |
74 | blk_steal_bios(&ns->head->requeue_list, req); | |
75 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | |
76 | blk_mq_end_request(req, 0); | |
77 | ||
0d0b660f CH |
78 | switch (status & 0x7ff) { |
79 | case NVME_SC_ANA_TRANSITION: | |
80 | case NVME_SC_ANA_INACCESSIBLE: | |
81 | case NVME_SC_ANA_PERSISTENT_LOSS: | |
82 | /* | |
83 | * If we got back an ANA error we know the controller is alive, | |
84 | * but not ready to serve this namespaces. The spec suggests | |
85 | * we should update our general state here, but due to the fact | |
86 | * that the admin and I/O queues are not serialized that is | |
87 | * fundamentally racy. So instead just clear the current path, | |
88 | * mark the the path as pending and kick of a re-read of the ANA | |
89 | * log page ASAP. | |
90 | */ | |
91 | nvme_mpath_clear_current_path(ns); | |
92 | if (ns->ctrl->ana_log_buf) { | |
93 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
94 | queue_work(nvme_wq, &ns->ctrl->ana_work); | |
95 | } | |
96 | break; | |
783f4a44 | 97 | case NVME_SC_HOST_PATH_ERROR: |
2dc3947b | 98 | case NVME_SC_HOST_ABORTED_CMD: |
783f4a44 JS |
99 | /* |
100 | * Temporary transport disruption in talking to the controller. | |
101 | * Try to send on a new path. | |
102 | */ | |
103 | nvme_mpath_clear_current_path(ns); | |
104 | break; | |
0d0b660f CH |
105 | default: |
106 | /* | |
107 | * Reset the controller for any non-ANA error as we don't know | |
108 | * what caused the error. | |
109 | */ | |
110 | nvme_reset_ctrl(ns->ctrl); | |
111 | break; | |
112 | } | |
113 | ||
32acab31 CH |
114 | kblockd_schedule_work(&ns->head->requeue_work); |
115 | } | |
116 | ||
32acab31 CH |
117 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
118 | { | |
119 | struct nvme_ns *ns; | |
120 | ||
765cc031 | 121 | down_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
122 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
123 | if (ns->head->disk) | |
124 | kblockd_schedule_work(&ns->head->requeue_work); | |
125 | } | |
765cc031 | 126 | up_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
127 | } |
128 | ||
0d0b660f CH |
129 | static const char *nvme_ana_state_names[] = { |
130 | [0] = "invalid state", | |
131 | [NVME_ANA_OPTIMIZED] = "optimized", | |
132 | [NVME_ANA_NONOPTIMIZED] = "non-optimized", | |
133 | [NVME_ANA_INACCESSIBLE] = "inaccessible", | |
134 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", | |
135 | [NVME_ANA_CHANGE] = "change", | |
136 | }; | |
137 | ||
0157ec8d | 138 | bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
32acab31 | 139 | { |
f3334447 | 140 | struct nvme_ns_head *head = ns->head; |
0157ec8d | 141 | bool changed = false; |
f3334447 CH |
142 | int node; |
143 | ||
144 | if (!head) | |
0157ec8d | 145 | goto out; |
f3334447 CH |
146 | |
147 | for_each_node(node) { | |
0157ec8d | 148 | if (ns == rcu_access_pointer(head->current_path[node])) { |
f3334447 | 149 | rcu_assign_pointer(head->current_path[node], NULL); |
0157ec8d SG |
150 | changed = true; |
151 | } | |
f3334447 | 152 | } |
0157ec8d SG |
153 | out: |
154 | return changed; | |
155 | } | |
156 | ||
157 | void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) | |
158 | { | |
159 | struct nvme_ns *ns; | |
160 | ||
161 | mutex_lock(&ctrl->scan_lock); | |
763303a8 | 162 | down_read(&ctrl->namespaces_rwsem); |
0157ec8d SG |
163 | list_for_each_entry(ns, &ctrl->namespaces, list) |
164 | if (nvme_mpath_clear_current_path(ns)) | |
165 | kblockd_schedule_work(&ns->head->requeue_work); | |
763303a8 | 166 | up_read(&ctrl->namespaces_rwsem); |
0157ec8d | 167 | mutex_unlock(&ctrl->scan_lock); |
f3334447 CH |
168 | } |
169 | ||
ca7ae5c9 HR |
170 | static bool nvme_path_is_disabled(struct nvme_ns *ns) |
171 | { | |
172 | return ns->ctrl->state != NVME_CTRL_LIVE || | |
04e70bd4 HR |
173 | test_bit(NVME_NS_ANA_PENDING, &ns->flags) || |
174 | test_bit(NVME_NS_REMOVING, &ns->flags); | |
ca7ae5c9 HR |
175 | } |
176 | ||
f3334447 CH |
177 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
178 | { | |
179 | int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; | |
180 | struct nvme_ns *found = NULL, *fallback = NULL, *ns; | |
32acab31 CH |
181 | |
182 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
ca7ae5c9 | 183 | if (nvme_path_is_disabled(ns)) |
0d0b660f | 184 | continue; |
f3334447 | 185 | |
75c10e73 HR |
186 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
187 | distance = node_distance(node, ns->ctrl->numa_node); | |
188 | else | |
189 | distance = LOCAL_DISTANCE; | |
f3334447 | 190 | |
0d0b660f CH |
191 | switch (ns->ana_state) { |
192 | case NVME_ANA_OPTIMIZED: | |
f3334447 CH |
193 | if (distance < found_distance) { |
194 | found_distance = distance; | |
195 | found = ns; | |
196 | } | |
197 | break; | |
0d0b660f | 198 | case NVME_ANA_NONOPTIMIZED: |
f3334447 CH |
199 | if (distance < fallback_distance) { |
200 | fallback_distance = distance; | |
201 | fallback = ns; | |
202 | } | |
0d0b660f CH |
203 | break; |
204 | default: | |
205 | break; | |
32acab31 CH |
206 | } |
207 | } | |
208 | ||
f3334447 CH |
209 | if (!found) |
210 | found = fallback; | |
211 | if (found) | |
212 | rcu_assign_pointer(head->current_path[node], found); | |
213 | return found; | |
0d0b660f CH |
214 | } |
215 | ||
75c10e73 HR |
216 | static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
217 | struct nvme_ns *ns) | |
218 | { | |
219 | ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, | |
220 | siblings); | |
221 | if (ns) | |
222 | return ns; | |
223 | return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); | |
224 | } | |
225 | ||
226 | static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, | |
227 | int node, struct nvme_ns *old) | |
228 | { | |
229 | struct nvme_ns *ns, *found, *fallback = NULL; | |
230 | ||
2032d074 HR |
231 | if (list_is_singular(&head->list)) { |
232 | if (nvme_path_is_disabled(old)) | |
233 | return NULL; | |
75c10e73 | 234 | return old; |
2032d074 | 235 | } |
75c10e73 HR |
236 | |
237 | for (ns = nvme_next_ns(head, old); | |
238 | ns != old; | |
239 | ns = nvme_next_ns(head, ns)) { | |
ca7ae5c9 | 240 | if (nvme_path_is_disabled(ns)) |
75c10e73 HR |
241 | continue; |
242 | ||
243 | if (ns->ana_state == NVME_ANA_OPTIMIZED) { | |
244 | found = ns; | |
245 | goto out; | |
246 | } | |
247 | if (ns->ana_state == NVME_ANA_NONOPTIMIZED) | |
248 | fallback = ns; | |
249 | } | |
250 | ||
251 | if (!fallback) | |
252 | return NULL; | |
253 | found = fallback; | |
254 | out: | |
255 | rcu_assign_pointer(head->current_path[node], found); | |
256 | return found; | |
257 | } | |
258 | ||
0d0b660f CH |
259 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
260 | { | |
261 | return ns->ctrl->state == NVME_CTRL_LIVE && | |
262 | ns->ana_state == NVME_ANA_OPTIMIZED; | |
32acab31 CH |
263 | } |
264 | ||
265 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | |
266 | { | |
f3334447 CH |
267 | int node = numa_node_id(); |
268 | struct nvme_ns *ns; | |
32acab31 | 269 | |
f3334447 | 270 | ns = srcu_dereference(head->current_path[node], &head->srcu); |
75c10e73 HR |
271 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) |
272 | ns = nvme_round_robin_path(head, node, ns); | |
0d0b660f | 273 | if (unlikely(!ns || !nvme_path_is_optimized(ns))) |
f3334447 | 274 | ns = __nvme_find_path(head, node); |
32acab31 CH |
275 | return ns; |
276 | } | |
277 | ||
0157ec8d SG |
278 | static bool nvme_available_path(struct nvme_ns_head *head) |
279 | { | |
280 | struct nvme_ns *ns; | |
281 | ||
282 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
283 | switch (ns->ctrl->state) { | |
284 | case NVME_CTRL_LIVE: | |
285 | case NVME_CTRL_RESETTING: | |
286 | case NVME_CTRL_CONNECTING: | |
287 | /* fallthru */ | |
288 | return true; | |
289 | default: | |
290 | break; | |
291 | } | |
292 | } | |
293 | return false; | |
294 | } | |
295 | ||
32acab31 CH |
296 | static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, |
297 | struct bio *bio) | |
298 | { | |
299 | struct nvme_ns_head *head = q->queuedata; | |
300 | struct device *dev = disk_to_dev(head->disk); | |
301 | struct nvme_ns *ns; | |
302 | blk_qc_t ret = BLK_QC_T_NONE; | |
303 | int srcu_idx; | |
304 | ||
525aa5a7 HR |
305 | /* |
306 | * The namespace might be going away and the bio might | |
307 | * be moved to a different queue via blk_steal_bios(), | |
308 | * so we need to use the bio_split pool from the original | |
309 | * queue to allocate the bvecs from. | |
310 | */ | |
311 | blk_queue_split(q, &bio); | |
312 | ||
32acab31 CH |
313 | srcu_idx = srcu_read_lock(&head->srcu); |
314 | ns = nvme_find_path(head); | |
315 | if (likely(ns)) { | |
316 | bio->bi_disk = ns->disk; | |
317 | bio->bi_opf |= REQ_NVME_MPATH; | |
2796b569 HR |
318 | trace_block_bio_remap(bio->bi_disk->queue, bio, |
319 | disk_devt(ns->head->disk), | |
320 | bio->bi_iter.bi_sector); | |
32acab31 | 321 | ret = direct_make_request(bio); |
0157ec8d SG |
322 | } else if (nvme_available_path(head)) { |
323 | dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); | |
32acab31 CH |
324 | |
325 | spin_lock_irq(&head->requeue_lock); | |
326 | bio_list_add(&head->requeue_list, bio); | |
327 | spin_unlock_irq(&head->requeue_lock); | |
328 | } else { | |
0157ec8d | 329 | dev_warn_ratelimited(dev, "no available path - failing I/O\n"); |
32acab31 CH |
330 | |
331 | bio->bi_status = BLK_STS_IOERR; | |
332 | bio_endio(bio); | |
333 | } | |
334 | ||
335 | srcu_read_unlock(&head->srcu, srcu_idx); | |
336 | return ret; | |
337 | } | |
338 | ||
32acab31 CH |
339 | static void nvme_requeue_work(struct work_struct *work) |
340 | { | |
341 | struct nvme_ns_head *head = | |
342 | container_of(work, struct nvme_ns_head, requeue_work); | |
343 | struct bio *bio, *next; | |
344 | ||
345 | spin_lock_irq(&head->requeue_lock); | |
346 | next = bio_list_get(&head->requeue_list); | |
347 | spin_unlock_irq(&head->requeue_lock); | |
348 | ||
349 | while ((bio = next) != NULL) { | |
350 | next = bio->bi_next; | |
351 | bio->bi_next = NULL; | |
352 | ||
353 | /* | |
354 | * Reset disk to the mpath node and resubmit to select a new | |
355 | * path. | |
356 | */ | |
357 | bio->bi_disk = head->disk; | |
358 | generic_make_request(bio); | |
359 | } | |
360 | } | |
361 | ||
362 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | |
363 | { | |
364 | struct request_queue *q; | |
365 | bool vwc = false; | |
366 | ||
0d0b660f | 367 | mutex_init(&head->lock); |
32acab31 CH |
368 | bio_list_init(&head->requeue_list); |
369 | spin_lock_init(&head->requeue_lock); | |
370 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | |
371 | ||
372 | /* | |
373 | * Add a multipath node if the subsystems supports multiple controllers. | |
374 | * We also do this for private namespaces as the namespace sharing data could | |
375 | * change after a rescan. | |
376 | */ | |
377 | if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) | |
378 | return 0; | |
379 | ||
103e515e | 380 | q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); |
32acab31 CH |
381 | if (!q) |
382 | goto out; | |
383 | q->queuedata = head; | |
384 | blk_queue_make_request(q, nvme_ns_head_make_request); | |
8b904b5b | 385 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q); |
32acab31 CH |
386 | /* set to a default value for 512 until disk is validated */ |
387 | blk_queue_logical_block_size(q, 512); | |
8f676b85 | 388 | blk_set_stacking_limits(&q->limits); |
32acab31 CH |
389 | |
390 | /* we need to propagate up the VMC settings */ | |
391 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) | |
392 | vwc = true; | |
393 | blk_queue_write_cache(q, vwc, vwc); | |
394 | ||
395 | head->disk = alloc_disk(0); | |
396 | if (!head->disk) | |
397 | goto out_cleanup_queue; | |
398 | head->disk->fops = &nvme_ns_head_ops; | |
399 | head->disk->private_data = head; | |
400 | head->disk->queue = q; | |
401 | head->disk->flags = GENHD_FL_EXT_DEVT; | |
402 | sprintf(head->disk->disk_name, "nvme%dn%d", | |
403 | ctrl->subsys->instance, head->instance); | |
404 | return 0; | |
405 | ||
406 | out_cleanup_queue: | |
407 | blk_cleanup_queue(q); | |
408 | out: | |
409 | return -ENOMEM; | |
410 | } | |
411 | ||
0d0b660f | 412 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
32acab31 | 413 | { |
0d0b660f CH |
414 | struct nvme_ns_head *head = ns->head; |
415 | ||
416 | lockdep_assert_held(&ns->head->lock); | |
417 | ||
32acab31 CH |
418 | if (!head->disk) |
419 | return; | |
9bd82b1a | 420 | |
33b14f67 HR |
421 | if (!(head->disk->flags & GENHD_FL_UP)) |
422 | device_add_disk(&head->subsys->dev, head->disk, | |
423 | nvme_ns_id_attr_groups); | |
0d0b660f | 424 | |
886fabf6 KB |
425 | if (nvme_path_is_optimized(ns)) { |
426 | int node, srcu_idx; | |
427 | ||
428 | srcu_idx = srcu_read_lock(&head->srcu); | |
429 | for_each_node(node) | |
430 | __nvme_find_path(head, node); | |
431 | srcu_read_unlock(&head->srcu, srcu_idx); | |
432 | } | |
433 | ||
504db087 | 434 | synchronize_srcu(&ns->head->srcu); |
0d0b660f CH |
435 | kblockd_schedule_work(&ns->head->requeue_work); |
436 | } | |
437 | ||
438 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, | |
439 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, | |
440 | void *)) | |
441 | { | |
442 | void *base = ctrl->ana_log_buf; | |
443 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); | |
444 | int error, i; | |
445 | ||
446 | lockdep_assert_held(&ctrl->ana_lock); | |
447 | ||
448 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { | |
449 | struct nvme_ana_group_desc *desc = base + offset; | |
64fab729 PS |
450 | u32 nr_nsids; |
451 | size_t nsid_buf_size; | |
452 | ||
453 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) | |
454 | return -EINVAL; | |
455 | ||
456 | nr_nsids = le32_to_cpu(desc->nnsids); | |
457 | nsid_buf_size = nr_nsids * sizeof(__le32); | |
0d0b660f CH |
458 | |
459 | if (WARN_ON_ONCE(desc->grpid == 0)) | |
460 | return -EINVAL; | |
461 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) | |
462 | return -EINVAL; | |
463 | if (WARN_ON_ONCE(desc->state == 0)) | |
464 | return -EINVAL; | |
465 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) | |
466 | return -EINVAL; | |
467 | ||
468 | offset += sizeof(*desc); | |
469 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) | |
470 | return -EINVAL; | |
471 | ||
472 | error = cb(ctrl, desc, data); | |
473 | if (error) | |
474 | return error; | |
475 | ||
476 | offset += nsid_buf_size; | |
0d0b660f CH |
477 | } |
478 | ||
479 | return 0; | |
480 | } | |
481 | ||
482 | static inline bool nvme_state_is_live(enum nvme_ana_state state) | |
483 | { | |
484 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; | |
485 | } | |
486 | ||
487 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, | |
488 | struct nvme_ns *ns) | |
489 | { | |
0d0b660f | 490 | mutex_lock(&ns->head->lock); |
0d0b660f CH |
491 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
492 | ns->ana_state = desc->state; | |
493 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
494 | ||
cc2278c4 | 495 | if (nvme_state_is_live(ns->ana_state)) |
0d0b660f CH |
496 | nvme_mpath_set_live(ns); |
497 | mutex_unlock(&ns->head->lock); | |
498 | } | |
499 | ||
500 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, | |
501 | struct nvme_ana_group_desc *desc, void *data) | |
502 | { | |
503 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; | |
504 | unsigned *nr_change_groups = data; | |
505 | struct nvme_ns *ns; | |
506 | ||
592b6e7b | 507 | dev_dbg(ctrl->device, "ANA group %d: %s.\n", |
0d0b660f CH |
508 | le32_to_cpu(desc->grpid), |
509 | nvme_ana_state_names[desc->state]); | |
510 | ||
511 | if (desc->state == NVME_ANA_CHANGE) | |
512 | (*nr_change_groups)++; | |
513 | ||
514 | if (!nr_nsids) | |
515 | return 0; | |
516 | ||
517 | down_write(&ctrl->namespaces_rwsem); | |
518 | list_for_each_entry(ns, &ctrl->namespaces, list) { | |
e01f91df AE |
519 | unsigned nsid = le32_to_cpu(desc->nsids[n]); |
520 | ||
521 | if (ns->head->ns_id < nsid) | |
0d0b660f | 522 | continue; |
e01f91df AE |
523 | if (ns->head->ns_id == nsid) |
524 | nvme_update_ns_ana_state(desc, ns); | |
0d0b660f CH |
525 | if (++n == nr_nsids) |
526 | break; | |
527 | } | |
528 | up_write(&ctrl->namespaces_rwsem); | |
0d0b660f CH |
529 | return 0; |
530 | } | |
531 | ||
86cccfbf | 532 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
0d0b660f CH |
533 | { |
534 | u32 nr_change_groups = 0; | |
535 | int error; | |
536 | ||
537 | mutex_lock(&ctrl->ana_lock); | |
86cccfbf | 538 | error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, |
0d0b660f CH |
539 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); |
540 | if (error) { | |
541 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); | |
542 | goto out_unlock; | |
543 | } | |
544 | ||
545 | error = nvme_parse_ana_log(ctrl, &nr_change_groups, | |
546 | nvme_update_ana_state); | |
547 | if (error) | |
548 | goto out_unlock; | |
549 | ||
550 | /* | |
551 | * In theory we should have an ANATT timer per group as they might enter | |
552 | * the change state at different times. But that is a lot of overhead | |
553 | * just to protect against a target that keeps entering new changes | |
554 | * states while never finishing previous ones. But we'll still | |
555 | * eventually time out once all groups are in change state, so this | |
556 | * isn't a big deal. | |
557 | * | |
558 | * We also double the ANATT value to provide some slack for transports | |
559 | * or AEN processing overhead. | |
560 | */ | |
561 | if (nr_change_groups) | |
562 | mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); | |
563 | else | |
564 | del_timer_sync(&ctrl->anatt_timer); | |
565 | out_unlock: | |
566 | mutex_unlock(&ctrl->ana_lock); | |
567 | return error; | |
568 | } | |
569 | ||
570 | static void nvme_ana_work(struct work_struct *work) | |
571 | { | |
572 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); | |
573 | ||
86cccfbf | 574 | nvme_read_ana_log(ctrl); |
0d0b660f CH |
575 | } |
576 | ||
577 | static void nvme_anatt_timeout(struct timer_list *t) | |
578 | { | |
579 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); | |
580 | ||
581 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); | |
582 | nvme_reset_ctrl(ctrl); | |
583 | } | |
584 | ||
585 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) | |
586 | { | |
587 | if (!nvme_ctrl_use_ana(ctrl)) | |
588 | return; | |
589 | del_timer_sync(&ctrl->anatt_timer); | |
590 | cancel_work_sync(&ctrl->ana_work); | |
591 | } | |
592 | ||
75c10e73 HR |
593 | #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
594 | struct device_attribute subsys_attr_##_name = \ | |
595 | __ATTR(_name, _mode, _show, _store) | |
596 | ||
597 | static const char *nvme_iopolicy_names[] = { | |
598 | [NVME_IOPOLICY_NUMA] = "numa", | |
599 | [NVME_IOPOLICY_RR] = "round-robin", | |
600 | }; | |
601 | ||
602 | static ssize_t nvme_subsys_iopolicy_show(struct device *dev, | |
603 | struct device_attribute *attr, char *buf) | |
604 | { | |
605 | struct nvme_subsystem *subsys = | |
606 | container_of(dev, struct nvme_subsystem, dev); | |
607 | ||
608 | return sprintf(buf, "%s\n", | |
609 | nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); | |
610 | } | |
611 | ||
612 | static ssize_t nvme_subsys_iopolicy_store(struct device *dev, | |
613 | struct device_attribute *attr, const char *buf, size_t count) | |
614 | { | |
615 | struct nvme_subsystem *subsys = | |
616 | container_of(dev, struct nvme_subsystem, dev); | |
617 | int i; | |
618 | ||
619 | for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { | |
620 | if (sysfs_streq(buf, nvme_iopolicy_names[i])) { | |
621 | WRITE_ONCE(subsys->iopolicy, i); | |
622 | return count; | |
623 | } | |
624 | } | |
625 | ||
626 | return -EINVAL; | |
627 | } | |
628 | SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, | |
629 | nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); | |
630 | ||
0d0b660f CH |
631 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
632 | char *buf) | |
633 | { | |
634 | return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); | |
635 | } | |
636 | DEVICE_ATTR_RO(ana_grpid); | |
637 | ||
638 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, | |
639 | char *buf) | |
640 | { | |
641 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | |
642 | ||
643 | return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); | |
644 | } | |
645 | DEVICE_ATTR_RO(ana_state); | |
646 | ||
647 | static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, | |
648 | struct nvme_ana_group_desc *desc, void *data) | |
649 | { | |
650 | struct nvme_ns *ns = data; | |
651 | ||
652 | if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { | |
653 | nvme_update_ns_ana_state(desc, ns); | |
654 | return -ENXIO; /* just break out of the loop */ | |
655 | } | |
656 | ||
657 | return 0; | |
658 | } | |
659 | ||
660 | void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) | |
661 | { | |
662 | if (nvme_ctrl_use_ana(ns->ctrl)) { | |
663 | mutex_lock(&ns->ctrl->ana_lock); | |
664 | ns->ana_grpid = le32_to_cpu(id->anagrpid); | |
665 | nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); | |
666 | mutex_unlock(&ns->ctrl->ana_lock); | |
667 | } else { | |
668 | mutex_lock(&ns->head->lock); | |
669 | ns->ana_state = NVME_ANA_OPTIMIZED; | |
670 | nvme_mpath_set_live(ns); | |
671 | mutex_unlock(&ns->head->lock); | |
9bd82b1a | 672 | } |
32acab31 CH |
673 | } |
674 | ||
675 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) | |
676 | { | |
677 | if (!head->disk) | |
678 | return; | |
33b14f67 | 679 | if (head->disk->flags & GENHD_FL_UP) |
0d0b660f | 680 | del_gendisk(head->disk); |
32acab31 CH |
681 | blk_set_queue_dying(head->disk->queue); |
682 | /* make sure all pending bios are cleaned up */ | |
683 | kblockd_schedule_work(&head->requeue_work); | |
684 | flush_work(&head->requeue_work); | |
685 | blk_cleanup_queue(head->disk->queue); | |
686 | put_disk(head->disk); | |
687 | } | |
0d0b660f CH |
688 | |
689 | int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | |
690 | { | |
691 | int error; | |
692 | ||
66b20ac0 MR |
693 | /* check if multipath is enabled and we have the capability */ |
694 | if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) | |
0d0b660f CH |
695 | return 0; |
696 | ||
697 | ctrl->anacap = id->anacap; | |
698 | ctrl->anatt = id->anatt; | |
699 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); | |
700 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); | |
701 | ||
702 | mutex_init(&ctrl->ana_lock); | |
703 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); | |
704 | ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + | |
705 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); | |
78a61cd4 | 706 | ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); |
0d0b660f CH |
707 | |
708 | if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { | |
709 | dev_err(ctrl->device, | |
710 | "ANA log page size (%zd) larger than MDTS (%d).\n", | |
711 | ctrl->ana_log_size, | |
712 | ctrl->max_hw_sectors << SECTOR_SHIFT); | |
713 | dev_err(ctrl->device, "disabling ANA support.\n"); | |
714 | return 0; | |
715 | } | |
716 | ||
717 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); | |
718 | ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); | |
bb830add SD |
719 | if (!ctrl->ana_log_buf) { |
720 | error = -ENOMEM; | |
0d0b660f | 721 | goto out; |
bb830add | 722 | } |
0d0b660f | 723 | |
86cccfbf | 724 | error = nvme_read_ana_log(ctrl); |
0d0b660f CH |
725 | if (error) |
726 | goto out_free_ana_log_buf; | |
727 | return 0; | |
728 | out_free_ana_log_buf: | |
729 | kfree(ctrl->ana_log_buf); | |
c7055fd1 | 730 | ctrl->ana_log_buf = NULL; |
0d0b660f | 731 | out: |
bb830add | 732 | return error; |
0d0b660f CH |
733 | } |
734 | ||
735 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) | |
736 | { | |
737 | kfree(ctrl->ana_log_buf); | |
c7055fd1 | 738 | ctrl->ana_log_buf = NULL; |
0d0b660f CH |
739 | } |
740 |