Commit | Line | Data |
---|---|---|
3dcf60bc | 1 | // SPDX-License-Identifier: GPL-2.0 |
6a0cb1bc HR |
2 | /* |
3 | * Zoned block device handling | |
4 | * | |
5 | * Copyright (c) 2015, Hannes Reinecke | |
6 | * Copyright (c) 2015, SUSE Linux GmbH | |
7 | * | |
8 | * Copyright (c) 2016, Damien Le Moal | |
9 | * Copyright (c) 2016, Western Digital | |
dd291d77 | 10 | * Copyright (c) 2024, Western Digital Corporation or its affiliates. |
6a0cb1bc HR |
11 | */ |
12 | ||
13 | #include <linux/kernel.h> | |
14 | #include <linux/module.h> | |
6a0cb1bc | 15 | #include <linux/blkdev.h> |
bf505456 | 16 | #include <linux/blk-mq.h> |
26202928 DLM |
17 | #include <linux/mm.h> |
18 | #include <linux/vmalloc.h> | |
bd976e52 | 19 | #include <linux/sched/mm.h> |
dd291d77 DLM |
20 | #include <linux/spinlock.h> |
21 | #include <linux/atomic.h> | |
22 | #include <linux/mempool.h> | |
6a0cb1bc | 23 | |
a2d6b3a2 | 24 | #include "blk.h" |
dd291d77 | 25 | #include "blk-mq-sched.h" |
a2d6b3a2 | 26 | |
02694e86 CK |
27 | #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name |
28 | static const char *const zone_cond_name[] = { | |
29 | ZONE_COND_NAME(NOT_WP), | |
30 | ZONE_COND_NAME(EMPTY), | |
31 | ZONE_COND_NAME(IMP_OPEN), | |
32 | ZONE_COND_NAME(EXP_OPEN), | |
33 | ZONE_COND_NAME(CLOSED), | |
34 | ZONE_COND_NAME(READONLY), | |
35 | ZONE_COND_NAME(FULL), | |
36 | ZONE_COND_NAME(OFFLINE), | |
37 | }; | |
38 | #undef ZONE_COND_NAME | |
39 | ||
dd291d77 DLM |
40 | /* |
41 | * Per-zone write plug. | |
42 | * @node: hlist_node structure for managing the plug using a hash table. | |
43 | * @link: To list the plug in the zone write plug error list of the disk. | |
44 | * @ref: Zone write plug reference counter. A zone write plug reference is | |
45 | * always at least 1 when the plug is hashed in the disk plug hash table. | |
46 | * The reference is incremented whenever a new BIO needing plugging is | |
47 | * submitted and when a function needs to manipulate a plug. The | |
48 | * reference count is decremented whenever a plugged BIO completes and | |
49 | * when a function that referenced the plug returns. The initial | |
50 | * reference is dropped whenever the zone of the zone write plug is reset, | |
51 | * finished and when the zone becomes full (last write BIO to the zone | |
52 | * completes). | |
53 | * @lock: Spinlock to atomically manipulate the plug. | |
54 | * @flags: Flags indicating the plug state. | |
55 | * @zone_no: The number of the zone the plug is managing. | |
56 | * @wp_offset: The zone write pointer location relative to the start of the zone | |
57 | * as a number of 512B sectors. | |
58 | * @bio_list: The list of BIOs that are currently plugged. | |
59 | * @bio_work: Work struct to handle issuing of plugged BIOs | |
60 | * @rcu_head: RCU head to free zone write plugs with an RCU grace period. | |
61 | * @disk: The gendisk the plug belongs to. | |
62 | */ | |
63 | struct blk_zone_wplug { | |
64 | struct hlist_node node; | |
65 | struct list_head link; | |
66 | atomic_t ref; | |
67 | spinlock_t lock; | |
68 | unsigned int flags; | |
69 | unsigned int zone_no; | |
70 | unsigned int wp_offset; | |
71 | struct bio_list bio_list; | |
72 | struct work_struct bio_work; | |
73 | struct rcu_head rcu_head; | |
74 | struct gendisk *disk; | |
75 | }; | |
76 | ||
77 | /* | |
78 | * Zone write plug flags bits: | |
79 | * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, | |
80 | * that is, that write BIOs are being throttled due to a write BIO already | |
81 | * being executed or the zone write plug bio list is not empty. | |
82 | * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be | |
83 | * recovered with a report zone to update the zone write pointer offset. | |
84 | * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed | |
85 | * from the disk hash table and that the initial reference to the zone | |
86 | * write plug set when the plug was first added to the hash table has been | |
87 | * dropped. This flag is set when a zone is reset, finished or become full, | |
88 | * to prevent new references to the zone write plug to be taken for | |
89 | * newly incoming BIOs. A zone write plug flagged with this flag will be | |
90 | * freed once all remaining references from BIOs or functions are dropped. | |
91 | */ | |
92 | #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) | |
93 | #define BLK_ZONE_WPLUG_ERROR (1U << 1) | |
94 | #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) | |
95 | ||
96 | #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) | |
97 | ||
02694e86 CK |
98 | /** |
99 | * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. | |
100 | * @zone_cond: BLK_ZONE_COND_XXX. | |
101 | * | |
102 | * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX | |
103 | * into string format. Useful in the debugging and tracing zone conditions. For | |
104 | * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". | |
105 | */ | |
106 | const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) | |
107 | { | |
108 | static const char *zone_cond_str = "UNKNOWN"; | |
109 | ||
110 | if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) | |
111 | zone_cond_str = zone_cond_name[zone_cond]; | |
112 | ||
113 | return zone_cond_str; | |
114 | } | |
115 | EXPORT_SYMBOL_GPL(blk_zone_cond_str); | |
116 | ||
6cc77e9c CH |
117 | /* |
118 | * Return true if a request is a write requests that needs zone write locking. | |
119 | */ | |
120 | bool blk_req_needs_zone_write_lock(struct request *rq) | |
121 | { | |
d86e716a | 122 | if (!rq->q->disk->seq_zones_wlock) |
6cc77e9c CH |
123 | return false; |
124 | ||
19821fee | 125 | return blk_rq_is_seq_zoned_write(rq); |
6cc77e9c CH |
126 | } |
127 | EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); | |
128 | ||
1392d370 JT |
129 | bool blk_req_zone_write_trylock(struct request *rq) |
130 | { | |
131 | unsigned int zno = blk_rq_zone_no(rq); | |
132 | ||
d86e716a | 133 | if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) |
1392d370 JT |
134 | return false; |
135 | ||
136 | WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); | |
137 | rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; | |
138 | ||
139 | return true; | |
140 | } | |
141 | EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); | |
142 | ||
6cc77e9c CH |
143 | void __blk_req_zone_write_lock(struct request *rq) |
144 | { | |
145 | if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), | |
d86e716a | 146 | rq->q->disk->seq_zones_wlock))) |
6cc77e9c CH |
147 | return; |
148 | ||
149 | WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); | |
150 | rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; | |
151 | } | |
152 | EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); | |
153 | ||
154 | void __blk_req_zone_write_unlock(struct request *rq) | |
155 | { | |
156 | rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; | |
d86e716a | 157 | if (rq->q->disk->seq_zones_wlock) |
6cc77e9c | 158 | WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), |
d86e716a | 159 | rq->q->disk->seq_zones_wlock)); |
6cc77e9c CH |
160 | } |
161 | EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); | |
162 | ||
a91e1380 | 163 | /** |
b623e347 CH |
164 | * bdev_nr_zones - Get number of zones |
165 | * @bdev: Target device | |
a91e1380 | 166 | * |
9b38bb4b CH |
167 | * Return the total number of zones of a zoned block device. For a block |
168 | * device without zone capabilities, the number of zones is always 0. | |
a91e1380 | 169 | */ |
b623e347 | 170 | unsigned int bdev_nr_zones(struct block_device *bdev) |
a91e1380 | 171 | { |
b623e347 | 172 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
a91e1380 | 173 | |
b623e347 | 174 | if (!bdev_is_zoned(bdev)) |
a91e1380 | 175 | return 0; |
b623e347 CH |
176 | return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> |
177 | ilog2(zone_sectors); | |
a91e1380 | 178 | } |
b623e347 | 179 | EXPORT_SYMBOL_GPL(bdev_nr_zones); |
a91e1380 | 180 | |
6a0cb1bc HR |
181 | /** |
182 | * blkdev_report_zones - Get zones information | |
183 | * @bdev: Target block device | |
184 | * @sector: Sector from which to report zones | |
d4100351 CH |
185 | * @nr_zones: Maximum number of zones to report |
186 | * @cb: Callback function called for each reported zone | |
187 | * @data: Private data for the callback | |
6a0cb1bc HR |
188 | * |
189 | * Description: | |
d4100351 CH |
190 | * Get zone information starting from the zone containing @sector for at most |
191 | * @nr_zones, and call @cb for each zone reported by the device. | |
192 | * To report all zones in a device starting from @sector, the BLK_ALL_ZONES | |
193 | * constant can be passed to @nr_zones. | |
194 | * Returns the number of zones reported by the device, or a negative errno | |
195 | * value in case of failure. | |
196 | * | |
197 | * Note: The caller must use memalloc_noXX_save/restore() calls to control | |
198 | * memory allocations done within this function. | |
6a0cb1bc | 199 | */ |
e76239a3 | 200 | int blkdev_report_zones(struct block_device *bdev, sector_t sector, |
d4100351 | 201 | unsigned int nr_zones, report_zones_cb cb, void *data) |
6a0cb1bc | 202 | { |
ceeb373a | 203 | struct gendisk *disk = bdev->bd_disk; |
5eac3eb3 | 204 | sector_t capacity = get_capacity(disk); |
6a0cb1bc | 205 | |
edd1dbc8 | 206 | if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) |
e76239a3 | 207 | return -EOPNOTSUPP; |
6a0cb1bc | 208 | |
d4100351 | 209 | if (!nr_zones || sector >= capacity) |
6a0cb1bc | 210 | return 0; |
6a0cb1bc | 211 | |
d4100351 | 212 | return disk->fops->report_zones(disk, sector, nr_zones, cb, data); |
6a0cb1bc HR |
213 | } |
214 | EXPORT_SYMBOL_GPL(blkdev_report_zones); | |
215 | ||
1ee533ec DLM |
216 | static inline unsigned long *blk_alloc_zone_bitmap(int node, |
217 | unsigned int nr_zones) | |
6e33dbf2 | 218 | { |
1ee533ec DLM |
219 | return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), |
220 | GFP_NOIO, node); | |
221 | } | |
6e33dbf2 | 222 | |
1ee533ec DLM |
223 | static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, |
224 | void *data) | |
225 | { | |
6e33dbf2 | 226 | /* |
1ee533ec DLM |
227 | * For an all-zones reset, ignore conventional, empty, read-only |
228 | * and offline zones. | |
6e33dbf2 | 229 | */ |
1ee533ec DLM |
230 | switch (zone->cond) { |
231 | case BLK_ZONE_COND_NOT_WP: | |
232 | case BLK_ZONE_COND_EMPTY: | |
233 | case BLK_ZONE_COND_READONLY: | |
234 | case BLK_ZONE_COND_OFFLINE: | |
235 | return 0; | |
236 | default: | |
237 | set_bit(idx, (unsigned long *)data); | |
238 | return 0; | |
239 | } | |
240 | } | |
241 | ||
71f4ecdb | 242 | static int blkdev_zone_reset_all_emulated(struct block_device *bdev) |
1ee533ec | 243 | { |
d86e716a | 244 | struct gendisk *disk = bdev->bd_disk; |
375c140c CH |
245 | sector_t capacity = bdev_nr_sectors(bdev); |
246 | sector_t zone_sectors = bdev_zone_sectors(bdev); | |
1ee533ec DLM |
247 | unsigned long *need_reset; |
248 | struct bio *bio = NULL; | |
249 | sector_t sector = 0; | |
250 | int ret; | |
251 | ||
d86e716a | 252 | need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); |
1ee533ec DLM |
253 | if (!need_reset) |
254 | return -ENOMEM; | |
255 | ||
d86e716a CH |
256 | ret = disk->fops->report_zones(disk, 0, disk->nr_zones, |
257 | blk_zone_need_reset_cb, need_reset); | |
1ee533ec DLM |
258 | if (ret < 0) |
259 | goto out_free_need_reset; | |
260 | ||
261 | ret = 0; | |
262 | while (sector < capacity) { | |
d86e716a | 263 | if (!test_bit(disk_zone_no(disk, sector), need_reset)) { |
1ee533ec DLM |
264 | sector += zone_sectors; |
265 | continue; | |
266 | } | |
267 | ||
0a3140ea | 268 | bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, |
71f4ecdb | 269 | GFP_KERNEL); |
1ee533ec DLM |
270 | bio->bi_iter.bi_sector = sector; |
271 | sector += zone_sectors; | |
272 | ||
273 | /* This may take a while, so be nice to others */ | |
274 | cond_resched(); | |
275 | } | |
276 | ||
277 | if (bio) { | |
278 | ret = submit_bio_wait(bio); | |
279 | bio_put(bio); | |
280 | } | |
281 | ||
282 | out_free_need_reset: | |
283 | kfree(need_reset); | |
284 | return ret; | |
285 | } | |
286 | ||
71f4ecdb | 287 | static int blkdev_zone_reset_all(struct block_device *bdev) |
1ee533ec DLM |
288 | { |
289 | struct bio bio; | |
290 | ||
49add496 | 291 | bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); |
1ee533ec | 292 | return submit_bio_wait(&bio); |
6e33dbf2 CK |
293 | } |
294 | ||
6a0cb1bc | 295 | /** |
6c1b1da5 | 296 | * blkdev_zone_mgmt - Execute a zone management operation on a range of zones |
6a0cb1bc | 297 | * @bdev: Target block device |
6c1b1da5 AJ |
298 | * @op: Operation to be performed on the zones |
299 | * @sector: Start sector of the first zone to operate on | |
300 | * @nr_sectors: Number of sectors, should be at least the length of one zone and | |
301 | * must be zone size aligned. | |
6a0cb1bc HR |
302 | * |
303 | * Description: | |
6c1b1da5 | 304 | * Perform the specified operation on the range of zones specified by |
6a0cb1bc HR |
305 | * @sector..@sector+@nr_sectors. Specifying the entire disk sector range |
306 | * is valid, but the specified range should not contain conventional zones. | |
6c1b1da5 AJ |
307 | * The operation to execute on each zone can be a zone reset, open, close |
308 | * or finish request. | |
6a0cb1bc | 309 | */ |
ff07a02e | 310 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, |
71f4ecdb | 311 | sector_t sector, sector_t nr_sectors) |
6a0cb1bc HR |
312 | { |
313 | struct request_queue *q = bdev_get_queue(bdev); | |
375c140c CH |
314 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
315 | sector_t capacity = bdev_nr_sectors(bdev); | |
6a0cb1bc | 316 | sector_t end_sector = sector + nr_sectors; |
a2d6b3a2 | 317 | struct bio *bio = NULL; |
1ee533ec | 318 | int ret = 0; |
6a0cb1bc | 319 | |
edd1dbc8 | 320 | if (!bdev_is_zoned(bdev)) |
6a0cb1bc HR |
321 | return -EOPNOTSUPP; |
322 | ||
a2d6b3a2 DLM |
323 | if (bdev_read_only(bdev)) |
324 | return -EPERM; | |
325 | ||
6c1b1da5 AJ |
326 | if (!op_is_zone_mgmt(op)) |
327 | return -EOPNOTSUPP; | |
328 | ||
11bde986 | 329 | if (end_sector <= sector || end_sector > capacity) |
6a0cb1bc HR |
330 | /* Out of range */ |
331 | return -EINVAL; | |
332 | ||
333 | /* Check alignment (handle eventual smaller last zone) */ | |
e29b2100 | 334 | if (!bdev_is_zone_start(bdev, sector)) |
6a0cb1bc HR |
335 | return -EINVAL; |
336 | ||
e29b2100 | 337 | if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) |
6a0cb1bc HR |
338 | return -EINVAL; |
339 | ||
1ee533ec DLM |
340 | /* |
341 | * In the case of a zone reset operation over all zones, | |
342 | * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this | |
343 | * command. For other devices, we emulate this command behavior by | |
344 | * identifying the zones needing a reset. | |
345 | */ | |
346 | if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { | |
347 | if (!blk_queue_zone_resetall(q)) | |
71f4ecdb JT |
348 | return blkdev_zone_reset_all_emulated(bdev); |
349 | return blkdev_zone_reset_all(bdev); | |
1ee533ec DLM |
350 | } |
351 | ||
6a0cb1bc | 352 | while (sector < end_sector) { |
71f4ecdb | 353 | bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); |
c7a1d926 | 354 | bio->bi_iter.bi_sector = sector; |
6a0cb1bc HR |
355 | sector += zone_sectors; |
356 | ||
357 | /* This may take a while, so be nice to others */ | |
358 | cond_resched(); | |
6a0cb1bc HR |
359 | } |
360 | ||
a2d6b3a2 DLM |
361 | ret = submit_bio_wait(bio); |
362 | bio_put(bio); | |
363 | ||
a2d6b3a2 | 364 | return ret; |
6a0cb1bc | 365 | } |
6c1b1da5 | 366 | EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); |
3ed05a98 | 367 | |
d4100351 CH |
368 | struct zone_report_args { |
369 | struct blk_zone __user *zones; | |
370 | }; | |
371 | ||
372 | static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, | |
373 | void *data) | |
374 | { | |
375 | struct zone_report_args *args = data; | |
376 | ||
377 | if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) | |
378 | return -EFAULT; | |
379 | return 0; | |
380 | } | |
381 | ||
56c4bddb | 382 | /* |
3ed05a98 ST |
383 | * BLKREPORTZONE ioctl processing. |
384 | * Called from blkdev_ioctl. | |
385 | */ | |
5e4ea834 CH |
386 | int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, |
387 | unsigned long arg) | |
3ed05a98 ST |
388 | { |
389 | void __user *argp = (void __user *)arg; | |
d4100351 | 390 | struct zone_report_args args; |
3ed05a98 | 391 | struct blk_zone_report rep; |
3ed05a98 ST |
392 | int ret; |
393 | ||
394 | if (!argp) | |
395 | return -EINVAL; | |
396 | ||
edd1dbc8 | 397 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
398 | return -ENOTTY; |
399 | ||
3ed05a98 ST |
400 | if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) |
401 | return -EFAULT; | |
402 | ||
403 | if (!rep.nr_zones) | |
404 | return -EINVAL; | |
405 | ||
d4100351 CH |
406 | args.zones = argp + sizeof(struct blk_zone_report); |
407 | ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, | |
408 | blkdev_copy_zone_to_user, &args); | |
409 | if (ret < 0) | |
410 | return ret; | |
3ed05a98 | 411 | |
d4100351 | 412 | rep.nr_zones = ret; |
82394db7 | 413 | rep.flags = BLK_ZONE_REP_CAPACITY; |
d4100351 CH |
414 | if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) |
415 | return -EFAULT; | |
416 | return 0; | |
3ed05a98 ST |
417 | } |
418 | ||
05bdb996 CH |
419 | static int blkdev_truncate_zone_range(struct block_device *bdev, |
420 | blk_mode_t mode, const struct blk_zone_range *zrange) | |
e5113505 SK |
421 | { |
422 | loff_t start, end; | |
423 | ||
424 | if (zrange->sector + zrange->nr_sectors <= zrange->sector || | |
425 | zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) | |
426 | /* Out of range */ | |
427 | return -EINVAL; | |
428 | ||
429 | start = zrange->sector << SECTOR_SHIFT; | |
430 | end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; | |
431 | ||
432 | return truncate_bdev_range(bdev, mode, start, end); | |
433 | } | |
434 | ||
56c4bddb | 435 | /* |
e876df1f | 436 | * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. |
3ed05a98 ST |
437 | * Called from blkdev_ioctl. |
438 | */ | |
05bdb996 | 439 | int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, |
e876df1f | 440 | unsigned int cmd, unsigned long arg) |
3ed05a98 ST |
441 | { |
442 | void __user *argp = (void __user *)arg; | |
3ed05a98 | 443 | struct blk_zone_range zrange; |
ff07a02e | 444 | enum req_op op; |
e5113505 | 445 | int ret; |
3ed05a98 ST |
446 | |
447 | if (!argp) | |
448 | return -EINVAL; | |
449 | ||
edd1dbc8 | 450 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
451 | return -ENOTTY; |
452 | ||
05bdb996 | 453 | if (!(mode & BLK_OPEN_WRITE)) |
3ed05a98 ST |
454 | return -EBADF; |
455 | ||
456 | if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) | |
457 | return -EFAULT; | |
458 | ||
e876df1f AJ |
459 | switch (cmd) { |
460 | case BLKRESETZONE: | |
461 | op = REQ_OP_ZONE_RESET; | |
e5113505 SK |
462 | |
463 | /* Invalidate the page cache, including dirty pages. */ | |
86399ea0 | 464 | filemap_invalidate_lock(bdev->bd_inode->i_mapping); |
e5113505 SK |
465 | ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
466 | if (ret) | |
86399ea0 | 467 | goto fail; |
e876df1f AJ |
468 | break; |
469 | case BLKOPENZONE: | |
470 | op = REQ_OP_ZONE_OPEN; | |
471 | break; | |
472 | case BLKCLOSEZONE: | |
473 | op = REQ_OP_ZONE_CLOSE; | |
474 | break; | |
475 | case BLKFINISHZONE: | |
476 | op = REQ_OP_ZONE_FINISH; | |
477 | break; | |
478 | default: | |
479 | return -ENOTTY; | |
480 | } | |
481 | ||
71f4ecdb | 482 | ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); |
e5113505 | 483 | |
86399ea0 SK |
484 | fail: |
485 | if (cmd == BLKRESETZONE) | |
486 | filemap_invalidate_unlock(bdev->bd_inode->i_mapping); | |
e5113505 SK |
487 | |
488 | return ret; | |
3ed05a98 | 489 | } |
bf505456 | 490 | |
dd291d77 DLM |
491 | static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) |
492 | { | |
493 | if (!disk->conv_zones_bitmap) | |
494 | return false; | |
495 | return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); | |
496 | } | |
497 | ||
498 | static bool disk_insert_zone_wplug(struct gendisk *disk, | |
499 | struct blk_zone_wplug *zwplug) | |
500 | { | |
501 | struct blk_zone_wplug *zwplg; | |
502 | unsigned long flags; | |
503 | unsigned int idx = | |
504 | hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); | |
505 | ||
506 | /* | |
507 | * Add the new zone write plug to the hash table, but carefully as we | |
508 | * are racing with other submission context, so we may already have a | |
509 | * zone write plug for the same zone. | |
510 | */ | |
511 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
512 | hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { | |
513 | if (zwplg->zone_no == zwplug->zone_no) { | |
514 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
515 | return false; | |
516 | } | |
517 | } | |
518 | hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); | |
519 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
520 | ||
521 | return true; | |
522 | } | |
523 | ||
524 | static void disk_remove_zone_wplug(struct gendisk *disk, | |
525 | struct blk_zone_wplug *zwplug) | |
526 | { | |
527 | unsigned long flags; | |
528 | ||
529 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
530 | zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; | |
531 | atomic_dec(&zwplug->ref); | |
532 | hlist_del_init_rcu(&zwplug->node); | |
533 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
534 | } | |
535 | ||
536 | static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, | |
537 | struct blk_zone_wplug *zwplug) | |
538 | { | |
539 | /* If the zone is still busy, the plug cannot be removed. */ | |
540 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) | |
541 | return false; | |
542 | ||
543 | /* We can remove zone write plugs for zones that are empty or full. */ | |
544 | return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; | |
545 | } | |
546 | ||
547 | static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, | |
548 | sector_t sector) | |
549 | { | |
550 | unsigned int zno = disk_zone_no(disk, sector); | |
551 | unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); | |
552 | struct blk_zone_wplug *zwplug; | |
553 | ||
554 | rcu_read_lock(); | |
555 | ||
556 | hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { | |
557 | if (zwplug->zone_no == zno && | |
558 | atomic_inc_not_zero(&zwplug->ref)) { | |
559 | rcu_read_unlock(); | |
560 | return zwplug; | |
561 | } | |
562 | } | |
563 | ||
564 | rcu_read_unlock(); | |
565 | ||
566 | return NULL; | |
567 | } | |
568 | ||
569 | static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) | |
570 | { | |
571 | struct blk_zone_wplug *zwplug = | |
572 | container_of(rcu_head, struct blk_zone_wplug, rcu_head); | |
573 | ||
574 | mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); | |
575 | } | |
576 | ||
577 | static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) | |
578 | { | |
579 | if (atomic_dec_and_test(&zwplug->ref)) { | |
580 | WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); | |
581 | WARN_ON_ONCE(!list_empty(&zwplug->link)); | |
582 | ||
583 | call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); | |
584 | } | |
585 | } | |
586 | ||
587 | static void blk_zone_wplug_bio_work(struct work_struct *work); | |
588 | ||
589 | /* | |
590 | * Get a reference on the write plug for the zone containing @sector. | |
591 | * If the plug does not exist, it is allocated and hashed. | |
592 | * Return a pointer to the zone write plug with the plug spinlock held. | |
593 | */ | |
594 | static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, | |
595 | sector_t sector, gfp_t gfp_mask, | |
596 | unsigned long *flags) | |
bf505456 | 597 | { |
dd291d77 DLM |
598 | unsigned int zno = disk_zone_no(disk, sector); |
599 | struct blk_zone_wplug *zwplug; | |
600 | ||
601 | again: | |
602 | zwplug = disk_get_zone_wplug(disk, sector); | |
603 | if (zwplug) { | |
604 | /* | |
605 | * Check that a BIO completion or a zone reset or finish | |
606 | * operation has not already removed the zone write plug from | |
607 | * the hash table and dropped its reference count. In such case, | |
608 | * we need to get a new plug so start over from the beginning. | |
609 | */ | |
610 | spin_lock_irqsave(&zwplug->lock, *flags); | |
611 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
612 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
613 | disk_put_zone_wplug(zwplug); | |
614 | goto again; | |
615 | } | |
616 | return zwplug; | |
617 | } | |
618 | ||
619 | /* | |
620 | * Allocate and initialize a zone write plug with an extra reference | |
621 | * so that it is not freed when the zone write plug becomes idle without | |
622 | * the zone being full. | |
623 | */ | |
624 | zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); | |
625 | if (!zwplug) | |
626 | return NULL; | |
627 | ||
628 | INIT_HLIST_NODE(&zwplug->node); | |
629 | INIT_LIST_HEAD(&zwplug->link); | |
630 | atomic_set(&zwplug->ref, 2); | |
631 | spin_lock_init(&zwplug->lock); | |
632 | zwplug->flags = 0; | |
633 | zwplug->zone_no = zno; | |
634 | zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); | |
635 | bio_list_init(&zwplug->bio_list); | |
636 | INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); | |
637 | zwplug->disk = disk; | |
638 | ||
639 | spin_lock_irqsave(&zwplug->lock, *flags); | |
640 | ||
641 | /* | |
642 | * Insert the new zone write plug in the hash table. This can fail only | |
643 | * if another context already inserted a plug. Retry from the beginning | |
644 | * in such case. | |
645 | */ | |
646 | if (!disk_insert_zone_wplug(disk, zwplug)) { | |
647 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
648 | mempool_free(zwplug, disk->zone_wplugs_pool); | |
649 | goto again; | |
650 | } | |
651 | ||
652 | return zwplug; | |
653 | } | |
654 | ||
655 | static inline void blk_zone_wplug_bio_io_error(struct bio *bio) | |
656 | { | |
657 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
658 | ||
659 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
660 | bio_io_error(bio); | |
661 | blk_queue_exit(q); | |
662 | } | |
663 | ||
664 | /* | |
665 | * Abort (fail) all plugged BIOs of a zone write plug. | |
666 | */ | |
667 | static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) | |
668 | { | |
669 | struct bio *bio; | |
670 | ||
671 | while ((bio = bio_list_pop(&zwplug->bio_list))) { | |
672 | blk_zone_wplug_bio_io_error(bio); | |
673 | disk_put_zone_wplug(zwplug); | |
674 | } | |
675 | } | |
676 | ||
677 | /* | |
678 | * Abort (fail) all plugged BIOs of a zone write plug that are not aligned | |
679 | * with the assumed write pointer location of the zone when the BIO will | |
680 | * be unplugged. | |
681 | */ | |
682 | static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, | |
683 | struct blk_zone_wplug *zwplug) | |
684 | { | |
685 | unsigned int zone_capacity = disk->zone_capacity; | |
686 | unsigned int wp_offset = zwplug->wp_offset; | |
687 | struct bio_list bl = BIO_EMPTY_LIST; | |
688 | struct bio *bio; | |
689 | ||
690 | while ((bio = bio_list_pop(&zwplug->bio_list))) { | |
691 | if (wp_offset >= zone_capacity || | |
9b1ce7f0 DLM |
692 | (bio_op(bio) != REQ_OP_ZONE_APPEND && |
693 | bio_offset_from_zone_start(bio) != wp_offset)) { | |
dd291d77 DLM |
694 | blk_zone_wplug_bio_io_error(bio); |
695 | disk_put_zone_wplug(zwplug); | |
696 | continue; | |
697 | } | |
698 | ||
699 | wp_offset += bio_sectors(bio); | |
700 | bio_list_add(&bl, bio); | |
701 | } | |
702 | ||
703 | bio_list_merge(&zwplug->bio_list, &bl); | |
704 | } | |
705 | ||
706 | /* | |
707 | * Set a zone write plug write pointer offset to either 0 (zone reset case) | |
708 | * or to the zone size (zone finish case). This aborts all plugged BIOs, which | |
709 | * is fine to do as doing a zone reset or zone finish while writes are in-flight | |
710 | * is a mistake from the user which will most likely cause all plugged BIOs to | |
711 | * fail anyway. | |
712 | */ | |
713 | static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, | |
714 | struct blk_zone_wplug *zwplug, | |
715 | unsigned int wp_offset) | |
716 | { | |
717 | unsigned long flags; | |
718 | ||
719 | spin_lock_irqsave(&zwplug->lock, flags); | |
720 | ||
721 | /* | |
722 | * Make sure that a BIO completion or another zone reset or finish | |
723 | * operation has not already removed the plug from the hash table. | |
724 | */ | |
725 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
726 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
727 | return; | |
728 | } | |
729 | ||
730 | /* Update the zone write pointer and abort all plugged BIOs. */ | |
731 | zwplug->wp_offset = wp_offset; | |
732 | disk_zone_wplug_abort(zwplug); | |
733 | ||
734 | /* | |
735 | * Updating the write pointer offset puts back the zone | |
736 | * in a good state. So clear the error flag and decrement the | |
737 | * error count if we were in error state. | |
738 | */ | |
739 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { | |
740 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
741 | spin_lock(&disk->zone_wplugs_lock); | |
742 | list_del_init(&zwplug->link); | |
743 | spin_unlock(&disk->zone_wplugs_lock); | |
744 | } | |
745 | ||
746 | /* | |
747 | * The zone write plug now has no BIO plugged: remove it from the | |
748 | * hash table so that it cannot be seen. The plug will be freed | |
749 | * when the last reference is dropped. | |
750 | */ | |
751 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
752 | disk_remove_zone_wplug(disk, zwplug); | |
753 | ||
754 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
755 | } | |
756 | ||
757 | static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, | |
758 | unsigned int wp_offset) | |
759 | { | |
760 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
761 | sector_t sector = bio->bi_iter.bi_sector; | |
762 | struct blk_zone_wplug *zwplug; | |
763 | ||
764 | /* Conventional zones cannot be reset nor finished. */ | |
765 | if (disk_zone_is_conv(disk, sector)) { | |
766 | bio_io_error(bio); | |
767 | return true; | |
768 | } | |
769 | ||
770 | /* | |
771 | * If we have a zone write plug, set its write pointer offset to 0 | |
772 | * (reset case) or to the zone size (finish case). This will abort all | |
773 | * BIOs plugged for the target zone. It is fine as resetting or | |
774 | * finishing zones while writes are still in-flight will result in the | |
775 | * writes failing anyway. | |
776 | */ | |
777 | zwplug = disk_get_zone_wplug(disk, sector); | |
778 | if (zwplug) { | |
779 | disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); | |
780 | disk_put_zone_wplug(zwplug); | |
781 | } | |
782 | ||
783 | return false; | |
784 | } | |
785 | ||
786 | static bool blk_zone_wplug_handle_reset_all(struct bio *bio) | |
787 | { | |
788 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
789 | struct blk_zone_wplug *zwplug; | |
790 | sector_t sector; | |
791 | ||
792 | /* | |
793 | * Set the write pointer offset of all zone write plugs to 0. This will | |
794 | * abort all plugged BIOs. It is fine as resetting zones while writes | |
795 | * are still in-flight will result in the writes failing anyway. | |
796 | */ | |
797 | for (sector = 0; sector < get_capacity(disk); | |
798 | sector += disk->queue->limits.chunk_sectors) { | |
799 | zwplug = disk_get_zone_wplug(disk, sector); | |
800 | if (zwplug) { | |
801 | disk_zone_wplug_set_wp_offset(disk, zwplug, 0); | |
802 | disk_put_zone_wplug(zwplug); | |
803 | } | |
804 | } | |
805 | ||
806 | return false; | |
807 | } | |
808 | ||
809 | static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, | |
810 | struct bio *bio, unsigned int nr_segs) | |
811 | { | |
812 | /* | |
813 | * Grab an extra reference on the BIO request queue usage counter. | |
814 | * This reference will be reused to submit a request for the BIO for | |
815 | * blk-mq devices and dropped when the BIO is failed and after | |
816 | * it is issued in the case of BIO-based devices. | |
817 | */ | |
818 | percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); | |
819 | ||
820 | /* | |
821 | * The BIO is being plugged and thus will have to wait for the on-going | |
822 | * write and for all other writes already plugged. So polling makes | |
823 | * no sense. | |
824 | */ | |
825 | bio_clear_polled(bio); | |
826 | ||
827 | /* | |
828 | * Reuse the poll cookie field to store the number of segments when | |
829 | * split to the hardware limits. | |
830 | */ | |
831 | bio->__bi_nr_segments = nr_segs; | |
832 | ||
833 | /* | |
834 | * We always receive BIOs after they are split and ready to be issued. | |
835 | * The block layer passes the parts of a split BIO in order, and the | |
836 | * user must also issue write sequentially. So simply add the new BIO | |
837 | * at the tail of the list to preserve the sequential write order. | |
838 | */ | |
839 | bio_list_add(&zwplug->bio_list, bio); | |
840 | } | |
841 | ||
842 | /* | |
843 | * Called from bio_attempt_back_merge() when a BIO was merged with a request. | |
844 | */ | |
845 | void blk_zone_write_plug_bio_merged(struct bio *bio) | |
846 | { | |
847 | struct blk_zone_wplug *zwplug; | |
848 | unsigned long flags; | |
849 | ||
850 | /* | |
851 | * If the BIO was already plugged, then we were called through | |
852 | * blk_zone_write_plug_attempt_merge() -> blk_attempt_bio_merge(). | |
853 | * For this case, blk_zone_write_plug_attempt_merge() will handle the | |
854 | * zone write pointer offset update. | |
855 | */ | |
856 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
857 | return; | |
858 | ||
859 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
860 | ||
861 | /* | |
862 | * Increase the plug reference count and advance the zone write | |
863 | * pointer offset. | |
864 | */ | |
865 | zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, | |
866 | bio->bi_iter.bi_sector); | |
867 | spin_lock_irqsave(&zwplug->lock, flags); | |
868 | zwplug->wp_offset += bio_sectors(bio); | |
869 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
870 | } | |
871 | ||
872 | /* | |
873 | * Attempt to merge plugged BIOs with a newly prepared request for a BIO that | |
874 | * already went through zone write plugging (either a new BIO or one that was | |
875 | * unplugged). | |
876 | */ | |
877 | void blk_zone_write_plug_attempt_merge(struct request *req) | |
878 | { | |
879 | sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); | |
880 | struct request_queue *q = req->q; | |
881 | struct gendisk *disk = q->disk; | |
882 | unsigned int zone_capacity = disk->zone_capacity; | |
883 | struct blk_zone_wplug *zwplug = | |
884 | disk_get_zone_wplug(disk, blk_rq_pos(req)); | |
885 | unsigned long flags; | |
886 | struct bio *bio; | |
887 | ||
888 | /* | |
889 | * Completion of this request needs to be handled with | |
890 | * blk_zone_write_plug_complete_request(). | |
891 | */ | |
892 | req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; | |
893 | ||
894 | if (blk_queue_nomerges(q)) | |
895 | return; | |
896 | ||
897 | /* | |
898 | * Walk through the list of plugged BIOs to check if they can be merged | |
899 | * into the back of the request. | |
900 | */ | |
901 | spin_lock_irqsave(&zwplug->lock, flags); | |
902 | while (zwplug->wp_offset < zone_capacity) { | |
903 | bio = bio_list_peek(&zwplug->bio_list); | |
904 | if (!bio) | |
905 | break; | |
906 | ||
907 | if (bio->bi_iter.bi_sector != req_back_sector || | |
908 | !blk_rq_merge_ok(req, bio)) | |
909 | break; | |
910 | ||
911 | WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && | |
912 | !bio->__bi_nr_segments); | |
913 | ||
914 | bio_list_pop(&zwplug->bio_list); | |
915 | if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != | |
916 | BIO_MERGE_OK) { | |
917 | bio_list_add_head(&zwplug->bio_list, bio); | |
918 | break; | |
919 | } | |
920 | ||
921 | /* | |
922 | * Drop the extra reference on the queue usage we got when | |
923 | * plugging the BIO and advance the write pointer offset. | |
924 | */ | |
925 | blk_queue_exit(q); | |
926 | zwplug->wp_offset += bio_sectors(bio); | |
927 | ||
928 | req_back_sector += bio_sectors(bio); | |
929 | } | |
930 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
931 | } | |
932 | ||
933 | static inline void disk_zone_wplug_set_error(struct gendisk *disk, | |
934 | struct blk_zone_wplug *zwplug) | |
935 | { | |
936 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) { | |
937 | unsigned long flags; | |
938 | ||
939 | /* | |
940 | * Increase the plug reference count. The reference will be | |
941 | * dropped in disk_zone_wplugs_work() once the error state | |
942 | * is handled. | |
943 | */ | |
944 | zwplug->flags |= BLK_ZONE_WPLUG_ERROR; | |
945 | atomic_inc(&zwplug->ref); | |
946 | ||
947 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
948 | list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); | |
949 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
950 | } | |
951 | } | |
952 | ||
953 | /* | |
954 | * Check and prepare a BIO for submission by incrementing the write pointer | |
9b1ce7f0 DLM |
955 | * offset of its zone write plug and changing zone append operations into |
956 | * regular write when zone append emulation is needed. | |
dd291d77 DLM |
957 | */ |
958 | static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, | |
959 | struct bio *bio) | |
960 | { | |
961 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
962 | ||
963 | /* | |
964 | * Check that the user is not attempting to write to a full zone. | |
965 | * We know such BIO will fail, and that would potentially overflow our | |
966 | * write pointer offset beyond the end of the zone. | |
967 | */ | |
968 | if (zwplug->wp_offset >= disk->zone_capacity) | |
969 | goto err; | |
970 | ||
9b1ce7f0 DLM |
971 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
972 | /* | |
973 | * Use a regular write starting at the current write pointer. | |
974 | * Similarly to native zone append operations, do not allow | |
975 | * merging. | |
976 | */ | |
977 | bio->bi_opf &= ~REQ_OP_MASK; | |
978 | bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; | |
979 | bio->bi_iter.bi_sector += zwplug->wp_offset; | |
980 | ||
981 | /* | |
982 | * Remember that this BIO is in fact a zone append operation | |
983 | * so that we can restore its operation code on completion. | |
984 | */ | |
985 | bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); | |
986 | } else { | |
987 | /* | |
988 | * Check for non-sequential writes early because we avoid a | |
989 | * whole lot of error handling trouble if we don't send it off | |
990 | * to the driver. | |
991 | */ | |
992 | if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) | |
993 | goto err; | |
994 | } | |
dd291d77 DLM |
995 | |
996 | /* Advance the zone write pointer offset. */ | |
997 | zwplug->wp_offset += bio_sectors(bio); | |
998 | ||
999 | return true; | |
1000 | ||
1001 | err: | |
1002 | /* We detected an invalid write BIO: schedule error recovery. */ | |
1003 | disk_zone_wplug_set_error(disk, zwplug); | |
1004 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1005 | return false; | |
1006 | } | |
1007 | ||
1008 | static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) | |
1009 | { | |
1010 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1011 | sector_t sector = bio->bi_iter.bi_sector; | |
1012 | struct blk_zone_wplug *zwplug; | |
1013 | gfp_t gfp_mask = GFP_NOIO; | |
1014 | unsigned long flags; | |
1015 | ||
1016 | /* | |
1017 | * BIOs must be fully contained within a zone so that we use the correct | |
1018 | * zone write plug for the entire BIO. For blk-mq devices, the block | |
1019 | * layer should already have done any splitting required to ensure this | |
1020 | * and this BIO should thus not be straddling zone boundaries. For | |
1021 | * BIO-based devices, it is the responsibility of the driver to split | |
1022 | * the bio before submitting it. | |
1023 | */ | |
1024 | if (WARN_ON_ONCE(bio_straddles_zones(bio))) { | |
1025 | bio_io_error(bio); | |
1026 | return true; | |
1027 | } | |
1028 | ||
1029 | /* Conventional zones do not need write plugging. */ | |
9b1ce7f0 DLM |
1030 | if (disk_zone_is_conv(disk, sector)) { |
1031 | /* Zone append to conventional zones is not allowed. */ | |
1032 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
1033 | bio_io_error(bio); | |
1034 | return true; | |
1035 | } | |
dd291d77 | 1036 | return false; |
9b1ce7f0 | 1037 | } |
dd291d77 DLM |
1038 | |
1039 | if (bio->bi_opf & REQ_NOWAIT) | |
1040 | gfp_mask = GFP_NOWAIT; | |
1041 | ||
1042 | zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); | |
1043 | if (!zwplug) { | |
1044 | bio_io_error(bio); | |
1045 | return true; | |
1046 | } | |
1047 | ||
1048 | /* Indicate that this BIO is being handled using zone write plugging. */ | |
1049 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1050 | ||
1051 | /* | |
1052 | * If the zone is already plugged or has a pending error, add the BIO | |
1053 | * to the plug BIO list. Otherwise, plug and let the BIO execute. | |
1054 | */ | |
1055 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) | |
1056 | goto plug; | |
1057 | ||
1058 | /* | |
1059 | * If an error is detected when preparing the BIO, add it to the BIO | |
1060 | * list so that error recovery can deal with it. | |
1061 | */ | |
1062 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) | |
1063 | goto plug; | |
1064 | ||
1065 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1066 | ||
1067 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1068 | ||
1069 | return false; | |
1070 | ||
1071 | plug: | |
1072 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1073 | blk_zone_wplug_add_bio(zwplug, bio, nr_segs); | |
1074 | ||
1075 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1076 | ||
1077 | return true; | |
1078 | } | |
1079 | ||
1080 | /** | |
1081 | * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging | |
1082 | * @bio: The BIO being submitted | |
1083 | * @nr_segs: The number of physical segments of @bio | |
1084 | * | |
9b1ce7f0 DLM |
1085 | * Handle write, write zeroes and zone append operations requiring emulation |
1086 | * using zone write plugging. | |
dd291d77 DLM |
1087 | * |
1088 | * Return true whenever @bio execution needs to be delayed through the zone | |
1089 | * write plug. Otherwise, return false to let the submission path process | |
1090 | * @bio normally. | |
1091 | */ | |
1092 | bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) | |
1093 | { | |
1094 | struct block_device *bdev = bio->bi_bdev; | |
1095 | ||
1096 | if (!bdev->bd_disk->zone_wplugs_hash) | |
1097 | return false; | |
1098 | ||
1099 | /* | |
1100 | * If the BIO already has the plugging flag set, then it was already | |
1101 | * handled through this path and this is a submission from the zone | |
1102 | * plug bio submit work. | |
1103 | */ | |
1104 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
1105 | return false; | |
1106 | ||
1107 | /* | |
1108 | * We do not need to do anything special for empty flush BIOs, e.g | |
1109 | * BIOs such as issued by blkdev_issue_flush(). The is because it is | |
1110 | * the responsibility of the user to first wait for the completion of | |
1111 | * write operations for flush to have any effect on the persistence of | |
1112 | * the written data. | |
1113 | */ | |
1114 | if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) | |
1115 | return false; | |
1116 | ||
1117 | /* | |
1118 | * Regular writes and write zeroes need to be handled through the target | |
1119 | * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH | |
1120 | * which may need to go through the flush machinery depending on the | |
1121 | * target device capabilities. Plugging such writes is fine as the flush | |
1122 | * machinery operates at the request level, below the plug, and | |
1123 | * completion of the flush sequence will go through the regular BIO | |
1124 | * completion, which will handle zone write plugging. | |
9b1ce7f0 DLM |
1125 | * Zone append operations for devices that requested emulation must |
1126 | * also be plugged so that these BIOs can be changed into regular | |
1127 | * write BIOs. | |
dd291d77 DLM |
1128 | * Zone reset, reset all and finish commands need special treatment |
1129 | * to correctly track the write pointer offset of zones. These commands | |
1130 | * are not plugged as we do not need serialization with write | |
1131 | * operations. It is the responsibility of the user to not issue reset | |
1132 | * and finish commands when write operations are in flight. | |
1133 | */ | |
1134 | switch (bio_op(bio)) { | |
9b1ce7f0 DLM |
1135 | case REQ_OP_ZONE_APPEND: |
1136 | if (!bdev_emulates_zone_append(bdev)) | |
1137 | return false; | |
1138 | fallthrough; | |
dd291d77 DLM |
1139 | case REQ_OP_WRITE: |
1140 | case REQ_OP_WRITE_ZEROES: | |
1141 | return blk_zone_wplug_handle_write(bio, nr_segs); | |
1142 | case REQ_OP_ZONE_RESET: | |
1143 | return blk_zone_wplug_handle_reset_or_finish(bio, 0); | |
1144 | case REQ_OP_ZONE_FINISH: | |
1145 | return blk_zone_wplug_handle_reset_or_finish(bio, | |
1146 | bdev_zone_sectors(bdev)); | |
1147 | case REQ_OP_ZONE_RESET_ALL: | |
1148 | return blk_zone_wplug_handle_reset_all(bio); | |
1149 | default: | |
1150 | return false; | |
1151 | } | |
1152 | ||
1153 | return false; | |
1154 | } | |
1155 | EXPORT_SYMBOL_GPL(blk_zone_plug_bio); | |
1156 | ||
1157 | static void disk_zone_wplug_unplug_bio(struct gendisk *disk, | |
1158 | struct blk_zone_wplug *zwplug) | |
1159 | { | |
1160 | unsigned long flags; | |
1161 | ||
1162 | spin_lock_irqsave(&zwplug->lock, flags); | |
1163 | ||
1164 | /* | |
1165 | * If we had an error, schedule error recovery. The recovery work | |
1166 | * will restart submission of plugged BIOs. | |
1167 | */ | |
1168 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { | |
1169 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1170 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1171 | return; | |
1172 | } | |
1173 | ||
1174 | /* Schedule submission of the next plugged BIO if we have one. */ | |
1175 | if (!bio_list_empty(&zwplug->bio_list)) { | |
1176 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1177 | kblockd_schedule_work(&zwplug->bio_work); | |
1178 | return; | |
1179 | } | |
1180 | ||
1181 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1182 | ||
1183 | /* | |
1184 | * If the zone is full (it was fully written or finished, or empty | |
1185 | * (it was reset), remove its zone write plug from the hash table. | |
1186 | */ | |
1187 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1188 | disk_remove_zone_wplug(disk, zwplug); | |
1189 | ||
1190 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1191 | } | |
1192 | ||
1193 | void blk_zone_write_plug_bio_endio(struct bio *bio) | |
1194 | { | |
1195 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1196 | struct blk_zone_wplug *zwplug = | |
1197 | disk_get_zone_wplug(bio->bi_bdev->bd_disk, | |
1198 | bio->bi_iter.bi_sector); | |
1199 | unsigned long flags; | |
1200 | ||
1201 | if (WARN_ON_ONCE(!zwplug)) | |
1202 | return; | |
1203 | ||
1204 | /* Make sure we do not see this BIO again by clearing the plug flag. */ | |
1205 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1206 | ||
9b1ce7f0 DLM |
1207 | /* |
1208 | * If this is a regular write emulating a zone append operation, | |
1209 | * restore the original operation code. | |
1210 | */ | |
1211 | if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { | |
1212 | bio->bi_opf &= ~REQ_OP_MASK; | |
1213 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
1214 | } | |
1215 | ||
dd291d77 DLM |
1216 | /* |
1217 | * If the BIO failed, mark the plug as having an error to trigger | |
1218 | * recovery. | |
1219 | */ | |
1220 | if (bio->bi_status != BLK_STS_OK) { | |
1221 | spin_lock_irqsave(&zwplug->lock, flags); | |
1222 | disk_zone_wplug_set_error(disk, zwplug); | |
1223 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1224 | } | |
1225 | ||
1226 | /* | |
1227 | * For BIO-based devices, blk_zone_write_plug_complete_request() | |
1228 | * is not called. So we need to schedule execution of the next | |
1229 | * plugged BIO here. | |
1230 | */ | |
1231 | if (bio->bi_bdev->bd_has_submit_bio) | |
1232 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1233 | ||
1234 | /* Drop the reference we took when the BIO was issued. */ | |
1235 | atomic_dec(&zwplug->ref); | |
1236 | disk_put_zone_wplug(zwplug); | |
1237 | } | |
1238 | ||
1239 | void blk_zone_write_plug_complete_request(struct request *req) | |
1240 | { | |
1241 | struct gendisk *disk = req->q->disk; | |
1242 | struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, req->__sector); | |
1243 | ||
1244 | if (WARN_ON_ONCE(!zwplug)) | |
1245 | return; | |
1246 | ||
1247 | req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; | |
1248 | ||
1249 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1250 | ||
1251 | /* | |
1252 | * Drop the reference we took when the request was initialized in | |
1253 | * blk_zone_write_plug_attempt_merge(). | |
1254 | */ | |
1255 | atomic_dec(&zwplug->ref); | |
1256 | disk_put_zone_wplug(zwplug); | |
1257 | } | |
1258 | ||
1259 | static void blk_zone_wplug_bio_work(struct work_struct *work) | |
1260 | { | |
1261 | struct blk_zone_wplug *zwplug = | |
1262 | container_of(work, struct blk_zone_wplug, bio_work); | |
1263 | struct block_device *bdev; | |
1264 | unsigned long flags; | |
1265 | struct bio *bio; | |
1266 | ||
1267 | /* | |
1268 | * Submit the next plugged BIO. If we do not have any, clear | |
1269 | * the plugged flag. | |
1270 | */ | |
1271 | spin_lock_irqsave(&zwplug->lock, flags); | |
1272 | ||
1273 | bio = bio_list_pop(&zwplug->bio_list); | |
1274 | if (!bio) { | |
1275 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1276 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1277 | return; | |
1278 | } | |
1279 | ||
1280 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { | |
1281 | /* Error recovery will decide what to do with the BIO. */ | |
1282 | bio_list_add_head(&zwplug->bio_list, bio); | |
1283 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1284 | return; | |
1285 | } | |
1286 | ||
1287 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1288 | ||
1289 | bdev = bio->bi_bdev; | |
1290 | submit_bio_noacct_nocheck(bio); | |
1291 | ||
1292 | /* | |
1293 | * blk-mq devices will reuse the extra reference on the request queue | |
1294 | * usage counter we took when the BIO was plugged, but the submission | |
1295 | * path for BIO-based devices will not do that. So drop this extra | |
1296 | * reference here. | |
1297 | */ | |
1298 | if (bdev->bd_has_submit_bio) | |
1299 | blk_queue_exit(bdev->bd_disk->queue); | |
1300 | } | |
1301 | ||
1302 | static unsigned int blk_zone_wp_offset(struct blk_zone *zone) | |
1303 | { | |
1304 | switch (zone->cond) { | |
1305 | case BLK_ZONE_COND_IMP_OPEN: | |
1306 | case BLK_ZONE_COND_EXP_OPEN: | |
1307 | case BLK_ZONE_COND_CLOSED: | |
1308 | return zone->wp - zone->start; | |
1309 | case BLK_ZONE_COND_FULL: | |
1310 | return zone->len; | |
1311 | case BLK_ZONE_COND_EMPTY: | |
1312 | return 0; | |
1313 | case BLK_ZONE_COND_NOT_WP: | |
1314 | case BLK_ZONE_COND_OFFLINE: | |
1315 | case BLK_ZONE_COND_READONLY: | |
1316 | default: | |
1317 | /* | |
1318 | * Conventional, offline and read-only zones do not have a valid | |
1319 | * write pointer. | |
1320 | */ | |
1321 | return UINT_MAX; | |
1322 | } | |
1323 | } | |
1324 | ||
1325 | static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, | |
1326 | unsigned int idx, void *data) | |
1327 | { | |
1328 | struct blk_zone *zonep = data; | |
1329 | ||
1330 | *zonep = *zone; | |
1331 | return 0; | |
1332 | } | |
1333 | ||
1334 | static void disk_zone_wplug_handle_error(struct gendisk *disk, | |
1335 | struct blk_zone_wplug *zwplug) | |
1336 | { | |
1337 | sector_t zone_start_sector = | |
1338 | bdev_zone_sectors(disk->part0) * zwplug->zone_no; | |
1339 | unsigned int noio_flag; | |
1340 | struct blk_zone zone; | |
1341 | unsigned long flags; | |
1342 | int ret; | |
1343 | ||
1344 | /* Get the current zone information from the device. */ | |
1345 | noio_flag = memalloc_noio_save(); | |
1346 | ret = disk->fops->report_zones(disk, zone_start_sector, 1, | |
1347 | blk_zone_wplug_report_zone_cb, &zone); | |
1348 | memalloc_noio_restore(noio_flag); | |
1349 | ||
1350 | spin_lock_irqsave(&zwplug->lock, flags); | |
1351 | ||
1352 | /* | |
1353 | * A zone reset or finish may have cleared the error already. In such | |
1354 | * case, do nothing as the report zones may have seen the "old" write | |
1355 | * pointer value before the reset/finish operation completed. | |
1356 | */ | |
1357 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) | |
1358 | goto unlock; | |
1359 | ||
1360 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
1361 | ||
1362 | if (ret != 1) { | |
1363 | /* | |
1364 | * We failed to get the zone information, meaning that something | |
1365 | * is likely really wrong with the device. Abort all remaining | |
1366 | * plugged BIOs as otherwise we could endup waiting forever on | |
1367 | * plugged BIOs to complete if there is a queue freeze on-going. | |
1368 | */ | |
1369 | disk_zone_wplug_abort(zwplug); | |
1370 | goto unplug; | |
1371 | } | |
1372 | ||
1373 | /* Update the zone write pointer offset. */ | |
1374 | zwplug->wp_offset = blk_zone_wp_offset(&zone); | |
1375 | disk_zone_wplug_abort_unaligned(disk, zwplug); | |
1376 | ||
1377 | /* Restart BIO submission if we still have any BIO left. */ | |
1378 | if (!bio_list_empty(&zwplug->bio_list)) { | |
1379 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); | |
1380 | kblockd_schedule_work(&zwplug->bio_work); | |
1381 | goto unlock; | |
1382 | } | |
1383 | ||
1384 | unplug: | |
1385 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1386 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1387 | disk_remove_zone_wplug(disk, zwplug); | |
1388 | ||
1389 | unlock: | |
1390 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1391 | } | |
1392 | ||
1393 | static void disk_zone_wplugs_work(struct work_struct *work) | |
1394 | { | |
1395 | struct gendisk *disk = | |
1396 | container_of(work, struct gendisk, zone_wplugs_work); | |
1397 | struct blk_zone_wplug *zwplug; | |
1398 | unsigned long flags; | |
1399 | ||
1400 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1401 | ||
1402 | while (!list_empty(&disk->zone_wplugs_err_list)) { | |
1403 | zwplug = list_first_entry(&disk->zone_wplugs_err_list, | |
1404 | struct blk_zone_wplug, link); | |
1405 | list_del_init(&zwplug->link); | |
1406 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1407 | ||
1408 | disk_zone_wplug_handle_error(disk, zwplug); | |
1409 | disk_put_zone_wplug(zwplug); | |
1410 | ||
1411 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1412 | } | |
1413 | ||
1414 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1415 | } | |
1416 | ||
1417 | static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) | |
1418 | { | |
1419 | return 1U << disk->zone_wplugs_hash_bits; | |
1420 | } | |
1421 | ||
1422 | void disk_init_zone_resources(struct gendisk *disk) | |
1423 | { | |
1424 | spin_lock_init(&disk->zone_wplugs_lock); | |
1425 | INIT_LIST_HEAD(&disk->zone_wplugs_err_list); | |
1426 | INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); | |
1427 | } | |
1428 | ||
1429 | /* | |
1430 | * For the size of a disk zone write plug hash table, use the size of the | |
1431 | * zone write plug mempool, which is the maximum of the disk open zones and | |
1432 | * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, | |
1433 | * 9 bits. For a disk that has no limits, mempool size defaults to 128. | |
1434 | */ | |
1435 | #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 | |
1436 | #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 | |
1437 | ||
1438 | static int disk_alloc_zone_resources(struct gendisk *disk, | |
1439 | unsigned int pool_size) | |
1440 | { | |
1441 | unsigned int i; | |
1442 | ||
1443 | disk->zone_wplugs_hash_bits = | |
1444 | min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); | |
1445 | ||
1446 | disk->zone_wplugs_hash = | |
1447 | kcalloc(disk_zone_wplugs_hash_size(disk), | |
1448 | sizeof(struct hlist_head), GFP_KERNEL); | |
1449 | if (!disk->zone_wplugs_hash) | |
1450 | return -ENOMEM; | |
1451 | ||
1452 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) | |
1453 | INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); | |
1454 | ||
1455 | disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, | |
1456 | sizeof(struct blk_zone_wplug)); | |
1457 | if (!disk->zone_wplugs_pool) { | |
1458 | kfree(disk->zone_wplugs_hash); | |
1459 | disk->zone_wplugs_hash = NULL; | |
1460 | disk->zone_wplugs_hash_bits = 0; | |
1461 | return -ENOMEM; | |
1462 | } | |
1463 | ||
1464 | return 0; | |
1465 | } | |
1466 | ||
1467 | static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) | |
1468 | { | |
1469 | struct blk_zone_wplug *zwplug; | |
1470 | unsigned int i; | |
1471 | ||
1472 | if (!disk->zone_wplugs_hash) | |
1473 | return; | |
1474 | ||
1475 | /* Free all the zone write plugs we have. */ | |
1476 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1477 | while (!hlist_empty(&disk->zone_wplugs_hash[i])) { | |
1478 | zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, | |
1479 | struct blk_zone_wplug, node); | |
1480 | atomic_inc(&zwplug->ref); | |
1481 | disk_remove_zone_wplug(disk, zwplug); | |
1482 | disk_put_zone_wplug(zwplug); | |
1483 | } | |
1484 | } | |
1485 | ||
1486 | kfree(disk->zone_wplugs_hash); | |
1487 | disk->zone_wplugs_hash = NULL; | |
1488 | disk->zone_wplugs_hash_bits = 0; | |
1489 | } | |
1490 | ||
1491 | void disk_free_zone_resources(struct gendisk *disk) | |
1492 | { | |
1493 | cancel_work_sync(&disk->zone_wplugs_work); | |
1494 | ||
1495 | disk_destroy_zone_wplugs_hash_table(disk); | |
1496 | ||
1497 | /* | |
1498 | * Wait for the zone write plugs to be RCU-freed before | |
1499 | * destorying the mempool. | |
1500 | */ | |
1501 | rcu_barrier(); | |
1502 | ||
1503 | mempool_destroy(disk->zone_wplugs_pool); | |
1504 | disk->zone_wplugs_pool = NULL; | |
1505 | ||
d86e716a CH |
1506 | kfree(disk->conv_zones_bitmap); |
1507 | disk->conv_zones_bitmap = NULL; | |
1508 | kfree(disk->seq_zones_wlock); | |
1509 | disk->seq_zones_wlock = NULL; | |
dd291d77 DLM |
1510 | |
1511 | disk->zone_capacity = 0; | |
1512 | disk->nr_zones = 0; | |
1513 | } | |
1514 | ||
946dd71e DLM |
1515 | static inline bool disk_need_zone_resources(struct gendisk *disk) |
1516 | { | |
1517 | /* | |
1518 | * All mq zoned devices need zone resources so that the block layer | |
1519 | * can automatically handle write BIO plugging. BIO-based device drivers | |
1520 | * (e.g. DM devices) are normally responsible for handling zone write | |
1521 | * ordering and do not need zone resources, unless the driver requires | |
1522 | * zone append emulation. | |
1523 | */ | |
1524 | return queue_is_mq(disk->queue) || | |
1525 | queue_emulates_zone_append(disk->queue); | |
1526 | } | |
1527 | ||
dd291d77 DLM |
1528 | static int disk_revalidate_zone_resources(struct gendisk *disk, |
1529 | unsigned int nr_zones) | |
1530 | { | |
1531 | struct queue_limits *lim = &disk->queue->limits; | |
1532 | unsigned int pool_size; | |
1533 | ||
946dd71e DLM |
1534 | if (!disk_need_zone_resources(disk)) |
1535 | return 0; | |
1536 | ||
dd291d77 DLM |
1537 | /* |
1538 | * If the device has no limit on the maximum number of open and active | |
1539 | * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. | |
1540 | */ | |
1541 | pool_size = max(lim->max_open_zones, lim->max_active_zones); | |
1542 | if (!pool_size) | |
1543 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); | |
1544 | ||
1545 | if (!disk->zone_wplugs_hash) | |
1546 | return disk_alloc_zone_resources(disk, pool_size); | |
1547 | ||
1548 | /* Resize the zone write plug memory pool if needed. */ | |
1549 | if (disk->zone_wplugs_pool->min_nr != pool_size) | |
1550 | return mempool_resize(disk->zone_wplugs_pool, pool_size); | |
1551 | ||
1552 | return 0; | |
bf505456 DLM |
1553 | } |
1554 | ||
d4100351 CH |
1555 | struct blk_revalidate_zone_args { |
1556 | struct gendisk *disk; | |
f216fdd7 | 1557 | unsigned long *conv_zones_bitmap; |
d4100351 | 1558 | unsigned long *seq_zones_wlock; |
e94f5819 | 1559 | unsigned int nr_zones; |
ecfe43b1 | 1560 | unsigned int zone_capacity; |
d4100351 CH |
1561 | sector_t sector; |
1562 | }; | |
1563 | ||
843283e9 DLM |
1564 | /* |
1565 | * Update the disk zone resources information and device queue limits. | |
1566 | * The disk queue is frozen when this is executed. | |
1567 | */ | |
1568 | static int disk_update_zone_resources(struct gendisk *disk, | |
1569 | struct blk_revalidate_zone_args *args) | |
1570 | { | |
1571 | struct request_queue *q = disk->queue; | |
1572 | struct queue_limits lim; | |
1573 | ||
1574 | disk->nr_zones = args->nr_zones; | |
1575 | disk->zone_capacity = args->zone_capacity; | |
1576 | swap(disk->seq_zones_wlock, args->seq_zones_wlock); | |
1577 | swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); | |
1578 | ||
1579 | /* | |
1580 | * If the device has no limit on the maximum number of open and active | |
1581 | * zones, set its max open zone limit to the mempool size to indicate | |
1582 | * to the user that there is a potential performance impact due to | |
1583 | * dynamic zone write plug allocation when simultaneously writing to | |
1584 | * more zones than the size of the mempool. | |
1585 | */ | |
1586 | if (disk->zone_wplugs_pool) { | |
1587 | lim = queue_limits_start_update(q); | |
1588 | if (!lim.max_open_zones && !lim.max_active_zones) | |
1589 | lim.max_open_zones = disk->zone_wplugs_pool->min_nr; | |
1590 | return queue_limits_commit_update(q, &lim); | |
1591 | } | |
1592 | ||
1593 | return 0; | |
1594 | } | |
1595 | ||
d9dd7308 DLM |
1596 | /* |
1597 | * Helper function to check the validity of zones of a zoned block device. | |
1598 | */ | |
d4100351 CH |
1599 | static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, |
1600 | void *data) | |
d9dd7308 | 1601 | { |
d4100351 CH |
1602 | struct blk_revalidate_zone_args *args = data; |
1603 | struct gendisk *disk = args->disk; | |
d9dd7308 | 1604 | struct request_queue *q = disk->queue; |
d9dd7308 | 1605 | sector_t capacity = get_capacity(disk); |
03e51c4a | 1606 | sector_t zone_sectors = q->limits.chunk_sectors; |
dd291d77 DLM |
1607 | struct blk_zone_wplug *zwplug; |
1608 | unsigned long flags; | |
1609 | unsigned int wp_offset; | |
03e51c4a DLM |
1610 | |
1611 | /* Check for bad zones and holes in the zone report */ | |
1612 | if (zone->start != args->sector) { | |
1613 | pr_warn("%s: Zone gap at sectors %llu..%llu\n", | |
1614 | disk->disk_name, args->sector, zone->start); | |
1615 | return -ENODEV; | |
1616 | } | |
1617 | ||
1618 | if (zone->start >= capacity || !zone->len) { | |
1619 | pr_warn("%s: Invalid zone start %llu, length %llu\n", | |
1620 | disk->disk_name, zone->start, zone->len); | |
1621 | return -ENODEV; | |
1622 | } | |
d9dd7308 DLM |
1623 | |
1624 | /* | |
1625 | * All zones must have the same size, with the exception on an eventual | |
1626 | * smaller last zone. | |
1627 | */ | |
03e51c4a DLM |
1628 | if (zone->start + zone->len < capacity) { |
1629 | if (zone->len != zone_sectors) { | |
6c6b3549 CH |
1630 | pr_warn("%s: Invalid zoned device with non constant zone size\n", |
1631 | disk->disk_name); | |
1632 | return -ENODEV; | |
1633 | } | |
03e51c4a DLM |
1634 | } else if (zone->len > zone_sectors) { |
1635 | pr_warn("%s: Invalid zoned device with larger last zone size\n", | |
1636 | disk->disk_name); | |
d4100351 | 1637 | return -ENODEV; |
d9dd7308 DLM |
1638 | } |
1639 | ||
ecfe43b1 DLM |
1640 | if (!zone->capacity || zone->capacity > zone->len) { |
1641 | pr_warn("%s: Invalid zone capacity\n", | |
1642 | disk->disk_name); | |
1643 | return -ENODEV; | |
1644 | } | |
1645 | ||
d9dd7308 DLM |
1646 | /* Check zone type */ |
1647 | switch (zone->type) { | |
1648 | case BLK_ZONE_TYPE_CONVENTIONAL: | |
ecfe43b1 DLM |
1649 | if (zone->capacity != zone->len) { |
1650 | pr_warn("%s: Invalid conventional zone capacity\n", | |
1651 | disk->disk_name); | |
1652 | return -ENODEV; | |
1653 | } | |
946dd71e DLM |
1654 | |
1655 | if (!disk_need_zone_resources(disk)) | |
1656 | break; | |
e94f5819 CH |
1657 | if (!args->conv_zones_bitmap) { |
1658 | args->conv_zones_bitmap = | |
1659 | blk_alloc_zone_bitmap(q->node, args->nr_zones); | |
1660 | if (!args->conv_zones_bitmap) | |
1661 | return -ENOMEM; | |
1662 | } | |
1663 | set_bit(idx, args->conv_zones_bitmap); | |
1664 | break; | |
d9dd7308 | 1665 | case BLK_ZONE_TYPE_SEQWRITE_REQ: |
e94f5819 CH |
1666 | if (!args->seq_zones_wlock) { |
1667 | args->seq_zones_wlock = | |
1668 | blk_alloc_zone_bitmap(q->node, args->nr_zones); | |
1669 | if (!args->seq_zones_wlock) | |
1670 | return -ENOMEM; | |
1671 | } | |
ecfe43b1 DLM |
1672 | |
1673 | /* | |
1674 | * Remember the capacity of the first sequential zone and check | |
1675 | * if it is constant for all zones. | |
1676 | */ | |
1677 | if (!args->zone_capacity) | |
1678 | args->zone_capacity = zone->capacity; | |
1679 | if (zone->capacity != args->zone_capacity) { | |
1680 | pr_warn("%s: Invalid variable zone capacity\n", | |
1681 | disk->disk_name); | |
1682 | return -ENODEV; | |
1683 | } | |
dd291d77 DLM |
1684 | |
1685 | /* | |
1686 | * We need to track the write pointer of all zones that are not | |
1687 | * empty nor full. So make sure we have a zone write plug for | |
946dd71e | 1688 | * such zone if the device has a zone write plug hash table. |
dd291d77 DLM |
1689 | */ |
1690 | wp_offset = blk_zone_wp_offset(zone); | |
946dd71e DLM |
1691 | if (disk->zone_wplugs_hash && |
1692 | wp_offset && wp_offset < zone_sectors) { | |
dd291d77 DLM |
1693 | zwplug = disk_get_and_lock_zone_wplug(disk, zone->start, |
1694 | GFP_NOIO, &flags); | |
1695 | if (!zwplug) | |
1696 | return -ENOMEM; | |
1697 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1698 | disk_put_zone_wplug(zwplug); | |
1699 | } | |
1700 | ||
d9dd7308 | 1701 | break; |
587371ed | 1702 | case BLK_ZONE_TYPE_SEQWRITE_PREF: |
d9dd7308 DLM |
1703 | default: |
1704 | pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", | |
1705 | disk->disk_name, (int)zone->type, zone->start); | |
d4100351 | 1706 | return -ENODEV; |
d9dd7308 DLM |
1707 | } |
1708 | ||
d4100351 CH |
1709 | args->sector += zone->len; |
1710 | return 0; | |
1711 | } | |
1712 | ||
bf505456 DLM |
1713 | /** |
1714 | * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps | |
1715 | * @disk: Target disk | |
e732671a | 1716 | * @update_driver_data: Callback to update driver data on the frozen disk |
bf505456 | 1717 | * |
03e51c4a DLM |
1718 | * Helper function for low-level device drivers to check and (re) allocate and |
1719 | * initialize a disk request queue zone bitmaps. This functions should normally | |
1720 | * be called within the disk ->revalidate method for blk-mq based drivers. | |
1721 | * Before calling this function, the device driver must already have set the | |
1722 | * device zone size (chunk_sector limit) and the max zone append limit. | |
946dd71e DLM |
1723 | * BIO based drivers can also use this function as long as the device queue |
1724 | * can be safely frozen. | |
e732671a DLM |
1725 | * If the @update_driver_data callback function is not NULL, the callback is |
1726 | * executed with the device request queue frozen after all zones have been | |
1727 | * checked. | |
bf505456 | 1728 | */ |
e732671a DLM |
1729 | int blk_revalidate_disk_zones(struct gendisk *disk, |
1730 | void (*update_driver_data)(struct gendisk *disk)) | |
bf505456 DLM |
1731 | { |
1732 | struct request_queue *q = disk->queue; | |
03e51c4a DLM |
1733 | sector_t zone_sectors = q->limits.chunk_sectors; |
1734 | sector_t capacity = get_capacity(disk); | |
1735 | struct blk_revalidate_zone_args args = { }; | |
6c6b3549 | 1736 | unsigned int noio_flag; |
dd291d77 | 1737 | int ret = -ENOMEM; |
bf505456 | 1738 | |
c98c3d09 CH |
1739 | if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) |
1740 | return -EIO; | |
bf505456 | 1741 | |
03e51c4a DLM |
1742 | if (!capacity) |
1743 | return -ENODEV; | |
1744 | ||
1745 | /* | |
1746 | * Checks that the device driver indicated a valid zone size and that | |
1747 | * the max zone append limit is set. | |
1748 | */ | |
1749 | if (!zone_sectors || !is_power_of_2(zone_sectors)) { | |
1750 | pr_warn("%s: Invalid non power of two zone size (%llu)\n", | |
1751 | disk->disk_name, zone_sectors); | |
1752 | return -ENODEV; | |
1753 | } | |
1754 | ||
ccdbf0aa | 1755 | if (!queue_max_zone_append_sectors(q)) { |
03e51c4a DLM |
1756 | pr_warn("%s: Invalid 0 maximum zone append limit\n", |
1757 | disk->disk_name); | |
1758 | return -ENODEV; | |
1759 | } | |
1a1206dc | 1760 | |
e94f5819 | 1761 | /* |
6c6b3549 CH |
1762 | * Ensure that all memory allocations in this context are done as if |
1763 | * GFP_NOIO was specified. | |
e94f5819 | 1764 | */ |
03e51c4a DLM |
1765 | args.disk = disk; |
1766 | args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); | |
6c6b3549 | 1767 | noio_flag = memalloc_noio_save(); |
dd291d77 DLM |
1768 | ret = disk_revalidate_zone_resources(disk, args.nr_zones); |
1769 | if (ret) { | |
1770 | memalloc_noio_restore(noio_flag); | |
1771 | return ret; | |
1772 | } | |
6c6b3549 CH |
1773 | ret = disk->fops->report_zones(disk, 0, UINT_MAX, |
1774 | blk_revalidate_zone_cb, &args); | |
2afdeb23 DLM |
1775 | if (!ret) { |
1776 | pr_warn("%s: No zones reported\n", disk->disk_name); | |
1777 | ret = -ENODEV; | |
1778 | } | |
6c6b3549 | 1779 | memalloc_noio_restore(noio_flag); |
bf505456 | 1780 | |
2afdeb23 DLM |
1781 | /* |
1782 | * If zones where reported, make sure that the entire disk capacity | |
1783 | * has been checked. | |
1784 | */ | |
03e51c4a | 1785 | if (ret > 0 && args.sector != capacity) { |
2afdeb23 DLM |
1786 | pr_warn("%s: Missing zones from sector %llu\n", |
1787 | disk->disk_name, args.sector); | |
1788 | ret = -ENODEV; | |
1789 | } | |
1790 | ||
bf505456 | 1791 | /* |
6c6b3549 CH |
1792 | * Install the new bitmaps and update nr_zones only once the queue is |
1793 | * stopped and all I/Os are completed (i.e. a scheduler is not | |
1794 | * referencing the bitmaps). | |
bf505456 DLM |
1795 | */ |
1796 | blk_mq_freeze_queue(q); | |
2afdeb23 | 1797 | if (ret > 0) { |
843283e9 | 1798 | ret = disk_update_zone_resources(disk, &args); |
e732671a DLM |
1799 | if (update_driver_data) |
1800 | update_driver_data(disk); | |
d4100351 | 1801 | } else { |
bf505456 | 1802 | pr_warn("%s: failed to revalidate zones\n", disk->disk_name); |
bf505456 | 1803 | } |
843283e9 DLM |
1804 | if (ret) |
1805 | disk_free_zone_resources(disk); | |
d4100351 | 1806 | blk_mq_unfreeze_queue(q); |
bf505456 | 1807 | |
d4100351 | 1808 | kfree(args.seq_zones_wlock); |
f216fdd7 | 1809 | kfree(args.conv_zones_bitmap); |
ecfe43b1 | 1810 | |
bf505456 DLM |
1811 | return ret; |
1812 | } | |
1813 | EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); |