Commit | Line | Data |
---|---|---|
3dcf60bc | 1 | // SPDX-License-Identifier: GPL-2.0 |
6a0cb1bc HR |
2 | /* |
3 | * Zoned block device handling | |
4 | * | |
5 | * Copyright (c) 2015, Hannes Reinecke | |
6 | * Copyright (c) 2015, SUSE Linux GmbH | |
7 | * | |
8 | * Copyright (c) 2016, Damien Le Moal | |
9 | * Copyright (c) 2016, Western Digital | |
dd291d77 | 10 | * Copyright (c) 2024, Western Digital Corporation or its affiliates. |
6a0cb1bc HR |
11 | */ |
12 | ||
13 | #include <linux/kernel.h> | |
14 | #include <linux/module.h> | |
6a0cb1bc | 15 | #include <linux/blkdev.h> |
bf505456 | 16 | #include <linux/blk-mq.h> |
26202928 DLM |
17 | #include <linux/mm.h> |
18 | #include <linux/vmalloc.h> | |
bd976e52 | 19 | #include <linux/sched/mm.h> |
dd291d77 DLM |
20 | #include <linux/spinlock.h> |
21 | #include <linux/atomic.h> | |
22 | #include <linux/mempool.h> | |
6a0cb1bc | 23 | |
a2d6b3a2 | 24 | #include "blk.h" |
dd291d77 | 25 | #include "blk-mq-sched.h" |
d9f1439a | 26 | #include "blk-mq-debugfs.h" |
a2d6b3a2 | 27 | |
02694e86 CK |
28 | #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name |
29 | static const char *const zone_cond_name[] = { | |
30 | ZONE_COND_NAME(NOT_WP), | |
31 | ZONE_COND_NAME(EMPTY), | |
32 | ZONE_COND_NAME(IMP_OPEN), | |
33 | ZONE_COND_NAME(EXP_OPEN), | |
34 | ZONE_COND_NAME(CLOSED), | |
35 | ZONE_COND_NAME(READONLY), | |
36 | ZONE_COND_NAME(FULL), | |
37 | ZONE_COND_NAME(OFFLINE), | |
38 | }; | |
39 | #undef ZONE_COND_NAME | |
40 | ||
dd291d77 DLM |
41 | /* |
42 | * Per-zone write plug. | |
43 | * @node: hlist_node structure for managing the plug using a hash table. | |
44 | * @link: To list the plug in the zone write plug error list of the disk. | |
45 | * @ref: Zone write plug reference counter. A zone write plug reference is | |
46 | * always at least 1 when the plug is hashed in the disk plug hash table. | |
47 | * The reference is incremented whenever a new BIO needing plugging is | |
48 | * submitted and when a function needs to manipulate a plug. The | |
49 | * reference count is decremented whenever a plugged BIO completes and | |
50 | * when a function that referenced the plug returns. The initial | |
51 | * reference is dropped whenever the zone of the zone write plug is reset, | |
52 | * finished and when the zone becomes full (last write BIO to the zone | |
53 | * completes). | |
54 | * @lock: Spinlock to atomically manipulate the plug. | |
55 | * @flags: Flags indicating the plug state. | |
56 | * @zone_no: The number of the zone the plug is managing. | |
57 | * @wp_offset: The zone write pointer location relative to the start of the zone | |
58 | * as a number of 512B sectors. | |
59 | * @bio_list: The list of BIOs that are currently plugged. | |
60 | * @bio_work: Work struct to handle issuing of plugged BIOs | |
61 | * @rcu_head: RCU head to free zone write plugs with an RCU grace period. | |
62 | * @disk: The gendisk the plug belongs to. | |
63 | */ | |
64 | struct blk_zone_wplug { | |
65 | struct hlist_node node; | |
66 | struct list_head link; | |
67 | atomic_t ref; | |
68 | spinlock_t lock; | |
69 | unsigned int flags; | |
70 | unsigned int zone_no; | |
71 | unsigned int wp_offset; | |
72 | struct bio_list bio_list; | |
73 | struct work_struct bio_work; | |
74 | struct rcu_head rcu_head; | |
75 | struct gendisk *disk; | |
76 | }; | |
77 | ||
78 | /* | |
79 | * Zone write plug flags bits: | |
80 | * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, | |
81 | * that is, that write BIOs are being throttled due to a write BIO already | |
82 | * being executed or the zone write plug bio list is not empty. | |
83 | * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be | |
84 | * recovered with a report zone to update the zone write pointer offset. | |
85 | * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed | |
86 | * from the disk hash table and that the initial reference to the zone | |
87 | * write plug set when the plug was first added to the hash table has been | |
88 | * dropped. This flag is set when a zone is reset, finished or become full, | |
89 | * to prevent new references to the zone write plug to be taken for | |
90 | * newly incoming BIOs. A zone write plug flagged with this flag will be | |
91 | * freed once all remaining references from BIOs or functions are dropped. | |
92 | */ | |
93 | #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) | |
94 | #define BLK_ZONE_WPLUG_ERROR (1U << 1) | |
95 | #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) | |
96 | ||
97 | #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) | |
98 | ||
02694e86 CK |
99 | /** |
100 | * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. | |
101 | * @zone_cond: BLK_ZONE_COND_XXX. | |
102 | * | |
103 | * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX | |
104 | * into string format. Useful in the debugging and tracing zone conditions. For | |
105 | * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". | |
106 | */ | |
107 | const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) | |
108 | { | |
109 | static const char *zone_cond_str = "UNKNOWN"; | |
110 | ||
111 | if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) | |
112 | zone_cond_str = zone_cond_name[zone_cond]; | |
113 | ||
114 | return zone_cond_str; | |
115 | } | |
116 | EXPORT_SYMBOL_GPL(blk_zone_cond_str); | |
117 | ||
a91e1380 | 118 | /** |
b623e347 CH |
119 | * bdev_nr_zones - Get number of zones |
120 | * @bdev: Target device | |
a91e1380 | 121 | * |
9b38bb4b CH |
122 | * Return the total number of zones of a zoned block device. For a block |
123 | * device without zone capabilities, the number of zones is always 0. | |
a91e1380 | 124 | */ |
b623e347 | 125 | unsigned int bdev_nr_zones(struct block_device *bdev) |
a91e1380 | 126 | { |
b623e347 | 127 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
a91e1380 | 128 | |
b623e347 | 129 | if (!bdev_is_zoned(bdev)) |
a91e1380 | 130 | return 0; |
b623e347 CH |
131 | return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> |
132 | ilog2(zone_sectors); | |
a91e1380 | 133 | } |
b623e347 | 134 | EXPORT_SYMBOL_GPL(bdev_nr_zones); |
a91e1380 | 135 | |
6a0cb1bc HR |
136 | /** |
137 | * blkdev_report_zones - Get zones information | |
138 | * @bdev: Target block device | |
139 | * @sector: Sector from which to report zones | |
d4100351 CH |
140 | * @nr_zones: Maximum number of zones to report |
141 | * @cb: Callback function called for each reported zone | |
142 | * @data: Private data for the callback | |
6a0cb1bc HR |
143 | * |
144 | * Description: | |
d4100351 CH |
145 | * Get zone information starting from the zone containing @sector for at most |
146 | * @nr_zones, and call @cb for each zone reported by the device. | |
147 | * To report all zones in a device starting from @sector, the BLK_ALL_ZONES | |
148 | * constant can be passed to @nr_zones. | |
149 | * Returns the number of zones reported by the device, or a negative errno | |
150 | * value in case of failure. | |
151 | * | |
152 | * Note: The caller must use memalloc_noXX_save/restore() calls to control | |
153 | * memory allocations done within this function. | |
6a0cb1bc | 154 | */ |
e76239a3 | 155 | int blkdev_report_zones(struct block_device *bdev, sector_t sector, |
d4100351 | 156 | unsigned int nr_zones, report_zones_cb cb, void *data) |
6a0cb1bc | 157 | { |
ceeb373a | 158 | struct gendisk *disk = bdev->bd_disk; |
5eac3eb3 | 159 | sector_t capacity = get_capacity(disk); |
6a0cb1bc | 160 | |
edd1dbc8 | 161 | if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) |
e76239a3 | 162 | return -EOPNOTSUPP; |
6a0cb1bc | 163 | |
d4100351 | 164 | if (!nr_zones || sector >= capacity) |
6a0cb1bc | 165 | return 0; |
6a0cb1bc | 166 | |
d4100351 | 167 | return disk->fops->report_zones(disk, sector, nr_zones, cb, data); |
6a0cb1bc HR |
168 | } |
169 | EXPORT_SYMBOL_GPL(blkdev_report_zones); | |
170 | ||
1ee533ec DLM |
171 | static inline unsigned long *blk_alloc_zone_bitmap(int node, |
172 | unsigned int nr_zones) | |
6e33dbf2 | 173 | { |
1ee533ec DLM |
174 | return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), |
175 | GFP_NOIO, node); | |
176 | } | |
6e33dbf2 | 177 | |
1ee533ec DLM |
178 | static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, |
179 | void *data) | |
180 | { | |
6e33dbf2 | 181 | /* |
1ee533ec DLM |
182 | * For an all-zones reset, ignore conventional, empty, read-only |
183 | * and offline zones. | |
6e33dbf2 | 184 | */ |
1ee533ec DLM |
185 | switch (zone->cond) { |
186 | case BLK_ZONE_COND_NOT_WP: | |
187 | case BLK_ZONE_COND_EMPTY: | |
188 | case BLK_ZONE_COND_READONLY: | |
189 | case BLK_ZONE_COND_OFFLINE: | |
190 | return 0; | |
191 | default: | |
192 | set_bit(idx, (unsigned long *)data); | |
193 | return 0; | |
194 | } | |
195 | } | |
196 | ||
71f4ecdb | 197 | static int blkdev_zone_reset_all_emulated(struct block_device *bdev) |
1ee533ec | 198 | { |
d86e716a | 199 | struct gendisk *disk = bdev->bd_disk; |
375c140c CH |
200 | sector_t capacity = bdev_nr_sectors(bdev); |
201 | sector_t zone_sectors = bdev_zone_sectors(bdev); | |
1ee533ec DLM |
202 | unsigned long *need_reset; |
203 | struct bio *bio = NULL; | |
204 | sector_t sector = 0; | |
205 | int ret; | |
206 | ||
d86e716a | 207 | need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); |
1ee533ec DLM |
208 | if (!need_reset) |
209 | return -ENOMEM; | |
210 | ||
d86e716a CH |
211 | ret = disk->fops->report_zones(disk, 0, disk->nr_zones, |
212 | blk_zone_need_reset_cb, need_reset); | |
1ee533ec DLM |
213 | if (ret < 0) |
214 | goto out_free_need_reset; | |
215 | ||
216 | ret = 0; | |
217 | while (sector < capacity) { | |
d86e716a | 218 | if (!test_bit(disk_zone_no(disk, sector), need_reset)) { |
1ee533ec DLM |
219 | sector += zone_sectors; |
220 | continue; | |
221 | } | |
222 | ||
0a3140ea | 223 | bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, |
71f4ecdb | 224 | GFP_KERNEL); |
1ee533ec DLM |
225 | bio->bi_iter.bi_sector = sector; |
226 | sector += zone_sectors; | |
227 | ||
228 | /* This may take a while, so be nice to others */ | |
229 | cond_resched(); | |
230 | } | |
231 | ||
232 | if (bio) { | |
233 | ret = submit_bio_wait(bio); | |
234 | bio_put(bio); | |
235 | } | |
236 | ||
237 | out_free_need_reset: | |
238 | kfree(need_reset); | |
239 | return ret; | |
240 | } | |
241 | ||
71f4ecdb | 242 | static int blkdev_zone_reset_all(struct block_device *bdev) |
1ee533ec DLM |
243 | { |
244 | struct bio bio; | |
245 | ||
49add496 | 246 | bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); |
1ee533ec | 247 | return submit_bio_wait(&bio); |
6e33dbf2 CK |
248 | } |
249 | ||
6a0cb1bc | 250 | /** |
6c1b1da5 | 251 | * blkdev_zone_mgmt - Execute a zone management operation on a range of zones |
6a0cb1bc | 252 | * @bdev: Target block device |
6c1b1da5 AJ |
253 | * @op: Operation to be performed on the zones |
254 | * @sector: Start sector of the first zone to operate on | |
255 | * @nr_sectors: Number of sectors, should be at least the length of one zone and | |
256 | * must be zone size aligned. | |
6a0cb1bc HR |
257 | * |
258 | * Description: | |
6c1b1da5 | 259 | * Perform the specified operation on the range of zones specified by |
6a0cb1bc HR |
260 | * @sector..@sector+@nr_sectors. Specifying the entire disk sector range |
261 | * is valid, but the specified range should not contain conventional zones. | |
6c1b1da5 AJ |
262 | * The operation to execute on each zone can be a zone reset, open, close |
263 | * or finish request. | |
6a0cb1bc | 264 | */ |
ff07a02e | 265 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, |
71f4ecdb | 266 | sector_t sector, sector_t nr_sectors) |
6a0cb1bc HR |
267 | { |
268 | struct request_queue *q = bdev_get_queue(bdev); | |
375c140c CH |
269 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
270 | sector_t capacity = bdev_nr_sectors(bdev); | |
6a0cb1bc | 271 | sector_t end_sector = sector + nr_sectors; |
a2d6b3a2 | 272 | struct bio *bio = NULL; |
1ee533ec | 273 | int ret = 0; |
6a0cb1bc | 274 | |
edd1dbc8 | 275 | if (!bdev_is_zoned(bdev)) |
6a0cb1bc HR |
276 | return -EOPNOTSUPP; |
277 | ||
a2d6b3a2 DLM |
278 | if (bdev_read_only(bdev)) |
279 | return -EPERM; | |
280 | ||
6c1b1da5 AJ |
281 | if (!op_is_zone_mgmt(op)) |
282 | return -EOPNOTSUPP; | |
283 | ||
11bde986 | 284 | if (end_sector <= sector || end_sector > capacity) |
6a0cb1bc HR |
285 | /* Out of range */ |
286 | return -EINVAL; | |
287 | ||
288 | /* Check alignment (handle eventual smaller last zone) */ | |
e29b2100 | 289 | if (!bdev_is_zone_start(bdev, sector)) |
6a0cb1bc HR |
290 | return -EINVAL; |
291 | ||
e29b2100 | 292 | if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) |
6a0cb1bc HR |
293 | return -EINVAL; |
294 | ||
1ee533ec DLM |
295 | /* |
296 | * In the case of a zone reset operation over all zones, | |
297 | * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this | |
298 | * command. For other devices, we emulate this command behavior by | |
299 | * identifying the zones needing a reset. | |
300 | */ | |
301 | if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { | |
302 | if (!blk_queue_zone_resetall(q)) | |
71f4ecdb JT |
303 | return blkdev_zone_reset_all_emulated(bdev); |
304 | return blkdev_zone_reset_all(bdev); | |
1ee533ec DLM |
305 | } |
306 | ||
6a0cb1bc | 307 | while (sector < end_sector) { |
71f4ecdb | 308 | bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); |
c7a1d926 | 309 | bio->bi_iter.bi_sector = sector; |
6a0cb1bc HR |
310 | sector += zone_sectors; |
311 | ||
312 | /* This may take a while, so be nice to others */ | |
313 | cond_resched(); | |
6a0cb1bc HR |
314 | } |
315 | ||
a2d6b3a2 DLM |
316 | ret = submit_bio_wait(bio); |
317 | bio_put(bio); | |
318 | ||
a2d6b3a2 | 319 | return ret; |
6a0cb1bc | 320 | } |
6c1b1da5 | 321 | EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); |
3ed05a98 | 322 | |
d4100351 CH |
323 | struct zone_report_args { |
324 | struct blk_zone __user *zones; | |
325 | }; | |
326 | ||
327 | static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, | |
328 | void *data) | |
329 | { | |
330 | struct zone_report_args *args = data; | |
331 | ||
332 | if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) | |
333 | return -EFAULT; | |
334 | return 0; | |
335 | } | |
336 | ||
56c4bddb | 337 | /* |
3ed05a98 ST |
338 | * BLKREPORTZONE ioctl processing. |
339 | * Called from blkdev_ioctl. | |
340 | */ | |
5e4ea834 CH |
341 | int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, |
342 | unsigned long arg) | |
3ed05a98 ST |
343 | { |
344 | void __user *argp = (void __user *)arg; | |
d4100351 | 345 | struct zone_report_args args; |
3ed05a98 | 346 | struct blk_zone_report rep; |
3ed05a98 ST |
347 | int ret; |
348 | ||
349 | if (!argp) | |
350 | return -EINVAL; | |
351 | ||
edd1dbc8 | 352 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
353 | return -ENOTTY; |
354 | ||
3ed05a98 ST |
355 | if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) |
356 | return -EFAULT; | |
357 | ||
358 | if (!rep.nr_zones) | |
359 | return -EINVAL; | |
360 | ||
d4100351 CH |
361 | args.zones = argp + sizeof(struct blk_zone_report); |
362 | ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, | |
363 | blkdev_copy_zone_to_user, &args); | |
364 | if (ret < 0) | |
365 | return ret; | |
3ed05a98 | 366 | |
d4100351 | 367 | rep.nr_zones = ret; |
82394db7 | 368 | rep.flags = BLK_ZONE_REP_CAPACITY; |
d4100351 CH |
369 | if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) |
370 | return -EFAULT; | |
371 | return 0; | |
3ed05a98 ST |
372 | } |
373 | ||
05bdb996 CH |
374 | static int blkdev_truncate_zone_range(struct block_device *bdev, |
375 | blk_mode_t mode, const struct blk_zone_range *zrange) | |
e5113505 SK |
376 | { |
377 | loff_t start, end; | |
378 | ||
379 | if (zrange->sector + zrange->nr_sectors <= zrange->sector || | |
380 | zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) | |
381 | /* Out of range */ | |
382 | return -EINVAL; | |
383 | ||
384 | start = zrange->sector << SECTOR_SHIFT; | |
385 | end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; | |
386 | ||
387 | return truncate_bdev_range(bdev, mode, start, end); | |
388 | } | |
389 | ||
56c4bddb | 390 | /* |
e876df1f | 391 | * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. |
3ed05a98 ST |
392 | * Called from blkdev_ioctl. |
393 | */ | |
05bdb996 | 394 | int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, |
e876df1f | 395 | unsigned int cmd, unsigned long arg) |
3ed05a98 ST |
396 | { |
397 | void __user *argp = (void __user *)arg; | |
3ed05a98 | 398 | struct blk_zone_range zrange; |
ff07a02e | 399 | enum req_op op; |
e5113505 | 400 | int ret; |
3ed05a98 ST |
401 | |
402 | if (!argp) | |
403 | return -EINVAL; | |
404 | ||
edd1dbc8 | 405 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
406 | return -ENOTTY; |
407 | ||
05bdb996 | 408 | if (!(mode & BLK_OPEN_WRITE)) |
3ed05a98 ST |
409 | return -EBADF; |
410 | ||
411 | if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) | |
412 | return -EFAULT; | |
413 | ||
e876df1f AJ |
414 | switch (cmd) { |
415 | case BLKRESETZONE: | |
416 | op = REQ_OP_ZONE_RESET; | |
e5113505 SK |
417 | |
418 | /* Invalidate the page cache, including dirty pages. */ | |
224941e8 | 419 | filemap_invalidate_lock(bdev->bd_mapping); |
e5113505 SK |
420 | ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
421 | if (ret) | |
86399ea0 | 422 | goto fail; |
e876df1f AJ |
423 | break; |
424 | case BLKOPENZONE: | |
425 | op = REQ_OP_ZONE_OPEN; | |
426 | break; | |
427 | case BLKCLOSEZONE: | |
428 | op = REQ_OP_ZONE_CLOSE; | |
429 | break; | |
430 | case BLKFINISHZONE: | |
431 | op = REQ_OP_ZONE_FINISH; | |
432 | break; | |
433 | default: | |
434 | return -ENOTTY; | |
435 | } | |
436 | ||
71f4ecdb | 437 | ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); |
e5113505 | 438 | |
86399ea0 SK |
439 | fail: |
440 | if (cmd == BLKRESETZONE) | |
224941e8 | 441 | filemap_invalidate_unlock(bdev->bd_mapping); |
e5113505 SK |
442 | |
443 | return ret; | |
3ed05a98 | 444 | } |
bf505456 | 445 | |
dd291d77 DLM |
446 | static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) |
447 | { | |
448 | if (!disk->conv_zones_bitmap) | |
449 | return false; | |
450 | return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); | |
451 | } | |
452 | ||
453 | static bool disk_insert_zone_wplug(struct gendisk *disk, | |
454 | struct blk_zone_wplug *zwplug) | |
455 | { | |
456 | struct blk_zone_wplug *zwplg; | |
457 | unsigned long flags; | |
458 | unsigned int idx = | |
459 | hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); | |
460 | ||
461 | /* | |
462 | * Add the new zone write plug to the hash table, but carefully as we | |
463 | * are racing with other submission context, so we may already have a | |
464 | * zone write plug for the same zone. | |
465 | */ | |
466 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
467 | hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { | |
468 | if (zwplg->zone_no == zwplug->zone_no) { | |
469 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
470 | return false; | |
471 | } | |
472 | } | |
473 | hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); | |
474 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
475 | ||
476 | return true; | |
477 | } | |
478 | ||
dd291d77 DLM |
479 | static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, |
480 | sector_t sector) | |
481 | { | |
482 | unsigned int zno = disk_zone_no(disk, sector); | |
483 | unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); | |
484 | struct blk_zone_wplug *zwplug; | |
485 | ||
486 | rcu_read_lock(); | |
487 | ||
488 | hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { | |
489 | if (zwplug->zone_no == zno && | |
490 | atomic_inc_not_zero(&zwplug->ref)) { | |
491 | rcu_read_unlock(); | |
492 | return zwplug; | |
493 | } | |
494 | } | |
495 | ||
496 | rcu_read_unlock(); | |
497 | ||
498 | return NULL; | |
499 | } | |
500 | ||
501 | static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) | |
502 | { | |
503 | struct blk_zone_wplug *zwplug = | |
504 | container_of(rcu_head, struct blk_zone_wplug, rcu_head); | |
505 | ||
506 | mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); | |
507 | } | |
508 | ||
509 | static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) | |
510 | { | |
511 | if (atomic_dec_and_test(&zwplug->ref)) { | |
512 | WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); | |
513 | WARN_ON_ONCE(!list_empty(&zwplug->link)); | |
79ae35a4 | 514 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); |
dd291d77 DLM |
515 | |
516 | call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); | |
517 | } | |
518 | } | |
519 | ||
79ae35a4 DLM |
520 | static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, |
521 | struct blk_zone_wplug *zwplug) | |
522 | { | |
7b295187 DLM |
523 | /* If the zone write plug was already removed, we are done. */ |
524 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
525 | return false; | |
526 | ||
527 | /* If the zone write plug is still busy, it cannot be removed. */ | |
79ae35a4 DLM |
528 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) |
529 | return false; | |
530 | ||
7b295187 DLM |
531 | /* |
532 | * Completions of BIOs with blk_zone_write_plug_bio_endio() may | |
533 | * happen after handling a request completion with | |
347bde9d | 534 | * blk_zone_write_plug_finish_request() (e.g. with split BIOs |
7b295187 DLM |
535 | * that are chained). In such case, disk_zone_wplug_unplug_bio() |
536 | * should not attempt to remove the zone write plug until all BIO | |
537 | * completions are seen. Check by looking at the zone write plug | |
538 | * reference count, which is 2 when the plug is unused (one reference | |
539 | * taken when the plug was allocated and another reference taken by the | |
540 | * caller context). | |
541 | */ | |
542 | if (atomic_read(&zwplug->ref) > 2) | |
543 | return false; | |
544 | ||
79ae35a4 DLM |
545 | /* We can remove zone write plugs for zones that are empty or full. */ |
546 | return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; | |
547 | } | |
548 | ||
549 | static void disk_remove_zone_wplug(struct gendisk *disk, | |
550 | struct blk_zone_wplug *zwplug) | |
551 | { | |
552 | unsigned long flags; | |
553 | ||
554 | /* If the zone write plug was already removed, we have nothing to do. */ | |
555 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
556 | return; | |
557 | ||
558 | /* | |
559 | * Mark the zone write plug as unhashed and drop the extra reference we | |
560 | * took when the plug was inserted in the hash table. | |
561 | */ | |
562 | zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; | |
563 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
564 | hlist_del_init_rcu(&zwplug->node); | |
565 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
566 | disk_put_zone_wplug(zwplug); | |
567 | } | |
568 | ||
dd291d77 DLM |
569 | static void blk_zone_wplug_bio_work(struct work_struct *work); |
570 | ||
571 | /* | |
572 | * Get a reference on the write plug for the zone containing @sector. | |
573 | * If the plug does not exist, it is allocated and hashed. | |
574 | * Return a pointer to the zone write plug with the plug spinlock held. | |
575 | */ | |
576 | static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, | |
577 | sector_t sector, gfp_t gfp_mask, | |
578 | unsigned long *flags) | |
bf505456 | 579 | { |
dd291d77 DLM |
580 | unsigned int zno = disk_zone_no(disk, sector); |
581 | struct blk_zone_wplug *zwplug; | |
582 | ||
583 | again: | |
584 | zwplug = disk_get_zone_wplug(disk, sector); | |
585 | if (zwplug) { | |
586 | /* | |
587 | * Check that a BIO completion or a zone reset or finish | |
588 | * operation has not already removed the zone write plug from | |
589 | * the hash table and dropped its reference count. In such case, | |
590 | * we need to get a new plug so start over from the beginning. | |
591 | */ | |
592 | spin_lock_irqsave(&zwplug->lock, *flags); | |
593 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
594 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
595 | disk_put_zone_wplug(zwplug); | |
596 | goto again; | |
597 | } | |
598 | return zwplug; | |
599 | } | |
600 | ||
601 | /* | |
602 | * Allocate and initialize a zone write plug with an extra reference | |
603 | * so that it is not freed when the zone write plug becomes idle without | |
604 | * the zone being full. | |
605 | */ | |
606 | zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); | |
607 | if (!zwplug) | |
608 | return NULL; | |
609 | ||
610 | INIT_HLIST_NODE(&zwplug->node); | |
611 | INIT_LIST_HEAD(&zwplug->link); | |
612 | atomic_set(&zwplug->ref, 2); | |
613 | spin_lock_init(&zwplug->lock); | |
614 | zwplug->flags = 0; | |
615 | zwplug->zone_no = zno; | |
616 | zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); | |
617 | bio_list_init(&zwplug->bio_list); | |
618 | INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); | |
619 | zwplug->disk = disk; | |
620 | ||
621 | spin_lock_irqsave(&zwplug->lock, *flags); | |
622 | ||
623 | /* | |
624 | * Insert the new zone write plug in the hash table. This can fail only | |
625 | * if another context already inserted a plug. Retry from the beginning | |
626 | * in such case. | |
627 | */ | |
628 | if (!disk_insert_zone_wplug(disk, zwplug)) { | |
629 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
630 | mempool_free(zwplug, disk->zone_wplugs_pool); | |
631 | goto again; | |
632 | } | |
633 | ||
634 | return zwplug; | |
635 | } | |
636 | ||
c9c8aea0 DLM |
637 | static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, |
638 | struct bio *bio) | |
dd291d77 | 639 | { |
c9c8aea0 | 640 | struct request_queue *q = zwplug->disk->queue; |
dd291d77 DLM |
641 | |
642 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
643 | bio_io_error(bio); | |
c9c8aea0 | 644 | disk_put_zone_wplug(zwplug); |
dd291d77 DLM |
645 | blk_queue_exit(q); |
646 | } | |
647 | ||
648 | /* | |
649 | * Abort (fail) all plugged BIOs of a zone write plug. | |
650 | */ | |
651 | static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) | |
652 | { | |
653 | struct bio *bio; | |
654 | ||
c9c8aea0 DLM |
655 | while ((bio = bio_list_pop(&zwplug->bio_list))) |
656 | blk_zone_wplug_bio_io_error(zwplug, bio); | |
dd291d77 DLM |
657 | } |
658 | ||
659 | /* | |
660 | * Abort (fail) all plugged BIOs of a zone write plug that are not aligned | |
661 | * with the assumed write pointer location of the zone when the BIO will | |
662 | * be unplugged. | |
663 | */ | |
664 | static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, | |
665 | struct blk_zone_wplug *zwplug) | |
666 | { | |
667 | unsigned int zone_capacity = disk->zone_capacity; | |
668 | unsigned int wp_offset = zwplug->wp_offset; | |
669 | struct bio_list bl = BIO_EMPTY_LIST; | |
670 | struct bio *bio; | |
671 | ||
672 | while ((bio = bio_list_pop(&zwplug->bio_list))) { | |
673 | if (wp_offset >= zone_capacity || | |
9b1ce7f0 DLM |
674 | (bio_op(bio) != REQ_OP_ZONE_APPEND && |
675 | bio_offset_from_zone_start(bio) != wp_offset)) { | |
c9c8aea0 | 676 | blk_zone_wplug_bio_io_error(zwplug, bio); |
dd291d77 DLM |
677 | continue; |
678 | } | |
679 | ||
680 | wp_offset += bio_sectors(bio); | |
681 | bio_list_add(&bl, bio); | |
682 | } | |
683 | ||
684 | bio_list_merge(&zwplug->bio_list, &bl); | |
685 | } | |
686 | ||
19aad274 DLM |
687 | static inline void disk_zone_wplug_set_error(struct gendisk *disk, |
688 | struct blk_zone_wplug *zwplug) | |
689 | { | |
690 | unsigned long flags; | |
691 | ||
692 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) | |
693 | return; | |
694 | ||
695 | /* | |
696 | * At this point, we already have a reference on the zone write plug. | |
697 | * However, since we are going to add the plug to the disk zone write | |
698 | * plugs work list, increase its reference count. This reference will | |
699 | * be dropped in disk_zone_wplugs_work() once the error state is | |
700 | * handled, or in disk_zone_wplug_clear_error() if the zone is reset or | |
701 | * finished. | |
702 | */ | |
703 | zwplug->flags |= BLK_ZONE_WPLUG_ERROR; | |
704 | atomic_inc(&zwplug->ref); | |
705 | ||
706 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
707 | list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); | |
708 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
709 | } | |
710 | ||
711 | static inline void disk_zone_wplug_clear_error(struct gendisk *disk, | |
712 | struct blk_zone_wplug *zwplug) | |
713 | { | |
714 | unsigned long flags; | |
715 | ||
716 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) | |
717 | return; | |
718 | ||
719 | /* | |
720 | * We are racing with the error handling work which drops the reference | |
721 | * on the zone write plug after handling the error state. So remove the | |
722 | * plug from the error list and drop its reference count only if the | |
723 | * error handling has not yet started, that is, if the zone write plug | |
724 | * is still listed. | |
725 | */ | |
726 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
727 | if (!list_empty(&zwplug->link)) { | |
728 | list_del_init(&zwplug->link); | |
729 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
730 | disk_put_zone_wplug(zwplug); | |
731 | } | |
732 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
733 | } | |
734 | ||
dd291d77 DLM |
735 | /* |
736 | * Set a zone write plug write pointer offset to either 0 (zone reset case) | |
737 | * or to the zone size (zone finish case). This aborts all plugged BIOs, which | |
738 | * is fine to do as doing a zone reset or zone finish while writes are in-flight | |
739 | * is a mistake from the user which will most likely cause all plugged BIOs to | |
740 | * fail anyway. | |
741 | */ | |
742 | static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, | |
743 | struct blk_zone_wplug *zwplug, | |
744 | unsigned int wp_offset) | |
745 | { | |
746 | unsigned long flags; | |
747 | ||
748 | spin_lock_irqsave(&zwplug->lock, flags); | |
749 | ||
750 | /* | |
751 | * Make sure that a BIO completion or another zone reset or finish | |
752 | * operation has not already removed the plug from the hash table. | |
753 | */ | |
754 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
755 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
756 | return; | |
757 | } | |
758 | ||
759 | /* Update the zone write pointer and abort all plugged BIOs. */ | |
760 | zwplug->wp_offset = wp_offset; | |
761 | disk_zone_wplug_abort(zwplug); | |
762 | ||
763 | /* | |
764 | * Updating the write pointer offset puts back the zone | |
765 | * in a good state. So clear the error flag and decrement the | |
766 | * error count if we were in error state. | |
767 | */ | |
19aad274 | 768 | disk_zone_wplug_clear_error(disk, zwplug); |
dd291d77 DLM |
769 | |
770 | /* | |
771 | * The zone write plug now has no BIO plugged: remove it from the | |
772 | * hash table so that it cannot be seen. The plug will be freed | |
773 | * when the last reference is dropped. | |
774 | */ | |
775 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
776 | disk_remove_zone_wplug(disk, zwplug); | |
777 | ||
778 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
779 | } | |
780 | ||
781 | static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, | |
782 | unsigned int wp_offset) | |
783 | { | |
784 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
785 | sector_t sector = bio->bi_iter.bi_sector; | |
786 | struct blk_zone_wplug *zwplug; | |
787 | ||
788 | /* Conventional zones cannot be reset nor finished. */ | |
789 | if (disk_zone_is_conv(disk, sector)) { | |
790 | bio_io_error(bio); | |
791 | return true; | |
792 | } | |
793 | ||
794 | /* | |
795 | * If we have a zone write plug, set its write pointer offset to 0 | |
796 | * (reset case) or to the zone size (finish case). This will abort all | |
797 | * BIOs plugged for the target zone. It is fine as resetting or | |
798 | * finishing zones while writes are still in-flight will result in the | |
799 | * writes failing anyway. | |
800 | */ | |
801 | zwplug = disk_get_zone_wplug(disk, sector); | |
802 | if (zwplug) { | |
803 | disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); | |
804 | disk_put_zone_wplug(zwplug); | |
805 | } | |
806 | ||
807 | return false; | |
808 | } | |
809 | ||
810 | static bool blk_zone_wplug_handle_reset_all(struct bio *bio) | |
811 | { | |
812 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
813 | struct blk_zone_wplug *zwplug; | |
814 | sector_t sector; | |
815 | ||
816 | /* | |
817 | * Set the write pointer offset of all zone write plugs to 0. This will | |
818 | * abort all plugged BIOs. It is fine as resetting zones while writes | |
819 | * are still in-flight will result in the writes failing anyway. | |
820 | */ | |
821 | for (sector = 0; sector < get_capacity(disk); | |
822 | sector += disk->queue->limits.chunk_sectors) { | |
823 | zwplug = disk_get_zone_wplug(disk, sector); | |
824 | if (zwplug) { | |
825 | disk_zone_wplug_set_wp_offset(disk, zwplug, 0); | |
826 | disk_put_zone_wplug(zwplug); | |
827 | } | |
828 | } | |
829 | ||
830 | return false; | |
831 | } | |
832 | ||
833 | static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, | |
834 | struct bio *bio, unsigned int nr_segs) | |
835 | { | |
836 | /* | |
837 | * Grab an extra reference on the BIO request queue usage counter. | |
838 | * This reference will be reused to submit a request for the BIO for | |
839 | * blk-mq devices and dropped when the BIO is failed and after | |
840 | * it is issued in the case of BIO-based devices. | |
841 | */ | |
842 | percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); | |
843 | ||
844 | /* | |
845 | * The BIO is being plugged and thus will have to wait for the on-going | |
846 | * write and for all other writes already plugged. So polling makes | |
847 | * no sense. | |
848 | */ | |
849 | bio_clear_polled(bio); | |
850 | ||
851 | /* | |
852 | * Reuse the poll cookie field to store the number of segments when | |
853 | * split to the hardware limits. | |
854 | */ | |
855 | bio->__bi_nr_segments = nr_segs; | |
856 | ||
857 | /* | |
858 | * We always receive BIOs after they are split and ready to be issued. | |
859 | * The block layer passes the parts of a split BIO in order, and the | |
860 | * user must also issue write sequentially. So simply add the new BIO | |
861 | * at the tail of the list to preserve the sequential write order. | |
862 | */ | |
863 | bio_list_add(&zwplug->bio_list, bio); | |
864 | } | |
865 | ||
866 | /* | |
867 | * Called from bio_attempt_back_merge() when a BIO was merged with a request. | |
868 | */ | |
869 | void blk_zone_write_plug_bio_merged(struct bio *bio) | |
870 | { | |
871 | struct blk_zone_wplug *zwplug; | |
872 | unsigned long flags; | |
873 | ||
874 | /* | |
875 | * If the BIO was already plugged, then we were called through | |
096bc7ea DLM |
876 | * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). |
877 | * For this case, we already hold a reference on the zone write plug for | |
878 | * the BIO and blk_zone_write_plug_init_request() will handle the | |
dd291d77 DLM |
879 | * zone write pointer offset update. |
880 | */ | |
881 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
882 | return; | |
883 | ||
884 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
885 | ||
886 | /* | |
c4c3ffda DLM |
887 | * Get a reference on the zone write plug of the target zone and advance |
888 | * the zone write pointer offset. Given that this is a merge, we already | |
889 | * have at least one request and one BIO referencing the zone write | |
890 | * plug. So this should not fail. | |
dd291d77 DLM |
891 | */ |
892 | zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, | |
893 | bio->bi_iter.bi_sector); | |
c4c3ffda DLM |
894 | if (WARN_ON_ONCE(!zwplug)) |
895 | return; | |
896 | ||
dd291d77 DLM |
897 | spin_lock_irqsave(&zwplug->lock, flags); |
898 | zwplug->wp_offset += bio_sectors(bio); | |
899 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
900 | } | |
901 | ||
902 | /* | |
903 | * Attempt to merge plugged BIOs with a newly prepared request for a BIO that | |
904 | * already went through zone write plugging (either a new BIO or one that was | |
905 | * unplugged). | |
906 | */ | |
096bc7ea | 907 | void blk_zone_write_plug_init_request(struct request *req) |
dd291d77 DLM |
908 | { |
909 | sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); | |
910 | struct request_queue *q = req->q; | |
911 | struct gendisk *disk = q->disk; | |
912 | unsigned int zone_capacity = disk->zone_capacity; | |
913 | struct blk_zone_wplug *zwplug = | |
914 | disk_get_zone_wplug(disk, blk_rq_pos(req)); | |
915 | unsigned long flags; | |
916 | struct bio *bio; | |
917 | ||
096bc7ea DLM |
918 | if (WARN_ON_ONCE(!zwplug)) |
919 | return; | |
920 | ||
dd291d77 | 921 | /* |
7b295187 | 922 | * Indicate that completion of this request needs to be handled with |
347bde9d | 923 | * blk_zone_write_plug_finish_request(), which will drop the reference |
7b295187 | 924 | * on the zone write plug we took above on entry to this function. |
dd291d77 DLM |
925 | */ |
926 | req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; | |
927 | ||
928 | if (blk_queue_nomerges(q)) | |
929 | return; | |
930 | ||
931 | /* | |
932 | * Walk through the list of plugged BIOs to check if they can be merged | |
933 | * into the back of the request. | |
934 | */ | |
935 | spin_lock_irqsave(&zwplug->lock, flags); | |
936 | while (zwplug->wp_offset < zone_capacity) { | |
937 | bio = bio_list_peek(&zwplug->bio_list); | |
938 | if (!bio) | |
939 | break; | |
940 | ||
941 | if (bio->bi_iter.bi_sector != req_back_sector || | |
942 | !blk_rq_merge_ok(req, bio)) | |
943 | break; | |
944 | ||
945 | WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && | |
946 | !bio->__bi_nr_segments); | |
947 | ||
948 | bio_list_pop(&zwplug->bio_list); | |
949 | if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != | |
950 | BIO_MERGE_OK) { | |
951 | bio_list_add_head(&zwplug->bio_list, bio); | |
952 | break; | |
953 | } | |
954 | ||
955 | /* | |
956 | * Drop the extra reference on the queue usage we got when | |
957 | * plugging the BIO and advance the write pointer offset. | |
958 | */ | |
959 | blk_queue_exit(q); | |
960 | zwplug->wp_offset += bio_sectors(bio); | |
961 | ||
962 | req_back_sector += bio_sectors(bio); | |
963 | } | |
964 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
965 | } | |
966 | ||
dd291d77 DLM |
967 | /* |
968 | * Check and prepare a BIO for submission by incrementing the write pointer | |
9b1ce7f0 DLM |
969 | * offset of its zone write plug and changing zone append operations into |
970 | * regular write when zone append emulation is needed. | |
dd291d77 DLM |
971 | */ |
972 | static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, | |
973 | struct bio *bio) | |
974 | { | |
975 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
976 | ||
977 | /* | |
978 | * Check that the user is not attempting to write to a full zone. | |
979 | * We know such BIO will fail, and that would potentially overflow our | |
980 | * write pointer offset beyond the end of the zone. | |
981 | */ | |
982 | if (zwplug->wp_offset >= disk->zone_capacity) | |
983 | goto err; | |
984 | ||
9b1ce7f0 DLM |
985 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
986 | /* | |
987 | * Use a regular write starting at the current write pointer. | |
988 | * Similarly to native zone append operations, do not allow | |
989 | * merging. | |
990 | */ | |
991 | bio->bi_opf &= ~REQ_OP_MASK; | |
992 | bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; | |
993 | bio->bi_iter.bi_sector += zwplug->wp_offset; | |
994 | ||
995 | /* | |
996 | * Remember that this BIO is in fact a zone append operation | |
997 | * so that we can restore its operation code on completion. | |
998 | */ | |
999 | bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); | |
1000 | } else { | |
1001 | /* | |
1002 | * Check for non-sequential writes early because we avoid a | |
1003 | * whole lot of error handling trouble if we don't send it off | |
1004 | * to the driver. | |
1005 | */ | |
1006 | if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) | |
1007 | goto err; | |
1008 | } | |
dd291d77 DLM |
1009 | |
1010 | /* Advance the zone write pointer offset. */ | |
1011 | zwplug->wp_offset += bio_sectors(bio); | |
1012 | ||
1013 | return true; | |
1014 | ||
1015 | err: | |
1016 | /* We detected an invalid write BIO: schedule error recovery. */ | |
1017 | disk_zone_wplug_set_error(disk, zwplug); | |
1018 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1019 | return false; | |
1020 | } | |
1021 | ||
1022 | static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) | |
1023 | { | |
1024 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1025 | sector_t sector = bio->bi_iter.bi_sector; | |
1026 | struct blk_zone_wplug *zwplug; | |
1027 | gfp_t gfp_mask = GFP_NOIO; | |
1028 | unsigned long flags; | |
1029 | ||
1030 | /* | |
1031 | * BIOs must be fully contained within a zone so that we use the correct | |
1032 | * zone write plug for the entire BIO. For blk-mq devices, the block | |
1033 | * layer should already have done any splitting required to ensure this | |
1034 | * and this BIO should thus not be straddling zone boundaries. For | |
1035 | * BIO-based devices, it is the responsibility of the driver to split | |
1036 | * the bio before submitting it. | |
1037 | */ | |
1038 | if (WARN_ON_ONCE(bio_straddles_zones(bio))) { | |
1039 | bio_io_error(bio); | |
1040 | return true; | |
1041 | } | |
1042 | ||
1043 | /* Conventional zones do not need write plugging. */ | |
9b1ce7f0 DLM |
1044 | if (disk_zone_is_conv(disk, sector)) { |
1045 | /* Zone append to conventional zones is not allowed. */ | |
1046 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
1047 | bio_io_error(bio); | |
1048 | return true; | |
1049 | } | |
dd291d77 | 1050 | return false; |
9b1ce7f0 | 1051 | } |
dd291d77 DLM |
1052 | |
1053 | if (bio->bi_opf & REQ_NOWAIT) | |
1054 | gfp_mask = GFP_NOWAIT; | |
1055 | ||
1056 | zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); | |
1057 | if (!zwplug) { | |
1058 | bio_io_error(bio); | |
1059 | return true; | |
1060 | } | |
1061 | ||
1062 | /* Indicate that this BIO is being handled using zone write plugging. */ | |
1063 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1064 | ||
1065 | /* | |
1066 | * If the zone is already plugged or has a pending error, add the BIO | |
1067 | * to the plug BIO list. Otherwise, plug and let the BIO execute. | |
1068 | */ | |
1069 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) | |
1070 | goto plug; | |
1071 | ||
1072 | /* | |
1073 | * If an error is detected when preparing the BIO, add it to the BIO | |
1074 | * list so that error recovery can deal with it. | |
1075 | */ | |
1076 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) | |
1077 | goto plug; | |
1078 | ||
1079 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1080 | ||
1081 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1082 | ||
1083 | return false; | |
1084 | ||
1085 | plug: | |
1086 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1087 | blk_zone_wplug_add_bio(zwplug, bio, nr_segs); | |
1088 | ||
1089 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1090 | ||
1091 | return true; | |
1092 | } | |
1093 | ||
1094 | /** | |
1095 | * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging | |
1096 | * @bio: The BIO being submitted | |
1097 | * @nr_segs: The number of physical segments of @bio | |
1098 | * | |
9b1ce7f0 DLM |
1099 | * Handle write, write zeroes and zone append operations requiring emulation |
1100 | * using zone write plugging. | |
dd291d77 DLM |
1101 | * |
1102 | * Return true whenever @bio execution needs to be delayed through the zone | |
1103 | * write plug. Otherwise, return false to let the submission path process | |
1104 | * @bio normally. | |
1105 | */ | |
1106 | bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) | |
1107 | { | |
1108 | struct block_device *bdev = bio->bi_bdev; | |
1109 | ||
1110 | if (!bdev->bd_disk->zone_wplugs_hash) | |
1111 | return false; | |
1112 | ||
1113 | /* | |
1114 | * If the BIO already has the plugging flag set, then it was already | |
1115 | * handled through this path and this is a submission from the zone | |
1116 | * plug bio submit work. | |
1117 | */ | |
1118 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
1119 | return false; | |
1120 | ||
1121 | /* | |
1122 | * We do not need to do anything special for empty flush BIOs, e.g | |
1123 | * BIOs such as issued by blkdev_issue_flush(). The is because it is | |
1124 | * the responsibility of the user to first wait for the completion of | |
1125 | * write operations for flush to have any effect on the persistence of | |
1126 | * the written data. | |
1127 | */ | |
1128 | if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) | |
1129 | return false; | |
1130 | ||
1131 | /* | |
1132 | * Regular writes and write zeroes need to be handled through the target | |
1133 | * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH | |
1134 | * which may need to go through the flush machinery depending on the | |
1135 | * target device capabilities. Plugging such writes is fine as the flush | |
1136 | * machinery operates at the request level, below the plug, and | |
1137 | * completion of the flush sequence will go through the regular BIO | |
1138 | * completion, which will handle zone write plugging. | |
9b1ce7f0 DLM |
1139 | * Zone append operations for devices that requested emulation must |
1140 | * also be plugged so that these BIOs can be changed into regular | |
1141 | * write BIOs. | |
dd291d77 DLM |
1142 | * Zone reset, reset all and finish commands need special treatment |
1143 | * to correctly track the write pointer offset of zones. These commands | |
1144 | * are not plugged as we do not need serialization with write | |
1145 | * operations. It is the responsibility of the user to not issue reset | |
1146 | * and finish commands when write operations are in flight. | |
1147 | */ | |
1148 | switch (bio_op(bio)) { | |
9b1ce7f0 DLM |
1149 | case REQ_OP_ZONE_APPEND: |
1150 | if (!bdev_emulates_zone_append(bdev)) | |
1151 | return false; | |
1152 | fallthrough; | |
dd291d77 DLM |
1153 | case REQ_OP_WRITE: |
1154 | case REQ_OP_WRITE_ZEROES: | |
1155 | return blk_zone_wplug_handle_write(bio, nr_segs); | |
1156 | case REQ_OP_ZONE_RESET: | |
1157 | return blk_zone_wplug_handle_reset_or_finish(bio, 0); | |
1158 | case REQ_OP_ZONE_FINISH: | |
1159 | return blk_zone_wplug_handle_reset_or_finish(bio, | |
1160 | bdev_zone_sectors(bdev)); | |
1161 | case REQ_OP_ZONE_RESET_ALL: | |
1162 | return blk_zone_wplug_handle_reset_all(bio); | |
1163 | default: | |
1164 | return false; | |
1165 | } | |
1166 | ||
1167 | return false; | |
1168 | } | |
1169 | EXPORT_SYMBOL_GPL(blk_zone_plug_bio); | |
1170 | ||
9e78c38a DLM |
1171 | static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, |
1172 | struct blk_zone_wplug *zwplug) | |
1173 | { | |
1174 | /* | |
1175 | * Take a reference on the zone write plug and schedule the submission | |
1176 | * of the next plugged BIO. blk_zone_wplug_bio_work() will release the | |
1177 | * reference we take here. | |
1178 | */ | |
1179 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); | |
1180 | atomic_inc(&zwplug->ref); | |
1181 | queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); | |
1182 | } | |
1183 | ||
dd291d77 DLM |
1184 | static void disk_zone_wplug_unplug_bio(struct gendisk *disk, |
1185 | struct blk_zone_wplug *zwplug) | |
1186 | { | |
1187 | unsigned long flags; | |
1188 | ||
1189 | spin_lock_irqsave(&zwplug->lock, flags); | |
1190 | ||
1191 | /* | |
1192 | * If we had an error, schedule error recovery. The recovery work | |
1193 | * will restart submission of plugged BIOs. | |
1194 | */ | |
1195 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { | |
1196 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1197 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1198 | return; | |
1199 | } | |
1200 | ||
1201 | /* Schedule submission of the next plugged BIO if we have one. */ | |
1202 | if (!bio_list_empty(&zwplug->bio_list)) { | |
9e78c38a | 1203 | disk_zone_wplug_schedule_bio_work(disk, zwplug); |
dd291d77 | 1204 | spin_unlock_irqrestore(&zwplug->lock, flags); |
dd291d77 DLM |
1205 | return; |
1206 | } | |
1207 | ||
1208 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1209 | ||
1210 | /* | |
1211 | * If the zone is full (it was fully written or finished, or empty | |
1212 | * (it was reset), remove its zone write plug from the hash table. | |
1213 | */ | |
1214 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1215 | disk_remove_zone_wplug(disk, zwplug); | |
1216 | ||
1217 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1218 | } | |
1219 | ||
1220 | void blk_zone_write_plug_bio_endio(struct bio *bio) | |
1221 | { | |
1222 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1223 | struct blk_zone_wplug *zwplug = | |
b5a64ec2 | 1224 | disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); |
dd291d77 DLM |
1225 | unsigned long flags; |
1226 | ||
1227 | if (WARN_ON_ONCE(!zwplug)) | |
1228 | return; | |
1229 | ||
1230 | /* Make sure we do not see this BIO again by clearing the plug flag. */ | |
1231 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1232 | ||
9b1ce7f0 DLM |
1233 | /* |
1234 | * If this is a regular write emulating a zone append operation, | |
1235 | * restore the original operation code. | |
1236 | */ | |
1237 | if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { | |
1238 | bio->bi_opf &= ~REQ_OP_MASK; | |
1239 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
1240 | } | |
1241 | ||
dd291d77 DLM |
1242 | /* |
1243 | * If the BIO failed, mark the plug as having an error to trigger | |
1244 | * recovery. | |
1245 | */ | |
1246 | if (bio->bi_status != BLK_STS_OK) { | |
1247 | spin_lock_irqsave(&zwplug->lock, flags); | |
1248 | disk_zone_wplug_set_error(disk, zwplug); | |
1249 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1250 | } | |
1251 | ||
7b295187 DLM |
1252 | /* Drop the reference we took when the BIO was issued. */ |
1253 | disk_put_zone_wplug(zwplug); | |
1254 | ||
dd291d77 | 1255 | /* |
347bde9d | 1256 | * For BIO-based devices, blk_zone_write_plug_finish_request() |
dd291d77 DLM |
1257 | * is not called. So we need to schedule execution of the next |
1258 | * plugged BIO here. | |
1259 | */ | |
1260 | if (bio->bi_bdev->bd_has_submit_bio) | |
1261 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1262 | ||
7b295187 | 1263 | /* Drop the reference we took when entering this function. */ |
dd291d77 DLM |
1264 | disk_put_zone_wplug(zwplug); |
1265 | } | |
1266 | ||
347bde9d | 1267 | void blk_zone_write_plug_finish_request(struct request *req) |
dd291d77 DLM |
1268 | { |
1269 | struct gendisk *disk = req->q->disk; | |
347bde9d | 1270 | struct blk_zone_wplug *zwplug; |
dd291d77 | 1271 | |
347bde9d | 1272 | zwplug = disk_get_zone_wplug(disk, req->__sector); |
dd291d77 DLM |
1273 | if (WARN_ON_ONCE(!zwplug)) |
1274 | return; | |
1275 | ||
1276 | req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; | |
1277 | ||
dd291d77 DLM |
1278 | /* |
1279 | * Drop the reference we took when the request was initialized in | |
096bc7ea | 1280 | * blk_zone_write_plug_init_request(). |
dd291d77 | 1281 | */ |
7b295187 DLM |
1282 | disk_put_zone_wplug(zwplug); |
1283 | ||
1284 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1285 | ||
1286 | /* Drop the reference we took when entering this function. */ | |
dd291d77 DLM |
1287 | disk_put_zone_wplug(zwplug); |
1288 | } | |
1289 | ||
1290 | static void blk_zone_wplug_bio_work(struct work_struct *work) | |
1291 | { | |
1292 | struct blk_zone_wplug *zwplug = | |
1293 | container_of(work, struct blk_zone_wplug, bio_work); | |
1294 | struct block_device *bdev; | |
1295 | unsigned long flags; | |
1296 | struct bio *bio; | |
1297 | ||
1298 | /* | |
1299 | * Submit the next plugged BIO. If we do not have any, clear | |
1300 | * the plugged flag. | |
1301 | */ | |
1302 | spin_lock_irqsave(&zwplug->lock, flags); | |
1303 | ||
1304 | bio = bio_list_pop(&zwplug->bio_list); | |
1305 | if (!bio) { | |
1306 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1307 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
9e78c38a | 1308 | goto put_zwplug; |
dd291d77 DLM |
1309 | } |
1310 | ||
1311 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { | |
1312 | /* Error recovery will decide what to do with the BIO. */ | |
1313 | bio_list_add_head(&zwplug->bio_list, bio); | |
1314 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
9e78c38a | 1315 | goto put_zwplug; |
dd291d77 DLM |
1316 | } |
1317 | ||
1318 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1319 | ||
1320 | bdev = bio->bi_bdev; | |
1321 | submit_bio_noacct_nocheck(bio); | |
1322 | ||
1323 | /* | |
1324 | * blk-mq devices will reuse the extra reference on the request queue | |
1325 | * usage counter we took when the BIO was plugged, but the submission | |
1326 | * path for BIO-based devices will not do that. So drop this extra | |
1327 | * reference here. | |
1328 | */ | |
1329 | if (bdev->bd_has_submit_bio) | |
1330 | blk_queue_exit(bdev->bd_disk->queue); | |
9e78c38a DLM |
1331 | |
1332 | put_zwplug: | |
1333 | /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ | |
1334 | disk_put_zone_wplug(zwplug); | |
dd291d77 DLM |
1335 | } |
1336 | ||
1337 | static unsigned int blk_zone_wp_offset(struct blk_zone *zone) | |
1338 | { | |
1339 | switch (zone->cond) { | |
1340 | case BLK_ZONE_COND_IMP_OPEN: | |
1341 | case BLK_ZONE_COND_EXP_OPEN: | |
1342 | case BLK_ZONE_COND_CLOSED: | |
1343 | return zone->wp - zone->start; | |
1344 | case BLK_ZONE_COND_FULL: | |
1345 | return zone->len; | |
1346 | case BLK_ZONE_COND_EMPTY: | |
1347 | return 0; | |
1348 | case BLK_ZONE_COND_NOT_WP: | |
1349 | case BLK_ZONE_COND_OFFLINE: | |
1350 | case BLK_ZONE_COND_READONLY: | |
1351 | default: | |
1352 | /* | |
1353 | * Conventional, offline and read-only zones do not have a valid | |
1354 | * write pointer. | |
1355 | */ | |
1356 | return UINT_MAX; | |
1357 | } | |
1358 | } | |
1359 | ||
1360 | static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, | |
1361 | unsigned int idx, void *data) | |
1362 | { | |
1363 | struct blk_zone *zonep = data; | |
1364 | ||
1365 | *zonep = *zone; | |
1366 | return 0; | |
1367 | } | |
1368 | ||
1369 | static void disk_zone_wplug_handle_error(struct gendisk *disk, | |
1370 | struct blk_zone_wplug *zwplug) | |
1371 | { | |
1372 | sector_t zone_start_sector = | |
1373 | bdev_zone_sectors(disk->part0) * zwplug->zone_no; | |
1374 | unsigned int noio_flag; | |
1375 | struct blk_zone zone; | |
1376 | unsigned long flags; | |
1377 | int ret; | |
1378 | ||
1379 | /* Get the current zone information from the device. */ | |
1380 | noio_flag = memalloc_noio_save(); | |
1381 | ret = disk->fops->report_zones(disk, zone_start_sector, 1, | |
1382 | blk_zone_wplug_report_zone_cb, &zone); | |
1383 | memalloc_noio_restore(noio_flag); | |
1384 | ||
1385 | spin_lock_irqsave(&zwplug->lock, flags); | |
1386 | ||
1387 | /* | |
1388 | * A zone reset or finish may have cleared the error already. In such | |
1389 | * case, do nothing as the report zones may have seen the "old" write | |
1390 | * pointer value before the reset/finish operation completed. | |
1391 | */ | |
1392 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) | |
1393 | goto unlock; | |
1394 | ||
1395 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
1396 | ||
1397 | if (ret != 1) { | |
1398 | /* | |
1399 | * We failed to get the zone information, meaning that something | |
1400 | * is likely really wrong with the device. Abort all remaining | |
1401 | * plugged BIOs as otherwise we could endup waiting forever on | |
1402 | * plugged BIOs to complete if there is a queue freeze on-going. | |
1403 | */ | |
1404 | disk_zone_wplug_abort(zwplug); | |
1405 | goto unplug; | |
1406 | } | |
1407 | ||
1408 | /* Update the zone write pointer offset. */ | |
1409 | zwplug->wp_offset = blk_zone_wp_offset(&zone); | |
1410 | disk_zone_wplug_abort_unaligned(disk, zwplug); | |
1411 | ||
1412 | /* Restart BIO submission if we still have any BIO left. */ | |
1413 | if (!bio_list_empty(&zwplug->bio_list)) { | |
9e78c38a | 1414 | disk_zone_wplug_schedule_bio_work(disk, zwplug); |
dd291d77 DLM |
1415 | goto unlock; |
1416 | } | |
1417 | ||
1418 | unplug: | |
1419 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1420 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1421 | disk_remove_zone_wplug(disk, zwplug); | |
1422 | ||
1423 | unlock: | |
1424 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1425 | } | |
1426 | ||
1427 | static void disk_zone_wplugs_work(struct work_struct *work) | |
1428 | { | |
1429 | struct gendisk *disk = | |
1430 | container_of(work, struct gendisk, zone_wplugs_work); | |
1431 | struct blk_zone_wplug *zwplug; | |
1432 | unsigned long flags; | |
1433 | ||
1434 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1435 | ||
1436 | while (!list_empty(&disk->zone_wplugs_err_list)) { | |
1437 | zwplug = list_first_entry(&disk->zone_wplugs_err_list, | |
1438 | struct blk_zone_wplug, link); | |
1439 | list_del_init(&zwplug->link); | |
1440 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1441 | ||
1442 | disk_zone_wplug_handle_error(disk, zwplug); | |
1443 | disk_put_zone_wplug(zwplug); | |
1444 | ||
1445 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1446 | } | |
1447 | ||
1448 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1449 | } | |
1450 | ||
1451 | static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) | |
1452 | { | |
1453 | return 1U << disk->zone_wplugs_hash_bits; | |
1454 | } | |
1455 | ||
1456 | void disk_init_zone_resources(struct gendisk *disk) | |
1457 | { | |
1458 | spin_lock_init(&disk->zone_wplugs_lock); | |
1459 | INIT_LIST_HEAD(&disk->zone_wplugs_err_list); | |
1460 | INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); | |
1461 | } | |
1462 | ||
1463 | /* | |
1464 | * For the size of a disk zone write plug hash table, use the size of the | |
1465 | * zone write plug mempool, which is the maximum of the disk open zones and | |
1466 | * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, | |
1467 | * 9 bits. For a disk that has no limits, mempool size defaults to 128. | |
1468 | */ | |
1469 | #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 | |
1470 | #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 | |
1471 | ||
1472 | static int disk_alloc_zone_resources(struct gendisk *disk, | |
1473 | unsigned int pool_size) | |
1474 | { | |
1475 | unsigned int i; | |
1476 | ||
1477 | disk->zone_wplugs_hash_bits = | |
1478 | min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); | |
1479 | ||
1480 | disk->zone_wplugs_hash = | |
1481 | kcalloc(disk_zone_wplugs_hash_size(disk), | |
1482 | sizeof(struct hlist_head), GFP_KERNEL); | |
1483 | if (!disk->zone_wplugs_hash) | |
1484 | return -ENOMEM; | |
1485 | ||
1486 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) | |
1487 | INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); | |
1488 | ||
1489 | disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, | |
1490 | sizeof(struct blk_zone_wplug)); | |
a8f59e5a DLM |
1491 | if (!disk->zone_wplugs_pool) |
1492 | goto free_hash; | |
1493 | ||
1494 | disk->zone_wplugs_wq = | |
1495 | alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, | |
1496 | pool_size, disk->disk_name); | |
1497 | if (!disk->zone_wplugs_wq) | |
1498 | goto destroy_pool; | |
dd291d77 DLM |
1499 | |
1500 | return 0; | |
a8f59e5a DLM |
1501 | |
1502 | destroy_pool: | |
1503 | mempool_destroy(disk->zone_wplugs_pool); | |
1504 | disk->zone_wplugs_pool = NULL; | |
1505 | free_hash: | |
1506 | kfree(disk->zone_wplugs_hash); | |
1507 | disk->zone_wplugs_hash = NULL; | |
1508 | disk->zone_wplugs_hash_bits = 0; | |
1509 | return -ENOMEM; | |
dd291d77 DLM |
1510 | } |
1511 | ||
1512 | static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) | |
1513 | { | |
1514 | struct blk_zone_wplug *zwplug; | |
1515 | unsigned int i; | |
1516 | ||
1517 | if (!disk->zone_wplugs_hash) | |
1518 | return; | |
1519 | ||
1520 | /* Free all the zone write plugs we have. */ | |
1521 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1522 | while (!hlist_empty(&disk->zone_wplugs_hash[i])) { | |
1523 | zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, | |
1524 | struct blk_zone_wplug, node); | |
1525 | atomic_inc(&zwplug->ref); | |
1526 | disk_remove_zone_wplug(disk, zwplug); | |
1527 | disk_put_zone_wplug(zwplug); | |
1528 | } | |
1529 | } | |
1530 | ||
1531 | kfree(disk->zone_wplugs_hash); | |
1532 | disk->zone_wplugs_hash = NULL; | |
1533 | disk->zone_wplugs_hash_bits = 0; | |
1534 | } | |
1535 | ||
1536 | void disk_free_zone_resources(struct gendisk *disk) | |
1537 | { | |
1538 | cancel_work_sync(&disk->zone_wplugs_work); | |
1539 | ||
a8f59e5a DLM |
1540 | if (disk->zone_wplugs_wq) { |
1541 | destroy_workqueue(disk->zone_wplugs_wq); | |
1542 | disk->zone_wplugs_wq = NULL; | |
1543 | } | |
1544 | ||
dd291d77 DLM |
1545 | disk_destroy_zone_wplugs_hash_table(disk); |
1546 | ||
1547 | /* | |
1548 | * Wait for the zone write plugs to be RCU-freed before | |
1549 | * destorying the mempool. | |
1550 | */ | |
1551 | rcu_barrier(); | |
1552 | ||
1553 | mempool_destroy(disk->zone_wplugs_pool); | |
1554 | disk->zone_wplugs_pool = NULL; | |
1555 | ||
d86e716a CH |
1556 | kfree(disk->conv_zones_bitmap); |
1557 | disk->conv_zones_bitmap = NULL; | |
dd291d77 DLM |
1558 | disk->zone_capacity = 0; |
1559 | disk->nr_zones = 0; | |
1560 | } | |
1561 | ||
946dd71e DLM |
1562 | static inline bool disk_need_zone_resources(struct gendisk *disk) |
1563 | { | |
1564 | /* | |
1565 | * All mq zoned devices need zone resources so that the block layer | |
1566 | * can automatically handle write BIO plugging. BIO-based device drivers | |
1567 | * (e.g. DM devices) are normally responsible for handling zone write | |
1568 | * ordering and do not need zone resources, unless the driver requires | |
1569 | * zone append emulation. | |
1570 | */ | |
1571 | return queue_is_mq(disk->queue) || | |
1572 | queue_emulates_zone_append(disk->queue); | |
1573 | } | |
1574 | ||
dd291d77 DLM |
1575 | static int disk_revalidate_zone_resources(struct gendisk *disk, |
1576 | unsigned int nr_zones) | |
1577 | { | |
1578 | struct queue_limits *lim = &disk->queue->limits; | |
1579 | unsigned int pool_size; | |
1580 | ||
946dd71e DLM |
1581 | if (!disk_need_zone_resources(disk)) |
1582 | return 0; | |
1583 | ||
dd291d77 DLM |
1584 | /* |
1585 | * If the device has no limit on the maximum number of open and active | |
1586 | * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. | |
1587 | */ | |
1588 | pool_size = max(lim->max_open_zones, lim->max_active_zones); | |
1589 | if (!pool_size) | |
1590 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); | |
1591 | ||
1592 | if (!disk->zone_wplugs_hash) | |
1593 | return disk_alloc_zone_resources(disk, pool_size); | |
1594 | ||
dd291d77 | 1595 | return 0; |
bf505456 DLM |
1596 | } |
1597 | ||
d4100351 CH |
1598 | struct blk_revalidate_zone_args { |
1599 | struct gendisk *disk; | |
f216fdd7 | 1600 | unsigned long *conv_zones_bitmap; |
e94f5819 | 1601 | unsigned int nr_zones; |
ecfe43b1 | 1602 | unsigned int zone_capacity; |
d4100351 CH |
1603 | sector_t sector; |
1604 | }; | |
1605 | ||
843283e9 DLM |
1606 | /* |
1607 | * Update the disk zone resources information and device queue limits. | |
1608 | * The disk queue is frozen when this is executed. | |
1609 | */ | |
1610 | static int disk_update_zone_resources(struct gendisk *disk, | |
1611 | struct blk_revalidate_zone_args *args) | |
1612 | { | |
1613 | struct request_queue *q = disk->queue; | |
6b7593b5 DLM |
1614 | unsigned int nr_seq_zones, nr_conv_zones = 0; |
1615 | unsigned int pool_size; | |
843283e9 DLM |
1616 | struct queue_limits lim; |
1617 | ||
1618 | disk->nr_zones = args->nr_zones; | |
1619 | disk->zone_capacity = args->zone_capacity; | |
843283e9 | 1620 | swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); |
6b7593b5 DLM |
1621 | if (disk->conv_zones_bitmap) |
1622 | nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, | |
1623 | disk->nr_zones); | |
1624 | if (nr_conv_zones >= disk->nr_zones) { | |
1625 | pr_warn("%s: Invalid number of conventional zones %u / %u\n", | |
1626 | disk->disk_name, nr_conv_zones, disk->nr_zones); | |
1627 | return -ENODEV; | |
1628 | } | |
1629 | ||
1630 | if (!disk->zone_wplugs_pool) | |
1631 | return 0; | |
843283e9 DLM |
1632 | |
1633 | /* | |
1634 | * If the device has no limit on the maximum number of open and active | |
1635 | * zones, set its max open zone limit to the mempool size to indicate | |
1636 | * to the user that there is a potential performance impact due to | |
1637 | * dynamic zone write plug allocation when simultaneously writing to | |
1638 | * more zones than the size of the mempool. | |
1639 | */ | |
6b7593b5 DLM |
1640 | lim = queue_limits_start_update(q); |
1641 | ||
1642 | nr_seq_zones = disk->nr_zones - nr_conv_zones; | |
1643 | pool_size = max(lim.max_open_zones, lim.max_active_zones); | |
1644 | if (!pool_size) | |
1645 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); | |
1646 | ||
1647 | mempool_resize(disk->zone_wplugs_pool, pool_size); | |
1648 | ||
1649 | if (!lim.max_open_zones && !lim.max_active_zones) { | |
1650 | if (pool_size < nr_seq_zones) | |
1651 | lim.max_open_zones = pool_size; | |
1652 | else | |
1653 | lim.max_open_zones = 0; | |
843283e9 DLM |
1654 | } |
1655 | ||
6b7593b5 | 1656 | return queue_limits_commit_update(q, &lim); |
843283e9 DLM |
1657 | } |
1658 | ||
d7580149 DLM |
1659 | static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, |
1660 | struct blk_revalidate_zone_args *args) | |
1661 | { | |
1662 | struct gendisk *disk = args->disk; | |
1663 | struct request_queue *q = disk->queue; | |
1664 | ||
1665 | if (zone->capacity != zone->len) { | |
1666 | pr_warn("%s: Invalid conventional zone capacity\n", | |
1667 | disk->disk_name); | |
1668 | return -ENODEV; | |
1669 | } | |
1670 | ||
1671 | if (!disk_need_zone_resources(disk)) | |
1672 | return 0; | |
1673 | ||
1674 | if (!args->conv_zones_bitmap) { | |
1675 | args->conv_zones_bitmap = | |
1676 | blk_alloc_zone_bitmap(q->node, args->nr_zones); | |
1677 | if (!args->conv_zones_bitmap) | |
1678 | return -ENOMEM; | |
1679 | } | |
1680 | ||
1681 | set_bit(idx, args->conv_zones_bitmap); | |
1682 | ||
1683 | return 0; | |
1684 | } | |
1685 | ||
1686 | static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, | |
1687 | struct blk_revalidate_zone_args *args) | |
1688 | { | |
1689 | struct gendisk *disk = args->disk; | |
1690 | struct blk_zone_wplug *zwplug; | |
1691 | unsigned int wp_offset; | |
1692 | unsigned long flags; | |
1693 | ||
1694 | /* | |
1695 | * Remember the capacity of the first sequential zone and check | |
1696 | * if it is constant for all zones. | |
1697 | */ | |
1698 | if (!args->zone_capacity) | |
1699 | args->zone_capacity = zone->capacity; | |
1700 | if (zone->capacity != args->zone_capacity) { | |
1701 | pr_warn("%s: Invalid variable zone capacity\n", | |
1702 | disk->disk_name); | |
1703 | return -ENODEV; | |
1704 | } | |
1705 | ||
1706 | /* | |
1707 | * We need to track the write pointer of all zones that are not | |
1708 | * empty nor full. So make sure we have a zone write plug for | |
1709 | * such zone if the device has a zone write plug hash table. | |
1710 | */ | |
1711 | if (!disk->zone_wplugs_hash) | |
1712 | return 0; | |
1713 | ||
1714 | wp_offset = blk_zone_wp_offset(zone); | |
1715 | if (!wp_offset || wp_offset >= zone->capacity) | |
1716 | return 0; | |
1717 | ||
1718 | zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); | |
1719 | if (!zwplug) | |
1720 | return -ENOMEM; | |
1721 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1722 | disk_put_zone_wplug(zwplug); | |
1723 | ||
1724 | return 0; | |
1725 | } | |
1726 | ||
d9dd7308 DLM |
1727 | /* |
1728 | * Helper function to check the validity of zones of a zoned block device. | |
1729 | */ | |
d4100351 CH |
1730 | static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, |
1731 | void *data) | |
d9dd7308 | 1732 | { |
d4100351 CH |
1733 | struct blk_revalidate_zone_args *args = data; |
1734 | struct gendisk *disk = args->disk; | |
d9dd7308 | 1735 | sector_t capacity = get_capacity(disk); |
d7580149 DLM |
1736 | sector_t zone_sectors = disk->queue->limits.chunk_sectors; |
1737 | int ret; | |
03e51c4a DLM |
1738 | |
1739 | /* Check for bad zones and holes in the zone report */ | |
1740 | if (zone->start != args->sector) { | |
1741 | pr_warn("%s: Zone gap at sectors %llu..%llu\n", | |
1742 | disk->disk_name, args->sector, zone->start); | |
1743 | return -ENODEV; | |
1744 | } | |
1745 | ||
1746 | if (zone->start >= capacity || !zone->len) { | |
1747 | pr_warn("%s: Invalid zone start %llu, length %llu\n", | |
1748 | disk->disk_name, zone->start, zone->len); | |
1749 | return -ENODEV; | |
1750 | } | |
d9dd7308 DLM |
1751 | |
1752 | /* | |
1753 | * All zones must have the same size, with the exception on an eventual | |
1754 | * smaller last zone. | |
1755 | */ | |
03e51c4a DLM |
1756 | if (zone->start + zone->len < capacity) { |
1757 | if (zone->len != zone_sectors) { | |
6c6b3549 CH |
1758 | pr_warn("%s: Invalid zoned device with non constant zone size\n", |
1759 | disk->disk_name); | |
1760 | return -ENODEV; | |
1761 | } | |
03e51c4a DLM |
1762 | } else if (zone->len > zone_sectors) { |
1763 | pr_warn("%s: Invalid zoned device with larger last zone size\n", | |
1764 | disk->disk_name); | |
d4100351 | 1765 | return -ENODEV; |
d9dd7308 DLM |
1766 | } |
1767 | ||
ecfe43b1 DLM |
1768 | if (!zone->capacity || zone->capacity > zone->len) { |
1769 | pr_warn("%s: Invalid zone capacity\n", | |
1770 | disk->disk_name); | |
1771 | return -ENODEV; | |
1772 | } | |
1773 | ||
d9dd7308 DLM |
1774 | /* Check zone type */ |
1775 | switch (zone->type) { | |
1776 | case BLK_ZONE_TYPE_CONVENTIONAL: | |
d7580149 | 1777 | ret = blk_revalidate_conv_zone(zone, idx, args); |
e94f5819 | 1778 | break; |
d9dd7308 | 1779 | case BLK_ZONE_TYPE_SEQWRITE_REQ: |
d7580149 | 1780 | ret = blk_revalidate_seq_zone(zone, idx, args); |
d9dd7308 | 1781 | break; |
587371ed | 1782 | case BLK_ZONE_TYPE_SEQWRITE_PREF: |
d9dd7308 DLM |
1783 | default: |
1784 | pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", | |
1785 | disk->disk_name, (int)zone->type, zone->start); | |
d7580149 | 1786 | ret = -ENODEV; |
d9dd7308 DLM |
1787 | } |
1788 | ||
d7580149 DLM |
1789 | if (!ret) |
1790 | args->sector += zone->len; | |
1791 | ||
1792 | return ret; | |
d4100351 CH |
1793 | } |
1794 | ||
bf505456 | 1795 | /** |
02ccd7c3 | 1796 | * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs |
bf505456 DLM |
1797 | * @disk: Target disk |
1798 | * | |
9b3c08b9 DLM |
1799 | * Helper function for low-level device drivers to check, (re) allocate and |
1800 | * initialize resources used for managing zoned disks. This function should | |
1801 | * normally be called by blk-mq based drivers when a zoned gendisk is probed | |
1802 | * and when the zone configuration of the gendisk changes (e.g. after a format). | |
03e51c4a DLM |
1803 | * Before calling this function, the device driver must already have set the |
1804 | * device zone size (chunk_sector limit) and the max zone append limit. | |
946dd71e DLM |
1805 | * BIO based drivers can also use this function as long as the device queue |
1806 | * can be safely frozen. | |
bf505456 | 1807 | */ |
9b3c08b9 | 1808 | int blk_revalidate_disk_zones(struct gendisk *disk) |
bf505456 DLM |
1809 | { |
1810 | struct request_queue *q = disk->queue; | |
03e51c4a DLM |
1811 | sector_t zone_sectors = q->limits.chunk_sectors; |
1812 | sector_t capacity = get_capacity(disk); | |
1813 | struct blk_revalidate_zone_args args = { }; | |
6c6b3549 | 1814 | unsigned int noio_flag; |
dd291d77 | 1815 | int ret = -ENOMEM; |
bf505456 | 1816 | |
c98c3d09 CH |
1817 | if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) |
1818 | return -EIO; | |
bf505456 | 1819 | |
03e51c4a DLM |
1820 | if (!capacity) |
1821 | return -ENODEV; | |
1822 | ||
1823 | /* | |
1824 | * Checks that the device driver indicated a valid zone size and that | |
1825 | * the max zone append limit is set. | |
1826 | */ | |
1827 | if (!zone_sectors || !is_power_of_2(zone_sectors)) { | |
1828 | pr_warn("%s: Invalid non power of two zone size (%llu)\n", | |
1829 | disk->disk_name, zone_sectors); | |
1830 | return -ENODEV; | |
1831 | } | |
1832 | ||
ccdbf0aa | 1833 | if (!queue_max_zone_append_sectors(q)) { |
03e51c4a DLM |
1834 | pr_warn("%s: Invalid 0 maximum zone append limit\n", |
1835 | disk->disk_name); | |
1836 | return -ENODEV; | |
1837 | } | |
1a1206dc | 1838 | |
e94f5819 | 1839 | /* |
6c6b3549 CH |
1840 | * Ensure that all memory allocations in this context are done as if |
1841 | * GFP_NOIO was specified. | |
e94f5819 | 1842 | */ |
03e51c4a DLM |
1843 | args.disk = disk; |
1844 | args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); | |
6c6b3549 | 1845 | noio_flag = memalloc_noio_save(); |
dd291d77 DLM |
1846 | ret = disk_revalidate_zone_resources(disk, args.nr_zones); |
1847 | if (ret) { | |
1848 | memalloc_noio_restore(noio_flag); | |
1849 | return ret; | |
1850 | } | |
6c6b3549 CH |
1851 | ret = disk->fops->report_zones(disk, 0, UINT_MAX, |
1852 | blk_revalidate_zone_cb, &args); | |
2afdeb23 DLM |
1853 | if (!ret) { |
1854 | pr_warn("%s: No zones reported\n", disk->disk_name); | |
1855 | ret = -ENODEV; | |
1856 | } | |
6c6b3549 | 1857 | memalloc_noio_restore(noio_flag); |
bf505456 | 1858 | |
2afdeb23 DLM |
1859 | /* |
1860 | * If zones where reported, make sure that the entire disk capacity | |
1861 | * has been checked. | |
1862 | */ | |
03e51c4a | 1863 | if (ret > 0 && args.sector != capacity) { |
2afdeb23 DLM |
1864 | pr_warn("%s: Missing zones from sector %llu\n", |
1865 | disk->disk_name, args.sector); | |
1866 | ret = -ENODEV; | |
1867 | } | |
1868 | ||
bf505456 | 1869 | /* |
02ccd7c3 DLM |
1870 | * Set the new disk zone parameters only once the queue is frozen and |
1871 | * all I/Os are completed. | |
bf505456 DLM |
1872 | */ |
1873 | blk_mq_freeze_queue(q); | |
9b3c08b9 | 1874 | if (ret > 0) |
843283e9 | 1875 | ret = disk_update_zone_resources(disk, &args); |
9b3c08b9 | 1876 | else |
bf505456 | 1877 | pr_warn("%s: failed to revalidate zones\n", disk->disk_name); |
843283e9 DLM |
1878 | if (ret) |
1879 | disk_free_zone_resources(disk); | |
d4100351 | 1880 | blk_mq_unfreeze_queue(q); |
bf505456 | 1881 | |
f216fdd7 | 1882 | kfree(args.conv_zones_bitmap); |
ecfe43b1 | 1883 | |
bf505456 DLM |
1884 | return ret; |
1885 | } | |
1886 | EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); | |
d9f1439a DLM |
1887 | |
1888 | #ifdef CONFIG_BLK_DEBUG_FS | |
1889 | ||
a98b05b0 | 1890 | int queue_zone_wplugs_show(void *data, struct seq_file *m) |
d9f1439a DLM |
1891 | { |
1892 | struct request_queue *q = data; | |
a98b05b0 DLM |
1893 | struct gendisk *disk = q->disk; |
1894 | struct blk_zone_wplug *zwplug; | |
1895 | unsigned int zwp_wp_offset, zwp_flags; | |
1896 | unsigned int zwp_zone_no, zwp_ref; | |
1897 | unsigned int zwp_bio_list_size, i; | |
1898 | unsigned long flags; | |
d9f1439a | 1899 | |
57787fa4 JT |
1900 | if (!disk->zone_wplugs_hash) |
1901 | return 0; | |
1902 | ||
a98b05b0 DLM |
1903 | rcu_read_lock(); |
1904 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1905 | hlist_for_each_entry_rcu(zwplug, | |
1906 | &disk->zone_wplugs_hash[i], node) { | |
1907 | spin_lock_irqsave(&zwplug->lock, flags); | |
1908 | zwp_zone_no = zwplug->zone_no; | |
1909 | zwp_flags = zwplug->flags; | |
1910 | zwp_ref = atomic_read(&zwplug->ref); | |
1911 | zwp_wp_offset = zwplug->wp_offset; | |
1912 | zwp_bio_list_size = bio_list_size(&zwplug->bio_list); | |
1913 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
d9f1439a | 1914 | |
a98b05b0 DLM |
1915 | seq_printf(m, "%u 0x%x %u %u %u\n", |
1916 | zwp_zone_no, zwp_flags, zwp_ref, | |
1917 | zwp_wp_offset, zwp_bio_list_size); | |
1918 | } | |
1919 | } | |
1920 | rcu_read_unlock(); | |
d9f1439a DLM |
1921 | |
1922 | return 0; | |
1923 | } | |
1924 | ||
1925 | #endif |