Commit | Line | Data |
---|---|---|
3dcf60bc | 1 | // SPDX-License-Identifier: GPL-2.0 |
6a0cb1bc HR |
2 | /* |
3 | * Zoned block device handling | |
4 | * | |
5 | * Copyright (c) 2015, Hannes Reinecke | |
6 | * Copyright (c) 2015, SUSE Linux GmbH | |
7 | * | |
8 | * Copyright (c) 2016, Damien Le Moal | |
9 | * Copyright (c) 2016, Western Digital | |
dd291d77 | 10 | * Copyright (c) 2024, Western Digital Corporation or its affiliates. |
6a0cb1bc HR |
11 | */ |
12 | ||
13 | #include <linux/kernel.h> | |
14 | #include <linux/module.h> | |
6a0cb1bc | 15 | #include <linux/blkdev.h> |
bf505456 | 16 | #include <linux/blk-mq.h> |
26202928 DLM |
17 | #include <linux/mm.h> |
18 | #include <linux/vmalloc.h> | |
bd976e52 | 19 | #include <linux/sched/mm.h> |
dd291d77 DLM |
20 | #include <linux/spinlock.h> |
21 | #include <linux/atomic.h> | |
22 | #include <linux/mempool.h> | |
6a0cb1bc | 23 | |
a2d6b3a2 | 24 | #include "blk.h" |
dd291d77 | 25 | #include "blk-mq-sched.h" |
d9f1439a | 26 | #include "blk-mq-debugfs.h" |
a2d6b3a2 | 27 | |
02694e86 CK |
28 | #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name |
29 | static const char *const zone_cond_name[] = { | |
30 | ZONE_COND_NAME(NOT_WP), | |
31 | ZONE_COND_NAME(EMPTY), | |
32 | ZONE_COND_NAME(IMP_OPEN), | |
33 | ZONE_COND_NAME(EXP_OPEN), | |
34 | ZONE_COND_NAME(CLOSED), | |
35 | ZONE_COND_NAME(READONLY), | |
36 | ZONE_COND_NAME(FULL), | |
37 | ZONE_COND_NAME(OFFLINE), | |
38 | }; | |
39 | #undef ZONE_COND_NAME | |
40 | ||
dd291d77 DLM |
41 | /* |
42 | * Per-zone write plug. | |
43 | * @node: hlist_node structure for managing the plug using a hash table. | |
44 | * @link: To list the plug in the zone write plug error list of the disk. | |
45 | * @ref: Zone write plug reference counter. A zone write plug reference is | |
46 | * always at least 1 when the plug is hashed in the disk plug hash table. | |
47 | * The reference is incremented whenever a new BIO needing plugging is | |
48 | * submitted and when a function needs to manipulate a plug. The | |
49 | * reference count is decremented whenever a plugged BIO completes and | |
50 | * when a function that referenced the plug returns. The initial | |
51 | * reference is dropped whenever the zone of the zone write plug is reset, | |
52 | * finished and when the zone becomes full (last write BIO to the zone | |
53 | * completes). | |
54 | * @lock: Spinlock to atomically manipulate the plug. | |
55 | * @flags: Flags indicating the plug state. | |
56 | * @zone_no: The number of the zone the plug is managing. | |
57 | * @wp_offset: The zone write pointer location relative to the start of the zone | |
58 | * as a number of 512B sectors. | |
59 | * @bio_list: The list of BIOs that are currently plugged. | |
60 | * @bio_work: Work struct to handle issuing of plugged BIOs | |
61 | * @rcu_head: RCU head to free zone write plugs with an RCU grace period. | |
62 | * @disk: The gendisk the plug belongs to. | |
63 | */ | |
64 | struct blk_zone_wplug { | |
65 | struct hlist_node node; | |
66 | struct list_head link; | |
67 | atomic_t ref; | |
68 | spinlock_t lock; | |
69 | unsigned int flags; | |
70 | unsigned int zone_no; | |
71 | unsigned int wp_offset; | |
72 | struct bio_list bio_list; | |
73 | struct work_struct bio_work; | |
74 | struct rcu_head rcu_head; | |
75 | struct gendisk *disk; | |
76 | }; | |
77 | ||
78 | /* | |
79 | * Zone write plug flags bits: | |
80 | * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, | |
81 | * that is, that write BIOs are being throttled due to a write BIO already | |
82 | * being executed or the zone write plug bio list is not empty. | |
83 | * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be | |
84 | * recovered with a report zone to update the zone write pointer offset. | |
85 | * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed | |
86 | * from the disk hash table and that the initial reference to the zone | |
87 | * write plug set when the plug was first added to the hash table has been | |
88 | * dropped. This flag is set when a zone is reset, finished or become full, | |
89 | * to prevent new references to the zone write plug to be taken for | |
90 | * newly incoming BIOs. A zone write plug flagged with this flag will be | |
91 | * freed once all remaining references from BIOs or functions are dropped. | |
92 | */ | |
93 | #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) | |
94 | #define BLK_ZONE_WPLUG_ERROR (1U << 1) | |
95 | #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) | |
96 | ||
97 | #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) | |
98 | ||
02694e86 CK |
99 | /** |
100 | * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. | |
101 | * @zone_cond: BLK_ZONE_COND_XXX. | |
102 | * | |
103 | * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX | |
104 | * into string format. Useful in the debugging and tracing zone conditions. For | |
105 | * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". | |
106 | */ | |
107 | const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) | |
108 | { | |
109 | static const char *zone_cond_str = "UNKNOWN"; | |
110 | ||
111 | if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) | |
112 | zone_cond_str = zone_cond_name[zone_cond]; | |
113 | ||
114 | return zone_cond_str; | |
115 | } | |
116 | EXPORT_SYMBOL_GPL(blk_zone_cond_str); | |
117 | ||
a91e1380 | 118 | /** |
b623e347 CH |
119 | * bdev_nr_zones - Get number of zones |
120 | * @bdev: Target device | |
a91e1380 | 121 | * |
9b38bb4b CH |
122 | * Return the total number of zones of a zoned block device. For a block |
123 | * device without zone capabilities, the number of zones is always 0. | |
a91e1380 | 124 | */ |
b623e347 | 125 | unsigned int bdev_nr_zones(struct block_device *bdev) |
a91e1380 | 126 | { |
b623e347 | 127 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
a91e1380 | 128 | |
b623e347 | 129 | if (!bdev_is_zoned(bdev)) |
a91e1380 | 130 | return 0; |
b623e347 CH |
131 | return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> |
132 | ilog2(zone_sectors); | |
a91e1380 | 133 | } |
b623e347 | 134 | EXPORT_SYMBOL_GPL(bdev_nr_zones); |
a91e1380 | 135 | |
6a0cb1bc HR |
136 | /** |
137 | * blkdev_report_zones - Get zones information | |
138 | * @bdev: Target block device | |
139 | * @sector: Sector from which to report zones | |
d4100351 CH |
140 | * @nr_zones: Maximum number of zones to report |
141 | * @cb: Callback function called for each reported zone | |
142 | * @data: Private data for the callback | |
6a0cb1bc HR |
143 | * |
144 | * Description: | |
d4100351 CH |
145 | * Get zone information starting from the zone containing @sector for at most |
146 | * @nr_zones, and call @cb for each zone reported by the device. | |
147 | * To report all zones in a device starting from @sector, the BLK_ALL_ZONES | |
148 | * constant can be passed to @nr_zones. | |
149 | * Returns the number of zones reported by the device, or a negative errno | |
150 | * value in case of failure. | |
151 | * | |
152 | * Note: The caller must use memalloc_noXX_save/restore() calls to control | |
153 | * memory allocations done within this function. | |
6a0cb1bc | 154 | */ |
e76239a3 | 155 | int blkdev_report_zones(struct block_device *bdev, sector_t sector, |
d4100351 | 156 | unsigned int nr_zones, report_zones_cb cb, void *data) |
6a0cb1bc | 157 | { |
ceeb373a | 158 | struct gendisk *disk = bdev->bd_disk; |
5eac3eb3 | 159 | sector_t capacity = get_capacity(disk); |
6a0cb1bc | 160 | |
edd1dbc8 | 161 | if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) |
e76239a3 | 162 | return -EOPNOTSUPP; |
6a0cb1bc | 163 | |
d4100351 | 164 | if (!nr_zones || sector >= capacity) |
6a0cb1bc | 165 | return 0; |
6a0cb1bc | 166 | |
d4100351 | 167 | return disk->fops->report_zones(disk, sector, nr_zones, cb, data); |
6a0cb1bc HR |
168 | } |
169 | EXPORT_SYMBOL_GPL(blkdev_report_zones); | |
170 | ||
1ee533ec DLM |
171 | static inline unsigned long *blk_alloc_zone_bitmap(int node, |
172 | unsigned int nr_zones) | |
6e33dbf2 | 173 | { |
1ee533ec DLM |
174 | return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), |
175 | GFP_NOIO, node); | |
176 | } | |
6e33dbf2 | 177 | |
1ee533ec DLM |
178 | static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, |
179 | void *data) | |
180 | { | |
6e33dbf2 | 181 | /* |
1ee533ec DLM |
182 | * For an all-zones reset, ignore conventional, empty, read-only |
183 | * and offline zones. | |
6e33dbf2 | 184 | */ |
1ee533ec DLM |
185 | switch (zone->cond) { |
186 | case BLK_ZONE_COND_NOT_WP: | |
187 | case BLK_ZONE_COND_EMPTY: | |
188 | case BLK_ZONE_COND_READONLY: | |
189 | case BLK_ZONE_COND_OFFLINE: | |
190 | return 0; | |
191 | default: | |
192 | set_bit(idx, (unsigned long *)data); | |
193 | return 0; | |
194 | } | |
195 | } | |
196 | ||
71f4ecdb | 197 | static int blkdev_zone_reset_all_emulated(struct block_device *bdev) |
1ee533ec | 198 | { |
d86e716a | 199 | struct gendisk *disk = bdev->bd_disk; |
375c140c CH |
200 | sector_t capacity = bdev_nr_sectors(bdev); |
201 | sector_t zone_sectors = bdev_zone_sectors(bdev); | |
1ee533ec DLM |
202 | unsigned long *need_reset; |
203 | struct bio *bio = NULL; | |
204 | sector_t sector = 0; | |
205 | int ret; | |
206 | ||
d86e716a | 207 | need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); |
1ee533ec DLM |
208 | if (!need_reset) |
209 | return -ENOMEM; | |
210 | ||
d86e716a CH |
211 | ret = disk->fops->report_zones(disk, 0, disk->nr_zones, |
212 | blk_zone_need_reset_cb, need_reset); | |
1ee533ec DLM |
213 | if (ret < 0) |
214 | goto out_free_need_reset; | |
215 | ||
216 | ret = 0; | |
217 | while (sector < capacity) { | |
d86e716a | 218 | if (!test_bit(disk_zone_no(disk, sector), need_reset)) { |
1ee533ec DLM |
219 | sector += zone_sectors; |
220 | continue; | |
221 | } | |
222 | ||
0a3140ea | 223 | bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, |
71f4ecdb | 224 | GFP_KERNEL); |
1ee533ec DLM |
225 | bio->bi_iter.bi_sector = sector; |
226 | sector += zone_sectors; | |
227 | ||
228 | /* This may take a while, so be nice to others */ | |
229 | cond_resched(); | |
230 | } | |
231 | ||
232 | if (bio) { | |
233 | ret = submit_bio_wait(bio); | |
234 | bio_put(bio); | |
235 | } | |
236 | ||
237 | out_free_need_reset: | |
238 | kfree(need_reset); | |
239 | return ret; | |
240 | } | |
241 | ||
71f4ecdb | 242 | static int blkdev_zone_reset_all(struct block_device *bdev) |
1ee533ec DLM |
243 | { |
244 | struct bio bio; | |
245 | ||
49add496 | 246 | bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); |
1ee533ec | 247 | return submit_bio_wait(&bio); |
6e33dbf2 CK |
248 | } |
249 | ||
6a0cb1bc | 250 | /** |
6c1b1da5 | 251 | * blkdev_zone_mgmt - Execute a zone management operation on a range of zones |
6a0cb1bc | 252 | * @bdev: Target block device |
6c1b1da5 AJ |
253 | * @op: Operation to be performed on the zones |
254 | * @sector: Start sector of the first zone to operate on | |
255 | * @nr_sectors: Number of sectors, should be at least the length of one zone and | |
256 | * must be zone size aligned. | |
6a0cb1bc HR |
257 | * |
258 | * Description: | |
6c1b1da5 | 259 | * Perform the specified operation on the range of zones specified by |
6a0cb1bc HR |
260 | * @sector..@sector+@nr_sectors. Specifying the entire disk sector range |
261 | * is valid, but the specified range should not contain conventional zones. | |
6c1b1da5 AJ |
262 | * The operation to execute on each zone can be a zone reset, open, close |
263 | * or finish request. | |
6a0cb1bc | 264 | */ |
ff07a02e | 265 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, |
71f4ecdb | 266 | sector_t sector, sector_t nr_sectors) |
6a0cb1bc HR |
267 | { |
268 | struct request_queue *q = bdev_get_queue(bdev); | |
375c140c CH |
269 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
270 | sector_t capacity = bdev_nr_sectors(bdev); | |
6a0cb1bc | 271 | sector_t end_sector = sector + nr_sectors; |
a2d6b3a2 | 272 | struct bio *bio = NULL; |
1ee533ec | 273 | int ret = 0; |
6a0cb1bc | 274 | |
edd1dbc8 | 275 | if (!bdev_is_zoned(bdev)) |
6a0cb1bc HR |
276 | return -EOPNOTSUPP; |
277 | ||
a2d6b3a2 DLM |
278 | if (bdev_read_only(bdev)) |
279 | return -EPERM; | |
280 | ||
6c1b1da5 AJ |
281 | if (!op_is_zone_mgmt(op)) |
282 | return -EOPNOTSUPP; | |
283 | ||
11bde986 | 284 | if (end_sector <= sector || end_sector > capacity) |
6a0cb1bc HR |
285 | /* Out of range */ |
286 | return -EINVAL; | |
287 | ||
288 | /* Check alignment (handle eventual smaller last zone) */ | |
e29b2100 | 289 | if (!bdev_is_zone_start(bdev, sector)) |
6a0cb1bc HR |
290 | return -EINVAL; |
291 | ||
e29b2100 | 292 | if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) |
6a0cb1bc HR |
293 | return -EINVAL; |
294 | ||
1ee533ec DLM |
295 | /* |
296 | * In the case of a zone reset operation over all zones, | |
297 | * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this | |
298 | * command. For other devices, we emulate this command behavior by | |
299 | * identifying the zones needing a reset. | |
300 | */ | |
301 | if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { | |
302 | if (!blk_queue_zone_resetall(q)) | |
71f4ecdb JT |
303 | return blkdev_zone_reset_all_emulated(bdev); |
304 | return blkdev_zone_reset_all(bdev); | |
1ee533ec DLM |
305 | } |
306 | ||
6a0cb1bc | 307 | while (sector < end_sector) { |
71f4ecdb | 308 | bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); |
c7a1d926 | 309 | bio->bi_iter.bi_sector = sector; |
6a0cb1bc HR |
310 | sector += zone_sectors; |
311 | ||
312 | /* This may take a while, so be nice to others */ | |
313 | cond_resched(); | |
6a0cb1bc HR |
314 | } |
315 | ||
a2d6b3a2 DLM |
316 | ret = submit_bio_wait(bio); |
317 | bio_put(bio); | |
318 | ||
a2d6b3a2 | 319 | return ret; |
6a0cb1bc | 320 | } |
6c1b1da5 | 321 | EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); |
3ed05a98 | 322 | |
d4100351 CH |
323 | struct zone_report_args { |
324 | struct blk_zone __user *zones; | |
325 | }; | |
326 | ||
327 | static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, | |
328 | void *data) | |
329 | { | |
330 | struct zone_report_args *args = data; | |
331 | ||
332 | if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) | |
333 | return -EFAULT; | |
334 | return 0; | |
335 | } | |
336 | ||
56c4bddb | 337 | /* |
3ed05a98 ST |
338 | * BLKREPORTZONE ioctl processing. |
339 | * Called from blkdev_ioctl. | |
340 | */ | |
5e4ea834 CH |
341 | int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, |
342 | unsigned long arg) | |
3ed05a98 ST |
343 | { |
344 | void __user *argp = (void __user *)arg; | |
d4100351 | 345 | struct zone_report_args args; |
3ed05a98 | 346 | struct blk_zone_report rep; |
3ed05a98 ST |
347 | int ret; |
348 | ||
349 | if (!argp) | |
350 | return -EINVAL; | |
351 | ||
edd1dbc8 | 352 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
353 | return -ENOTTY; |
354 | ||
3ed05a98 ST |
355 | if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) |
356 | return -EFAULT; | |
357 | ||
358 | if (!rep.nr_zones) | |
359 | return -EINVAL; | |
360 | ||
d4100351 CH |
361 | args.zones = argp + sizeof(struct blk_zone_report); |
362 | ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, | |
363 | blkdev_copy_zone_to_user, &args); | |
364 | if (ret < 0) | |
365 | return ret; | |
3ed05a98 | 366 | |
d4100351 | 367 | rep.nr_zones = ret; |
82394db7 | 368 | rep.flags = BLK_ZONE_REP_CAPACITY; |
d4100351 CH |
369 | if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) |
370 | return -EFAULT; | |
371 | return 0; | |
3ed05a98 ST |
372 | } |
373 | ||
05bdb996 CH |
374 | static int blkdev_truncate_zone_range(struct block_device *bdev, |
375 | blk_mode_t mode, const struct blk_zone_range *zrange) | |
e5113505 SK |
376 | { |
377 | loff_t start, end; | |
378 | ||
379 | if (zrange->sector + zrange->nr_sectors <= zrange->sector || | |
380 | zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) | |
381 | /* Out of range */ | |
382 | return -EINVAL; | |
383 | ||
384 | start = zrange->sector << SECTOR_SHIFT; | |
385 | end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; | |
386 | ||
387 | return truncate_bdev_range(bdev, mode, start, end); | |
388 | } | |
389 | ||
56c4bddb | 390 | /* |
e876df1f | 391 | * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. |
3ed05a98 ST |
392 | * Called from blkdev_ioctl. |
393 | */ | |
05bdb996 | 394 | int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, |
e876df1f | 395 | unsigned int cmd, unsigned long arg) |
3ed05a98 ST |
396 | { |
397 | void __user *argp = (void __user *)arg; | |
3ed05a98 | 398 | struct blk_zone_range zrange; |
ff07a02e | 399 | enum req_op op; |
e5113505 | 400 | int ret; |
3ed05a98 ST |
401 | |
402 | if (!argp) | |
403 | return -EINVAL; | |
404 | ||
edd1dbc8 | 405 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
406 | return -ENOTTY; |
407 | ||
05bdb996 | 408 | if (!(mode & BLK_OPEN_WRITE)) |
3ed05a98 ST |
409 | return -EBADF; |
410 | ||
411 | if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) | |
412 | return -EFAULT; | |
413 | ||
e876df1f AJ |
414 | switch (cmd) { |
415 | case BLKRESETZONE: | |
416 | op = REQ_OP_ZONE_RESET; | |
e5113505 SK |
417 | |
418 | /* Invalidate the page cache, including dirty pages. */ | |
224941e8 | 419 | filemap_invalidate_lock(bdev->bd_mapping); |
e5113505 SK |
420 | ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
421 | if (ret) | |
86399ea0 | 422 | goto fail; |
e876df1f AJ |
423 | break; |
424 | case BLKOPENZONE: | |
425 | op = REQ_OP_ZONE_OPEN; | |
426 | break; | |
427 | case BLKCLOSEZONE: | |
428 | op = REQ_OP_ZONE_CLOSE; | |
429 | break; | |
430 | case BLKFINISHZONE: | |
431 | op = REQ_OP_ZONE_FINISH; | |
432 | break; | |
433 | default: | |
434 | return -ENOTTY; | |
435 | } | |
436 | ||
71f4ecdb | 437 | ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); |
e5113505 | 438 | |
86399ea0 SK |
439 | fail: |
440 | if (cmd == BLKRESETZONE) | |
224941e8 | 441 | filemap_invalidate_unlock(bdev->bd_mapping); |
e5113505 SK |
442 | |
443 | return ret; | |
3ed05a98 | 444 | } |
bf505456 | 445 | |
dd291d77 DLM |
446 | static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) |
447 | { | |
448 | if (!disk->conv_zones_bitmap) | |
449 | return false; | |
450 | return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); | |
451 | } | |
452 | ||
cd639993 DLM |
453 | static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) |
454 | { | |
455 | return zone->start + zone->len >= get_capacity(disk); | |
456 | } | |
457 | ||
29459c3e DLM |
458 | static bool disk_zone_is_full(struct gendisk *disk, |
459 | unsigned int zno, unsigned int offset_in_zone) | |
460 | { | |
461 | if (zno < disk->nr_zones - 1) | |
462 | return offset_in_zone >= disk->zone_capacity; | |
463 | return offset_in_zone >= disk->last_zone_capacity; | |
464 | } | |
465 | ||
466 | static bool disk_zone_wplug_is_full(struct gendisk *disk, | |
467 | struct blk_zone_wplug *zwplug) | |
468 | { | |
469 | return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); | |
470 | } | |
471 | ||
dd291d77 DLM |
472 | static bool disk_insert_zone_wplug(struct gendisk *disk, |
473 | struct blk_zone_wplug *zwplug) | |
474 | { | |
475 | struct blk_zone_wplug *zwplg; | |
476 | unsigned long flags; | |
477 | unsigned int idx = | |
478 | hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); | |
479 | ||
480 | /* | |
481 | * Add the new zone write plug to the hash table, but carefully as we | |
482 | * are racing with other submission context, so we may already have a | |
483 | * zone write plug for the same zone. | |
484 | */ | |
485 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
486 | hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { | |
487 | if (zwplg->zone_no == zwplug->zone_no) { | |
488 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
489 | return false; | |
490 | } | |
491 | } | |
492 | hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); | |
493 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
494 | ||
495 | return true; | |
496 | } | |
497 | ||
dd291d77 DLM |
498 | static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, |
499 | sector_t sector) | |
500 | { | |
501 | unsigned int zno = disk_zone_no(disk, sector); | |
502 | unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); | |
503 | struct blk_zone_wplug *zwplug; | |
504 | ||
505 | rcu_read_lock(); | |
506 | ||
507 | hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { | |
508 | if (zwplug->zone_no == zno && | |
509 | atomic_inc_not_zero(&zwplug->ref)) { | |
510 | rcu_read_unlock(); | |
511 | return zwplug; | |
512 | } | |
513 | } | |
514 | ||
515 | rcu_read_unlock(); | |
516 | ||
517 | return NULL; | |
518 | } | |
519 | ||
520 | static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) | |
521 | { | |
522 | struct blk_zone_wplug *zwplug = | |
523 | container_of(rcu_head, struct blk_zone_wplug, rcu_head); | |
524 | ||
525 | mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); | |
526 | } | |
527 | ||
528 | static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) | |
529 | { | |
530 | if (atomic_dec_and_test(&zwplug->ref)) { | |
531 | WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); | |
532 | WARN_ON_ONCE(!list_empty(&zwplug->link)); | |
79ae35a4 | 533 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); |
dd291d77 DLM |
534 | |
535 | call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); | |
536 | } | |
537 | } | |
538 | ||
79ae35a4 DLM |
539 | static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, |
540 | struct blk_zone_wplug *zwplug) | |
541 | { | |
7b295187 DLM |
542 | /* If the zone write plug was already removed, we are done. */ |
543 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
544 | return false; | |
545 | ||
546 | /* If the zone write plug is still busy, it cannot be removed. */ | |
79ae35a4 DLM |
547 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) |
548 | return false; | |
549 | ||
7b295187 DLM |
550 | /* |
551 | * Completions of BIOs with blk_zone_write_plug_bio_endio() may | |
552 | * happen after handling a request completion with | |
347bde9d | 553 | * blk_zone_write_plug_finish_request() (e.g. with split BIOs |
7b295187 DLM |
554 | * that are chained). In such case, disk_zone_wplug_unplug_bio() |
555 | * should not attempt to remove the zone write plug until all BIO | |
556 | * completions are seen. Check by looking at the zone write plug | |
557 | * reference count, which is 2 when the plug is unused (one reference | |
558 | * taken when the plug was allocated and another reference taken by the | |
559 | * caller context). | |
560 | */ | |
561 | if (atomic_read(&zwplug->ref) > 2) | |
562 | return false; | |
563 | ||
79ae35a4 | 564 | /* We can remove zone write plugs for zones that are empty or full. */ |
29459c3e | 565 | return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); |
79ae35a4 DLM |
566 | } |
567 | ||
568 | static void disk_remove_zone_wplug(struct gendisk *disk, | |
569 | struct blk_zone_wplug *zwplug) | |
570 | { | |
571 | unsigned long flags; | |
572 | ||
573 | /* If the zone write plug was already removed, we have nothing to do. */ | |
574 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
575 | return; | |
576 | ||
577 | /* | |
578 | * Mark the zone write plug as unhashed and drop the extra reference we | |
579 | * took when the plug was inserted in the hash table. | |
580 | */ | |
581 | zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; | |
582 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
583 | hlist_del_init_rcu(&zwplug->node); | |
584 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
585 | disk_put_zone_wplug(zwplug); | |
586 | } | |
587 | ||
dd291d77 DLM |
588 | static void blk_zone_wplug_bio_work(struct work_struct *work); |
589 | ||
590 | /* | |
591 | * Get a reference on the write plug for the zone containing @sector. | |
592 | * If the plug does not exist, it is allocated and hashed. | |
593 | * Return a pointer to the zone write plug with the plug spinlock held. | |
594 | */ | |
595 | static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, | |
596 | sector_t sector, gfp_t gfp_mask, | |
597 | unsigned long *flags) | |
bf505456 | 598 | { |
dd291d77 DLM |
599 | unsigned int zno = disk_zone_no(disk, sector); |
600 | struct blk_zone_wplug *zwplug; | |
601 | ||
602 | again: | |
603 | zwplug = disk_get_zone_wplug(disk, sector); | |
604 | if (zwplug) { | |
605 | /* | |
606 | * Check that a BIO completion or a zone reset or finish | |
607 | * operation has not already removed the zone write plug from | |
608 | * the hash table and dropped its reference count. In such case, | |
609 | * we need to get a new plug so start over from the beginning. | |
610 | */ | |
611 | spin_lock_irqsave(&zwplug->lock, *flags); | |
612 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
613 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
614 | disk_put_zone_wplug(zwplug); | |
615 | goto again; | |
616 | } | |
617 | return zwplug; | |
618 | } | |
619 | ||
620 | /* | |
621 | * Allocate and initialize a zone write plug with an extra reference | |
622 | * so that it is not freed when the zone write plug becomes idle without | |
623 | * the zone being full. | |
624 | */ | |
625 | zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); | |
626 | if (!zwplug) | |
627 | return NULL; | |
628 | ||
629 | INIT_HLIST_NODE(&zwplug->node); | |
630 | INIT_LIST_HEAD(&zwplug->link); | |
631 | atomic_set(&zwplug->ref, 2); | |
632 | spin_lock_init(&zwplug->lock); | |
633 | zwplug->flags = 0; | |
634 | zwplug->zone_no = zno; | |
635 | zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); | |
636 | bio_list_init(&zwplug->bio_list); | |
637 | INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); | |
638 | zwplug->disk = disk; | |
639 | ||
640 | spin_lock_irqsave(&zwplug->lock, *flags); | |
641 | ||
642 | /* | |
643 | * Insert the new zone write plug in the hash table. This can fail only | |
644 | * if another context already inserted a plug. Retry from the beginning | |
645 | * in such case. | |
646 | */ | |
647 | if (!disk_insert_zone_wplug(disk, zwplug)) { | |
648 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
649 | mempool_free(zwplug, disk->zone_wplugs_pool); | |
650 | goto again; | |
651 | } | |
652 | ||
653 | return zwplug; | |
654 | } | |
655 | ||
c9c8aea0 DLM |
656 | static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, |
657 | struct bio *bio) | |
dd291d77 | 658 | { |
c9c8aea0 | 659 | struct request_queue *q = zwplug->disk->queue; |
dd291d77 DLM |
660 | |
661 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
662 | bio_io_error(bio); | |
c9c8aea0 | 663 | disk_put_zone_wplug(zwplug); |
dd291d77 DLM |
664 | blk_queue_exit(q); |
665 | } | |
666 | ||
667 | /* | |
668 | * Abort (fail) all plugged BIOs of a zone write plug. | |
669 | */ | |
670 | static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) | |
671 | { | |
672 | struct bio *bio; | |
673 | ||
c9c8aea0 DLM |
674 | while ((bio = bio_list_pop(&zwplug->bio_list))) |
675 | blk_zone_wplug_bio_io_error(zwplug, bio); | |
dd291d77 DLM |
676 | } |
677 | ||
678 | /* | |
679 | * Abort (fail) all plugged BIOs of a zone write plug that are not aligned | |
680 | * with the assumed write pointer location of the zone when the BIO will | |
681 | * be unplugged. | |
682 | */ | |
683 | static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, | |
684 | struct blk_zone_wplug *zwplug) | |
685 | { | |
dd291d77 DLM |
686 | unsigned int wp_offset = zwplug->wp_offset; |
687 | struct bio_list bl = BIO_EMPTY_LIST; | |
688 | struct bio *bio; | |
689 | ||
690 | while ((bio = bio_list_pop(&zwplug->bio_list))) { | |
29459c3e | 691 | if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || |
9b1ce7f0 DLM |
692 | (bio_op(bio) != REQ_OP_ZONE_APPEND && |
693 | bio_offset_from_zone_start(bio) != wp_offset)) { | |
c9c8aea0 | 694 | blk_zone_wplug_bio_io_error(zwplug, bio); |
dd291d77 DLM |
695 | continue; |
696 | } | |
697 | ||
698 | wp_offset += bio_sectors(bio); | |
699 | bio_list_add(&bl, bio); | |
700 | } | |
701 | ||
702 | bio_list_merge(&zwplug->bio_list, &bl); | |
703 | } | |
704 | ||
19aad274 DLM |
705 | static inline void disk_zone_wplug_set_error(struct gendisk *disk, |
706 | struct blk_zone_wplug *zwplug) | |
707 | { | |
708 | unsigned long flags; | |
709 | ||
710 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) | |
711 | return; | |
712 | ||
713 | /* | |
714 | * At this point, we already have a reference on the zone write plug. | |
715 | * However, since we are going to add the plug to the disk zone write | |
716 | * plugs work list, increase its reference count. This reference will | |
717 | * be dropped in disk_zone_wplugs_work() once the error state is | |
718 | * handled, or in disk_zone_wplug_clear_error() if the zone is reset or | |
719 | * finished. | |
720 | */ | |
721 | zwplug->flags |= BLK_ZONE_WPLUG_ERROR; | |
722 | atomic_inc(&zwplug->ref); | |
723 | ||
724 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
725 | list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); | |
726 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
727 | } | |
728 | ||
729 | static inline void disk_zone_wplug_clear_error(struct gendisk *disk, | |
730 | struct blk_zone_wplug *zwplug) | |
731 | { | |
732 | unsigned long flags; | |
733 | ||
734 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) | |
735 | return; | |
736 | ||
737 | /* | |
738 | * We are racing with the error handling work which drops the reference | |
739 | * on the zone write plug after handling the error state. So remove the | |
740 | * plug from the error list and drop its reference count only if the | |
741 | * error handling has not yet started, that is, if the zone write plug | |
742 | * is still listed. | |
743 | */ | |
744 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
745 | if (!list_empty(&zwplug->link)) { | |
746 | list_del_init(&zwplug->link); | |
747 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
748 | disk_put_zone_wplug(zwplug); | |
749 | } | |
750 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
751 | } | |
752 | ||
dd291d77 DLM |
753 | /* |
754 | * Set a zone write plug write pointer offset to either 0 (zone reset case) | |
755 | * or to the zone size (zone finish case). This aborts all plugged BIOs, which | |
756 | * is fine to do as doing a zone reset or zone finish while writes are in-flight | |
757 | * is a mistake from the user which will most likely cause all plugged BIOs to | |
758 | * fail anyway. | |
759 | */ | |
760 | static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, | |
761 | struct blk_zone_wplug *zwplug, | |
762 | unsigned int wp_offset) | |
763 | { | |
764 | unsigned long flags; | |
765 | ||
766 | spin_lock_irqsave(&zwplug->lock, flags); | |
767 | ||
768 | /* | |
769 | * Make sure that a BIO completion or another zone reset or finish | |
770 | * operation has not already removed the plug from the hash table. | |
771 | */ | |
772 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
773 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
774 | return; | |
775 | } | |
776 | ||
777 | /* Update the zone write pointer and abort all plugged BIOs. */ | |
778 | zwplug->wp_offset = wp_offset; | |
779 | disk_zone_wplug_abort(zwplug); | |
780 | ||
781 | /* | |
782 | * Updating the write pointer offset puts back the zone | |
783 | * in a good state. So clear the error flag and decrement the | |
784 | * error count if we were in error state. | |
785 | */ | |
19aad274 | 786 | disk_zone_wplug_clear_error(disk, zwplug); |
dd291d77 DLM |
787 | |
788 | /* | |
789 | * The zone write plug now has no BIO plugged: remove it from the | |
790 | * hash table so that it cannot be seen. The plug will be freed | |
791 | * when the last reference is dropped. | |
792 | */ | |
793 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
794 | disk_remove_zone_wplug(disk, zwplug); | |
795 | ||
796 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
797 | } | |
798 | ||
799 | static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, | |
800 | unsigned int wp_offset) | |
801 | { | |
802 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
803 | sector_t sector = bio->bi_iter.bi_sector; | |
804 | struct blk_zone_wplug *zwplug; | |
805 | ||
806 | /* Conventional zones cannot be reset nor finished. */ | |
807 | if (disk_zone_is_conv(disk, sector)) { | |
808 | bio_io_error(bio); | |
809 | return true; | |
810 | } | |
811 | ||
812 | /* | |
813 | * If we have a zone write plug, set its write pointer offset to 0 | |
814 | * (reset case) or to the zone size (finish case). This will abort all | |
815 | * BIOs plugged for the target zone. It is fine as resetting or | |
816 | * finishing zones while writes are still in-flight will result in the | |
817 | * writes failing anyway. | |
818 | */ | |
819 | zwplug = disk_get_zone_wplug(disk, sector); | |
820 | if (zwplug) { | |
821 | disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); | |
822 | disk_put_zone_wplug(zwplug); | |
823 | } | |
824 | ||
825 | return false; | |
826 | } | |
827 | ||
828 | static bool blk_zone_wplug_handle_reset_all(struct bio *bio) | |
829 | { | |
830 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
831 | struct blk_zone_wplug *zwplug; | |
832 | sector_t sector; | |
833 | ||
834 | /* | |
835 | * Set the write pointer offset of all zone write plugs to 0. This will | |
836 | * abort all plugged BIOs. It is fine as resetting zones while writes | |
837 | * are still in-flight will result in the writes failing anyway. | |
838 | */ | |
839 | for (sector = 0; sector < get_capacity(disk); | |
840 | sector += disk->queue->limits.chunk_sectors) { | |
841 | zwplug = disk_get_zone_wplug(disk, sector); | |
842 | if (zwplug) { | |
843 | disk_zone_wplug_set_wp_offset(disk, zwplug, 0); | |
844 | disk_put_zone_wplug(zwplug); | |
845 | } | |
846 | } | |
847 | ||
848 | return false; | |
849 | } | |
850 | ||
851 | static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, | |
852 | struct bio *bio, unsigned int nr_segs) | |
853 | { | |
854 | /* | |
855 | * Grab an extra reference on the BIO request queue usage counter. | |
856 | * This reference will be reused to submit a request for the BIO for | |
857 | * blk-mq devices and dropped when the BIO is failed and after | |
858 | * it is issued in the case of BIO-based devices. | |
859 | */ | |
860 | percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); | |
861 | ||
862 | /* | |
863 | * The BIO is being plugged and thus will have to wait for the on-going | |
864 | * write and for all other writes already plugged. So polling makes | |
865 | * no sense. | |
866 | */ | |
867 | bio_clear_polled(bio); | |
868 | ||
869 | /* | |
870 | * Reuse the poll cookie field to store the number of segments when | |
871 | * split to the hardware limits. | |
872 | */ | |
873 | bio->__bi_nr_segments = nr_segs; | |
874 | ||
875 | /* | |
876 | * We always receive BIOs after they are split and ready to be issued. | |
877 | * The block layer passes the parts of a split BIO in order, and the | |
878 | * user must also issue write sequentially. So simply add the new BIO | |
879 | * at the tail of the list to preserve the sequential write order. | |
880 | */ | |
881 | bio_list_add(&zwplug->bio_list, bio); | |
882 | } | |
883 | ||
884 | /* | |
885 | * Called from bio_attempt_back_merge() when a BIO was merged with a request. | |
886 | */ | |
887 | void blk_zone_write_plug_bio_merged(struct bio *bio) | |
888 | { | |
889 | struct blk_zone_wplug *zwplug; | |
890 | unsigned long flags; | |
891 | ||
892 | /* | |
893 | * If the BIO was already plugged, then we were called through | |
096bc7ea DLM |
894 | * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). |
895 | * For this case, we already hold a reference on the zone write plug for | |
896 | * the BIO and blk_zone_write_plug_init_request() will handle the | |
dd291d77 DLM |
897 | * zone write pointer offset update. |
898 | */ | |
899 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
900 | return; | |
901 | ||
902 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
903 | ||
904 | /* | |
c4c3ffda DLM |
905 | * Get a reference on the zone write plug of the target zone and advance |
906 | * the zone write pointer offset. Given that this is a merge, we already | |
907 | * have at least one request and one BIO referencing the zone write | |
908 | * plug. So this should not fail. | |
dd291d77 DLM |
909 | */ |
910 | zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, | |
911 | bio->bi_iter.bi_sector); | |
c4c3ffda DLM |
912 | if (WARN_ON_ONCE(!zwplug)) |
913 | return; | |
914 | ||
dd291d77 DLM |
915 | spin_lock_irqsave(&zwplug->lock, flags); |
916 | zwplug->wp_offset += bio_sectors(bio); | |
917 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
918 | } | |
919 | ||
920 | /* | |
921 | * Attempt to merge plugged BIOs with a newly prepared request for a BIO that | |
922 | * already went through zone write plugging (either a new BIO or one that was | |
923 | * unplugged). | |
924 | */ | |
096bc7ea | 925 | void blk_zone_write_plug_init_request(struct request *req) |
dd291d77 DLM |
926 | { |
927 | sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); | |
928 | struct request_queue *q = req->q; | |
929 | struct gendisk *disk = q->disk; | |
dd291d77 DLM |
930 | struct blk_zone_wplug *zwplug = |
931 | disk_get_zone_wplug(disk, blk_rq_pos(req)); | |
932 | unsigned long flags; | |
933 | struct bio *bio; | |
934 | ||
096bc7ea DLM |
935 | if (WARN_ON_ONCE(!zwplug)) |
936 | return; | |
937 | ||
dd291d77 | 938 | /* |
7b295187 | 939 | * Indicate that completion of this request needs to be handled with |
347bde9d | 940 | * blk_zone_write_plug_finish_request(), which will drop the reference |
7b295187 | 941 | * on the zone write plug we took above on entry to this function. |
dd291d77 DLM |
942 | */ |
943 | req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; | |
944 | ||
945 | if (blk_queue_nomerges(q)) | |
946 | return; | |
947 | ||
948 | /* | |
949 | * Walk through the list of plugged BIOs to check if they can be merged | |
950 | * into the back of the request. | |
951 | */ | |
952 | spin_lock_irqsave(&zwplug->lock, flags); | |
29459c3e | 953 | while (!disk_zone_wplug_is_full(disk, zwplug)) { |
dd291d77 DLM |
954 | bio = bio_list_peek(&zwplug->bio_list); |
955 | if (!bio) | |
956 | break; | |
957 | ||
958 | if (bio->bi_iter.bi_sector != req_back_sector || | |
959 | !blk_rq_merge_ok(req, bio)) | |
960 | break; | |
961 | ||
962 | WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && | |
963 | !bio->__bi_nr_segments); | |
964 | ||
965 | bio_list_pop(&zwplug->bio_list); | |
966 | if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != | |
967 | BIO_MERGE_OK) { | |
968 | bio_list_add_head(&zwplug->bio_list, bio); | |
969 | break; | |
970 | } | |
971 | ||
972 | /* | |
973 | * Drop the extra reference on the queue usage we got when | |
974 | * plugging the BIO and advance the write pointer offset. | |
975 | */ | |
976 | blk_queue_exit(q); | |
977 | zwplug->wp_offset += bio_sectors(bio); | |
978 | ||
979 | req_back_sector += bio_sectors(bio); | |
980 | } | |
981 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
982 | } | |
983 | ||
dd291d77 DLM |
984 | /* |
985 | * Check and prepare a BIO for submission by incrementing the write pointer | |
9b1ce7f0 DLM |
986 | * offset of its zone write plug and changing zone append operations into |
987 | * regular write when zone append emulation is needed. | |
dd291d77 DLM |
988 | */ |
989 | static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, | |
990 | struct bio *bio) | |
991 | { | |
992 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
993 | ||
994 | /* | |
995 | * Check that the user is not attempting to write to a full zone. | |
996 | * We know such BIO will fail, and that would potentially overflow our | |
997 | * write pointer offset beyond the end of the zone. | |
998 | */ | |
29459c3e | 999 | if (disk_zone_wplug_is_full(disk, zwplug)) |
dd291d77 DLM |
1000 | goto err; |
1001 | ||
9b1ce7f0 DLM |
1002 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
1003 | /* | |
1004 | * Use a regular write starting at the current write pointer. | |
1005 | * Similarly to native zone append operations, do not allow | |
1006 | * merging. | |
1007 | */ | |
1008 | bio->bi_opf &= ~REQ_OP_MASK; | |
1009 | bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; | |
1010 | bio->bi_iter.bi_sector += zwplug->wp_offset; | |
1011 | ||
1012 | /* | |
1013 | * Remember that this BIO is in fact a zone append operation | |
1014 | * so that we can restore its operation code on completion. | |
1015 | */ | |
1016 | bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); | |
1017 | } else { | |
1018 | /* | |
1019 | * Check for non-sequential writes early because we avoid a | |
1020 | * whole lot of error handling trouble if we don't send it off | |
1021 | * to the driver. | |
1022 | */ | |
1023 | if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) | |
1024 | goto err; | |
1025 | } | |
dd291d77 DLM |
1026 | |
1027 | /* Advance the zone write pointer offset. */ | |
1028 | zwplug->wp_offset += bio_sectors(bio); | |
1029 | ||
1030 | return true; | |
1031 | ||
1032 | err: | |
1033 | /* We detected an invalid write BIO: schedule error recovery. */ | |
1034 | disk_zone_wplug_set_error(disk, zwplug); | |
1035 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1036 | return false; | |
1037 | } | |
1038 | ||
1039 | static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) | |
1040 | { | |
1041 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1042 | sector_t sector = bio->bi_iter.bi_sector; | |
1043 | struct blk_zone_wplug *zwplug; | |
1044 | gfp_t gfp_mask = GFP_NOIO; | |
1045 | unsigned long flags; | |
1046 | ||
1047 | /* | |
1048 | * BIOs must be fully contained within a zone so that we use the correct | |
1049 | * zone write plug for the entire BIO. For blk-mq devices, the block | |
1050 | * layer should already have done any splitting required to ensure this | |
1051 | * and this BIO should thus not be straddling zone boundaries. For | |
1052 | * BIO-based devices, it is the responsibility of the driver to split | |
1053 | * the bio before submitting it. | |
1054 | */ | |
1055 | if (WARN_ON_ONCE(bio_straddles_zones(bio))) { | |
1056 | bio_io_error(bio); | |
1057 | return true; | |
1058 | } | |
1059 | ||
1060 | /* Conventional zones do not need write plugging. */ | |
9b1ce7f0 DLM |
1061 | if (disk_zone_is_conv(disk, sector)) { |
1062 | /* Zone append to conventional zones is not allowed. */ | |
1063 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
1064 | bio_io_error(bio); | |
1065 | return true; | |
1066 | } | |
dd291d77 | 1067 | return false; |
9b1ce7f0 | 1068 | } |
dd291d77 DLM |
1069 | |
1070 | if (bio->bi_opf & REQ_NOWAIT) | |
1071 | gfp_mask = GFP_NOWAIT; | |
1072 | ||
1073 | zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); | |
1074 | if (!zwplug) { | |
1075 | bio_io_error(bio); | |
1076 | return true; | |
1077 | } | |
1078 | ||
1079 | /* Indicate that this BIO is being handled using zone write plugging. */ | |
1080 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1081 | ||
1082 | /* | |
1083 | * If the zone is already plugged or has a pending error, add the BIO | |
1084 | * to the plug BIO list. Otherwise, plug and let the BIO execute. | |
1085 | */ | |
1086 | if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) | |
1087 | goto plug; | |
1088 | ||
1089 | /* | |
1090 | * If an error is detected when preparing the BIO, add it to the BIO | |
1091 | * list so that error recovery can deal with it. | |
1092 | */ | |
1093 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) | |
1094 | goto plug; | |
1095 | ||
1096 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1097 | ||
1098 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1099 | ||
1100 | return false; | |
1101 | ||
1102 | plug: | |
1103 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1104 | blk_zone_wplug_add_bio(zwplug, bio, nr_segs); | |
1105 | ||
1106 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1107 | ||
1108 | return true; | |
1109 | } | |
1110 | ||
1111 | /** | |
1112 | * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging | |
1113 | * @bio: The BIO being submitted | |
1114 | * @nr_segs: The number of physical segments of @bio | |
1115 | * | |
9b1ce7f0 DLM |
1116 | * Handle write, write zeroes and zone append operations requiring emulation |
1117 | * using zone write plugging. | |
dd291d77 DLM |
1118 | * |
1119 | * Return true whenever @bio execution needs to be delayed through the zone | |
1120 | * write plug. Otherwise, return false to let the submission path process | |
1121 | * @bio normally. | |
1122 | */ | |
1123 | bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) | |
1124 | { | |
1125 | struct block_device *bdev = bio->bi_bdev; | |
1126 | ||
1127 | if (!bdev->bd_disk->zone_wplugs_hash) | |
1128 | return false; | |
1129 | ||
1130 | /* | |
1131 | * If the BIO already has the plugging flag set, then it was already | |
1132 | * handled through this path and this is a submission from the zone | |
1133 | * plug bio submit work. | |
1134 | */ | |
1135 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
1136 | return false; | |
1137 | ||
1138 | /* | |
1139 | * We do not need to do anything special for empty flush BIOs, e.g | |
1140 | * BIOs such as issued by blkdev_issue_flush(). The is because it is | |
1141 | * the responsibility of the user to first wait for the completion of | |
1142 | * write operations for flush to have any effect on the persistence of | |
1143 | * the written data. | |
1144 | */ | |
1145 | if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) | |
1146 | return false; | |
1147 | ||
1148 | /* | |
1149 | * Regular writes and write zeroes need to be handled through the target | |
1150 | * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH | |
1151 | * which may need to go through the flush machinery depending on the | |
1152 | * target device capabilities. Plugging such writes is fine as the flush | |
1153 | * machinery operates at the request level, below the plug, and | |
1154 | * completion of the flush sequence will go through the regular BIO | |
1155 | * completion, which will handle zone write plugging. | |
9b1ce7f0 DLM |
1156 | * Zone append operations for devices that requested emulation must |
1157 | * also be plugged so that these BIOs can be changed into regular | |
1158 | * write BIOs. | |
dd291d77 DLM |
1159 | * Zone reset, reset all and finish commands need special treatment |
1160 | * to correctly track the write pointer offset of zones. These commands | |
1161 | * are not plugged as we do not need serialization with write | |
1162 | * operations. It is the responsibility of the user to not issue reset | |
1163 | * and finish commands when write operations are in flight. | |
1164 | */ | |
1165 | switch (bio_op(bio)) { | |
9b1ce7f0 DLM |
1166 | case REQ_OP_ZONE_APPEND: |
1167 | if (!bdev_emulates_zone_append(bdev)) | |
1168 | return false; | |
1169 | fallthrough; | |
dd291d77 DLM |
1170 | case REQ_OP_WRITE: |
1171 | case REQ_OP_WRITE_ZEROES: | |
1172 | return blk_zone_wplug_handle_write(bio, nr_segs); | |
1173 | case REQ_OP_ZONE_RESET: | |
1174 | return blk_zone_wplug_handle_reset_or_finish(bio, 0); | |
1175 | case REQ_OP_ZONE_FINISH: | |
1176 | return blk_zone_wplug_handle_reset_or_finish(bio, | |
1177 | bdev_zone_sectors(bdev)); | |
1178 | case REQ_OP_ZONE_RESET_ALL: | |
1179 | return blk_zone_wplug_handle_reset_all(bio); | |
1180 | default: | |
1181 | return false; | |
1182 | } | |
1183 | ||
1184 | return false; | |
1185 | } | |
1186 | EXPORT_SYMBOL_GPL(blk_zone_plug_bio); | |
1187 | ||
9e78c38a DLM |
1188 | static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, |
1189 | struct blk_zone_wplug *zwplug) | |
1190 | { | |
1191 | /* | |
1192 | * Take a reference on the zone write plug and schedule the submission | |
1193 | * of the next plugged BIO. blk_zone_wplug_bio_work() will release the | |
1194 | * reference we take here. | |
1195 | */ | |
1196 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); | |
1197 | atomic_inc(&zwplug->ref); | |
1198 | queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); | |
1199 | } | |
1200 | ||
dd291d77 DLM |
1201 | static void disk_zone_wplug_unplug_bio(struct gendisk *disk, |
1202 | struct blk_zone_wplug *zwplug) | |
1203 | { | |
1204 | unsigned long flags; | |
1205 | ||
1206 | spin_lock_irqsave(&zwplug->lock, flags); | |
1207 | ||
1208 | /* | |
1209 | * If we had an error, schedule error recovery. The recovery work | |
1210 | * will restart submission of plugged BIOs. | |
1211 | */ | |
1212 | if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { | |
1213 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1214 | kblockd_schedule_work(&disk->zone_wplugs_work); | |
1215 | return; | |
1216 | } | |
1217 | ||
1218 | /* Schedule submission of the next plugged BIO if we have one. */ | |
1219 | if (!bio_list_empty(&zwplug->bio_list)) { | |
9e78c38a | 1220 | disk_zone_wplug_schedule_bio_work(disk, zwplug); |
dd291d77 | 1221 | spin_unlock_irqrestore(&zwplug->lock, flags); |
dd291d77 DLM |
1222 | return; |
1223 | } | |
1224 | ||
1225 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1226 | ||
1227 | /* | |
1228 | * If the zone is full (it was fully written or finished, or empty | |
1229 | * (it was reset), remove its zone write plug from the hash table. | |
1230 | */ | |
1231 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1232 | disk_remove_zone_wplug(disk, zwplug); | |
1233 | ||
1234 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1235 | } | |
1236 | ||
1237 | void blk_zone_write_plug_bio_endio(struct bio *bio) | |
1238 | { | |
1239 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1240 | struct blk_zone_wplug *zwplug = | |
b5a64ec2 | 1241 | disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); |
dd291d77 DLM |
1242 | unsigned long flags; |
1243 | ||
1244 | if (WARN_ON_ONCE(!zwplug)) | |
1245 | return; | |
1246 | ||
1247 | /* Make sure we do not see this BIO again by clearing the plug flag. */ | |
1248 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1249 | ||
9b1ce7f0 DLM |
1250 | /* |
1251 | * If this is a regular write emulating a zone append operation, | |
1252 | * restore the original operation code. | |
1253 | */ | |
1254 | if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { | |
1255 | bio->bi_opf &= ~REQ_OP_MASK; | |
1256 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
1257 | } | |
1258 | ||
dd291d77 DLM |
1259 | /* |
1260 | * If the BIO failed, mark the plug as having an error to trigger | |
1261 | * recovery. | |
1262 | */ | |
1263 | if (bio->bi_status != BLK_STS_OK) { | |
1264 | spin_lock_irqsave(&zwplug->lock, flags); | |
1265 | disk_zone_wplug_set_error(disk, zwplug); | |
1266 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1267 | } | |
1268 | ||
7b295187 DLM |
1269 | /* Drop the reference we took when the BIO was issued. */ |
1270 | disk_put_zone_wplug(zwplug); | |
1271 | ||
dd291d77 | 1272 | /* |
347bde9d | 1273 | * For BIO-based devices, blk_zone_write_plug_finish_request() |
dd291d77 DLM |
1274 | * is not called. So we need to schedule execution of the next |
1275 | * plugged BIO here. | |
1276 | */ | |
3413efa8 | 1277 | if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) |
dd291d77 DLM |
1278 | disk_zone_wplug_unplug_bio(disk, zwplug); |
1279 | ||
7b295187 | 1280 | /* Drop the reference we took when entering this function. */ |
dd291d77 DLM |
1281 | disk_put_zone_wplug(zwplug); |
1282 | } | |
1283 | ||
347bde9d | 1284 | void blk_zone_write_plug_finish_request(struct request *req) |
dd291d77 DLM |
1285 | { |
1286 | struct gendisk *disk = req->q->disk; | |
347bde9d | 1287 | struct blk_zone_wplug *zwplug; |
dd291d77 | 1288 | |
347bde9d | 1289 | zwplug = disk_get_zone_wplug(disk, req->__sector); |
dd291d77 DLM |
1290 | if (WARN_ON_ONCE(!zwplug)) |
1291 | return; | |
1292 | ||
1293 | req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; | |
1294 | ||
dd291d77 DLM |
1295 | /* |
1296 | * Drop the reference we took when the request was initialized in | |
096bc7ea | 1297 | * blk_zone_write_plug_init_request(). |
dd291d77 | 1298 | */ |
7b295187 DLM |
1299 | disk_put_zone_wplug(zwplug); |
1300 | ||
1301 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1302 | ||
1303 | /* Drop the reference we took when entering this function. */ | |
dd291d77 DLM |
1304 | disk_put_zone_wplug(zwplug); |
1305 | } | |
1306 | ||
1307 | static void blk_zone_wplug_bio_work(struct work_struct *work) | |
1308 | { | |
1309 | struct blk_zone_wplug *zwplug = | |
1310 | container_of(work, struct blk_zone_wplug, bio_work); | |
1311 | struct block_device *bdev; | |
1312 | unsigned long flags; | |
1313 | struct bio *bio; | |
1314 | ||
1315 | /* | |
1316 | * Submit the next plugged BIO. If we do not have any, clear | |
1317 | * the plugged flag. | |
1318 | */ | |
1319 | spin_lock_irqsave(&zwplug->lock, flags); | |
1320 | ||
1321 | bio = bio_list_pop(&zwplug->bio_list); | |
1322 | if (!bio) { | |
1323 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1324 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
9e78c38a | 1325 | goto put_zwplug; |
dd291d77 DLM |
1326 | } |
1327 | ||
1328 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { | |
1329 | /* Error recovery will decide what to do with the BIO. */ | |
1330 | bio_list_add_head(&zwplug->bio_list, bio); | |
1331 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
9e78c38a | 1332 | goto put_zwplug; |
dd291d77 DLM |
1333 | } |
1334 | ||
1335 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1336 | ||
1337 | bdev = bio->bi_bdev; | |
1338 | submit_bio_noacct_nocheck(bio); | |
1339 | ||
1340 | /* | |
1341 | * blk-mq devices will reuse the extra reference on the request queue | |
1342 | * usage counter we took when the BIO was plugged, but the submission | |
1343 | * path for BIO-based devices will not do that. So drop this extra | |
1344 | * reference here. | |
1345 | */ | |
3413efa8 | 1346 | if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) |
dd291d77 | 1347 | blk_queue_exit(bdev->bd_disk->queue); |
9e78c38a DLM |
1348 | |
1349 | put_zwplug: | |
1350 | /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ | |
1351 | disk_put_zone_wplug(zwplug); | |
dd291d77 DLM |
1352 | } |
1353 | ||
1354 | static unsigned int blk_zone_wp_offset(struct blk_zone *zone) | |
1355 | { | |
1356 | switch (zone->cond) { | |
1357 | case BLK_ZONE_COND_IMP_OPEN: | |
1358 | case BLK_ZONE_COND_EXP_OPEN: | |
1359 | case BLK_ZONE_COND_CLOSED: | |
1360 | return zone->wp - zone->start; | |
1361 | case BLK_ZONE_COND_FULL: | |
1362 | return zone->len; | |
1363 | case BLK_ZONE_COND_EMPTY: | |
1364 | return 0; | |
1365 | case BLK_ZONE_COND_NOT_WP: | |
1366 | case BLK_ZONE_COND_OFFLINE: | |
1367 | case BLK_ZONE_COND_READONLY: | |
1368 | default: | |
1369 | /* | |
1370 | * Conventional, offline and read-only zones do not have a valid | |
1371 | * write pointer. | |
1372 | */ | |
1373 | return UINT_MAX; | |
1374 | } | |
1375 | } | |
1376 | ||
1377 | static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, | |
1378 | unsigned int idx, void *data) | |
1379 | { | |
1380 | struct blk_zone *zonep = data; | |
1381 | ||
1382 | *zonep = *zone; | |
1383 | return 0; | |
1384 | } | |
1385 | ||
1386 | static void disk_zone_wplug_handle_error(struct gendisk *disk, | |
1387 | struct blk_zone_wplug *zwplug) | |
1388 | { | |
1389 | sector_t zone_start_sector = | |
1390 | bdev_zone_sectors(disk->part0) * zwplug->zone_no; | |
1391 | unsigned int noio_flag; | |
1392 | struct blk_zone zone; | |
1393 | unsigned long flags; | |
1394 | int ret; | |
1395 | ||
1396 | /* Get the current zone information from the device. */ | |
1397 | noio_flag = memalloc_noio_save(); | |
1398 | ret = disk->fops->report_zones(disk, zone_start_sector, 1, | |
1399 | blk_zone_wplug_report_zone_cb, &zone); | |
1400 | memalloc_noio_restore(noio_flag); | |
1401 | ||
1402 | spin_lock_irqsave(&zwplug->lock, flags); | |
1403 | ||
1404 | /* | |
1405 | * A zone reset or finish may have cleared the error already. In such | |
1406 | * case, do nothing as the report zones may have seen the "old" write | |
1407 | * pointer value before the reset/finish operation completed. | |
1408 | */ | |
1409 | if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) | |
1410 | goto unlock; | |
1411 | ||
1412 | zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; | |
1413 | ||
1414 | if (ret != 1) { | |
1415 | /* | |
1416 | * We failed to get the zone information, meaning that something | |
1417 | * is likely really wrong with the device. Abort all remaining | |
1418 | * plugged BIOs as otherwise we could endup waiting forever on | |
1419 | * plugged BIOs to complete if there is a queue freeze on-going. | |
1420 | */ | |
1421 | disk_zone_wplug_abort(zwplug); | |
1422 | goto unplug; | |
1423 | } | |
1424 | ||
1425 | /* Update the zone write pointer offset. */ | |
1426 | zwplug->wp_offset = blk_zone_wp_offset(&zone); | |
1427 | disk_zone_wplug_abort_unaligned(disk, zwplug); | |
1428 | ||
1429 | /* Restart BIO submission if we still have any BIO left. */ | |
1430 | if (!bio_list_empty(&zwplug->bio_list)) { | |
9e78c38a | 1431 | disk_zone_wplug_schedule_bio_work(disk, zwplug); |
dd291d77 DLM |
1432 | goto unlock; |
1433 | } | |
1434 | ||
1435 | unplug: | |
1436 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1437 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1438 | disk_remove_zone_wplug(disk, zwplug); | |
1439 | ||
1440 | unlock: | |
1441 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1442 | } | |
1443 | ||
1444 | static void disk_zone_wplugs_work(struct work_struct *work) | |
1445 | { | |
1446 | struct gendisk *disk = | |
1447 | container_of(work, struct gendisk, zone_wplugs_work); | |
1448 | struct blk_zone_wplug *zwplug; | |
1449 | unsigned long flags; | |
1450 | ||
1451 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1452 | ||
1453 | while (!list_empty(&disk->zone_wplugs_err_list)) { | |
1454 | zwplug = list_first_entry(&disk->zone_wplugs_err_list, | |
1455 | struct blk_zone_wplug, link); | |
1456 | list_del_init(&zwplug->link); | |
1457 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1458 | ||
1459 | disk_zone_wplug_handle_error(disk, zwplug); | |
1460 | disk_put_zone_wplug(zwplug); | |
1461 | ||
1462 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1463 | } | |
1464 | ||
1465 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1466 | } | |
1467 | ||
1468 | static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) | |
1469 | { | |
1470 | return 1U << disk->zone_wplugs_hash_bits; | |
1471 | } | |
1472 | ||
1473 | void disk_init_zone_resources(struct gendisk *disk) | |
1474 | { | |
1475 | spin_lock_init(&disk->zone_wplugs_lock); | |
1476 | INIT_LIST_HEAD(&disk->zone_wplugs_err_list); | |
1477 | INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); | |
1478 | } | |
1479 | ||
1480 | /* | |
1481 | * For the size of a disk zone write plug hash table, use the size of the | |
1482 | * zone write plug mempool, which is the maximum of the disk open zones and | |
1483 | * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, | |
1484 | * 9 bits. For a disk that has no limits, mempool size defaults to 128. | |
1485 | */ | |
1486 | #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 | |
1487 | #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 | |
1488 | ||
1489 | static int disk_alloc_zone_resources(struct gendisk *disk, | |
1490 | unsigned int pool_size) | |
1491 | { | |
1492 | unsigned int i; | |
1493 | ||
1494 | disk->zone_wplugs_hash_bits = | |
1495 | min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); | |
1496 | ||
1497 | disk->zone_wplugs_hash = | |
1498 | kcalloc(disk_zone_wplugs_hash_size(disk), | |
1499 | sizeof(struct hlist_head), GFP_KERNEL); | |
1500 | if (!disk->zone_wplugs_hash) | |
1501 | return -ENOMEM; | |
1502 | ||
1503 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) | |
1504 | INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); | |
1505 | ||
1506 | disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, | |
1507 | sizeof(struct blk_zone_wplug)); | |
a8f59e5a DLM |
1508 | if (!disk->zone_wplugs_pool) |
1509 | goto free_hash; | |
1510 | ||
1511 | disk->zone_wplugs_wq = | |
1512 | alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, | |
1513 | pool_size, disk->disk_name); | |
1514 | if (!disk->zone_wplugs_wq) | |
1515 | goto destroy_pool; | |
dd291d77 DLM |
1516 | |
1517 | return 0; | |
a8f59e5a DLM |
1518 | |
1519 | destroy_pool: | |
1520 | mempool_destroy(disk->zone_wplugs_pool); | |
1521 | disk->zone_wplugs_pool = NULL; | |
1522 | free_hash: | |
1523 | kfree(disk->zone_wplugs_hash); | |
1524 | disk->zone_wplugs_hash = NULL; | |
1525 | disk->zone_wplugs_hash_bits = 0; | |
1526 | return -ENOMEM; | |
dd291d77 DLM |
1527 | } |
1528 | ||
1529 | static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) | |
1530 | { | |
1531 | struct blk_zone_wplug *zwplug; | |
1532 | unsigned int i; | |
1533 | ||
1534 | if (!disk->zone_wplugs_hash) | |
1535 | return; | |
1536 | ||
1537 | /* Free all the zone write plugs we have. */ | |
1538 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1539 | while (!hlist_empty(&disk->zone_wplugs_hash[i])) { | |
1540 | zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, | |
1541 | struct blk_zone_wplug, node); | |
1542 | atomic_inc(&zwplug->ref); | |
1543 | disk_remove_zone_wplug(disk, zwplug); | |
1544 | disk_put_zone_wplug(zwplug); | |
1545 | } | |
1546 | } | |
1547 | ||
1548 | kfree(disk->zone_wplugs_hash); | |
1549 | disk->zone_wplugs_hash = NULL; | |
1550 | disk->zone_wplugs_hash_bits = 0; | |
1551 | } | |
1552 | ||
1553 | void disk_free_zone_resources(struct gendisk *disk) | |
1554 | { | |
1933192a DLM |
1555 | if (!disk->zone_wplugs_pool) |
1556 | return; | |
1557 | ||
dd291d77 DLM |
1558 | cancel_work_sync(&disk->zone_wplugs_work); |
1559 | ||
a8f59e5a DLM |
1560 | if (disk->zone_wplugs_wq) { |
1561 | destroy_workqueue(disk->zone_wplugs_wq); | |
1562 | disk->zone_wplugs_wq = NULL; | |
1563 | } | |
1564 | ||
dd291d77 DLM |
1565 | disk_destroy_zone_wplugs_hash_table(disk); |
1566 | ||
1567 | /* | |
1568 | * Wait for the zone write plugs to be RCU-freed before | |
1569 | * destorying the mempool. | |
1570 | */ | |
1571 | rcu_barrier(); | |
1572 | ||
1573 | mempool_destroy(disk->zone_wplugs_pool); | |
1574 | disk->zone_wplugs_pool = NULL; | |
1575 | ||
d86e716a CH |
1576 | kfree(disk->conv_zones_bitmap); |
1577 | disk->conv_zones_bitmap = NULL; | |
dd291d77 | 1578 | disk->zone_capacity = 0; |
29459c3e | 1579 | disk->last_zone_capacity = 0; |
dd291d77 DLM |
1580 | disk->nr_zones = 0; |
1581 | } | |
1582 | ||
946dd71e DLM |
1583 | static inline bool disk_need_zone_resources(struct gendisk *disk) |
1584 | { | |
1585 | /* | |
1586 | * All mq zoned devices need zone resources so that the block layer | |
1587 | * can automatically handle write BIO plugging. BIO-based device drivers | |
1588 | * (e.g. DM devices) are normally responsible for handling zone write | |
1589 | * ordering and do not need zone resources, unless the driver requires | |
1590 | * zone append emulation. | |
1591 | */ | |
1592 | return queue_is_mq(disk->queue) || | |
1593 | queue_emulates_zone_append(disk->queue); | |
1594 | } | |
1595 | ||
dd291d77 DLM |
1596 | static int disk_revalidate_zone_resources(struct gendisk *disk, |
1597 | unsigned int nr_zones) | |
1598 | { | |
1599 | struct queue_limits *lim = &disk->queue->limits; | |
1600 | unsigned int pool_size; | |
1601 | ||
946dd71e DLM |
1602 | if (!disk_need_zone_resources(disk)) |
1603 | return 0; | |
1604 | ||
dd291d77 DLM |
1605 | /* |
1606 | * If the device has no limit on the maximum number of open and active | |
1607 | * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. | |
1608 | */ | |
1609 | pool_size = max(lim->max_open_zones, lim->max_active_zones); | |
1610 | if (!pool_size) | |
1611 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); | |
1612 | ||
1613 | if (!disk->zone_wplugs_hash) | |
1614 | return disk_alloc_zone_resources(disk, pool_size); | |
1615 | ||
dd291d77 | 1616 | return 0; |
bf505456 DLM |
1617 | } |
1618 | ||
d4100351 CH |
1619 | struct blk_revalidate_zone_args { |
1620 | struct gendisk *disk; | |
f216fdd7 | 1621 | unsigned long *conv_zones_bitmap; |
e94f5819 | 1622 | unsigned int nr_zones; |
ecfe43b1 | 1623 | unsigned int zone_capacity; |
29459c3e | 1624 | unsigned int last_zone_capacity; |
d4100351 CH |
1625 | sector_t sector; |
1626 | }; | |
1627 | ||
843283e9 DLM |
1628 | /* |
1629 | * Update the disk zone resources information and device queue limits. | |
1630 | * The disk queue is frozen when this is executed. | |
1631 | */ | |
1632 | static int disk_update_zone_resources(struct gendisk *disk, | |
1633 | struct blk_revalidate_zone_args *args) | |
1634 | { | |
1635 | struct request_queue *q = disk->queue; | |
6b7593b5 DLM |
1636 | unsigned int nr_seq_zones, nr_conv_zones = 0; |
1637 | unsigned int pool_size; | |
843283e9 DLM |
1638 | struct queue_limits lim; |
1639 | ||
1640 | disk->nr_zones = args->nr_zones; | |
1641 | disk->zone_capacity = args->zone_capacity; | |
29459c3e | 1642 | disk->last_zone_capacity = args->last_zone_capacity; |
843283e9 | 1643 | swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); |
6b7593b5 DLM |
1644 | if (disk->conv_zones_bitmap) |
1645 | nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, | |
1646 | disk->nr_zones); | |
1647 | if (nr_conv_zones >= disk->nr_zones) { | |
1648 | pr_warn("%s: Invalid number of conventional zones %u / %u\n", | |
1649 | disk->disk_name, nr_conv_zones, disk->nr_zones); | |
1650 | return -ENODEV; | |
1651 | } | |
1652 | ||
1653 | if (!disk->zone_wplugs_pool) | |
1654 | return 0; | |
843283e9 DLM |
1655 | |
1656 | /* | |
1657 | * If the device has no limit on the maximum number of open and active | |
1658 | * zones, set its max open zone limit to the mempool size to indicate | |
1659 | * to the user that there is a potential performance impact due to | |
1660 | * dynamic zone write plug allocation when simultaneously writing to | |
1661 | * more zones than the size of the mempool. | |
1662 | */ | |
6b7593b5 DLM |
1663 | lim = queue_limits_start_update(q); |
1664 | ||
1665 | nr_seq_zones = disk->nr_zones - nr_conv_zones; | |
1666 | pool_size = max(lim.max_open_zones, lim.max_active_zones); | |
1667 | if (!pool_size) | |
1668 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); | |
1669 | ||
1670 | mempool_resize(disk->zone_wplugs_pool, pool_size); | |
1671 | ||
1672 | if (!lim.max_open_zones && !lim.max_active_zones) { | |
1673 | if (pool_size < nr_seq_zones) | |
1674 | lim.max_open_zones = pool_size; | |
1675 | else | |
1676 | lim.max_open_zones = 0; | |
843283e9 DLM |
1677 | } |
1678 | ||
6b7593b5 | 1679 | return queue_limits_commit_update(q, &lim); |
843283e9 DLM |
1680 | } |
1681 | ||
d7580149 DLM |
1682 | static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, |
1683 | struct blk_revalidate_zone_args *args) | |
1684 | { | |
1685 | struct gendisk *disk = args->disk; | |
1686 | struct request_queue *q = disk->queue; | |
1687 | ||
1688 | if (zone->capacity != zone->len) { | |
1689 | pr_warn("%s: Invalid conventional zone capacity\n", | |
1690 | disk->disk_name); | |
1691 | return -ENODEV; | |
1692 | } | |
1693 | ||
29459c3e DLM |
1694 | if (disk_zone_is_last(disk, zone)) |
1695 | args->last_zone_capacity = zone->capacity; | |
1696 | ||
d7580149 DLM |
1697 | if (!disk_need_zone_resources(disk)) |
1698 | return 0; | |
1699 | ||
1700 | if (!args->conv_zones_bitmap) { | |
1701 | args->conv_zones_bitmap = | |
1702 | blk_alloc_zone_bitmap(q->node, args->nr_zones); | |
1703 | if (!args->conv_zones_bitmap) | |
1704 | return -ENOMEM; | |
1705 | } | |
1706 | ||
1707 | set_bit(idx, args->conv_zones_bitmap); | |
1708 | ||
1709 | return 0; | |
1710 | } | |
1711 | ||
1712 | static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, | |
1713 | struct blk_revalidate_zone_args *args) | |
1714 | { | |
1715 | struct gendisk *disk = args->disk; | |
1716 | struct blk_zone_wplug *zwplug; | |
1717 | unsigned int wp_offset; | |
1718 | unsigned long flags; | |
1719 | ||
1720 | /* | |
1721 | * Remember the capacity of the first sequential zone and check | |
cd639993 DLM |
1722 | * if it is constant for all zones, ignoring the last zone as it can be |
1723 | * smaller. | |
d7580149 DLM |
1724 | */ |
1725 | if (!args->zone_capacity) | |
1726 | args->zone_capacity = zone->capacity; | |
29459c3e DLM |
1727 | if (disk_zone_is_last(disk, zone)) { |
1728 | args->last_zone_capacity = zone->capacity; | |
1729 | } else if (zone->capacity != args->zone_capacity) { | |
d7580149 DLM |
1730 | pr_warn("%s: Invalid variable zone capacity\n", |
1731 | disk->disk_name); | |
1732 | return -ENODEV; | |
1733 | } | |
1734 | ||
1735 | /* | |
1736 | * We need to track the write pointer of all zones that are not | |
1737 | * empty nor full. So make sure we have a zone write plug for | |
1738 | * such zone if the device has a zone write plug hash table. | |
1739 | */ | |
1740 | if (!disk->zone_wplugs_hash) | |
1741 | return 0; | |
1742 | ||
1743 | wp_offset = blk_zone_wp_offset(zone); | |
1744 | if (!wp_offset || wp_offset >= zone->capacity) | |
1745 | return 0; | |
1746 | ||
1747 | zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); | |
1748 | if (!zwplug) | |
1749 | return -ENOMEM; | |
1750 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1751 | disk_put_zone_wplug(zwplug); | |
1752 | ||
1753 | return 0; | |
1754 | } | |
1755 | ||
d9dd7308 DLM |
1756 | /* |
1757 | * Helper function to check the validity of zones of a zoned block device. | |
1758 | */ | |
d4100351 CH |
1759 | static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, |
1760 | void *data) | |
d9dd7308 | 1761 | { |
d4100351 CH |
1762 | struct blk_revalidate_zone_args *args = data; |
1763 | struct gendisk *disk = args->disk; | |
d7580149 DLM |
1764 | sector_t zone_sectors = disk->queue->limits.chunk_sectors; |
1765 | int ret; | |
03e51c4a DLM |
1766 | |
1767 | /* Check for bad zones and holes in the zone report */ | |
1768 | if (zone->start != args->sector) { | |
1769 | pr_warn("%s: Zone gap at sectors %llu..%llu\n", | |
1770 | disk->disk_name, args->sector, zone->start); | |
1771 | return -ENODEV; | |
1772 | } | |
1773 | ||
cd639993 | 1774 | if (zone->start >= get_capacity(disk) || !zone->len) { |
03e51c4a DLM |
1775 | pr_warn("%s: Invalid zone start %llu, length %llu\n", |
1776 | disk->disk_name, zone->start, zone->len); | |
1777 | return -ENODEV; | |
1778 | } | |
d9dd7308 DLM |
1779 | |
1780 | /* | |
1781 | * All zones must have the same size, with the exception on an eventual | |
1782 | * smaller last zone. | |
1783 | */ | |
cd639993 | 1784 | if (!disk_zone_is_last(disk, zone)) { |
03e51c4a | 1785 | if (zone->len != zone_sectors) { |
6c6b3549 CH |
1786 | pr_warn("%s: Invalid zoned device with non constant zone size\n", |
1787 | disk->disk_name); | |
1788 | return -ENODEV; | |
1789 | } | |
03e51c4a DLM |
1790 | } else if (zone->len > zone_sectors) { |
1791 | pr_warn("%s: Invalid zoned device with larger last zone size\n", | |
1792 | disk->disk_name); | |
d4100351 | 1793 | return -ENODEV; |
d9dd7308 DLM |
1794 | } |
1795 | ||
ecfe43b1 DLM |
1796 | if (!zone->capacity || zone->capacity > zone->len) { |
1797 | pr_warn("%s: Invalid zone capacity\n", | |
1798 | disk->disk_name); | |
1799 | return -ENODEV; | |
1800 | } | |
1801 | ||
d9dd7308 DLM |
1802 | /* Check zone type */ |
1803 | switch (zone->type) { | |
1804 | case BLK_ZONE_TYPE_CONVENTIONAL: | |
d7580149 | 1805 | ret = blk_revalidate_conv_zone(zone, idx, args); |
e94f5819 | 1806 | break; |
d9dd7308 | 1807 | case BLK_ZONE_TYPE_SEQWRITE_REQ: |
d7580149 | 1808 | ret = blk_revalidate_seq_zone(zone, idx, args); |
d9dd7308 | 1809 | break; |
587371ed | 1810 | case BLK_ZONE_TYPE_SEQWRITE_PREF: |
d9dd7308 DLM |
1811 | default: |
1812 | pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", | |
1813 | disk->disk_name, (int)zone->type, zone->start); | |
d7580149 | 1814 | ret = -ENODEV; |
d9dd7308 DLM |
1815 | } |
1816 | ||
d7580149 DLM |
1817 | if (!ret) |
1818 | args->sector += zone->len; | |
1819 | ||
1820 | return ret; | |
d4100351 CH |
1821 | } |
1822 | ||
bf505456 | 1823 | /** |
02ccd7c3 | 1824 | * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs |
bf505456 DLM |
1825 | * @disk: Target disk |
1826 | * | |
9b3c08b9 DLM |
1827 | * Helper function for low-level device drivers to check, (re) allocate and |
1828 | * initialize resources used for managing zoned disks. This function should | |
1829 | * normally be called by blk-mq based drivers when a zoned gendisk is probed | |
1830 | * and when the zone configuration of the gendisk changes (e.g. after a format). | |
03e51c4a DLM |
1831 | * Before calling this function, the device driver must already have set the |
1832 | * device zone size (chunk_sector limit) and the max zone append limit. | |
946dd71e DLM |
1833 | * BIO based drivers can also use this function as long as the device queue |
1834 | * can be safely frozen. | |
bf505456 | 1835 | */ |
9b3c08b9 | 1836 | int blk_revalidate_disk_zones(struct gendisk *disk) |
bf505456 DLM |
1837 | { |
1838 | struct request_queue *q = disk->queue; | |
03e51c4a DLM |
1839 | sector_t zone_sectors = q->limits.chunk_sectors; |
1840 | sector_t capacity = get_capacity(disk); | |
1841 | struct blk_revalidate_zone_args args = { }; | |
6c6b3549 | 1842 | unsigned int noio_flag; |
dd291d77 | 1843 | int ret = -ENOMEM; |
bf505456 | 1844 | |
c98c3d09 CH |
1845 | if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) |
1846 | return -EIO; | |
bf505456 | 1847 | |
03e51c4a DLM |
1848 | if (!capacity) |
1849 | return -ENODEV; | |
1850 | ||
1851 | /* | |
1852 | * Checks that the device driver indicated a valid zone size and that | |
1853 | * the max zone append limit is set. | |
1854 | */ | |
1855 | if (!zone_sectors || !is_power_of_2(zone_sectors)) { | |
1856 | pr_warn("%s: Invalid non power of two zone size (%llu)\n", | |
1857 | disk->disk_name, zone_sectors); | |
1858 | return -ENODEV; | |
1859 | } | |
1860 | ||
ccdbf0aa | 1861 | if (!queue_max_zone_append_sectors(q)) { |
03e51c4a DLM |
1862 | pr_warn("%s: Invalid 0 maximum zone append limit\n", |
1863 | disk->disk_name); | |
1864 | return -ENODEV; | |
1865 | } | |
1a1206dc | 1866 | |
e94f5819 | 1867 | /* |
6c6b3549 CH |
1868 | * Ensure that all memory allocations in this context are done as if |
1869 | * GFP_NOIO was specified. | |
e94f5819 | 1870 | */ |
03e51c4a DLM |
1871 | args.disk = disk; |
1872 | args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); | |
6c6b3549 | 1873 | noio_flag = memalloc_noio_save(); |
dd291d77 DLM |
1874 | ret = disk_revalidate_zone_resources(disk, args.nr_zones); |
1875 | if (ret) { | |
1876 | memalloc_noio_restore(noio_flag); | |
1877 | return ret; | |
1878 | } | |
6c6b3549 CH |
1879 | ret = disk->fops->report_zones(disk, 0, UINT_MAX, |
1880 | blk_revalidate_zone_cb, &args); | |
2afdeb23 DLM |
1881 | if (!ret) { |
1882 | pr_warn("%s: No zones reported\n", disk->disk_name); | |
1883 | ret = -ENODEV; | |
1884 | } | |
6c6b3549 | 1885 | memalloc_noio_restore(noio_flag); |
bf505456 | 1886 | |
2afdeb23 DLM |
1887 | /* |
1888 | * If zones where reported, make sure that the entire disk capacity | |
1889 | * has been checked. | |
1890 | */ | |
03e51c4a | 1891 | if (ret > 0 && args.sector != capacity) { |
2afdeb23 DLM |
1892 | pr_warn("%s: Missing zones from sector %llu\n", |
1893 | disk->disk_name, args.sector); | |
1894 | ret = -ENODEV; | |
1895 | } | |
1896 | ||
bf505456 | 1897 | /* |
02ccd7c3 DLM |
1898 | * Set the new disk zone parameters only once the queue is frozen and |
1899 | * all I/Os are completed. | |
bf505456 DLM |
1900 | */ |
1901 | blk_mq_freeze_queue(q); | |
9b3c08b9 | 1902 | if (ret > 0) |
843283e9 | 1903 | ret = disk_update_zone_resources(disk, &args); |
9b3c08b9 | 1904 | else |
bf505456 | 1905 | pr_warn("%s: failed to revalidate zones\n", disk->disk_name); |
843283e9 DLM |
1906 | if (ret) |
1907 | disk_free_zone_resources(disk); | |
d4100351 | 1908 | blk_mq_unfreeze_queue(q); |
bf505456 | 1909 | |
f216fdd7 | 1910 | kfree(args.conv_zones_bitmap); |
ecfe43b1 | 1911 | |
bf505456 DLM |
1912 | return ret; |
1913 | } | |
1914 | EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); | |
d9f1439a DLM |
1915 | |
1916 | #ifdef CONFIG_BLK_DEBUG_FS | |
1917 | ||
a98b05b0 | 1918 | int queue_zone_wplugs_show(void *data, struct seq_file *m) |
d9f1439a DLM |
1919 | { |
1920 | struct request_queue *q = data; | |
a98b05b0 DLM |
1921 | struct gendisk *disk = q->disk; |
1922 | struct blk_zone_wplug *zwplug; | |
1923 | unsigned int zwp_wp_offset, zwp_flags; | |
1924 | unsigned int zwp_zone_no, zwp_ref; | |
1925 | unsigned int zwp_bio_list_size, i; | |
1926 | unsigned long flags; | |
d9f1439a | 1927 | |
57787fa4 JT |
1928 | if (!disk->zone_wplugs_hash) |
1929 | return 0; | |
1930 | ||
a98b05b0 DLM |
1931 | rcu_read_lock(); |
1932 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1933 | hlist_for_each_entry_rcu(zwplug, | |
1934 | &disk->zone_wplugs_hash[i], node) { | |
1935 | spin_lock_irqsave(&zwplug->lock, flags); | |
1936 | zwp_zone_no = zwplug->zone_no; | |
1937 | zwp_flags = zwplug->flags; | |
1938 | zwp_ref = atomic_read(&zwplug->ref); | |
1939 | zwp_wp_offset = zwplug->wp_offset; | |
1940 | zwp_bio_list_size = bio_list_size(&zwplug->bio_list); | |
1941 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
d9f1439a | 1942 | |
a98b05b0 DLM |
1943 | seq_printf(m, "%u 0x%x %u %u %u\n", |
1944 | zwp_zone_no, zwp_flags, zwp_ref, | |
1945 | zwp_wp_offset, zwp_bio_list_size); | |
1946 | } | |
1947 | } | |
1948 | rcu_read_unlock(); | |
d9f1439a DLM |
1949 | |
1950 | return 0; | |
1951 | } | |
1952 | ||
1953 | #endif |