Commit | Line | Data |
---|---|---|
3dcf60bc | 1 | // SPDX-License-Identifier: GPL-2.0 |
6a0cb1bc HR |
2 | /* |
3 | * Zoned block device handling | |
4 | * | |
5 | * Copyright (c) 2015, Hannes Reinecke | |
6 | * Copyright (c) 2015, SUSE Linux GmbH | |
7 | * | |
8 | * Copyright (c) 2016, Damien Le Moal | |
9 | * Copyright (c) 2016, Western Digital | |
dd291d77 | 10 | * Copyright (c) 2024, Western Digital Corporation or its affiliates. |
6a0cb1bc HR |
11 | */ |
12 | ||
13 | #include <linux/kernel.h> | |
6a0cb1bc | 14 | #include <linux/blkdev.h> |
bf505456 | 15 | #include <linux/blk-mq.h> |
dd291d77 | 16 | #include <linux/spinlock.h> |
4122fef1 | 17 | #include <linux/refcount.h> |
dd291d77 | 18 | #include <linux/mempool.h> |
6a0cb1bc | 19 | |
a2d6b3a2 | 20 | #include "blk.h" |
dd291d77 | 21 | #include "blk-mq-sched.h" |
d9f1439a | 22 | #include "blk-mq-debugfs.h" |
a2d6b3a2 | 23 | |
02694e86 CK |
24 | #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name |
25 | static const char *const zone_cond_name[] = { | |
26 | ZONE_COND_NAME(NOT_WP), | |
27 | ZONE_COND_NAME(EMPTY), | |
28 | ZONE_COND_NAME(IMP_OPEN), | |
29 | ZONE_COND_NAME(EXP_OPEN), | |
30 | ZONE_COND_NAME(CLOSED), | |
31 | ZONE_COND_NAME(READONLY), | |
32 | ZONE_COND_NAME(FULL), | |
33 | ZONE_COND_NAME(OFFLINE), | |
34 | }; | |
35 | #undef ZONE_COND_NAME | |
36 | ||
dd291d77 DLM |
37 | /* |
38 | * Per-zone write plug. | |
39 | * @node: hlist_node structure for managing the plug using a hash table. | |
dd291d77 DLM |
40 | * @ref: Zone write plug reference counter. A zone write plug reference is |
41 | * always at least 1 when the plug is hashed in the disk plug hash table. | |
42 | * The reference is incremented whenever a new BIO needing plugging is | |
43 | * submitted and when a function needs to manipulate a plug. The | |
44 | * reference count is decremented whenever a plugged BIO completes and | |
45 | * when a function that referenced the plug returns. The initial | |
46 | * reference is dropped whenever the zone of the zone write plug is reset, | |
47 | * finished and when the zone becomes full (last write BIO to the zone | |
48 | * completes). | |
49 | * @lock: Spinlock to atomically manipulate the plug. | |
50 | * @flags: Flags indicating the plug state. | |
51 | * @zone_no: The number of the zone the plug is managing. | |
52 | * @wp_offset: The zone write pointer location relative to the start of the zone | |
53 | * as a number of 512B sectors. | |
54 | * @bio_list: The list of BIOs that are currently plugged. | |
55 | * @bio_work: Work struct to handle issuing of plugged BIOs | |
56 | * @rcu_head: RCU head to free zone write plugs with an RCU grace period. | |
57 | * @disk: The gendisk the plug belongs to. | |
58 | */ | |
59 | struct blk_zone_wplug { | |
60 | struct hlist_node node; | |
4122fef1 | 61 | refcount_t ref; |
dd291d77 DLM |
62 | spinlock_t lock; |
63 | unsigned int flags; | |
64 | unsigned int zone_no; | |
65 | unsigned int wp_offset; | |
66 | struct bio_list bio_list; | |
67 | struct work_struct bio_work; | |
68 | struct rcu_head rcu_head; | |
69 | struct gendisk *disk; | |
70 | }; | |
71 | ||
72 | /* | |
73 | * Zone write plug flags bits: | |
74 | * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, | |
75 | * that is, that write BIOs are being throttled due to a write BIO already | |
76 | * being executed or the zone write plug bio list is not empty. | |
fe0418eb DLM |
77 | * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone |
78 | * write pointer offset and need to update it. | |
dd291d77 DLM |
79 | * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed |
80 | * from the disk hash table and that the initial reference to the zone | |
81 | * write plug set when the plug was first added to the hash table has been | |
82 | * dropped. This flag is set when a zone is reset, finished or become full, | |
83 | * to prevent new references to the zone write plug to be taken for | |
84 | * newly incoming BIOs. A zone write plug flagged with this flag will be | |
85 | * freed once all remaining references from BIOs or functions are dropped. | |
86 | */ | |
87 | #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) | |
fe0418eb | 88 | #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) |
dd291d77 DLM |
89 | #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) |
90 | ||
02694e86 CK |
91 | /** |
92 | * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. | |
93 | * @zone_cond: BLK_ZONE_COND_XXX. | |
94 | * | |
95 | * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX | |
96 | * into string format. Useful in the debugging and tracing zone conditions. For | |
97 | * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". | |
98 | */ | |
99 | const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) | |
100 | { | |
101 | static const char *zone_cond_str = "UNKNOWN"; | |
102 | ||
103 | if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) | |
104 | zone_cond_str = zone_cond_name[zone_cond]; | |
105 | ||
106 | return zone_cond_str; | |
107 | } | |
108 | EXPORT_SYMBOL_GPL(blk_zone_cond_str); | |
109 | ||
b76b840f DLM |
110 | struct disk_report_zones_cb_args { |
111 | struct gendisk *disk; | |
112 | report_zones_cb user_cb; | |
113 | void *user_data; | |
114 | }; | |
115 | ||
116 | static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, | |
117 | struct blk_zone *zone); | |
118 | ||
119 | static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, | |
120 | void *data) | |
121 | { | |
122 | struct disk_report_zones_cb_args *args = data; | |
123 | struct gendisk *disk = args->disk; | |
124 | ||
125 | if (disk->zone_wplugs_hash) | |
126 | disk_zone_wplug_sync_wp_offset(disk, zone); | |
127 | ||
128 | if (!args->user_cb) | |
129 | return 0; | |
130 | ||
131 | return args->user_cb(zone, idx, args->user_data); | |
132 | } | |
133 | ||
6a0cb1bc HR |
134 | /** |
135 | * blkdev_report_zones - Get zones information | |
136 | * @bdev: Target block device | |
137 | * @sector: Sector from which to report zones | |
d4100351 CH |
138 | * @nr_zones: Maximum number of zones to report |
139 | * @cb: Callback function called for each reported zone | |
140 | * @data: Private data for the callback | |
6a0cb1bc HR |
141 | * |
142 | * Description: | |
d4100351 CH |
143 | * Get zone information starting from the zone containing @sector for at most |
144 | * @nr_zones, and call @cb for each zone reported by the device. | |
145 | * To report all zones in a device starting from @sector, the BLK_ALL_ZONES | |
146 | * constant can be passed to @nr_zones. | |
147 | * Returns the number of zones reported by the device, or a negative errno | |
148 | * value in case of failure. | |
149 | * | |
150 | * Note: The caller must use memalloc_noXX_save/restore() calls to control | |
151 | * memory allocations done within this function. | |
6a0cb1bc | 152 | */ |
e76239a3 | 153 | int blkdev_report_zones(struct block_device *bdev, sector_t sector, |
d4100351 | 154 | unsigned int nr_zones, report_zones_cb cb, void *data) |
6a0cb1bc | 155 | { |
ceeb373a | 156 | struct gendisk *disk = bdev->bd_disk; |
5eac3eb3 | 157 | sector_t capacity = get_capacity(disk); |
fe0418eb DLM |
158 | struct disk_report_zones_cb_args args = { |
159 | .disk = disk, | |
160 | .user_cb = cb, | |
161 | .user_data = data, | |
162 | }; | |
6a0cb1bc | 163 | |
edd1dbc8 | 164 | if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) |
e76239a3 | 165 | return -EOPNOTSUPP; |
6a0cb1bc | 166 | |
d4100351 | 167 | if (!nr_zones || sector >= capacity) |
6a0cb1bc | 168 | return 0; |
6a0cb1bc | 169 | |
fe0418eb DLM |
170 | return disk->fops->report_zones(disk, sector, nr_zones, |
171 | disk_report_zones_cb, &args); | |
6a0cb1bc HR |
172 | } |
173 | EXPORT_SYMBOL_GPL(blkdev_report_zones); | |
174 | ||
71f4ecdb | 175 | static int blkdev_zone_reset_all(struct block_device *bdev) |
1ee533ec DLM |
176 | { |
177 | struct bio bio; | |
178 | ||
49add496 | 179 | bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); |
1ee533ec | 180 | return submit_bio_wait(&bio); |
6e33dbf2 CK |
181 | } |
182 | ||
6a0cb1bc | 183 | /** |
6c1b1da5 | 184 | * blkdev_zone_mgmt - Execute a zone management operation on a range of zones |
6a0cb1bc | 185 | * @bdev: Target block device |
6c1b1da5 AJ |
186 | * @op: Operation to be performed on the zones |
187 | * @sector: Start sector of the first zone to operate on | |
188 | * @nr_sectors: Number of sectors, should be at least the length of one zone and | |
189 | * must be zone size aligned. | |
6a0cb1bc HR |
190 | * |
191 | * Description: | |
6c1b1da5 | 192 | * Perform the specified operation on the range of zones specified by |
6a0cb1bc HR |
193 | * @sector..@sector+@nr_sectors. Specifying the entire disk sector range |
194 | * is valid, but the specified range should not contain conventional zones. | |
6c1b1da5 AJ |
195 | * The operation to execute on each zone can be a zone reset, open, close |
196 | * or finish request. | |
6a0cb1bc | 197 | */ |
ff07a02e | 198 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, |
71f4ecdb | 199 | sector_t sector, sector_t nr_sectors) |
6a0cb1bc | 200 | { |
375c140c CH |
201 | sector_t zone_sectors = bdev_zone_sectors(bdev); |
202 | sector_t capacity = bdev_nr_sectors(bdev); | |
6a0cb1bc | 203 | sector_t end_sector = sector + nr_sectors; |
a2d6b3a2 | 204 | struct bio *bio = NULL; |
1ee533ec | 205 | int ret = 0; |
6a0cb1bc | 206 | |
edd1dbc8 | 207 | if (!bdev_is_zoned(bdev)) |
6a0cb1bc HR |
208 | return -EOPNOTSUPP; |
209 | ||
a2d6b3a2 DLM |
210 | if (bdev_read_only(bdev)) |
211 | return -EPERM; | |
212 | ||
6c1b1da5 AJ |
213 | if (!op_is_zone_mgmt(op)) |
214 | return -EOPNOTSUPP; | |
215 | ||
11bde986 | 216 | if (end_sector <= sector || end_sector > capacity) |
6a0cb1bc HR |
217 | /* Out of range */ |
218 | return -EINVAL; | |
219 | ||
220 | /* Check alignment (handle eventual smaller last zone) */ | |
e29b2100 | 221 | if (!bdev_is_zone_start(bdev, sector)) |
6a0cb1bc HR |
222 | return -EINVAL; |
223 | ||
e29b2100 | 224 | if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) |
6a0cb1bc HR |
225 | return -EINVAL; |
226 | ||
1ee533ec | 227 | /* |
f2a7bea2 DLM |
228 | * In the case of a zone reset operation over all zones, use |
229 | * REQ_OP_ZONE_RESET_ALL. | |
1ee533ec | 230 | */ |
f2a7bea2 | 231 | if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) |
71f4ecdb | 232 | return blkdev_zone_reset_all(bdev); |
1ee533ec | 233 | |
6a0cb1bc | 234 | while (sector < end_sector) { |
71f4ecdb | 235 | bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); |
c7a1d926 | 236 | bio->bi_iter.bi_sector = sector; |
6a0cb1bc HR |
237 | sector += zone_sectors; |
238 | ||
239 | /* This may take a while, so be nice to others */ | |
240 | cond_resched(); | |
6a0cb1bc HR |
241 | } |
242 | ||
a2d6b3a2 DLM |
243 | ret = submit_bio_wait(bio); |
244 | bio_put(bio); | |
245 | ||
a2d6b3a2 | 246 | return ret; |
6a0cb1bc | 247 | } |
6c1b1da5 | 248 | EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); |
3ed05a98 | 249 | |
d4100351 CH |
250 | struct zone_report_args { |
251 | struct blk_zone __user *zones; | |
252 | }; | |
253 | ||
254 | static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, | |
255 | void *data) | |
256 | { | |
257 | struct zone_report_args *args = data; | |
258 | ||
259 | if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) | |
260 | return -EFAULT; | |
261 | return 0; | |
262 | } | |
263 | ||
56c4bddb | 264 | /* |
3ed05a98 ST |
265 | * BLKREPORTZONE ioctl processing. |
266 | * Called from blkdev_ioctl. | |
267 | */ | |
5e4ea834 CH |
268 | int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, |
269 | unsigned long arg) | |
3ed05a98 ST |
270 | { |
271 | void __user *argp = (void __user *)arg; | |
d4100351 | 272 | struct zone_report_args args; |
3ed05a98 | 273 | struct blk_zone_report rep; |
3ed05a98 ST |
274 | int ret; |
275 | ||
276 | if (!argp) | |
277 | return -EINVAL; | |
278 | ||
edd1dbc8 | 279 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
280 | return -ENOTTY; |
281 | ||
3ed05a98 ST |
282 | if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) |
283 | return -EFAULT; | |
284 | ||
285 | if (!rep.nr_zones) | |
286 | return -EINVAL; | |
287 | ||
d4100351 CH |
288 | args.zones = argp + sizeof(struct blk_zone_report); |
289 | ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, | |
290 | blkdev_copy_zone_to_user, &args); | |
291 | if (ret < 0) | |
292 | return ret; | |
3ed05a98 | 293 | |
d4100351 | 294 | rep.nr_zones = ret; |
82394db7 | 295 | rep.flags = BLK_ZONE_REP_CAPACITY; |
d4100351 CH |
296 | if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) |
297 | return -EFAULT; | |
298 | return 0; | |
3ed05a98 ST |
299 | } |
300 | ||
05bdb996 CH |
301 | static int blkdev_truncate_zone_range(struct block_device *bdev, |
302 | blk_mode_t mode, const struct blk_zone_range *zrange) | |
e5113505 SK |
303 | { |
304 | loff_t start, end; | |
305 | ||
306 | if (zrange->sector + zrange->nr_sectors <= zrange->sector || | |
307 | zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) | |
308 | /* Out of range */ | |
309 | return -EINVAL; | |
310 | ||
311 | start = zrange->sector << SECTOR_SHIFT; | |
312 | end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; | |
313 | ||
314 | return truncate_bdev_range(bdev, mode, start, end); | |
315 | } | |
316 | ||
56c4bddb | 317 | /* |
e876df1f | 318 | * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. |
3ed05a98 ST |
319 | * Called from blkdev_ioctl. |
320 | */ | |
05bdb996 | 321 | int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, |
e876df1f | 322 | unsigned int cmd, unsigned long arg) |
3ed05a98 ST |
323 | { |
324 | void __user *argp = (void __user *)arg; | |
3ed05a98 | 325 | struct blk_zone_range zrange; |
ff07a02e | 326 | enum req_op op; |
e5113505 | 327 | int ret; |
3ed05a98 ST |
328 | |
329 | if (!argp) | |
330 | return -EINVAL; | |
331 | ||
edd1dbc8 | 332 | if (!bdev_is_zoned(bdev)) |
3ed05a98 ST |
333 | return -ENOTTY; |
334 | ||
05bdb996 | 335 | if (!(mode & BLK_OPEN_WRITE)) |
3ed05a98 ST |
336 | return -EBADF; |
337 | ||
338 | if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) | |
339 | return -EFAULT; | |
340 | ||
e876df1f AJ |
341 | switch (cmd) { |
342 | case BLKRESETZONE: | |
343 | op = REQ_OP_ZONE_RESET; | |
e5113505 SK |
344 | |
345 | /* Invalidate the page cache, including dirty pages. */ | |
c0e473a0 | 346 | inode_lock(bdev->bd_mapping->host); |
224941e8 | 347 | filemap_invalidate_lock(bdev->bd_mapping); |
e5113505 SK |
348 | ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
349 | if (ret) | |
86399ea0 | 350 | goto fail; |
e876df1f AJ |
351 | break; |
352 | case BLKOPENZONE: | |
353 | op = REQ_OP_ZONE_OPEN; | |
354 | break; | |
355 | case BLKCLOSEZONE: | |
356 | op = REQ_OP_ZONE_CLOSE; | |
357 | break; | |
358 | case BLKFINISHZONE: | |
359 | op = REQ_OP_ZONE_FINISH; | |
360 | break; | |
361 | default: | |
362 | return -ENOTTY; | |
363 | } | |
364 | ||
71f4ecdb | 365 | ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); |
e5113505 | 366 | |
86399ea0 | 367 | fail: |
c0e473a0 | 368 | if (cmd == BLKRESETZONE) { |
224941e8 | 369 | filemap_invalidate_unlock(bdev->bd_mapping); |
c0e473a0 DW |
370 | inode_unlock(bdev->bd_mapping->host); |
371 | } | |
e5113505 SK |
372 | |
373 | return ret; | |
3ed05a98 | 374 | } |
bf505456 | 375 | |
cd639993 DLM |
376 | static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) |
377 | { | |
378 | return zone->start + zone->len >= get_capacity(disk); | |
379 | } | |
380 | ||
29459c3e DLM |
381 | static bool disk_zone_is_full(struct gendisk *disk, |
382 | unsigned int zno, unsigned int offset_in_zone) | |
383 | { | |
384 | if (zno < disk->nr_zones - 1) | |
385 | return offset_in_zone >= disk->zone_capacity; | |
386 | return offset_in_zone >= disk->last_zone_capacity; | |
387 | } | |
388 | ||
389 | static bool disk_zone_wplug_is_full(struct gendisk *disk, | |
390 | struct blk_zone_wplug *zwplug) | |
391 | { | |
392 | return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); | |
393 | } | |
394 | ||
dd291d77 DLM |
395 | static bool disk_insert_zone_wplug(struct gendisk *disk, |
396 | struct blk_zone_wplug *zwplug) | |
397 | { | |
398 | struct blk_zone_wplug *zwplg; | |
399 | unsigned long flags; | |
400 | unsigned int idx = | |
401 | hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); | |
402 | ||
403 | /* | |
404 | * Add the new zone write plug to the hash table, but carefully as we | |
405 | * are racing with other submission context, so we may already have a | |
406 | * zone write plug for the same zone. | |
407 | */ | |
408 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
409 | hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { | |
410 | if (zwplg->zone_no == zwplug->zone_no) { | |
411 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
412 | return false; | |
413 | } | |
414 | } | |
415 | hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); | |
a6aa36e9 | 416 | atomic_inc(&disk->nr_zone_wplugs); |
dd291d77 DLM |
417 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); |
418 | ||
419 | return true; | |
420 | } | |
421 | ||
a6aa36e9 DLM |
422 | static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, |
423 | sector_t sector) | |
dd291d77 DLM |
424 | { |
425 | unsigned int zno = disk_zone_no(disk, sector); | |
426 | unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); | |
427 | struct blk_zone_wplug *zwplug; | |
428 | ||
429 | rcu_read_lock(); | |
430 | ||
431 | hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { | |
432 | if (zwplug->zone_no == zno && | |
4122fef1 | 433 | refcount_inc_not_zero(&zwplug->ref)) { |
dd291d77 DLM |
434 | rcu_read_unlock(); |
435 | return zwplug; | |
436 | } | |
437 | } | |
438 | ||
439 | rcu_read_unlock(); | |
440 | ||
441 | return NULL; | |
442 | } | |
443 | ||
a6aa36e9 DLM |
444 | static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, |
445 | sector_t sector) | |
446 | { | |
447 | if (!atomic_read(&disk->nr_zone_wplugs)) | |
448 | return NULL; | |
449 | ||
450 | return disk_get_hashed_zone_wplug(disk, sector); | |
451 | } | |
452 | ||
dd291d77 DLM |
453 | static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) |
454 | { | |
455 | struct blk_zone_wplug *zwplug = | |
456 | container_of(rcu_head, struct blk_zone_wplug, rcu_head); | |
457 | ||
458 | mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); | |
459 | } | |
460 | ||
461 | static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) | |
462 | { | |
4122fef1 | 463 | if (refcount_dec_and_test(&zwplug->ref)) { |
dd291d77 | 464 | WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); |
fe0418eb | 465 | WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); |
79ae35a4 | 466 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); |
dd291d77 DLM |
467 | |
468 | call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); | |
469 | } | |
470 | } | |
471 | ||
79ae35a4 DLM |
472 | static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, |
473 | struct blk_zone_wplug *zwplug) | |
474 | { | |
cbac56e5 BVA |
475 | lockdep_assert_held(&zwplug->lock); |
476 | ||
7b295187 DLM |
477 | /* If the zone write plug was already removed, we are done. */ |
478 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
479 | return false; | |
480 | ||
fe0418eb DLM |
481 | /* If the zone write plug is still plugged, it cannot be removed. */ |
482 | if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) | |
79ae35a4 DLM |
483 | return false; |
484 | ||
7b295187 DLM |
485 | /* |
486 | * Completions of BIOs with blk_zone_write_plug_bio_endio() may | |
487 | * happen after handling a request completion with | |
347bde9d | 488 | * blk_zone_write_plug_finish_request() (e.g. with split BIOs |
7b295187 DLM |
489 | * that are chained). In such case, disk_zone_wplug_unplug_bio() |
490 | * should not attempt to remove the zone write plug until all BIO | |
491 | * completions are seen. Check by looking at the zone write plug | |
492 | * reference count, which is 2 when the plug is unused (one reference | |
493 | * taken when the plug was allocated and another reference taken by the | |
494 | * caller context). | |
495 | */ | |
4122fef1 | 496 | if (refcount_read(&zwplug->ref) > 2) |
7b295187 DLM |
497 | return false; |
498 | ||
79ae35a4 | 499 | /* We can remove zone write plugs for zones that are empty or full. */ |
29459c3e | 500 | return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); |
79ae35a4 DLM |
501 | } |
502 | ||
503 | static void disk_remove_zone_wplug(struct gendisk *disk, | |
504 | struct blk_zone_wplug *zwplug) | |
505 | { | |
506 | unsigned long flags; | |
507 | ||
508 | /* If the zone write plug was already removed, we have nothing to do. */ | |
509 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) | |
510 | return; | |
511 | ||
512 | /* | |
513 | * Mark the zone write plug as unhashed and drop the extra reference we | |
514 | * took when the plug was inserted in the hash table. | |
515 | */ | |
516 | zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; | |
517 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
518 | hlist_del_init_rcu(&zwplug->node); | |
a6aa36e9 | 519 | atomic_dec(&disk->nr_zone_wplugs); |
79ae35a4 DLM |
520 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); |
521 | disk_put_zone_wplug(zwplug); | |
522 | } | |
523 | ||
dd291d77 DLM |
524 | static void blk_zone_wplug_bio_work(struct work_struct *work); |
525 | ||
526 | /* | |
527 | * Get a reference on the write plug for the zone containing @sector. | |
528 | * If the plug does not exist, it is allocated and hashed. | |
529 | * Return a pointer to the zone write plug with the plug spinlock held. | |
530 | */ | |
531 | static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, | |
532 | sector_t sector, gfp_t gfp_mask, | |
533 | unsigned long *flags) | |
bf505456 | 534 | { |
dd291d77 DLM |
535 | unsigned int zno = disk_zone_no(disk, sector); |
536 | struct blk_zone_wplug *zwplug; | |
537 | ||
538 | again: | |
539 | zwplug = disk_get_zone_wplug(disk, sector); | |
540 | if (zwplug) { | |
541 | /* | |
542 | * Check that a BIO completion or a zone reset or finish | |
543 | * operation has not already removed the zone write plug from | |
544 | * the hash table and dropped its reference count. In such case, | |
545 | * we need to get a new plug so start over from the beginning. | |
546 | */ | |
547 | spin_lock_irqsave(&zwplug->lock, *flags); | |
548 | if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { | |
549 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
550 | disk_put_zone_wplug(zwplug); | |
551 | goto again; | |
552 | } | |
553 | return zwplug; | |
554 | } | |
555 | ||
556 | /* | |
557 | * Allocate and initialize a zone write plug with an extra reference | |
558 | * so that it is not freed when the zone write plug becomes idle without | |
559 | * the zone being full. | |
560 | */ | |
561 | zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); | |
562 | if (!zwplug) | |
563 | return NULL; | |
564 | ||
565 | INIT_HLIST_NODE(&zwplug->node); | |
4122fef1 | 566 | refcount_set(&zwplug->ref, 2); |
dd291d77 DLM |
567 | spin_lock_init(&zwplug->lock); |
568 | zwplug->flags = 0; | |
569 | zwplug->zone_no = zno; | |
790eb09e | 570 | zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); |
dd291d77 DLM |
571 | bio_list_init(&zwplug->bio_list); |
572 | INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); | |
573 | zwplug->disk = disk; | |
574 | ||
575 | spin_lock_irqsave(&zwplug->lock, *flags); | |
576 | ||
577 | /* | |
578 | * Insert the new zone write plug in the hash table. This can fail only | |
579 | * if another context already inserted a plug. Retry from the beginning | |
580 | * in such case. | |
581 | */ | |
582 | if (!disk_insert_zone_wplug(disk, zwplug)) { | |
583 | spin_unlock_irqrestore(&zwplug->lock, *flags); | |
584 | mempool_free(zwplug, disk->zone_wplugs_pool); | |
585 | goto again; | |
586 | } | |
587 | ||
588 | return zwplug; | |
589 | } | |
590 | ||
c9c8aea0 DLM |
591 | static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, |
592 | struct bio *bio) | |
dd291d77 | 593 | { |
c9c8aea0 | 594 | struct request_queue *q = zwplug->disk->queue; |
dd291d77 DLM |
595 | |
596 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
597 | bio_io_error(bio); | |
c9c8aea0 | 598 | disk_put_zone_wplug(zwplug); |
fa855563 | 599 | /* Drop the reference taken by disk_zone_wplug_add_bio(() */ |
dd291d77 DLM |
600 | blk_queue_exit(q); |
601 | } | |
602 | ||
603 | /* | |
604 | * Abort (fail) all plugged BIOs of a zone write plug. | |
605 | */ | |
606 | static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) | |
607 | { | |
608 | struct bio *bio; | |
609 | ||
a6aa36e9 DLM |
610 | if (bio_list_empty(&zwplug->bio_list)) |
611 | return; | |
612 | ||
613 | pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", | |
614 | zwplug->disk->disk_name, zwplug->zone_no); | |
c9c8aea0 DLM |
615 | while ((bio = bio_list_pop(&zwplug->bio_list))) |
616 | blk_zone_wplug_bio_io_error(zwplug, bio); | |
dd291d77 DLM |
617 | } |
618 | ||
619 | /* | |
fe0418eb DLM |
620 | * Set a zone write plug write pointer offset to the specified value. |
621 | * This aborts all plugged BIOs, which is fine as this function is called for | |
622 | * a zone reset operation, a zone finish operation or if the zone needs a wp | |
623 | * update from a report zone after a write error. | |
dd291d77 DLM |
624 | */ |
625 | static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, | |
626 | struct blk_zone_wplug *zwplug, | |
627 | unsigned int wp_offset) | |
628 | { | |
fe0418eb | 629 | lockdep_assert_held(&zwplug->lock); |
dd291d77 DLM |
630 | |
631 | /* Update the zone write pointer and abort all plugged BIOs. */ | |
fe0418eb | 632 | zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; |
dd291d77 DLM |
633 | zwplug->wp_offset = wp_offset; |
634 | disk_zone_wplug_abort(zwplug); | |
635 | ||
dd291d77 DLM |
636 | /* |
637 | * The zone write plug now has no BIO plugged: remove it from the | |
638 | * hash table so that it cannot be seen. The plug will be freed | |
639 | * when the last reference is dropped. | |
640 | */ | |
641 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
642 | disk_remove_zone_wplug(disk, zwplug); | |
dd291d77 DLM |
643 | } |
644 | ||
b76b840f DLM |
645 | static unsigned int blk_zone_wp_offset(struct blk_zone *zone) |
646 | { | |
647 | switch (zone->cond) { | |
648 | case BLK_ZONE_COND_IMP_OPEN: | |
649 | case BLK_ZONE_COND_EXP_OPEN: | |
650 | case BLK_ZONE_COND_CLOSED: | |
651 | return zone->wp - zone->start; | |
652 | case BLK_ZONE_COND_FULL: | |
653 | return zone->len; | |
654 | case BLK_ZONE_COND_EMPTY: | |
655 | return 0; | |
656 | case BLK_ZONE_COND_NOT_WP: | |
657 | case BLK_ZONE_COND_OFFLINE: | |
658 | case BLK_ZONE_COND_READONLY: | |
659 | default: | |
660 | /* | |
661 | * Conventional, offline and read-only zones do not have a valid | |
662 | * write pointer. | |
663 | */ | |
664 | return UINT_MAX; | |
665 | } | |
666 | } | |
667 | ||
668 | static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, | |
669 | struct blk_zone *zone) | |
670 | { | |
671 | struct blk_zone_wplug *zwplug; | |
672 | unsigned long flags; | |
673 | ||
674 | zwplug = disk_get_zone_wplug(disk, zone->start); | |
675 | if (!zwplug) | |
676 | return; | |
677 | ||
678 | spin_lock_irqsave(&zwplug->lock, flags); | |
fe0418eb | 679 | if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) |
b76b840f DLM |
680 | disk_zone_wplug_set_wp_offset(disk, zwplug, |
681 | blk_zone_wp_offset(zone)); | |
682 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
683 | ||
684 | disk_put_zone_wplug(zwplug); | |
685 | } | |
686 | ||
687 | static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) | |
688 | { | |
689 | struct disk_report_zones_cb_args args = { | |
690 | .disk = disk, | |
691 | }; | |
692 | ||
693 | return disk->fops->report_zones(disk, sector, 1, | |
694 | disk_report_zones_cb, &args); | |
695 | } | |
696 | ||
dd291d77 DLM |
697 | static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, |
698 | unsigned int wp_offset) | |
699 | { | |
700 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
701 | sector_t sector = bio->bi_iter.bi_sector; | |
702 | struct blk_zone_wplug *zwplug; | |
fe0418eb | 703 | unsigned long flags; |
dd291d77 DLM |
704 | |
705 | /* Conventional zones cannot be reset nor finished. */ | |
f3d9bf05 | 706 | if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { |
dd291d77 DLM |
707 | bio_io_error(bio); |
708 | return true; | |
709 | } | |
710 | ||
5eb3317a DLM |
711 | /* |
712 | * No-wait reset or finish BIOs do not make much sense as the callers | |
713 | * issue these as blocking operations in most cases. To avoid issues | |
714 | * the BIO execution potentially failing with BLK_STS_AGAIN, warn about | |
715 | * REQ_NOWAIT being set and ignore that flag. | |
716 | */ | |
717 | if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) | |
718 | bio->bi_opf &= ~REQ_NOWAIT; | |
719 | ||
dd291d77 DLM |
720 | /* |
721 | * If we have a zone write plug, set its write pointer offset to 0 | |
722 | * (reset case) or to the zone size (finish case). This will abort all | |
723 | * BIOs plugged for the target zone. It is fine as resetting or | |
724 | * finishing zones while writes are still in-flight will result in the | |
725 | * writes failing anyway. | |
726 | */ | |
727 | zwplug = disk_get_zone_wplug(disk, sector); | |
728 | if (zwplug) { | |
fe0418eb | 729 | spin_lock_irqsave(&zwplug->lock, flags); |
dd291d77 | 730 | disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); |
fe0418eb | 731 | spin_unlock_irqrestore(&zwplug->lock, flags); |
dd291d77 DLM |
732 | disk_put_zone_wplug(zwplug); |
733 | } | |
734 | ||
735 | return false; | |
736 | } | |
737 | ||
738 | static bool blk_zone_wplug_handle_reset_all(struct bio *bio) | |
739 | { | |
740 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
741 | struct blk_zone_wplug *zwplug; | |
fe0418eb | 742 | unsigned long flags; |
dd291d77 DLM |
743 | sector_t sector; |
744 | ||
745 | /* | |
746 | * Set the write pointer offset of all zone write plugs to 0. This will | |
747 | * abort all plugged BIOs. It is fine as resetting zones while writes | |
748 | * are still in-flight will result in the writes failing anyway. | |
749 | */ | |
750 | for (sector = 0; sector < get_capacity(disk); | |
751 | sector += disk->queue->limits.chunk_sectors) { | |
752 | zwplug = disk_get_zone_wplug(disk, sector); | |
753 | if (zwplug) { | |
fe0418eb | 754 | spin_lock_irqsave(&zwplug->lock, flags); |
dd291d77 | 755 | disk_zone_wplug_set_wp_offset(disk, zwplug, 0); |
fe0418eb | 756 | spin_unlock_irqrestore(&zwplug->lock, flags); |
dd291d77 DLM |
757 | disk_put_zone_wplug(zwplug); |
758 | } | |
759 | } | |
760 | ||
761 | return false; | |
762 | } | |
763 | ||
cae00567 DLM |
764 | static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, |
765 | struct blk_zone_wplug *zwplug) | |
766 | { | |
767 | /* | |
768 | * Take a reference on the zone write plug and schedule the submission | |
769 | * of the next plugged BIO. blk_zone_wplug_bio_work() will release the | |
770 | * reference we take here. | |
771 | */ | |
772 | WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); | |
773 | refcount_inc(&zwplug->ref); | |
774 | queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); | |
775 | } | |
776 | ||
777 | static inline void disk_zone_wplug_add_bio(struct gendisk *disk, | |
778 | struct blk_zone_wplug *zwplug, | |
779 | struct bio *bio, unsigned int nr_segs) | |
dd291d77 | 780 | { |
cae00567 DLM |
781 | bool schedule_bio_work = false; |
782 | ||
dd291d77 DLM |
783 | /* |
784 | * Grab an extra reference on the BIO request queue usage counter. | |
785 | * This reference will be reused to submit a request for the BIO for | |
786 | * blk-mq devices and dropped when the BIO is failed and after | |
787 | * it is issued in the case of BIO-based devices. | |
788 | */ | |
789 | percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); | |
790 | ||
791 | /* | |
792 | * The BIO is being plugged and thus will have to wait for the on-going | |
793 | * write and for all other writes already plugged. So polling makes | |
794 | * no sense. | |
795 | */ | |
796 | bio_clear_polled(bio); | |
797 | ||
cae00567 DLM |
798 | /* |
799 | * REQ_NOWAIT BIOs are always handled using the zone write plug BIO | |
800 | * work, which can block. So clear the REQ_NOWAIT flag and schedule the | |
801 | * work if this is the first BIO we are plugging. | |
802 | */ | |
803 | if (bio->bi_opf & REQ_NOWAIT) { | |
804 | schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); | |
805 | bio->bi_opf &= ~REQ_NOWAIT; | |
806 | } | |
807 | ||
dd291d77 DLM |
808 | /* |
809 | * Reuse the poll cookie field to store the number of segments when | |
810 | * split to the hardware limits. | |
811 | */ | |
812 | bio->__bi_nr_segments = nr_segs; | |
813 | ||
814 | /* | |
815 | * We always receive BIOs after they are split and ready to be issued. | |
816 | * The block layer passes the parts of a split BIO in order, and the | |
817 | * user must also issue write sequentially. So simply add the new BIO | |
818 | * at the tail of the list to preserve the sequential write order. | |
819 | */ | |
820 | bio_list_add(&zwplug->bio_list, bio); | |
cae00567 DLM |
821 | |
822 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
823 | ||
824 | if (schedule_bio_work) | |
825 | disk_zone_wplug_schedule_bio_work(disk, zwplug); | |
dd291d77 DLM |
826 | } |
827 | ||
828 | /* | |
829 | * Called from bio_attempt_back_merge() when a BIO was merged with a request. | |
830 | */ | |
831 | void blk_zone_write_plug_bio_merged(struct bio *bio) | |
832 | { | |
833 | struct blk_zone_wplug *zwplug; | |
834 | unsigned long flags; | |
835 | ||
836 | /* | |
837 | * If the BIO was already plugged, then we were called through | |
096bc7ea DLM |
838 | * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). |
839 | * For this case, we already hold a reference on the zone write plug for | |
840 | * the BIO and blk_zone_write_plug_init_request() will handle the | |
dd291d77 DLM |
841 | * zone write pointer offset update. |
842 | */ | |
843 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
844 | return; | |
845 | ||
846 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
847 | ||
848 | /* | |
c4c3ffda DLM |
849 | * Get a reference on the zone write plug of the target zone and advance |
850 | * the zone write pointer offset. Given that this is a merge, we already | |
851 | * have at least one request and one BIO referencing the zone write | |
852 | * plug. So this should not fail. | |
dd291d77 DLM |
853 | */ |
854 | zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, | |
855 | bio->bi_iter.bi_sector); | |
c4c3ffda DLM |
856 | if (WARN_ON_ONCE(!zwplug)) |
857 | return; | |
858 | ||
dd291d77 DLM |
859 | spin_lock_irqsave(&zwplug->lock, flags); |
860 | zwplug->wp_offset += bio_sectors(bio); | |
861 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
862 | } | |
863 | ||
864 | /* | |
865 | * Attempt to merge plugged BIOs with a newly prepared request for a BIO that | |
866 | * already went through zone write plugging (either a new BIO or one that was | |
867 | * unplugged). | |
868 | */ | |
096bc7ea | 869 | void blk_zone_write_plug_init_request(struct request *req) |
dd291d77 DLM |
870 | { |
871 | sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); | |
872 | struct request_queue *q = req->q; | |
873 | struct gendisk *disk = q->disk; | |
dd291d77 DLM |
874 | struct blk_zone_wplug *zwplug = |
875 | disk_get_zone_wplug(disk, blk_rq_pos(req)); | |
876 | unsigned long flags; | |
877 | struct bio *bio; | |
878 | ||
096bc7ea DLM |
879 | if (WARN_ON_ONCE(!zwplug)) |
880 | return; | |
881 | ||
dd291d77 | 882 | /* |
7b295187 | 883 | * Indicate that completion of this request needs to be handled with |
347bde9d | 884 | * blk_zone_write_plug_finish_request(), which will drop the reference |
7b295187 | 885 | * on the zone write plug we took above on entry to this function. |
dd291d77 DLM |
886 | */ |
887 | req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; | |
888 | ||
889 | if (blk_queue_nomerges(q)) | |
890 | return; | |
891 | ||
892 | /* | |
893 | * Walk through the list of plugged BIOs to check if they can be merged | |
894 | * into the back of the request. | |
895 | */ | |
896 | spin_lock_irqsave(&zwplug->lock, flags); | |
29459c3e | 897 | while (!disk_zone_wplug_is_full(disk, zwplug)) { |
dd291d77 DLM |
898 | bio = bio_list_peek(&zwplug->bio_list); |
899 | if (!bio) | |
900 | break; | |
901 | ||
902 | if (bio->bi_iter.bi_sector != req_back_sector || | |
903 | !blk_rq_merge_ok(req, bio)) | |
904 | break; | |
905 | ||
906 | WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && | |
907 | !bio->__bi_nr_segments); | |
908 | ||
909 | bio_list_pop(&zwplug->bio_list); | |
910 | if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != | |
911 | BIO_MERGE_OK) { | |
912 | bio_list_add_head(&zwplug->bio_list, bio); | |
913 | break; | |
914 | } | |
915 | ||
fa855563 | 916 | /* Drop the reference taken by disk_zone_wplug_add_bio(). */ |
dd291d77 DLM |
917 | blk_queue_exit(q); |
918 | zwplug->wp_offset += bio_sectors(bio); | |
919 | ||
920 | req_back_sector += bio_sectors(bio); | |
921 | } | |
922 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
923 | } | |
924 | ||
dd291d77 DLM |
925 | /* |
926 | * Check and prepare a BIO for submission by incrementing the write pointer | |
9b1ce7f0 DLM |
927 | * offset of its zone write plug and changing zone append operations into |
928 | * regular write when zone append emulation is needed. | |
dd291d77 DLM |
929 | */ |
930 | static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, | |
931 | struct bio *bio) | |
932 | { | |
933 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
934 | ||
cbac56e5 BVA |
935 | lockdep_assert_held(&zwplug->lock); |
936 | ||
fe0418eb DLM |
937 | /* |
938 | * If we lost track of the zone write pointer due to a write error, | |
939 | * the user must either execute a report zones, reset the zone or finish | |
940 | * the to recover a reliable write pointer position. Fail BIOs if the | |
941 | * user did not do that as we cannot handle emulated zone append | |
942 | * otherwise. | |
943 | */ | |
944 | if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) | |
945 | return false; | |
946 | ||
dd291d77 DLM |
947 | /* |
948 | * Check that the user is not attempting to write to a full zone. | |
949 | * We know such BIO will fail, and that would potentially overflow our | |
950 | * write pointer offset beyond the end of the zone. | |
951 | */ | |
29459c3e | 952 | if (disk_zone_wplug_is_full(disk, zwplug)) |
fe0418eb | 953 | return false; |
dd291d77 | 954 | |
9b1ce7f0 DLM |
955 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
956 | /* | |
957 | * Use a regular write starting at the current write pointer. | |
958 | * Similarly to native zone append operations, do not allow | |
959 | * merging. | |
960 | */ | |
961 | bio->bi_opf &= ~REQ_OP_MASK; | |
962 | bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; | |
963 | bio->bi_iter.bi_sector += zwplug->wp_offset; | |
964 | ||
965 | /* | |
966 | * Remember that this BIO is in fact a zone append operation | |
967 | * so that we can restore its operation code on completion. | |
968 | */ | |
969 | bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); | |
970 | } else { | |
971 | /* | |
fe0418eb DLM |
972 | * Check for non-sequential writes early as we know that BIOs |
973 | * with a start sector not unaligned to the zone write pointer | |
974 | * will fail. | |
9b1ce7f0 DLM |
975 | */ |
976 | if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) | |
fe0418eb | 977 | return false; |
9b1ce7f0 | 978 | } |
dd291d77 DLM |
979 | |
980 | /* Advance the zone write pointer offset. */ | |
981 | zwplug->wp_offset += bio_sectors(bio); | |
982 | ||
983 | return true; | |
dd291d77 DLM |
984 | } |
985 | ||
986 | static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) | |
987 | { | |
988 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
989 | sector_t sector = bio->bi_iter.bi_sector; | |
990 | struct blk_zone_wplug *zwplug; | |
991 | gfp_t gfp_mask = GFP_NOIO; | |
992 | unsigned long flags; | |
993 | ||
994 | /* | |
995 | * BIOs must be fully contained within a zone so that we use the correct | |
996 | * zone write plug for the entire BIO. For blk-mq devices, the block | |
997 | * layer should already have done any splitting required to ensure this | |
998 | * and this BIO should thus not be straddling zone boundaries. For | |
999 | * BIO-based devices, it is the responsibility of the driver to split | |
1000 | * the bio before submitting it. | |
1001 | */ | |
1002 | if (WARN_ON_ONCE(bio_straddles_zones(bio))) { | |
1003 | bio_io_error(bio); | |
1004 | return true; | |
1005 | } | |
1006 | ||
1007 | /* Conventional zones do not need write plugging. */ | |
f3d9bf05 | 1008 | if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { |
9b1ce7f0 DLM |
1009 | /* Zone append to conventional zones is not allowed. */ |
1010 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { | |
1011 | bio_io_error(bio); | |
1012 | return true; | |
1013 | } | |
dd291d77 | 1014 | return false; |
9b1ce7f0 | 1015 | } |
dd291d77 DLM |
1016 | |
1017 | if (bio->bi_opf & REQ_NOWAIT) | |
1018 | gfp_mask = GFP_NOWAIT; | |
1019 | ||
1020 | zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); | |
1021 | if (!zwplug) { | |
cae00567 DLM |
1022 | if (bio->bi_opf & REQ_NOWAIT) |
1023 | bio_wouldblock_error(bio); | |
1024 | else | |
1025 | bio_io_error(bio); | |
dd291d77 DLM |
1026 | return true; |
1027 | } | |
1028 | ||
1029 | /* Indicate that this BIO is being handled using zone write plugging. */ | |
1030 | bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1031 | ||
1032 | /* | |
fe0418eb DLM |
1033 | * If the zone is already plugged, add the BIO to the plug BIO list. |
1034 | * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a | |
1035 | * BLK_STS_AGAIN failure if we let the BIO execute. | |
cae00567 | 1036 | * Otherwise, plug and let the BIO execute. |
dd291d77 | 1037 | */ |
fe0418eb DLM |
1038 | if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || |
1039 | (bio->bi_opf & REQ_NOWAIT)) | |
dd291d77 DLM |
1040 | goto plug; |
1041 | ||
fe0418eb DLM |
1042 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { |
1043 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1044 | bio_io_error(bio); | |
1045 | return true; | |
1046 | } | |
dd291d77 DLM |
1047 | |
1048 | zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; | |
1049 | ||
1050 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1051 | ||
1052 | return false; | |
1053 | ||
1054 | plug: | |
cae00567 | 1055 | disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); |
dd291d77 DLM |
1056 | |
1057 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1058 | ||
1059 | return true; | |
1060 | } | |
1061 | ||
a6aa36e9 DLM |
1062 | static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) |
1063 | { | |
1064 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1065 | struct blk_zone_wplug *zwplug; | |
1066 | unsigned long flags; | |
1067 | ||
1068 | /* | |
1069 | * We have native support for zone append operations, so we are not | |
1070 | * going to handle @bio through plugging. However, we may already have a | |
1071 | * zone write plug for the target zone if that zone was previously | |
1072 | * partially written using regular writes. In such case, we risk leaving | |
1073 | * the plug in the disk hash table if the zone is fully written using | |
1074 | * zone append operations. Avoid this by removing the zone write plug. | |
1075 | */ | |
1076 | zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); | |
1077 | if (likely(!zwplug)) | |
1078 | return; | |
1079 | ||
1080 | spin_lock_irqsave(&zwplug->lock, flags); | |
1081 | ||
1082 | /* | |
1083 | * We are about to remove the zone write plug. But if the user | |
1084 | * (mistakenly) has issued regular writes together with native zone | |
1085 | * append, we must aborts the writes as otherwise the plugged BIOs would | |
1086 | * not be executed by the plug BIO work as disk_get_zone_wplug() will | |
1087 | * return NULL after the plug is removed. Aborting the plugged write | |
1088 | * BIOs is consistent with the fact that these writes will most likely | |
1089 | * fail anyway as there is no ordering guarantees between zone append | |
1090 | * operations and regular write operations. | |
1091 | */ | |
1092 | if (!bio_list_empty(&zwplug->bio_list)) { | |
1093 | pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", | |
1094 | disk->disk_name, zwplug->zone_no); | |
1095 | disk_zone_wplug_abort(zwplug); | |
1096 | } | |
1097 | disk_remove_zone_wplug(disk, zwplug); | |
1098 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1099 | ||
1100 | disk_put_zone_wplug(zwplug); | |
1101 | } | |
1102 | ||
dd291d77 DLM |
1103 | /** |
1104 | * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging | |
1105 | * @bio: The BIO being submitted | |
1106 | * @nr_segs: The number of physical segments of @bio | |
1107 | * | |
9b1ce7f0 DLM |
1108 | * Handle write, write zeroes and zone append operations requiring emulation |
1109 | * using zone write plugging. | |
dd291d77 DLM |
1110 | * |
1111 | * Return true whenever @bio execution needs to be delayed through the zone | |
1112 | * write plug. Otherwise, return false to let the submission path process | |
1113 | * @bio normally. | |
1114 | */ | |
1115 | bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) | |
1116 | { | |
1117 | struct block_device *bdev = bio->bi_bdev; | |
1118 | ||
1119 | if (!bdev->bd_disk->zone_wplugs_hash) | |
1120 | return false; | |
1121 | ||
1122 | /* | |
1123 | * If the BIO already has the plugging flag set, then it was already | |
1124 | * handled through this path and this is a submission from the zone | |
1125 | * plug bio submit work. | |
1126 | */ | |
1127 | if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) | |
1128 | return false; | |
1129 | ||
1130 | /* | |
1131 | * We do not need to do anything special for empty flush BIOs, e.g | |
1132 | * BIOs such as issued by blkdev_issue_flush(). The is because it is | |
1133 | * the responsibility of the user to first wait for the completion of | |
1134 | * write operations for flush to have any effect on the persistence of | |
1135 | * the written data. | |
1136 | */ | |
1137 | if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) | |
1138 | return false; | |
1139 | ||
1140 | /* | |
1141 | * Regular writes and write zeroes need to be handled through the target | |
1142 | * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH | |
1143 | * which may need to go through the flush machinery depending on the | |
1144 | * target device capabilities. Plugging such writes is fine as the flush | |
1145 | * machinery operates at the request level, below the plug, and | |
1146 | * completion of the flush sequence will go through the regular BIO | |
1147 | * completion, which will handle zone write plugging. | |
9b1ce7f0 DLM |
1148 | * Zone append operations for devices that requested emulation must |
1149 | * also be plugged so that these BIOs can be changed into regular | |
1150 | * write BIOs. | |
dd291d77 DLM |
1151 | * Zone reset, reset all and finish commands need special treatment |
1152 | * to correctly track the write pointer offset of zones. These commands | |
1153 | * are not plugged as we do not need serialization with write | |
1154 | * operations. It is the responsibility of the user to not issue reset | |
1155 | * and finish commands when write operations are in flight. | |
1156 | */ | |
1157 | switch (bio_op(bio)) { | |
9b1ce7f0 | 1158 | case REQ_OP_ZONE_APPEND: |
a6aa36e9 DLM |
1159 | if (!bdev_emulates_zone_append(bdev)) { |
1160 | blk_zone_wplug_handle_native_zone_append(bio); | |
9b1ce7f0 | 1161 | return false; |
a6aa36e9 | 1162 | } |
9b1ce7f0 | 1163 | fallthrough; |
dd291d77 DLM |
1164 | case REQ_OP_WRITE: |
1165 | case REQ_OP_WRITE_ZEROES: | |
1166 | return blk_zone_wplug_handle_write(bio, nr_segs); | |
1167 | case REQ_OP_ZONE_RESET: | |
1168 | return blk_zone_wplug_handle_reset_or_finish(bio, 0); | |
1169 | case REQ_OP_ZONE_FINISH: | |
1170 | return blk_zone_wplug_handle_reset_or_finish(bio, | |
1171 | bdev_zone_sectors(bdev)); | |
1172 | case REQ_OP_ZONE_RESET_ALL: | |
1173 | return blk_zone_wplug_handle_reset_all(bio); | |
1174 | default: | |
1175 | return false; | |
1176 | } | |
1177 | ||
1178 | return false; | |
1179 | } | |
1180 | EXPORT_SYMBOL_GPL(blk_zone_plug_bio); | |
1181 | ||
1182 | static void disk_zone_wplug_unplug_bio(struct gendisk *disk, | |
1183 | struct blk_zone_wplug *zwplug) | |
1184 | { | |
1185 | unsigned long flags; | |
1186 | ||
1187 | spin_lock_irqsave(&zwplug->lock, flags); | |
1188 | ||
dd291d77 DLM |
1189 | /* Schedule submission of the next plugged BIO if we have one. */ |
1190 | if (!bio_list_empty(&zwplug->bio_list)) { | |
9e78c38a | 1191 | disk_zone_wplug_schedule_bio_work(disk, zwplug); |
dd291d77 | 1192 | spin_unlock_irqrestore(&zwplug->lock, flags); |
dd291d77 DLM |
1193 | return; |
1194 | } | |
1195 | ||
1196 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1197 | ||
1198 | /* | |
1199 | * If the zone is full (it was fully written or finished, or empty | |
1200 | * (it was reset), remove its zone write plug from the hash table. | |
1201 | */ | |
1202 | if (disk_should_remove_zone_wplug(disk, zwplug)) | |
1203 | disk_remove_zone_wplug(disk, zwplug); | |
1204 | ||
1205 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1206 | } | |
1207 | ||
1208 | void blk_zone_write_plug_bio_endio(struct bio *bio) | |
1209 | { | |
1210 | struct gendisk *disk = bio->bi_bdev->bd_disk; | |
1211 | struct blk_zone_wplug *zwplug = | |
b5a64ec2 | 1212 | disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); |
dd291d77 DLM |
1213 | unsigned long flags; |
1214 | ||
1215 | if (WARN_ON_ONCE(!zwplug)) | |
1216 | return; | |
1217 | ||
1218 | /* Make sure we do not see this BIO again by clearing the plug flag. */ | |
1219 | bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); | |
1220 | ||
9b1ce7f0 DLM |
1221 | /* |
1222 | * If this is a regular write emulating a zone append operation, | |
1223 | * restore the original operation code. | |
1224 | */ | |
1225 | if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { | |
1226 | bio->bi_opf &= ~REQ_OP_MASK; | |
1227 | bio->bi_opf |= REQ_OP_ZONE_APPEND; | |
f705d33c | 1228 | bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); |
9b1ce7f0 DLM |
1229 | } |
1230 | ||
dd291d77 | 1231 | /* |
fe0418eb DLM |
1232 | * If the BIO failed, abort all plugged BIOs and mark the plug as |
1233 | * needing a write pointer update. | |
dd291d77 DLM |
1234 | */ |
1235 | if (bio->bi_status != BLK_STS_OK) { | |
1236 | spin_lock_irqsave(&zwplug->lock, flags); | |
fe0418eb DLM |
1237 | disk_zone_wplug_abort(zwplug); |
1238 | zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; | |
dd291d77 DLM |
1239 | spin_unlock_irqrestore(&zwplug->lock, flags); |
1240 | } | |
1241 | ||
7b295187 DLM |
1242 | /* Drop the reference we took when the BIO was issued. */ |
1243 | disk_put_zone_wplug(zwplug); | |
1244 | ||
dd291d77 | 1245 | /* |
347bde9d | 1246 | * For BIO-based devices, blk_zone_write_plug_finish_request() |
dd291d77 DLM |
1247 | * is not called. So we need to schedule execution of the next |
1248 | * plugged BIO here. | |
1249 | */ | |
3413efa8 | 1250 | if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) |
dd291d77 DLM |
1251 | disk_zone_wplug_unplug_bio(disk, zwplug); |
1252 | ||
7b295187 | 1253 | /* Drop the reference we took when entering this function. */ |
dd291d77 DLM |
1254 | disk_put_zone_wplug(zwplug); |
1255 | } | |
1256 | ||
347bde9d | 1257 | void blk_zone_write_plug_finish_request(struct request *req) |
dd291d77 DLM |
1258 | { |
1259 | struct gendisk *disk = req->q->disk; | |
347bde9d | 1260 | struct blk_zone_wplug *zwplug; |
dd291d77 | 1261 | |
347bde9d | 1262 | zwplug = disk_get_zone_wplug(disk, req->__sector); |
dd291d77 DLM |
1263 | if (WARN_ON_ONCE(!zwplug)) |
1264 | return; | |
1265 | ||
1266 | req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; | |
1267 | ||
dd291d77 DLM |
1268 | /* |
1269 | * Drop the reference we took when the request was initialized in | |
096bc7ea | 1270 | * blk_zone_write_plug_init_request(). |
dd291d77 | 1271 | */ |
7b295187 DLM |
1272 | disk_put_zone_wplug(zwplug); |
1273 | ||
1274 | disk_zone_wplug_unplug_bio(disk, zwplug); | |
1275 | ||
1276 | /* Drop the reference we took when entering this function. */ | |
dd291d77 DLM |
1277 | disk_put_zone_wplug(zwplug); |
1278 | } | |
1279 | ||
1280 | static void blk_zone_wplug_bio_work(struct work_struct *work) | |
1281 | { | |
1282 | struct blk_zone_wplug *zwplug = | |
1283 | container_of(work, struct blk_zone_wplug, bio_work); | |
1284 | struct block_device *bdev; | |
1285 | unsigned long flags; | |
1286 | struct bio *bio; | |
1287 | ||
1288 | /* | |
1289 | * Submit the next plugged BIO. If we do not have any, clear | |
1290 | * the plugged flag. | |
1291 | */ | |
1292 | spin_lock_irqsave(&zwplug->lock, flags); | |
1293 | ||
fe0418eb | 1294 | again: |
dd291d77 DLM |
1295 | bio = bio_list_pop(&zwplug->bio_list); |
1296 | if (!bio) { | |
1297 | zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; | |
1298 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
9e78c38a | 1299 | goto put_zwplug; |
dd291d77 DLM |
1300 | } |
1301 | ||
1302 | if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { | |
fe0418eb DLM |
1303 | blk_zone_wplug_bio_io_error(zwplug, bio); |
1304 | goto again; | |
dd291d77 DLM |
1305 | } |
1306 | ||
1307 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1308 | ||
1309 | bdev = bio->bi_bdev; | |
dd291d77 DLM |
1310 | |
1311 | /* | |
1312 | * blk-mq devices will reuse the extra reference on the request queue | |
1313 | * usage counter we took when the BIO was plugged, but the submission | |
1314 | * path for BIO-based devices will not do that. So drop this extra | |
1315 | * reference here. | |
1316 | */ | |
cf625013 CH |
1317 | if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { |
1318 | bdev->bd_disk->fops->submit_bio(bio); | |
dd291d77 | 1319 | blk_queue_exit(bdev->bd_disk->queue); |
cf625013 CH |
1320 | } else { |
1321 | blk_mq_submit_bio(bio); | |
1322 | } | |
9e78c38a DLM |
1323 | |
1324 | put_zwplug: | |
1325 | /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ | |
1326 | disk_put_zone_wplug(zwplug); | |
dd291d77 DLM |
1327 | } |
1328 | ||
dd291d77 DLM |
1329 | static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) |
1330 | { | |
1331 | return 1U << disk->zone_wplugs_hash_bits; | |
1332 | } | |
1333 | ||
1334 | void disk_init_zone_resources(struct gendisk *disk) | |
1335 | { | |
1336 | spin_lock_init(&disk->zone_wplugs_lock); | |
dd291d77 DLM |
1337 | } |
1338 | ||
1339 | /* | |
1340 | * For the size of a disk zone write plug hash table, use the size of the | |
1341 | * zone write plug mempool, which is the maximum of the disk open zones and | |
1342 | * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, | |
1343 | * 9 bits. For a disk that has no limits, mempool size defaults to 128. | |
1344 | */ | |
1345 | #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 | |
1346 | #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 | |
1347 | ||
1348 | static int disk_alloc_zone_resources(struct gendisk *disk, | |
1349 | unsigned int pool_size) | |
1350 | { | |
1351 | unsigned int i; | |
1352 | ||
a6aa36e9 | 1353 | atomic_set(&disk->nr_zone_wplugs, 0); |
dd291d77 DLM |
1354 | disk->zone_wplugs_hash_bits = |
1355 | min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); | |
1356 | ||
1357 | disk->zone_wplugs_hash = | |
1358 | kcalloc(disk_zone_wplugs_hash_size(disk), | |
1359 | sizeof(struct hlist_head), GFP_KERNEL); | |
1360 | if (!disk->zone_wplugs_hash) | |
1361 | return -ENOMEM; | |
1362 | ||
1363 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) | |
1364 | INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); | |
1365 | ||
1366 | disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, | |
1367 | sizeof(struct blk_zone_wplug)); | |
a8f59e5a DLM |
1368 | if (!disk->zone_wplugs_pool) |
1369 | goto free_hash; | |
1370 | ||
1371 | disk->zone_wplugs_wq = | |
1372 | alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, | |
1373 | pool_size, disk->disk_name); | |
1374 | if (!disk->zone_wplugs_wq) | |
1375 | goto destroy_pool; | |
dd291d77 DLM |
1376 | |
1377 | return 0; | |
a8f59e5a DLM |
1378 | |
1379 | destroy_pool: | |
1380 | mempool_destroy(disk->zone_wplugs_pool); | |
1381 | disk->zone_wplugs_pool = NULL; | |
1382 | free_hash: | |
1383 | kfree(disk->zone_wplugs_hash); | |
1384 | disk->zone_wplugs_hash = NULL; | |
1385 | disk->zone_wplugs_hash_bits = 0; | |
1386 | return -ENOMEM; | |
dd291d77 DLM |
1387 | } |
1388 | ||
1389 | static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) | |
1390 | { | |
1391 | struct blk_zone_wplug *zwplug; | |
1392 | unsigned int i; | |
1393 | ||
1394 | if (!disk->zone_wplugs_hash) | |
1395 | return; | |
1396 | ||
1397 | /* Free all the zone write plugs we have. */ | |
1398 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { | |
1399 | while (!hlist_empty(&disk->zone_wplugs_hash[i])) { | |
1400 | zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, | |
1401 | struct blk_zone_wplug, node); | |
4122fef1 | 1402 | refcount_inc(&zwplug->ref); |
dd291d77 DLM |
1403 | disk_remove_zone_wplug(disk, zwplug); |
1404 | disk_put_zone_wplug(zwplug); | |
1405 | } | |
1406 | } | |
1407 | ||
a6aa36e9 | 1408 | WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); |
dd291d77 DLM |
1409 | kfree(disk->zone_wplugs_hash); |
1410 | disk->zone_wplugs_hash = NULL; | |
1411 | disk->zone_wplugs_hash_bits = 0; | |
1412 | } | |
1413 | ||
d7cb6d74 DLM |
1414 | static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, |
1415 | unsigned long *bitmap) | |
1416 | { | |
1417 | unsigned int nr_conv_zones = 0; | |
1418 | unsigned long flags; | |
1419 | ||
1420 | spin_lock_irqsave(&disk->zone_wplugs_lock, flags); | |
1421 | if (bitmap) | |
1422 | nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); | |
1423 | bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, | |
1424 | lockdep_is_held(&disk->zone_wplugs_lock)); | |
1425 | spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); | |
1426 | ||
1427 | kfree_rcu_mightsleep(bitmap); | |
1428 | ||
1429 | return nr_conv_zones; | |
1430 | } | |
1431 | ||
dd291d77 DLM |
1432 | void disk_free_zone_resources(struct gendisk *disk) |
1433 | { | |
1933192a DLM |
1434 | if (!disk->zone_wplugs_pool) |
1435 | return; | |
1436 | ||
a8f59e5a DLM |
1437 | if (disk->zone_wplugs_wq) { |
1438 | destroy_workqueue(disk->zone_wplugs_wq); | |
1439 | disk->zone_wplugs_wq = NULL; | |
1440 | } | |
1441 | ||
dd291d77 DLM |
1442 | disk_destroy_zone_wplugs_hash_table(disk); |
1443 | ||
1444 | /* | |
1445 | * Wait for the zone write plugs to be RCU-freed before | |
1446 | * destorying the mempool. | |
1447 | */ | |
1448 | rcu_barrier(); | |
1449 | ||
1450 | mempool_destroy(disk->zone_wplugs_pool); | |
1451 | disk->zone_wplugs_pool = NULL; | |
1452 | ||
d7cb6d74 | 1453 | disk_set_conv_zones_bitmap(disk, NULL); |
dd291d77 | 1454 | disk->zone_capacity = 0; |
29459c3e | 1455 | disk->last_zone_capacity = 0; |
dd291d77 DLM |
1456 | disk->nr_zones = 0; |
1457 | } | |
1458 | ||
946dd71e DLM |
1459 | static inline bool disk_need_zone_resources(struct gendisk *disk) |
1460 | { | |
1461 | /* | |
1462 | * All mq zoned devices need zone resources so that the block layer | |
1463 | * can automatically handle write BIO plugging. BIO-based device drivers | |
1464 | * (e.g. DM devices) are normally responsible for handling zone write | |
1465 | * ordering and do not need zone resources, unless the driver requires | |
1466 | * zone append emulation. | |
1467 | */ | |
1468 | return queue_is_mq(disk->queue) || | |
1469 | queue_emulates_zone_append(disk->queue); | |
1470 | } | |
1471 | ||
dd291d77 DLM |
1472 | static int disk_revalidate_zone_resources(struct gendisk *disk, |
1473 | unsigned int nr_zones) | |
1474 | { | |
1475 | struct queue_limits *lim = &disk->queue->limits; | |
1476 | unsigned int pool_size; | |
1477 | ||
946dd71e DLM |
1478 | if (!disk_need_zone_resources(disk)) |
1479 | return 0; | |
1480 | ||
dd291d77 DLM |
1481 | /* |
1482 | * If the device has no limit on the maximum number of open and active | |
1483 | * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. | |
1484 | */ | |
1485 | pool_size = max(lim->max_open_zones, lim->max_active_zones); | |
1486 | if (!pool_size) | |
1487 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); | |
1488 | ||
1489 | if (!disk->zone_wplugs_hash) | |
1490 | return disk_alloc_zone_resources(disk, pool_size); | |
1491 | ||
dd291d77 | 1492 | return 0; |
bf505456 DLM |
1493 | } |
1494 | ||
d4100351 CH |
1495 | struct blk_revalidate_zone_args { |
1496 | struct gendisk *disk; | |
f216fdd7 | 1497 | unsigned long *conv_zones_bitmap; |
e94f5819 | 1498 | unsigned int nr_zones; |
ecfe43b1 | 1499 | unsigned int zone_capacity; |
29459c3e | 1500 | unsigned int last_zone_capacity; |
d4100351 CH |
1501 | sector_t sector; |
1502 | }; | |
1503 | ||
843283e9 DLM |
1504 | /* |
1505 | * Update the disk zone resources information and device queue limits. | |
1506 | * The disk queue is frozen when this is executed. | |
1507 | */ | |
1508 | static int disk_update_zone_resources(struct gendisk *disk, | |
1509 | struct blk_revalidate_zone_args *args) | |
1510 | { | |
1511 | struct request_queue *q = disk->queue; | |
d7cb6d74 | 1512 | unsigned int nr_seq_zones, nr_conv_zones; |
6b7593b5 | 1513 | unsigned int pool_size; |
843283e9 DLM |
1514 | struct queue_limits lim; |
1515 | ||
1516 | disk->nr_zones = args->nr_zones; | |
1517 | disk->zone_capacity = args->zone_capacity; | |
29459c3e | 1518 | disk->last_zone_capacity = args->last_zone_capacity; |
d7cb6d74 DLM |
1519 | nr_conv_zones = |
1520 | disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); | |
6b7593b5 DLM |
1521 | if (nr_conv_zones >= disk->nr_zones) { |
1522 | pr_warn("%s: Invalid number of conventional zones %u / %u\n", | |
1523 | disk->disk_name, nr_conv_zones, disk->nr_zones); | |
1524 | return -ENODEV; | |
1525 | } | |
1526 | ||
e21d12c7 DLM |
1527 | lim = queue_limits_start_update(q); |
1528 | ||
1529 | /* | |
1530 | * Some devices can advertize zone resource limits that are larger than | |
1531 | * the number of sequential zones of the zoned block device, e.g. a | |
1532 | * small ZNS namespace. For such case, assume that the zoned device has | |
1533 | * no zone resource limits. | |
1534 | */ | |
1535 | nr_seq_zones = disk->nr_zones - nr_conv_zones; | |
1536 | if (lim.max_open_zones >= nr_seq_zones) | |
1537 | lim.max_open_zones = 0; | |
1538 | if (lim.max_active_zones >= nr_seq_zones) | |
1539 | lim.max_active_zones = 0; | |
1540 | ||
6b7593b5 | 1541 | if (!disk->zone_wplugs_pool) |
e21d12c7 | 1542 | goto commit; |
843283e9 DLM |
1543 | |
1544 | /* | |
1545 | * If the device has no limit on the maximum number of open and active | |
1546 | * zones, set its max open zone limit to the mempool size to indicate | |
1547 | * to the user that there is a potential performance impact due to | |
1548 | * dynamic zone write plug allocation when simultaneously writing to | |
1549 | * more zones than the size of the mempool. | |
1550 | */ | |
6b7593b5 DLM |
1551 | pool_size = max(lim.max_open_zones, lim.max_active_zones); |
1552 | if (!pool_size) | |
1553 | pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); | |
1554 | ||
1555 | mempool_resize(disk->zone_wplugs_pool, pool_size); | |
1556 | ||
1557 | if (!lim.max_open_zones && !lim.max_active_zones) { | |
1558 | if (pool_size < nr_seq_zones) | |
1559 | lim.max_open_zones = pool_size; | |
1560 | else | |
1561 | lim.max_open_zones = 0; | |
843283e9 DLM |
1562 | } |
1563 | ||
e21d12c7 | 1564 | commit: |
aa427d7b | 1565 | return queue_limits_commit_update_frozen(q, &lim); |
843283e9 DLM |
1566 | } |
1567 | ||
d7580149 DLM |
1568 | static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, |
1569 | struct blk_revalidate_zone_args *args) | |
1570 | { | |
1571 | struct gendisk *disk = args->disk; | |
d7580149 DLM |
1572 | |
1573 | if (zone->capacity != zone->len) { | |
1574 | pr_warn("%s: Invalid conventional zone capacity\n", | |
1575 | disk->disk_name); | |
1576 | return -ENODEV; | |
1577 | } | |
1578 | ||
29459c3e DLM |
1579 | if (disk_zone_is_last(disk, zone)) |
1580 | args->last_zone_capacity = zone->capacity; | |
1581 | ||
d7580149 DLM |
1582 | if (!disk_need_zone_resources(disk)) |
1583 | return 0; | |
1584 | ||
1585 | if (!args->conv_zones_bitmap) { | |
1586 | args->conv_zones_bitmap = | |
2f20872e | 1587 | bitmap_zalloc(args->nr_zones, GFP_NOIO); |
d7580149 DLM |
1588 | if (!args->conv_zones_bitmap) |
1589 | return -ENOMEM; | |
1590 | } | |
1591 | ||
1592 | set_bit(idx, args->conv_zones_bitmap); | |
1593 | ||
1594 | return 0; | |
1595 | } | |
1596 | ||
1597 | static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, | |
1598 | struct blk_revalidate_zone_args *args) | |
1599 | { | |
1600 | struct gendisk *disk = args->disk; | |
1601 | struct blk_zone_wplug *zwplug; | |
1602 | unsigned int wp_offset; | |
1603 | unsigned long flags; | |
1604 | ||
1605 | /* | |
1606 | * Remember the capacity of the first sequential zone and check | |
cd639993 DLM |
1607 | * if it is constant for all zones, ignoring the last zone as it can be |
1608 | * smaller. | |
d7580149 DLM |
1609 | */ |
1610 | if (!args->zone_capacity) | |
1611 | args->zone_capacity = zone->capacity; | |
29459c3e DLM |
1612 | if (disk_zone_is_last(disk, zone)) { |
1613 | args->last_zone_capacity = zone->capacity; | |
1614 | } else if (zone->capacity != args->zone_capacity) { | |
d7580149 DLM |
1615 | pr_warn("%s: Invalid variable zone capacity\n", |
1616 | disk->disk_name); | |
1617 | return -ENODEV; | |
1618 | } | |
1619 | ||
1620 | /* | |
a6aa36e9 DLM |
1621 | * If the device needs zone append emulation, we need to track the |
1622 | * write pointer of all zones that are not empty nor full. So make sure | |
1623 | * we have a zone write plug for such zone if the device has a zone | |
1624 | * write plug hash table. | |
d7580149 | 1625 | */ |
a6aa36e9 | 1626 | if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) |
d7580149 DLM |
1627 | return 0; |
1628 | ||
fe0418eb DLM |
1629 | disk_zone_wplug_sync_wp_offset(disk, zone); |
1630 | ||
d7580149 DLM |
1631 | wp_offset = blk_zone_wp_offset(zone); |
1632 | if (!wp_offset || wp_offset >= zone->capacity) | |
1633 | return 0; | |
1634 | ||
1635 | zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); | |
1636 | if (!zwplug) | |
1637 | return -ENOMEM; | |
1638 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1639 | disk_put_zone_wplug(zwplug); | |
1640 | ||
1641 | return 0; | |
1642 | } | |
1643 | ||
d9dd7308 DLM |
1644 | /* |
1645 | * Helper function to check the validity of zones of a zoned block device. | |
1646 | */ | |
d4100351 CH |
1647 | static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, |
1648 | void *data) | |
d9dd7308 | 1649 | { |
d4100351 CH |
1650 | struct blk_revalidate_zone_args *args = data; |
1651 | struct gendisk *disk = args->disk; | |
d7580149 DLM |
1652 | sector_t zone_sectors = disk->queue->limits.chunk_sectors; |
1653 | int ret; | |
03e51c4a DLM |
1654 | |
1655 | /* Check for bad zones and holes in the zone report */ | |
1656 | if (zone->start != args->sector) { | |
1657 | pr_warn("%s: Zone gap at sectors %llu..%llu\n", | |
1658 | disk->disk_name, args->sector, zone->start); | |
1659 | return -ENODEV; | |
1660 | } | |
1661 | ||
cd639993 | 1662 | if (zone->start >= get_capacity(disk) || !zone->len) { |
03e51c4a DLM |
1663 | pr_warn("%s: Invalid zone start %llu, length %llu\n", |
1664 | disk->disk_name, zone->start, zone->len); | |
1665 | return -ENODEV; | |
1666 | } | |
d9dd7308 DLM |
1667 | |
1668 | /* | |
1669 | * All zones must have the same size, with the exception on an eventual | |
1670 | * smaller last zone. | |
1671 | */ | |
cd639993 | 1672 | if (!disk_zone_is_last(disk, zone)) { |
03e51c4a | 1673 | if (zone->len != zone_sectors) { |
6c6b3549 CH |
1674 | pr_warn("%s: Invalid zoned device with non constant zone size\n", |
1675 | disk->disk_name); | |
1676 | return -ENODEV; | |
1677 | } | |
03e51c4a DLM |
1678 | } else if (zone->len > zone_sectors) { |
1679 | pr_warn("%s: Invalid zoned device with larger last zone size\n", | |
1680 | disk->disk_name); | |
d4100351 | 1681 | return -ENODEV; |
d9dd7308 DLM |
1682 | } |
1683 | ||
ecfe43b1 DLM |
1684 | if (!zone->capacity || zone->capacity > zone->len) { |
1685 | pr_warn("%s: Invalid zone capacity\n", | |
1686 | disk->disk_name); | |
1687 | return -ENODEV; | |
1688 | } | |
1689 | ||
d9dd7308 DLM |
1690 | /* Check zone type */ |
1691 | switch (zone->type) { | |
1692 | case BLK_ZONE_TYPE_CONVENTIONAL: | |
d7580149 | 1693 | ret = blk_revalidate_conv_zone(zone, idx, args); |
e94f5819 | 1694 | break; |
d9dd7308 | 1695 | case BLK_ZONE_TYPE_SEQWRITE_REQ: |
d7580149 | 1696 | ret = blk_revalidate_seq_zone(zone, idx, args); |
d9dd7308 | 1697 | break; |
587371ed | 1698 | case BLK_ZONE_TYPE_SEQWRITE_PREF: |
d9dd7308 DLM |
1699 | default: |
1700 | pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", | |
1701 | disk->disk_name, (int)zone->type, zone->start); | |
d7580149 | 1702 | ret = -ENODEV; |
d9dd7308 DLM |
1703 | } |
1704 | ||
d7580149 DLM |
1705 | if (!ret) |
1706 | args->sector += zone->len; | |
1707 | ||
1708 | return ret; | |
d4100351 CH |
1709 | } |
1710 | ||
bf505456 | 1711 | /** |
02ccd7c3 | 1712 | * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs |
bf505456 DLM |
1713 | * @disk: Target disk |
1714 | * | |
9b3c08b9 DLM |
1715 | * Helper function for low-level device drivers to check, (re) allocate and |
1716 | * initialize resources used for managing zoned disks. This function should | |
1717 | * normally be called by blk-mq based drivers when a zoned gendisk is probed | |
1718 | * and when the zone configuration of the gendisk changes (e.g. after a format). | |
03e51c4a DLM |
1719 | * Before calling this function, the device driver must already have set the |
1720 | * device zone size (chunk_sector limit) and the max zone append limit. | |
946dd71e DLM |
1721 | * BIO based drivers can also use this function as long as the device queue |
1722 | * can be safely frozen. | |
bf505456 | 1723 | */ |
9b3c08b9 | 1724 | int blk_revalidate_disk_zones(struct gendisk *disk) |
bf505456 DLM |
1725 | { |
1726 | struct request_queue *q = disk->queue; | |
03e51c4a DLM |
1727 | sector_t zone_sectors = q->limits.chunk_sectors; |
1728 | sector_t capacity = get_capacity(disk); | |
1729 | struct blk_revalidate_zone_args args = { }; | |
6c6b3549 | 1730 | unsigned int noio_flag; |
dd291d77 | 1731 | int ret = -ENOMEM; |
bf505456 | 1732 | |
c98c3d09 CH |
1733 | if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) |
1734 | return -EIO; | |
bf505456 | 1735 | |
03e51c4a DLM |
1736 | if (!capacity) |
1737 | return -ENODEV; | |
1738 | ||
1739 | /* | |
1740 | * Checks that the device driver indicated a valid zone size and that | |
1741 | * the max zone append limit is set. | |
1742 | */ | |
1743 | if (!zone_sectors || !is_power_of_2(zone_sectors)) { | |
1744 | pr_warn("%s: Invalid non power of two zone size (%llu)\n", | |
1745 | disk->disk_name, zone_sectors); | |
1746 | return -ENODEV; | |
1747 | } | |
1748 | ||
e94f5819 | 1749 | /* |
6c6b3549 CH |
1750 | * Ensure that all memory allocations in this context are done as if |
1751 | * GFP_NOIO was specified. | |
e94f5819 | 1752 | */ |
03e51c4a DLM |
1753 | args.disk = disk; |
1754 | args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); | |
6c6b3549 | 1755 | noio_flag = memalloc_noio_save(); |
dd291d77 DLM |
1756 | ret = disk_revalidate_zone_resources(disk, args.nr_zones); |
1757 | if (ret) { | |
1758 | memalloc_noio_restore(noio_flag); | |
1759 | return ret; | |
1760 | } | |
fe0418eb | 1761 | |
6c6b3549 CH |
1762 | ret = disk->fops->report_zones(disk, 0, UINT_MAX, |
1763 | blk_revalidate_zone_cb, &args); | |
2afdeb23 DLM |
1764 | if (!ret) { |
1765 | pr_warn("%s: No zones reported\n", disk->disk_name); | |
1766 | ret = -ENODEV; | |
1767 | } | |
6c6b3549 | 1768 | memalloc_noio_restore(noio_flag); |
bf505456 | 1769 | |
2afdeb23 DLM |
1770 | /* |
1771 | * If zones where reported, make sure that the entire disk capacity | |
1772 | * has been checked. | |
1773 | */ | |
03e51c4a | 1774 | if (ret > 0 && args.sector != capacity) { |
2afdeb23 DLM |
1775 | pr_warn("%s: Missing zones from sector %llu\n", |
1776 | disk->disk_name, args.sector); | |
1777 | ret = -ENODEV; | |
1778 | } | |
1779 | ||
bf505456 | 1780 | /* |
02ccd7c3 DLM |
1781 | * Set the new disk zone parameters only once the queue is frozen and |
1782 | * all I/Os are completed. | |
bf505456 | 1783 | */ |
9b3c08b9 | 1784 | if (ret > 0) |
843283e9 | 1785 | ret = disk_update_zone_resources(disk, &args); |
9b3c08b9 | 1786 | else |
bf505456 | 1787 | pr_warn("%s: failed to revalidate zones\n", disk->disk_name); |
0b83c86b | 1788 | if (ret) { |
1e1a9cec CH |
1789 | unsigned int memflags = blk_mq_freeze_queue(q); |
1790 | ||
843283e9 | 1791 | disk_free_zone_resources(disk); |
1e1a9cec | 1792 | blk_mq_unfreeze_queue(q, memflags); |
0b83c86b | 1793 | } |
bf505456 DLM |
1794 | |
1795 | return ret; | |
1796 | } | |
1797 | EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); | |
d9f1439a | 1798 | |
b76b840f DLM |
1799 | /** |
1800 | * blk_zone_issue_zeroout - zero-fill a block range in a zone | |
1801 | * @bdev: blockdev to write | |
1802 | * @sector: start sector | |
1803 | * @nr_sects: number of sectors to write | |
1804 | * @gfp_mask: memory allocation flags (for bio_alloc) | |
1805 | * | |
1806 | * Description: | |
1807 | * Zero-fill a block range in a zone (@sector must be equal to the zone write | |
1808 | * pointer), handling potential errors due to the (initially unknown) lack of | |
1809 | * hardware offload (See blkdev_issue_zeroout()). | |
1810 | */ | |
1811 | int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, | |
1812 | sector_t nr_sects, gfp_t gfp_mask) | |
1813 | { | |
1814 | int ret; | |
1815 | ||
1816 | if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) | |
1817 | return -EIO; | |
1818 | ||
1819 | ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, | |
1820 | BLKDEV_ZERO_NOFALLBACK); | |
1821 | if (ret != -EOPNOTSUPP) | |
1822 | return ret; | |
1823 | ||
1824 | /* | |
1825 | * The failed call to blkdev_issue_zeroout() advanced the zone write | |
1826 | * pointer. Undo this using a report zone to update the zone write | |
1827 | * pointer to the correct current value. | |
1828 | */ | |
1829 | ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); | |
1830 | if (ret != 1) | |
1831 | return ret < 0 ? ret : -EIO; | |
1832 | ||
1833 | /* | |
1834 | * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a | |
1835 | * regular write with zero-pages. | |
1836 | */ | |
1837 | return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); | |
1838 | } | |
1839 | EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); | |
1840 | ||
d9f1439a | 1841 | #ifdef CONFIG_BLK_DEBUG_FS |
cb01ecb7 BVA |
1842 | static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, |
1843 | struct seq_file *m) | |
1844 | { | |
1845 | unsigned int zwp_wp_offset, zwp_flags; | |
1846 | unsigned int zwp_zone_no, zwp_ref; | |
1847 | unsigned int zwp_bio_list_size; | |
1848 | unsigned long flags; | |
1849 | ||
1850 | spin_lock_irqsave(&zwplug->lock, flags); | |
1851 | zwp_zone_no = zwplug->zone_no; | |
1852 | zwp_flags = zwplug->flags; | |
1853 | zwp_ref = refcount_read(&zwplug->ref); | |
1854 | zwp_wp_offset = zwplug->wp_offset; | |
1855 | zwp_bio_list_size = bio_list_size(&zwplug->bio_list); | |
1856 | spin_unlock_irqrestore(&zwplug->lock, flags); | |
1857 | ||
1858 | seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, | |
1859 | zwp_wp_offset, zwp_bio_list_size); | |
1860 | } | |
d9f1439a | 1861 | |
a98b05b0 | 1862 | int queue_zone_wplugs_show(void *data, struct seq_file *m) |
d9f1439a DLM |
1863 | { |
1864 | struct request_queue *q = data; | |
a98b05b0 DLM |
1865 | struct gendisk *disk = q->disk; |
1866 | struct blk_zone_wplug *zwplug; | |
cb01ecb7 | 1867 | unsigned int i; |
d9f1439a | 1868 | |
57787fa4 JT |
1869 | if (!disk->zone_wplugs_hash) |
1870 | return 0; | |
1871 | ||
a98b05b0 | 1872 | rcu_read_lock(); |
cb01ecb7 BVA |
1873 | for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) |
1874 | hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], | |
1875 | node) | |
1876 | queue_zone_wplug_show(zwplug, m); | |
a98b05b0 | 1877 | rcu_read_unlock(); |
d9f1439a DLM |
1878 | |
1879 | return 0; | |
1880 | } | |
1881 | ||
1882 | #endif |