| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | #include <linux/jiffies.h> |
| 4 | #include <linux/kernel.h> |
| 5 | #include <linux/ktime.h> |
| 6 | #include <linux/list.h> |
| 7 | #include <linux/math64.h> |
| 8 | #include <linux/sizes.h> |
| 9 | #include <linux/workqueue.h> |
| 10 | #include "ctree.h" |
| 11 | #include "block-group.h" |
| 12 | #include "discard.h" |
| 13 | #include "free-space-cache.h" |
| 14 | #include "fs.h" |
| 15 | |
| 16 | /* |
| 17 | * This contains the logic to handle async discard. |
| 18 | * |
| 19 | * Async discard manages trimming of free space outside of transaction commit. |
| 20 | * Discarding is done by managing the block_groups on a LRU list based on free |
| 21 | * space recency. Two passes are used to first prioritize discarding extents |
| 22 | * and then allow for trimming in the bitmap the best opportunity to coalesce. |
| 23 | * The block_groups are maintained on multiple lists to allow for multiple |
| 24 | * passes with different discard filter requirements. A delayed work item is |
| 25 | * used to manage discarding with timeout determined by a max of the delay |
| 26 | * incurred by the iops rate limit, the byte rate limit, and the max delay of |
| 27 | * BTRFS_DISCARD_MAX_DELAY. |
| 28 | * |
| 29 | * Note, this only keeps track of block_groups that are explicitly for data. |
| 30 | * Mixed block_groups are not supported. |
| 31 | * |
| 32 | * The first list is special to manage discarding of fully free block groups. |
| 33 | * This is necessary because we issue a final trim for a full free block group |
| 34 | * after forgetting it. When a block group becomes unused, instead of directly |
| 35 | * being added to the unused_bgs list, we add it to this first list. Then |
| 36 | * from there, if it becomes fully discarded, we place it onto the unused_bgs |
| 37 | * list. |
| 38 | * |
| 39 | * The in-memory free space cache serves as the backing state for discard. |
| 40 | * Consequently this means there is no persistence. We opt to load all the |
| 41 | * block groups in as not discarded, so the mount case degenerates to the |
| 42 | * crashing case. |
| 43 | * |
| 44 | * As the free space cache uses bitmaps, there exists a tradeoff between |
| 45 | * ease/efficiency for find_free_extent() and the accuracy of discard state. |
| 46 | * Here we opt to let untrimmed regions merge with everything while only letting |
| 47 | * trimmed regions merge with other trimmed regions. This can cause |
| 48 | * overtrimming, but the coalescing benefit seems to be worth it. Additionally, |
| 49 | * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, |
| 50 | * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, |
| 51 | * this resets the state and we will retry trimming the whole bitmap. This is a |
| 52 | * tradeoff between discard state accuracy and the cost of accounting. |
| 53 | */ |
| 54 | |
| 55 | /* This is an initial delay to give some chance for block reuse */ |
| 56 | #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) |
| 57 | #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) |
| 58 | |
| 59 | #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) |
| 60 | #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) |
| 61 | #define BTRFS_DISCARD_MAX_IOPS (1000U) |
| 62 | |
| 63 | /* Monotonically decreasing minimum length filters after index 0 */ |
| 64 | static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { |
| 65 | 0, |
| 66 | BTRFS_ASYNC_DISCARD_MAX_FILTER, |
| 67 | BTRFS_ASYNC_DISCARD_MIN_FILTER |
| 68 | }; |
| 69 | |
| 70 | static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, |
| 71 | const struct btrfs_block_group *block_group) |
| 72 | { |
| 73 | return &discard_ctl->discard_list[block_group->discard_index]; |
| 74 | } |
| 75 | |
| 76 | /* |
| 77 | * Determine if async discard should be running. |
| 78 | * |
| 79 | * @discard_ctl: discard control |
| 80 | * |
| 81 | * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. |
| 82 | */ |
| 83 | static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) |
| 84 | { |
| 85 | struct btrfs_fs_info *fs_info = container_of(discard_ctl, |
| 86 | struct btrfs_fs_info, |
| 87 | discard_ctl); |
| 88 | |
| 89 | return (!(fs_info->sb->s_flags & SB_RDONLY) && |
| 90 | test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); |
| 91 | } |
| 92 | |
| 93 | static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
| 94 | struct btrfs_block_group *block_group) |
| 95 | { |
| 96 | lockdep_assert_held(&discard_ctl->lock); |
| 97 | |
| 98 | if (list_empty(&block_group->discard_list) || |
| 99 | block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { |
| 100 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) |
| 101 | block_group->discard_index = BTRFS_DISCARD_INDEX_START; |
| 102 | block_group->discard_eligible_time = (ktime_get_ns() + |
| 103 | BTRFS_DISCARD_DELAY); |
| 104 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
| 105 | } |
| 106 | if (list_empty(&block_group->discard_list)) |
| 107 | btrfs_get_block_group(block_group); |
| 108 | |
| 109 | list_move_tail(&block_group->discard_list, |
| 110 | get_discard_list(discard_ctl, block_group)); |
| 111 | } |
| 112 | |
| 113 | static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
| 114 | struct btrfs_block_group *block_group) |
| 115 | { |
| 116 | if (!btrfs_is_block_group_data_only(block_group)) |
| 117 | return; |
| 118 | |
| 119 | if (!btrfs_run_discard_work(discard_ctl)) |
| 120 | return; |
| 121 | |
| 122 | spin_lock(&discard_ctl->lock); |
| 123 | __add_to_discard_list(discard_ctl, block_group); |
| 124 | spin_unlock(&discard_ctl->lock); |
| 125 | } |
| 126 | |
| 127 | static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, |
| 128 | struct btrfs_block_group *block_group) |
| 129 | { |
| 130 | bool queued; |
| 131 | |
| 132 | spin_lock(&discard_ctl->lock); |
| 133 | |
| 134 | queued = !list_empty(&block_group->discard_list); |
| 135 | |
| 136 | if (!btrfs_run_discard_work(discard_ctl)) { |
| 137 | spin_unlock(&discard_ctl->lock); |
| 138 | return; |
| 139 | } |
| 140 | |
| 141 | list_del_init(&block_group->discard_list); |
| 142 | |
| 143 | block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; |
| 144 | block_group->discard_eligible_time = (ktime_get_ns() + |
| 145 | BTRFS_DISCARD_UNUSED_DELAY); |
| 146 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
| 147 | if (!queued) |
| 148 | btrfs_get_block_group(block_group); |
| 149 | list_add_tail(&block_group->discard_list, |
| 150 | &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); |
| 151 | |
| 152 | spin_unlock(&discard_ctl->lock); |
| 153 | } |
| 154 | |
| 155 | static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, |
| 156 | struct btrfs_block_group *block_group) |
| 157 | { |
| 158 | bool running = false; |
| 159 | bool queued = false; |
| 160 | |
| 161 | spin_lock(&discard_ctl->lock); |
| 162 | |
| 163 | if (block_group == discard_ctl->block_group) { |
| 164 | running = true; |
| 165 | discard_ctl->block_group = NULL; |
| 166 | } |
| 167 | |
| 168 | block_group->discard_eligible_time = 0; |
| 169 | queued = !list_empty(&block_group->discard_list); |
| 170 | list_del_init(&block_group->discard_list); |
| 171 | if (queued) |
| 172 | btrfs_put_block_group(block_group); |
| 173 | |
| 174 | spin_unlock(&discard_ctl->lock); |
| 175 | |
| 176 | return running; |
| 177 | } |
| 178 | |
| 179 | /* |
| 180 | * Find block_group that's up next for discarding. |
| 181 | * |
| 182 | * @discard_ctl: discard control |
| 183 | * @now: current time |
| 184 | * |
| 185 | * Iterate over the discard lists to find the next block_group up for |
| 186 | * discarding checking the discard_eligible_time of block_group. |
| 187 | */ |
| 188 | static struct btrfs_block_group *find_next_block_group( |
| 189 | struct btrfs_discard_ctl *discard_ctl, |
| 190 | u64 now) |
| 191 | { |
| 192 | struct btrfs_block_group *ret_block_group = NULL, *block_group; |
| 193 | int i; |
| 194 | |
| 195 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { |
| 196 | struct list_head *discard_list = &discard_ctl->discard_list[i]; |
| 197 | |
| 198 | if (!list_empty(discard_list)) { |
| 199 | block_group = list_first_entry(discard_list, |
| 200 | struct btrfs_block_group, |
| 201 | discard_list); |
| 202 | |
| 203 | if (!ret_block_group) |
| 204 | ret_block_group = block_group; |
| 205 | |
| 206 | if (ret_block_group->discard_eligible_time < now) |
| 207 | break; |
| 208 | |
| 209 | if (ret_block_group->discard_eligible_time > |
| 210 | block_group->discard_eligible_time) |
| 211 | ret_block_group = block_group; |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | return ret_block_group; |
| 216 | } |
| 217 | |
| 218 | /* |
| 219 | * Look up next block group and set it for use. |
| 220 | * |
| 221 | * @discard_ctl: discard control |
| 222 | * @discard_state: the discard_state of the block_group after state management |
| 223 | * @discard_index: the discard_index of the block_group after state management |
| 224 | * @now: time when discard was invoked, in ns |
| 225 | * |
| 226 | * Wrap find_next_block_group() and set the block_group to be in use. |
| 227 | * @discard_state's control flow is managed here. Variables related to |
| 228 | * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state |
| 229 | * and @discard_index are remembered as it may change while we're discarding, |
| 230 | * but we want the discard to execute in the context determined here. |
| 231 | */ |
| 232 | static struct btrfs_block_group *peek_discard_list( |
| 233 | struct btrfs_discard_ctl *discard_ctl, |
| 234 | enum btrfs_discard_state *discard_state, |
| 235 | int *discard_index, u64 now) |
| 236 | { |
| 237 | struct btrfs_block_group *block_group; |
| 238 | |
| 239 | spin_lock(&discard_ctl->lock); |
| 240 | again: |
| 241 | block_group = find_next_block_group(discard_ctl, now); |
| 242 | |
| 243 | if (block_group && now >= block_group->discard_eligible_time) { |
| 244 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && |
| 245 | block_group->used != 0) { |
| 246 | if (btrfs_is_block_group_data_only(block_group)) { |
| 247 | __add_to_discard_list(discard_ctl, block_group); |
| 248 | /* |
| 249 | * The block group must have been moved to other |
| 250 | * discard list even if discard was disabled in |
| 251 | * the meantime or a transaction abort happened, |
| 252 | * otherwise we can end up in an infinite loop, |
| 253 | * always jumping into the 'again' label and |
| 254 | * keep getting this block group over and over |
| 255 | * in case there are no other block groups in |
| 256 | * the discard lists. |
| 257 | */ |
| 258 | ASSERT(block_group->discard_index != |
| 259 | BTRFS_DISCARD_INDEX_UNUSED, |
| 260 | "discard_index=%d", |
| 261 | block_group->discard_index); |
| 262 | } else { |
| 263 | list_del_init(&block_group->discard_list); |
| 264 | btrfs_put_block_group(block_group); |
| 265 | } |
| 266 | goto again; |
| 267 | } |
| 268 | if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { |
| 269 | block_group->discard_cursor = block_group->start; |
| 270 | block_group->discard_state = BTRFS_DISCARD_EXTENTS; |
| 271 | } |
| 272 | } |
| 273 | if (block_group) { |
| 274 | btrfs_get_block_group(block_group); |
| 275 | discard_ctl->block_group = block_group; |
| 276 | *discard_state = block_group->discard_state; |
| 277 | *discard_index = block_group->discard_index; |
| 278 | } |
| 279 | spin_unlock(&discard_ctl->lock); |
| 280 | |
| 281 | return block_group; |
| 282 | } |
| 283 | |
| 284 | /* |
| 285 | * Update a block group's filters. |
| 286 | * |
| 287 | * @block_group: block group of interest |
| 288 | * @bytes: recently freed region size after coalescing |
| 289 | * |
| 290 | * Async discard maintains multiple lists with progressively smaller filters |
| 291 | * to prioritize discarding based on size. Should a free space that matches |
| 292 | * a larger filter be returned to the free_space_cache, prioritize that discard |
| 293 | * by moving @block_group to the proper filter. |
| 294 | */ |
| 295 | void btrfs_discard_check_filter(struct btrfs_block_group *block_group, |
| 296 | u64 bytes) |
| 297 | { |
| 298 | struct btrfs_discard_ctl *discard_ctl; |
| 299 | |
| 300 | if (!block_group || |
| 301 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) |
| 302 | return; |
| 303 | |
| 304 | discard_ctl = &block_group->fs_info->discard_ctl; |
| 305 | |
| 306 | if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && |
| 307 | bytes >= discard_minlen[block_group->discard_index - 1]) { |
| 308 | int i; |
| 309 | |
| 310 | remove_from_discard_list(discard_ctl, block_group); |
| 311 | |
| 312 | for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; |
| 313 | i++) { |
| 314 | if (bytes >= discard_minlen[i]) { |
| 315 | block_group->discard_index = i; |
| 316 | add_to_discard_list(discard_ctl, block_group); |
| 317 | break; |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | } |
| 322 | |
| 323 | /* |
| 324 | * Move a block group along the discard lists. |
| 325 | * |
| 326 | * @discard_ctl: discard control |
| 327 | * @block_group: block_group of interest |
| 328 | * |
| 329 | * Increment @block_group's discard_index. If it falls of the list, let it be. |
| 330 | * Otherwise add it back to the appropriate list. |
| 331 | */ |
| 332 | static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, |
| 333 | struct btrfs_block_group *block_group) |
| 334 | { |
| 335 | block_group->discard_index++; |
| 336 | if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { |
| 337 | block_group->discard_index = 1; |
| 338 | return; |
| 339 | } |
| 340 | |
| 341 | add_to_discard_list(discard_ctl, block_group); |
| 342 | } |
| 343 | |
| 344 | /* |
| 345 | * Remove a block_group from the discard lists. |
| 346 | * |
| 347 | * @discard_ctl: discard control |
| 348 | * @block_group: block_group of interest |
| 349 | * |
| 350 | * Remove @block_group from the discard lists. If necessary, wait on the |
| 351 | * current work and then reschedule the delayed work. |
| 352 | */ |
| 353 | void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, |
| 354 | struct btrfs_block_group *block_group) |
| 355 | { |
| 356 | if (remove_from_discard_list(discard_ctl, block_group)) { |
| 357 | cancel_delayed_work_sync(&discard_ctl->work); |
| 358 | btrfs_discard_schedule_work(discard_ctl, true); |
| 359 | } |
| 360 | } |
| 361 | |
| 362 | /* |
| 363 | * Handles queuing the block_groups. |
| 364 | * |
| 365 | * @discard_ctl: discard control |
| 366 | * @block_group: block_group of interest |
| 367 | * |
| 368 | * Maintain the LRU order of the discard lists. |
| 369 | */ |
| 370 | void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, |
| 371 | struct btrfs_block_group *block_group) |
| 372 | { |
| 373 | if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) |
| 374 | return; |
| 375 | |
| 376 | if (block_group->used == 0) |
| 377 | add_to_discard_unused_list(discard_ctl, block_group); |
| 378 | else |
| 379 | add_to_discard_list(discard_ctl, block_group); |
| 380 | |
| 381 | if (!delayed_work_pending(&discard_ctl->work)) |
| 382 | btrfs_discard_schedule_work(discard_ctl, false); |
| 383 | } |
| 384 | |
| 385 | static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
| 386 | u64 now, bool override) |
| 387 | { |
| 388 | struct btrfs_block_group *block_group; |
| 389 | |
| 390 | if (!btrfs_run_discard_work(discard_ctl)) |
| 391 | return; |
| 392 | if (!override && delayed_work_pending(&discard_ctl->work)) |
| 393 | return; |
| 394 | |
| 395 | block_group = find_next_block_group(discard_ctl, now); |
| 396 | if (block_group) { |
| 397 | u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; |
| 398 | u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); |
| 399 | |
| 400 | /* |
| 401 | * A single delayed workqueue item is responsible for |
| 402 | * discarding, so we can manage the bytes rate limit by keeping |
| 403 | * track of the previous discard. |
| 404 | */ |
| 405 | if (kbps_limit && discard_ctl->prev_discard) { |
| 406 | u64 bps_limit = ((u64)kbps_limit) * SZ_1K; |
| 407 | u64 bps_delay = div64_u64(discard_ctl->prev_discard * |
| 408 | NSEC_PER_SEC, bps_limit); |
| 409 | |
| 410 | delay = max(delay, bps_delay); |
| 411 | } |
| 412 | |
| 413 | /* |
| 414 | * This timeout is to hopefully prevent immediate discarding |
| 415 | * in a recently allocated block group. |
| 416 | */ |
| 417 | if (now < block_group->discard_eligible_time) { |
| 418 | u64 bg_timeout = block_group->discard_eligible_time - now; |
| 419 | |
| 420 | delay = max(delay, bg_timeout); |
| 421 | } |
| 422 | |
| 423 | if (override && discard_ctl->prev_discard) { |
| 424 | u64 elapsed = now - discard_ctl->prev_discard_time; |
| 425 | |
| 426 | if (delay > elapsed) |
| 427 | delay -= elapsed; |
| 428 | else |
| 429 | delay = 0; |
| 430 | } |
| 431 | |
| 432 | mod_delayed_work(discard_ctl->discard_workers, |
| 433 | &discard_ctl->work, nsecs_to_jiffies(delay)); |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | /* |
| 438 | * Responsible for scheduling the discard work. |
| 439 | * |
| 440 | * @discard_ctl: discard control |
| 441 | * @override: override the current timer |
| 442 | * |
| 443 | * Discards are issued by a delayed workqueue item. @override is used to |
| 444 | * update the current delay as the baseline delay interval is reevaluated on |
| 445 | * transaction commit. This is also maxed with any other rate limit. |
| 446 | */ |
| 447 | void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
| 448 | bool override) |
| 449 | { |
| 450 | const u64 now = ktime_get_ns(); |
| 451 | |
| 452 | spin_lock(&discard_ctl->lock); |
| 453 | __btrfs_discard_schedule_work(discard_ctl, now, override); |
| 454 | spin_unlock(&discard_ctl->lock); |
| 455 | } |
| 456 | |
| 457 | /* |
| 458 | * Determine next step of a block_group. |
| 459 | * |
| 460 | * @discard_ctl: discard control |
| 461 | * @block_group: block_group of interest |
| 462 | * |
| 463 | * Determine the next step for a block group after it's finished going through |
| 464 | * a pass on a discard list. If it is unused and fully trimmed, we can mark it |
| 465 | * unused and send it to the unused_bgs path. Otherwise, pass it onto the |
| 466 | * appropriate filter list or let it fall off. |
| 467 | */ |
| 468 | static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, |
| 469 | struct btrfs_block_group *block_group) |
| 470 | { |
| 471 | remove_from_discard_list(discard_ctl, block_group); |
| 472 | |
| 473 | if (block_group->used == 0) { |
| 474 | if (btrfs_is_free_space_trimmed(block_group)) |
| 475 | btrfs_mark_bg_unused(block_group); |
| 476 | else |
| 477 | add_to_discard_unused_list(discard_ctl, block_group); |
| 478 | } else { |
| 479 | btrfs_update_discard_index(discard_ctl, block_group); |
| 480 | } |
| 481 | } |
| 482 | |
| 483 | /* |
| 484 | * Discard work queue callback |
| 485 | * |
| 486 | * @work: work |
| 487 | * |
| 488 | * Find the next block_group to start discarding and then discard a single |
| 489 | * region. It does this in a two-pass fashion: first extents and second |
| 490 | * bitmaps. Completely discarded block groups are sent to the unused_bgs path. |
| 491 | */ |
| 492 | static void btrfs_discard_workfn(struct work_struct *work) |
| 493 | { |
| 494 | struct btrfs_discard_ctl *discard_ctl; |
| 495 | struct btrfs_block_group *block_group; |
| 496 | enum btrfs_discard_state discard_state; |
| 497 | int discard_index = 0; |
| 498 | u64 trimmed = 0; |
| 499 | u64 minlen = 0; |
| 500 | u64 now = ktime_get_ns(); |
| 501 | |
| 502 | discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); |
| 503 | |
| 504 | block_group = peek_discard_list(discard_ctl, &discard_state, |
| 505 | &discard_index, now); |
| 506 | if (!block_group) |
| 507 | return; |
| 508 | if (!btrfs_run_discard_work(discard_ctl)) { |
| 509 | spin_lock(&discard_ctl->lock); |
| 510 | btrfs_put_block_group(block_group); |
| 511 | discard_ctl->block_group = NULL; |
| 512 | spin_unlock(&discard_ctl->lock); |
| 513 | return; |
| 514 | } |
| 515 | if (now < block_group->discard_eligible_time) { |
| 516 | spin_lock(&discard_ctl->lock); |
| 517 | btrfs_put_block_group(block_group); |
| 518 | discard_ctl->block_group = NULL; |
| 519 | spin_unlock(&discard_ctl->lock); |
| 520 | btrfs_discard_schedule_work(discard_ctl, false); |
| 521 | return; |
| 522 | } |
| 523 | |
| 524 | /* Perform discarding */ |
| 525 | minlen = discard_minlen[discard_index]; |
| 526 | |
| 527 | if (discard_state == BTRFS_DISCARD_BITMAPS) { |
| 528 | u64 maxlen = 0; |
| 529 | |
| 530 | /* |
| 531 | * Use the previous levels minimum discard length as the max |
| 532 | * length filter. In the case something is added to make a |
| 533 | * region go beyond the max filter, the entire bitmap is set |
| 534 | * back to BTRFS_TRIM_STATE_UNTRIMMED. |
| 535 | */ |
| 536 | if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) |
| 537 | maxlen = discard_minlen[discard_index - 1]; |
| 538 | |
| 539 | btrfs_trim_block_group_bitmaps(block_group, &trimmed, |
| 540 | block_group->discard_cursor, |
| 541 | btrfs_block_group_end(block_group), |
| 542 | minlen, maxlen, true); |
| 543 | discard_ctl->discard_bitmap_bytes += trimmed; |
| 544 | } else { |
| 545 | btrfs_trim_block_group_extents(block_group, &trimmed, |
| 546 | block_group->discard_cursor, |
| 547 | btrfs_block_group_end(block_group), |
| 548 | minlen, true); |
| 549 | discard_ctl->discard_extent_bytes += trimmed; |
| 550 | } |
| 551 | |
| 552 | /* Determine next steps for a block_group */ |
| 553 | if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { |
| 554 | if (discard_state == BTRFS_DISCARD_BITMAPS) { |
| 555 | btrfs_finish_discard_pass(discard_ctl, block_group); |
| 556 | } else { |
| 557 | block_group->discard_cursor = block_group->start; |
| 558 | spin_lock(&discard_ctl->lock); |
| 559 | if (block_group->discard_state != |
| 560 | BTRFS_DISCARD_RESET_CURSOR) |
| 561 | block_group->discard_state = |
| 562 | BTRFS_DISCARD_BITMAPS; |
| 563 | spin_unlock(&discard_ctl->lock); |
| 564 | } |
| 565 | } |
| 566 | |
| 567 | now = ktime_get_ns(); |
| 568 | spin_lock(&discard_ctl->lock); |
| 569 | discard_ctl->prev_discard = trimmed; |
| 570 | discard_ctl->prev_discard_time = now; |
| 571 | btrfs_put_block_group(block_group); |
| 572 | discard_ctl->block_group = NULL; |
| 573 | __btrfs_discard_schedule_work(discard_ctl, now, false); |
| 574 | spin_unlock(&discard_ctl->lock); |
| 575 | } |
| 576 | |
| 577 | /* |
| 578 | * Recalculate the base delay. |
| 579 | * |
| 580 | * @discard_ctl: discard control |
| 581 | * |
| 582 | * Recalculate the base delay which is based off the total number of |
| 583 | * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) |
| 584 | * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). |
| 585 | */ |
| 586 | void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) |
| 587 | { |
| 588 | s32 discardable_extents; |
| 589 | s64 discardable_bytes; |
| 590 | u32 iops_limit; |
| 591 | unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; |
| 592 | unsigned long delay; |
| 593 | |
| 594 | discardable_extents = atomic_read(&discard_ctl->discardable_extents); |
| 595 | if (!discardable_extents) |
| 596 | return; |
| 597 | |
| 598 | spin_lock(&discard_ctl->lock); |
| 599 | |
| 600 | /* |
| 601 | * The following is to fix a potential -1 discrepancy that we're not |
| 602 | * sure how to reproduce. But given that this is the only place that |
| 603 | * utilizes these numbers and this is only called by from |
| 604 | * btrfs_finish_extent_commit() which is synchronized, we can correct |
| 605 | * here. |
| 606 | */ |
| 607 | if (discardable_extents < 0) |
| 608 | atomic_add(-discardable_extents, |
| 609 | &discard_ctl->discardable_extents); |
| 610 | |
| 611 | discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); |
| 612 | if (discardable_bytes < 0) |
| 613 | atomic64_add(-discardable_bytes, |
| 614 | &discard_ctl->discardable_bytes); |
| 615 | |
| 616 | if (discardable_extents <= 0) { |
| 617 | spin_unlock(&discard_ctl->lock); |
| 618 | return; |
| 619 | } |
| 620 | |
| 621 | iops_limit = READ_ONCE(discard_ctl->iops_limit); |
| 622 | |
| 623 | if (iops_limit) { |
| 624 | delay = MSEC_PER_SEC / iops_limit; |
| 625 | } else { |
| 626 | /* |
| 627 | * Unset iops_limit means go as fast as possible, so allow a |
| 628 | * delay of 0. |
| 629 | */ |
| 630 | delay = 0; |
| 631 | min_delay = 0; |
| 632 | } |
| 633 | |
| 634 | delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); |
| 635 | discard_ctl->delay_ms = delay; |
| 636 | |
| 637 | spin_unlock(&discard_ctl->lock); |
| 638 | } |
| 639 | |
| 640 | /* |
| 641 | * Propagate discard counters. |
| 642 | * |
| 643 | * @block_group: block_group of interest |
| 644 | * |
| 645 | * Propagate deltas of counters up to the discard_ctl. It maintains a current |
| 646 | * counter and a previous counter passing the delta up to the global stat. |
| 647 | * Then the current counter value becomes the previous counter value. |
| 648 | */ |
| 649 | void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) |
| 650 | { |
| 651 | struct btrfs_free_space_ctl *ctl; |
| 652 | struct btrfs_discard_ctl *discard_ctl; |
| 653 | s32 extents_delta; |
| 654 | s64 bytes_delta; |
| 655 | |
| 656 | if (!block_group || |
| 657 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || |
| 658 | !btrfs_is_block_group_data_only(block_group)) |
| 659 | return; |
| 660 | |
| 661 | ctl = block_group->free_space_ctl; |
| 662 | discard_ctl = &block_group->fs_info->discard_ctl; |
| 663 | |
| 664 | lockdep_assert_held(&ctl->tree_lock); |
| 665 | extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - |
| 666 | ctl->discardable_extents[BTRFS_STAT_PREV]; |
| 667 | if (extents_delta) { |
| 668 | atomic_add(extents_delta, &discard_ctl->discardable_extents); |
| 669 | ctl->discardable_extents[BTRFS_STAT_PREV] = |
| 670 | ctl->discardable_extents[BTRFS_STAT_CURR]; |
| 671 | } |
| 672 | |
| 673 | bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - |
| 674 | ctl->discardable_bytes[BTRFS_STAT_PREV]; |
| 675 | if (bytes_delta) { |
| 676 | atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); |
| 677 | ctl->discardable_bytes[BTRFS_STAT_PREV] = |
| 678 | ctl->discardable_bytes[BTRFS_STAT_CURR]; |
| 679 | } |
| 680 | } |
| 681 | |
| 682 | /* |
| 683 | * Punt unused_bgs list to discard lists. |
| 684 | * |
| 685 | * @fs_info: fs_info of interest |
| 686 | * |
| 687 | * The unused_bgs list needs to be punted to the discard lists because the |
| 688 | * order of operations is changed. In the normal synchronous discard path, the |
| 689 | * block groups are trimmed via a single large trim in transaction commit. This |
| 690 | * is ultimately what we are trying to avoid with asynchronous discard. Thus, |
| 691 | * it must be done before going down the unused_bgs path. |
| 692 | */ |
| 693 | void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) |
| 694 | { |
| 695 | struct btrfs_block_group *block_group, *next; |
| 696 | |
| 697 | spin_lock(&fs_info->unused_bgs_lock); |
| 698 | /* We enabled async discard, so punt all to the queue */ |
| 699 | list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, |
| 700 | bg_list) { |
| 701 | list_del_init(&block_group->bg_list); |
| 702 | btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); |
| 703 | /* |
| 704 | * This put is for the get done by btrfs_mark_bg_unused. |
| 705 | * Queueing discard incremented it for discard's reference. |
| 706 | */ |
| 707 | btrfs_put_block_group(block_group); |
| 708 | } |
| 709 | spin_unlock(&fs_info->unused_bgs_lock); |
| 710 | } |
| 711 | |
| 712 | /* |
| 713 | * Purge discard lists. |
| 714 | * |
| 715 | * @discard_ctl: discard control |
| 716 | * |
| 717 | * If we are disabling async discard, we may have intercepted block groups that |
| 718 | * are completely free and ready for the unused_bgs path. As discarding will |
| 719 | * now happen in transaction commit or not at all, we can safely mark the |
| 720 | * corresponding block groups as unused and they will be sent on their merry |
| 721 | * way to the unused_bgs list. |
| 722 | */ |
| 723 | static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) |
| 724 | { |
| 725 | struct btrfs_block_group *block_group, *next; |
| 726 | int i; |
| 727 | |
| 728 | spin_lock(&discard_ctl->lock); |
| 729 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { |
| 730 | list_for_each_entry_safe(block_group, next, |
| 731 | &discard_ctl->discard_list[i], |
| 732 | discard_list) { |
| 733 | list_del_init(&block_group->discard_list); |
| 734 | spin_unlock(&discard_ctl->lock); |
| 735 | if (block_group->used == 0) |
| 736 | btrfs_mark_bg_unused(block_group); |
| 737 | spin_lock(&discard_ctl->lock); |
| 738 | btrfs_put_block_group(block_group); |
| 739 | } |
| 740 | } |
| 741 | spin_unlock(&discard_ctl->lock); |
| 742 | } |
| 743 | |
| 744 | void btrfs_discard_resume(struct btrfs_fs_info *fs_info) |
| 745 | { |
| 746 | if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { |
| 747 | btrfs_discard_cleanup(fs_info); |
| 748 | return; |
| 749 | } |
| 750 | |
| 751 | btrfs_discard_punt_unused_bgs_list(fs_info); |
| 752 | |
| 753 | set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); |
| 754 | } |
| 755 | |
| 756 | void btrfs_discard_stop(struct btrfs_fs_info *fs_info) |
| 757 | { |
| 758 | clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); |
| 759 | } |
| 760 | |
| 761 | void btrfs_discard_init(struct btrfs_fs_info *fs_info) |
| 762 | { |
| 763 | struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; |
| 764 | int i; |
| 765 | |
| 766 | spin_lock_init(&discard_ctl->lock); |
| 767 | INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); |
| 768 | |
| 769 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) |
| 770 | INIT_LIST_HEAD(&discard_ctl->discard_list[i]); |
| 771 | |
| 772 | discard_ctl->prev_discard = 0; |
| 773 | discard_ctl->prev_discard_time = 0; |
| 774 | atomic_set(&discard_ctl->discardable_extents, 0); |
| 775 | atomic64_set(&discard_ctl->discardable_bytes, 0); |
| 776 | discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; |
| 777 | discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; |
| 778 | discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; |
| 779 | discard_ctl->kbps_limit = 0; |
| 780 | discard_ctl->discard_extent_bytes = 0; |
| 781 | discard_ctl->discard_bitmap_bytes = 0; |
| 782 | atomic64_set(&discard_ctl->discard_bytes_saved, 0); |
| 783 | } |
| 784 | |
| 785 | void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) |
| 786 | { |
| 787 | btrfs_discard_stop(fs_info); |
| 788 | cancel_delayed_work_sync(&fs_info->discard_ctl.work); |
| 789 | btrfs_discard_purge_list(&fs_info->discard_ctl); |
| 790 | } |