Merge tag 'pci-v6.16-fixes-3' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci
[linux-block.git] / fs / btrfs / discard.c
CommitLineData
b0643e59
DZ
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/jiffies.h>
4#include <linux/kernel.h>
5#include <linux/ktime.h>
6#include <linux/list.h>
e93591bb 7#include <linux/math64.h>
b0643e59
DZ
8#include <linux/sizes.h>
9#include <linux/workqueue.h>
10#include "ctree.h"
11#include "block-group.h"
12#include "discard.h"
13#include "free-space-cache.h"
fc97a410 14#include "fs.h"
b0643e59 15
dbc2a8c9
DZ
16/*
17 * This contains the logic to handle async discard.
18 *
19 * Async discard manages trimming of free space outside of transaction commit.
20 * Discarding is done by managing the block_groups on a LRU list based on free
21 * space recency. Two passes are used to first prioritize discarding extents
22 * and then allow for trimming in the bitmap the best opportunity to coalesce.
23 * The block_groups are maintained on multiple lists to allow for multiple
24 * passes with different discard filter requirements. A delayed work item is
25 * used to manage discarding with timeout determined by a max of the delay
26 * incurred by the iops rate limit, the byte rate limit, and the max delay of
27 * BTRFS_DISCARD_MAX_DELAY.
28 *
29 * Note, this only keeps track of block_groups that are explicitly for data.
30 * Mixed block_groups are not supported.
31 *
32 * The first list is special to manage discarding of fully free block groups.
33 * This is necessary because we issue a final trim for a full free block group
34 * after forgetting it. When a block group becomes unused, instead of directly
35 * being added to the unused_bgs list, we add it to this first list. Then
36 * from there, if it becomes fully discarded, we place it onto the unused_bgs
37 * list.
38 *
39 * The in-memory free space cache serves as the backing state for discard.
40 * Consequently this means there is no persistence. We opt to load all the
41 * block groups in as not discarded, so the mount case degenerates to the
42 * crashing case.
43 *
44 * As the free space cache uses bitmaps, there exists a tradeoff between
45 * ease/efficiency for find_free_extent() and the accuracy of discard state.
46 * Here we opt to let untrimmed regions merge with everything while only letting
47 * trimmed regions merge with other trimmed regions. This can cause
48 * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
49 * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
50 * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
51 * this resets the state and we will retry trimming the whole bitmap. This is a
52 * tradeoff between discard state accuracy and the cost of accounting.
53 */
54
b0643e59
DZ
55/* This is an initial delay to give some chance for block reuse */
56#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
6e80d4f8 57#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
b0643e59 58
a2309300
DZ
59#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
60#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
e9f59429 61#define BTRFS_DISCARD_MAX_IOPS (1000U)
a2309300 62
43dd529a 63/* Monotonically decreasing minimum length filters after index 0 */
7fe6d45e
DZ
64static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 0,
66 BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 BTRFS_ASYNC_DISCARD_MIN_FILTER
68};
69
b0643e59 70static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
ca283ea9 71 const struct btrfs_block_group *block_group)
b0643e59
DZ
72{
73 return &discard_ctl->discard_list[block_group->discard_index];
74}
75
bb5167e6
JT
76/*
77 * Determine if async discard should be running.
78 *
79 * @discard_ctl: discard control
80 *
81 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82 */
ca283ea9 83static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
bb5167e6
JT
84{
85 struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 struct btrfs_fs_info,
87 discard_ctl);
88
89 return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91}
92
2bee7eb8
DZ
93static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 struct btrfs_block_group *block_group)
b0643e59 95{
2b5463fc 96 lockdep_assert_held(&discard_ctl->lock);
b0643e59 97
6e80d4f8
DZ
98 if (list_empty(&block_group->discard_list) ||
99 block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101 block_group->discard_index = BTRFS_DISCARD_INDEX_START;
b0643e59
DZ
102 block_group->discard_eligible_time = (ktime_get_ns() +
103 BTRFS_DISCARD_DELAY);
2bee7eb8 104 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
6e80d4f8 105 }
2b5463fc
BB
106 if (list_empty(&block_group->discard_list))
107 btrfs_get_block_group(block_group);
b0643e59
DZ
108
109 list_move_tail(&block_group->discard_list,
110 get_discard_list(discard_ctl, block_group));
2bee7eb8 111}
b0643e59 112
2bee7eb8
DZ
113static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114 struct btrfs_block_group *block_group)
115{
5cb0724e
DZ
116 if (!btrfs_is_block_group_data_only(block_group))
117 return;
118
54db6d1b
FM
119 if (!btrfs_run_discard_work(discard_ctl))
120 return;
121
2bee7eb8
DZ
122 spin_lock(&discard_ctl->lock);
123 __add_to_discard_list(discard_ctl, block_group);
b0643e59
DZ
124 spin_unlock(&discard_ctl->lock);
125}
126
6e80d4f8
DZ
127static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128 struct btrfs_block_group *block_group)
129{
2b5463fc
BB
130 bool queued;
131
6e80d4f8
DZ
132 spin_lock(&discard_ctl->lock);
133
2b5463fc
BB
134 queued = !list_empty(&block_group->discard_list);
135
6e80d4f8
DZ
136 if (!btrfs_run_discard_work(discard_ctl)) {
137 spin_unlock(&discard_ctl->lock);
138 return;
139 }
140
141 list_del_init(&block_group->discard_list);
142
143 block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144 block_group->discard_eligible_time = (ktime_get_ns() +
145 BTRFS_DISCARD_UNUSED_DELAY);
2bee7eb8 146 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
2b5463fc
BB
147 if (!queued)
148 btrfs_get_block_group(block_group);
6e80d4f8
DZ
149 list_add_tail(&block_group->discard_list,
150 &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151
152 spin_unlock(&discard_ctl->lock);
153}
154
b0643e59
DZ
155static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156 struct btrfs_block_group *block_group)
157{
158 bool running = false;
2b5463fc 159 bool queued = false;
b0643e59
DZ
160
161 spin_lock(&discard_ctl->lock);
162
163 if (block_group == discard_ctl->block_group) {
164 running = true;
165 discard_ctl->block_group = NULL;
166 }
167
168 block_group->discard_eligible_time = 0;
2b5463fc 169 queued = !list_empty(&block_group->discard_list);
b0643e59 170 list_del_init(&block_group->discard_list);
895c6721 171 if (queued)
2b5463fc 172 btrfs_put_block_group(block_group);
b0643e59
DZ
173
174 spin_unlock(&discard_ctl->lock);
175
176 return running;
177}
178
43dd529a
DS
179/*
180 * Find block_group that's up next for discarding.
181 *
182 * @discard_ctl: discard control
183 * @now: current time
b0643e59
DZ
184 *
185 * Iterate over the discard lists to find the next block_group up for
186 * discarding checking the discard_eligible_time of block_group.
187 */
188static struct btrfs_block_group *find_next_block_group(
189 struct btrfs_discard_ctl *discard_ctl,
190 u64 now)
191{
192 struct btrfs_block_group *ret_block_group = NULL, *block_group;
193 int i;
194
195 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196 struct list_head *discard_list = &discard_ctl->discard_list[i];
197
198 if (!list_empty(discard_list)) {
199 block_group = list_first_entry(discard_list,
200 struct btrfs_block_group,
201 discard_list);
202
203 if (!ret_block_group)
204 ret_block_group = block_group;
205
206 if (ret_block_group->discard_eligible_time < now)
207 break;
208
209 if (ret_block_group->discard_eligible_time >
210 block_group->discard_eligible_time)
211 ret_block_group = block_group;
212 }
213 }
214
215 return ret_block_group;
216}
217
43dd529a
DS
218/*
219 * Look up next block group and set it for use.
92419695
NB
220 *
221 * @discard_ctl: discard control
2bee7eb8 222 * @discard_state: the discard_state of the block_group after state management
7fe6d45e 223 * @discard_index: the discard_index of the block_group after state management
92419695 224 * @now: time when discard was invoked, in ns
b0643e59 225 *
43dd529a
DS
226 * Wrap find_next_block_group() and set the block_group to be in use.
227 * @discard_state's control flow is managed here. Variables related to
228 * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
7fe6d45e
DZ
229 * and @discard_index are remembered as it may change while we're discarding,
230 * but we want the discard to execute in the context determined here.
b0643e59
DZ
231 */
232static struct btrfs_block_group *peek_discard_list(
2bee7eb8 233 struct btrfs_discard_ctl *discard_ctl,
7fe6d45e 234 enum btrfs_discard_state *discard_state,
ea9ed87c 235 int *discard_index, u64 now)
b0643e59
DZ
236{
237 struct btrfs_block_group *block_group;
b0643e59
DZ
238
239 spin_lock(&discard_ctl->lock);
2bee7eb8 240again:
b0643e59
DZ
241 block_group = find_next_block_group(discard_ctl, now);
242
ea9ed87c 243 if (block_group && now >= block_group->discard_eligible_time) {
2bee7eb8
DZ
244 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
245 block_group->used != 0) {
2b5463fc 246 if (btrfs_is_block_group_data_only(block_group)) {
5cb0724e 247 __add_to_discard_list(discard_ctl, block_group);
54db6d1b
FM
248 /*
249 * The block group must have been moved to other
250 * discard list even if discard was disabled in
251 * the meantime or a transaction abort happened,
252 * otherwise we can end up in an infinite loop,
253 * always jumping into the 'again' label and
254 * keep getting this block group over and over
255 * in case there are no other block groups in
256 * the discard lists.
257 */
258 ASSERT(block_group->discard_index !=
1886b77f
FM
259 BTRFS_DISCARD_INDEX_UNUSED,
260 "discard_index=%d",
261 block_group->discard_index);
2b5463fc 262 } else {
5cb0724e 263 list_del_init(&block_group->discard_list);
2b5463fc
BB
264 btrfs_put_block_group(block_group);
265 }
2bee7eb8
DZ
266 goto again;
267 }
268 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
269 block_group->discard_cursor = block_group->start;
270 block_group->discard_state = BTRFS_DISCARD_EXTENTS;
271 }
ea9ed87c
PB
272 }
273 if (block_group) {
895c6721
BB
274 btrfs_get_block_group(block_group);
275 discard_ctl->block_group = block_group;
2bee7eb8 276 *discard_state = block_group->discard_state;
7fe6d45e 277 *discard_index = block_group->discard_index;
2bee7eb8 278 }
b0643e59
DZ
279 spin_unlock(&discard_ctl->lock);
280
281 return block_group;
282}
283
43dd529a
DS
284/*
285 * Update a block group's filters.
286 *
287 * @block_group: block group of interest
288 * @bytes: recently freed region size after coalescing
7fe6d45e
DZ
289 *
290 * Async discard maintains multiple lists with progressively smaller filters
291 * to prioritize discarding based on size. Should a free space that matches
292 * a larger filter be returned to the free_space_cache, prioritize that discard
293 * by moving @block_group to the proper filter.
294 */
295void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
296 u64 bytes)
297{
298 struct btrfs_discard_ctl *discard_ctl;
299
300 if (!block_group ||
301 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
302 return;
303
304 discard_ctl = &block_group->fs_info->discard_ctl;
305
306 if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
307 bytes >= discard_minlen[block_group->discard_index - 1]) {
308 int i;
309
310 remove_from_discard_list(discard_ctl, block_group);
311
312 for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
313 i++) {
314 if (bytes >= discard_minlen[i]) {
315 block_group->discard_index = i;
316 add_to_discard_list(discard_ctl, block_group);
317 break;
318 }
319 }
320 }
321}
322
43dd529a
DS
323/*
324 * Move a block group along the discard lists.
325 *
7fe6d45e
DZ
326 * @discard_ctl: discard control
327 * @block_group: block_group of interest
328 *
329 * Increment @block_group's discard_index. If it falls of the list, let it be.
330 * Otherwise add it back to the appropriate list.
331 */
332static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
333 struct btrfs_block_group *block_group)
334{
335 block_group->discard_index++;
336 if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
337 block_group->discard_index = 1;
338 return;
339 }
340
341 add_to_discard_list(discard_ctl, block_group);
342}
343
43dd529a
DS
344/*
345 * Remove a block_group from the discard lists.
346 *
b0643e59
DZ
347 * @discard_ctl: discard control
348 * @block_group: block_group of interest
349 *
43dd529a
DS
350 * Remove @block_group from the discard lists. If necessary, wait on the
351 * current work and then reschedule the delayed work.
b0643e59
DZ
352 */
353void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
354 struct btrfs_block_group *block_group)
355{
356 if (remove_from_discard_list(discard_ctl, block_group)) {
357 cancel_delayed_work_sync(&discard_ctl->work);
358 btrfs_discard_schedule_work(discard_ctl, true);
359 }
360}
361
43dd529a
DS
362/*
363 * Handles queuing the block_groups.
364 *
b0643e59
DZ
365 * @discard_ctl: discard control
366 * @block_group: block_group of interest
367 *
43dd529a 368 * Maintain the LRU order of the discard lists.
b0643e59
DZ
369 */
370void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
371 struct btrfs_block_group *block_group)
372{
373 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
374 return;
375
6e80d4f8
DZ
376 if (block_group->used == 0)
377 add_to_discard_unused_list(discard_ctl, block_group);
378 else
379 add_to_discard_list(discard_ctl, block_group);
b0643e59
DZ
380
381 if (!delayed_work_pending(&discard_ctl->work))
382 btrfs_discard_schedule_work(discard_ctl, false);
383}
384
8fc05859
PB
385static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
386 u64 now, bool override)
b0643e59
DZ
387{
388 struct btrfs_block_group *block_group;
b0643e59
DZ
389
390 if (!btrfs_run_discard_work(discard_ctl))
8fc05859 391 return;
b0643e59 392 if (!override && delayed_work_pending(&discard_ctl->work))
8fc05859 393 return;
b0643e59
DZ
394
395 block_group = find_next_block_group(discard_ctl, now);
396 if (block_group) {
6e88f116 397 u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
e93591bb
DZ
398 u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
399
400 /*
401 * A single delayed workqueue item is responsible for
402 * discarding, so we can manage the bytes rate limit by keeping
403 * track of the previous discard.
404 */
405 if (kbps_limit && discard_ctl->prev_discard) {
406 u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
407 u64 bps_delay = div64_u64(discard_ctl->prev_discard *
6e88f116 408 NSEC_PER_SEC, bps_limit);
e93591bb 409
6e88f116 410 delay = max(delay, bps_delay);
e93591bb 411 }
a2309300
DZ
412
413 /*
414 * This timeout is to hopefully prevent immediate discarding
415 * in a recently allocated block group.
416 */
417 if (now < block_group->discard_eligible_time) {
418 u64 bg_timeout = block_group->discard_eligible_time - now;
b0643e59 419
6e88f116 420 delay = max(delay, bg_timeout);
a2309300 421 }
b0643e59 422
df903e5d
PB
423 if (override && discard_ctl->prev_discard) {
424 u64 elapsed = now - discard_ctl->prev_discard_time;
425
426 if (delay > elapsed)
427 delay -= elapsed;
428 else
429 delay = 0;
430 }
431
b0643e59 432 mod_delayed_work(discard_ctl->discard_workers,
6e88f116 433 &discard_ctl->work, nsecs_to_jiffies(delay));
b0643e59 434 }
8fc05859
PB
435}
436
437/*
43dd529a
DS
438 * Responsible for scheduling the discard work.
439 *
8fc05859
PB
440 * @discard_ctl: discard control
441 * @override: override the current timer
442 *
443 * Discards are issued by a delayed workqueue item. @override is used to
444 * update the current delay as the baseline delay interval is reevaluated on
445 * transaction commit. This is also maxed with any other rate limit.
446 */
447void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
448 bool override)
449{
450 const u64 now = ktime_get_ns();
451
452 spin_lock(&discard_ctl->lock);
453 __btrfs_discard_schedule_work(discard_ctl, now, override);
b0643e59
DZ
454 spin_unlock(&discard_ctl->lock);
455}
456
43dd529a
DS
457/*
458 * Determine next step of a block_group.
459 *
6e80d4f8
DZ
460 * @discard_ctl: discard control
461 * @block_group: block_group of interest
462 *
43dd529a
DS
463 * Determine the next step for a block group after it's finished going through
464 * a pass on a discard list. If it is unused and fully trimmed, we can mark it
465 * unused and send it to the unused_bgs path. Otherwise, pass it onto the
466 * appropriate filter list or let it fall off.
6e80d4f8
DZ
467 */
468static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
469 struct btrfs_block_group *block_group)
470{
471 remove_from_discard_list(discard_ctl, block_group);
472
473 if (block_group->used == 0) {
474 if (btrfs_is_free_space_trimmed(block_group))
475 btrfs_mark_bg_unused(block_group);
476 else
477 add_to_discard_unused_list(discard_ctl, block_group);
7fe6d45e
DZ
478 } else {
479 btrfs_update_discard_index(discard_ctl, block_group);
6e80d4f8
DZ
480 }
481}
482
43dd529a
DS
483/*
484 * Discard work queue callback
485 *
b0643e59
DZ
486 * @work: work
487 *
43dd529a
DS
488 * Find the next block_group to start discarding and then discard a single
489 * region. It does this in a two-pass fashion: first extents and second
2bee7eb8 490 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
b0643e59
DZ
491 */
492static void btrfs_discard_workfn(struct work_struct *work)
493{
494 struct btrfs_discard_ctl *discard_ctl;
495 struct btrfs_block_group *block_group;
2bee7eb8 496 enum btrfs_discard_state discard_state;
7fe6d45e 497 int discard_index = 0;
b0643e59 498 u64 trimmed = 0;
7fe6d45e 499 u64 minlen = 0;
ea9ed87c 500 u64 now = ktime_get_ns();
b0643e59
DZ
501
502 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
503
7fe6d45e 504 block_group = peek_discard_list(discard_ctl, &discard_state,
ea9ed87c 505 &discard_index, now);
895c6721 506 if (!block_group)
b0643e59 507 return;
895c6721
BB
508 if (!btrfs_run_discard_work(discard_ctl)) {
509 spin_lock(&discard_ctl->lock);
510 btrfs_put_block_group(block_group);
511 discard_ctl->block_group = NULL;
512 spin_unlock(&discard_ctl->lock);
513 return;
514 }
ea9ed87c 515 if (now < block_group->discard_eligible_time) {
895c6721
BB
516 spin_lock(&discard_ctl->lock);
517 btrfs_put_block_group(block_group);
518 discard_ctl->block_group = NULL;
519 spin_unlock(&discard_ctl->lock);
ea9ed87c
PB
520 btrfs_discard_schedule_work(discard_ctl, false);
521 return;
522 }
b0643e59 523
2bee7eb8 524 /* Perform discarding */
7fe6d45e
DZ
525 minlen = discard_minlen[discard_index];
526
527 if (discard_state == BTRFS_DISCARD_BITMAPS) {
528 u64 maxlen = 0;
529
530 /*
531 * Use the previous levels minimum discard length as the max
532 * length filter. In the case something is added to make a
533 * region go beyond the max filter, the entire bitmap is set
534 * back to BTRFS_TRIM_STATE_UNTRIMMED.
535 */
536 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
537 maxlen = discard_minlen[discard_index - 1];
538
2bee7eb8
DZ
539 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
540 block_group->discard_cursor,
541 btrfs_block_group_end(block_group),
7fe6d45e 542 minlen, maxlen, true);
9ddf648f 543 discard_ctl->discard_bitmap_bytes += trimmed;
7fe6d45e 544 } else {
2bee7eb8
DZ
545 btrfs_trim_block_group_extents(block_group, &trimmed,
546 block_group->discard_cursor,
547 btrfs_block_group_end(block_group),
7fe6d45e 548 minlen, true);
9ddf648f 549 discard_ctl->discard_extent_bytes += trimmed;
7fe6d45e 550 }
2bee7eb8
DZ
551
552 /* Determine next steps for a block_group */
553 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
554 if (discard_state == BTRFS_DISCARD_BITMAPS) {
555 btrfs_finish_discard_pass(discard_ctl, block_group);
556 } else {
557 block_group->discard_cursor = block_group->start;
558 spin_lock(&discard_ctl->lock);
559 if (block_group->discard_state !=
560 BTRFS_DISCARD_RESET_CURSOR)
561 block_group->discard_state =
562 BTRFS_DISCARD_BITMAPS;
563 spin_unlock(&discard_ctl->lock);
564 }
565 }
566
1ea2872f 567 now = ktime_get_ns();
2bee7eb8 568 spin_lock(&discard_ctl->lock);
1ea2872f
PB
569 discard_ctl->prev_discard = trimmed;
570 discard_ctl->prev_discard_time = now;
895c6721 571 btrfs_put_block_group(block_group);
2bee7eb8 572 discard_ctl->block_group = NULL;
8fc05859 573 __btrfs_discard_schedule_work(discard_ctl, now, false);
2bee7eb8 574 spin_unlock(&discard_ctl->lock);
b0643e59
DZ
575}
576
43dd529a
DS
577/*
578 * Recalculate the base delay.
579 *
a2309300
DZ
580 * @discard_ctl: discard control
581 *
582 * Recalculate the base delay which is based off the total number of
583 * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
584 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
585 */
586void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
587{
588 s32 discardable_extents;
81b29a3b 589 s64 discardable_bytes;
a2309300 590 u32 iops_limit;
ef9cddfe 591 unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
a2309300 592 unsigned long delay;
a2309300
DZ
593
594 discardable_extents = atomic_read(&discard_ctl->discardable_extents);
595 if (!discardable_extents)
596 return;
597
598 spin_lock(&discard_ctl->lock);
599
81b29a3b 600 /*
43dd529a 601 * The following is to fix a potential -1 discrepancy that we're not
81b29a3b
DZ
602 * sure how to reproduce. But given that this is the only place that
603 * utilizes these numbers and this is only called by from
604 * btrfs_finish_extent_commit() which is synchronized, we can correct
605 * here.
606 */
607 if (discardable_extents < 0)
608 atomic_add(-discardable_extents,
609 &discard_ctl->discardable_extents);
610
611 discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
612 if (discardable_bytes < 0)
613 atomic64_add(-discardable_bytes,
614 &discard_ctl->discardable_bytes);
615
616 if (discardable_extents <= 0) {
617 spin_unlock(&discard_ctl->lock);
618 return;
619 }
620
a2309300 621 iops_limit = READ_ONCE(discard_ctl->iops_limit);
ef9cddfe
BB
622
623 if (iops_limit) {
e50404a8 624 delay = MSEC_PER_SEC / iops_limit;
ef9cddfe
BB
625 } else {
626 /*
627 * Unset iops_limit means go as fast as possible, so allow a
628 * delay of 0.
629 */
630 delay = 0;
631 min_delay = 0;
632 }
a2309300 633
ef9cddfe 634 delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
6e88f116 635 discard_ctl->delay_ms = delay;
a2309300
DZ
636
637 spin_unlock(&discard_ctl->lock);
638}
639
43dd529a
DS
640/*
641 * Propagate discard counters.
642 *
dfb79ddb 643 * @block_group: block_group of interest
dfb79ddb 644 *
43dd529a
DS
645 * Propagate deltas of counters up to the discard_ctl. It maintains a current
646 * counter and a previous counter passing the delta up to the global stat.
647 * Then the current counter value becomes the previous counter value.
dfb79ddb 648 */
66b53bae 649void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
dfb79ddb 650{
66b53bae 651 struct btrfs_free_space_ctl *ctl;
dfb79ddb
DZ
652 struct btrfs_discard_ctl *discard_ctl;
653 s32 extents_delta;
5dc7c10b 654 s64 bytes_delta;
dfb79ddb 655
5cb0724e
DZ
656 if (!block_group ||
657 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
658 !btrfs_is_block_group_data_only(block_group))
dfb79ddb
DZ
659 return;
660
66b53bae 661 ctl = block_group->free_space_ctl;
dfb79ddb
DZ
662 discard_ctl = &block_group->fs_info->discard_ctl;
663
66b53bae 664 lockdep_assert_held(&ctl->tree_lock);
dfb79ddb
DZ
665 extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
666 ctl->discardable_extents[BTRFS_STAT_PREV];
667 if (extents_delta) {
668 atomic_add(extents_delta, &discard_ctl->discardable_extents);
669 ctl->discardable_extents[BTRFS_STAT_PREV] =
670 ctl->discardable_extents[BTRFS_STAT_CURR];
671 }
5dc7c10b
DZ
672
673 bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
674 ctl->discardable_bytes[BTRFS_STAT_PREV];
675 if (bytes_delta) {
676 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
677 ctl->discardable_bytes[BTRFS_STAT_PREV] =
678 ctl->discardable_bytes[BTRFS_STAT_CURR];
679 }
dfb79ddb
DZ
680}
681
43dd529a
DS
682/*
683 * Punt unused_bgs list to discard lists.
684 *
6e80d4f8
DZ
685 * @fs_info: fs_info of interest
686 *
687 * The unused_bgs list needs to be punted to the discard lists because the
1a9fd417 688 * order of operations is changed. In the normal synchronous discard path, the
6e80d4f8
DZ
689 * block groups are trimmed via a single large trim in transaction commit. This
690 * is ultimately what we are trying to avoid with asynchronous discard. Thus,
691 * it must be done before going down the unused_bgs path.
692 */
693void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
694{
695 struct btrfs_block_group *block_group, *next;
696
697 spin_lock(&fs_info->unused_bgs_lock);
698 /* We enabled async discard, so punt all to the queue */
699 list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
700 bg_list) {
701 list_del_init(&block_group->bg_list);
702 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2b5463fc
BB
703 /*
704 * This put is for the get done by btrfs_mark_bg_unused.
705 * Queueing discard incremented it for discard's reference.
706 */
707 btrfs_put_block_group(block_group);
6e80d4f8
DZ
708 }
709 spin_unlock(&fs_info->unused_bgs_lock);
710}
711
43dd529a
DS
712/*
713 * Purge discard lists.
714 *
6e80d4f8
DZ
715 * @discard_ctl: discard control
716 *
717 * If we are disabling async discard, we may have intercepted block groups that
718 * are completely free and ready for the unused_bgs path. As discarding will
719 * now happen in transaction commit or not at all, we can safely mark the
720 * corresponding block groups as unused and they will be sent on their merry
721 * way to the unused_bgs list.
722 */
723static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
724{
725 struct btrfs_block_group *block_group, *next;
726 int i;
727
728 spin_lock(&discard_ctl->lock);
729 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
730 list_for_each_entry_safe(block_group, next,
731 &discard_ctl->discard_list[i],
732 discard_list) {
733 list_del_init(&block_group->discard_list);
734 spin_unlock(&discard_ctl->lock);
735 if (block_group->used == 0)
736 btrfs_mark_bg_unused(block_group);
737 spin_lock(&discard_ctl->lock);
2b5463fc 738 btrfs_put_block_group(block_group);
6e80d4f8
DZ
739 }
740 }
741 spin_unlock(&discard_ctl->lock);
742}
743
b0643e59
DZ
744void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
745{
746 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
747 btrfs_discard_cleanup(fs_info);
748 return;
749 }
750
6e80d4f8
DZ
751 btrfs_discard_punt_unused_bgs_list(fs_info);
752
b0643e59
DZ
753 set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
754}
755
756void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
757{
758 clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
759}
760
761void btrfs_discard_init(struct btrfs_fs_info *fs_info)
762{
763 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
764 int i;
765
766 spin_lock_init(&discard_ctl->lock);
767 INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
768
769 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
770 INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
dfb79ddb 771
e93591bb 772 discard_ctl->prev_discard = 0;
df903e5d 773 discard_ctl->prev_discard_time = 0;
dfb79ddb 774 atomic_set(&discard_ctl->discardable_extents, 0);
5dc7c10b 775 atomic64_set(&discard_ctl->discardable_bytes, 0);
19b2a2c7 776 discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
6e88f116 777 discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
a2309300 778 discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
e93591bb 779 discard_ctl->kbps_limit = 0;
9ddf648f
DZ
780 discard_ctl->discard_extent_bytes = 0;
781 discard_ctl->discard_bitmap_bytes = 0;
782 atomic64_set(&discard_ctl->discard_bytes_saved, 0);
b0643e59
DZ
783}
784
785void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
786{
787 btrfs_discard_stop(fs_info);
788 cancel_delayed_work_sync(&fs_info->discard_ctl.work);
6e80d4f8 789 btrfs_discard_purge_list(&fs_info->discard_ctl);
b0643e59 790}