btrfs: calculate discard delay based on number of extents
[linux-block.git] / fs / btrfs / discard.c
CommitLineData
b0643e59
DZ
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/jiffies.h>
4#include <linux/kernel.h>
5#include <linux/ktime.h>
6#include <linux/list.h>
7#include <linux/sizes.h>
8#include <linux/workqueue.h>
9#include "ctree.h"
10#include "block-group.h"
11#include "discard.h"
12#include "free-space-cache.h"
13
14/* This is an initial delay to give some chance for block reuse */
15#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
6e80d4f8 16#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
b0643e59 17
a2309300
DZ
18/* Target completion latency of discarding all discardable extents */
19#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
20#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
21#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
22#define BTRFS_DISCARD_MAX_IOPS (10U)
23
b0643e59
DZ
24static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
25 struct btrfs_block_group *block_group)
26{
27 return &discard_ctl->discard_list[block_group->discard_index];
28}
29
2bee7eb8
DZ
30static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
31 struct btrfs_block_group *block_group)
b0643e59 32{
2bee7eb8 33 if (!btrfs_run_discard_work(discard_ctl))
b0643e59 34 return;
b0643e59 35
6e80d4f8
DZ
36 if (list_empty(&block_group->discard_list) ||
37 block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
38 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
39 block_group->discard_index = BTRFS_DISCARD_INDEX_START;
b0643e59
DZ
40 block_group->discard_eligible_time = (ktime_get_ns() +
41 BTRFS_DISCARD_DELAY);
2bee7eb8 42 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
6e80d4f8 43 }
b0643e59
DZ
44
45 list_move_tail(&block_group->discard_list,
46 get_discard_list(discard_ctl, block_group));
2bee7eb8 47}
b0643e59 48
2bee7eb8
DZ
49static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
50 struct btrfs_block_group *block_group)
51{
52 spin_lock(&discard_ctl->lock);
53 __add_to_discard_list(discard_ctl, block_group);
b0643e59
DZ
54 spin_unlock(&discard_ctl->lock);
55}
56
6e80d4f8
DZ
57static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
58 struct btrfs_block_group *block_group)
59{
60 spin_lock(&discard_ctl->lock);
61
62 if (!btrfs_run_discard_work(discard_ctl)) {
63 spin_unlock(&discard_ctl->lock);
64 return;
65 }
66
67 list_del_init(&block_group->discard_list);
68
69 block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
70 block_group->discard_eligible_time = (ktime_get_ns() +
71 BTRFS_DISCARD_UNUSED_DELAY);
2bee7eb8 72 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
6e80d4f8
DZ
73 list_add_tail(&block_group->discard_list,
74 &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
75
76 spin_unlock(&discard_ctl->lock);
77}
78
b0643e59
DZ
79static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
80 struct btrfs_block_group *block_group)
81{
82 bool running = false;
83
84 spin_lock(&discard_ctl->lock);
85
86 if (block_group == discard_ctl->block_group) {
87 running = true;
88 discard_ctl->block_group = NULL;
89 }
90
91 block_group->discard_eligible_time = 0;
92 list_del_init(&block_group->discard_list);
93
94 spin_unlock(&discard_ctl->lock);
95
96 return running;
97}
98
99/**
100 * find_next_block_group - find block_group that's up next for discarding
101 * @discard_ctl: discard control
102 * @now: current time
103 *
104 * Iterate over the discard lists to find the next block_group up for
105 * discarding checking the discard_eligible_time of block_group.
106 */
107static struct btrfs_block_group *find_next_block_group(
108 struct btrfs_discard_ctl *discard_ctl,
109 u64 now)
110{
111 struct btrfs_block_group *ret_block_group = NULL, *block_group;
112 int i;
113
114 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
115 struct list_head *discard_list = &discard_ctl->discard_list[i];
116
117 if (!list_empty(discard_list)) {
118 block_group = list_first_entry(discard_list,
119 struct btrfs_block_group,
120 discard_list);
121
122 if (!ret_block_group)
123 ret_block_group = block_group;
124
125 if (ret_block_group->discard_eligible_time < now)
126 break;
127
128 if (ret_block_group->discard_eligible_time >
129 block_group->discard_eligible_time)
130 ret_block_group = block_group;
131 }
132 }
133
134 return ret_block_group;
135}
136
137/**
138 * peek_discard_list - wrap find_next_block_group()
139 * @discard_ctl: discard control
2bee7eb8 140 * @discard_state: the discard_state of the block_group after state management
b0643e59
DZ
141 *
142 * This wraps find_next_block_group() and sets the block_group to be in use.
2bee7eb8
DZ
143 * discard_state's control flow is managed here. Variables related to
144 * discard_state are reset here as needed (eg. discard_cursor). @discard_state
145 * is remembered as it may change while we're discarding, but we want the
146 * discard to execute in the context determined here.
b0643e59
DZ
147 */
148static struct btrfs_block_group *peek_discard_list(
2bee7eb8
DZ
149 struct btrfs_discard_ctl *discard_ctl,
150 enum btrfs_discard_state *discard_state)
b0643e59
DZ
151{
152 struct btrfs_block_group *block_group;
153 const u64 now = ktime_get_ns();
154
155 spin_lock(&discard_ctl->lock);
2bee7eb8 156again:
b0643e59
DZ
157 block_group = find_next_block_group(discard_ctl, now);
158
2bee7eb8
DZ
159 if (block_group && now > block_group->discard_eligible_time) {
160 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
161 block_group->used != 0) {
162 __add_to_discard_list(discard_ctl, block_group);
163 goto again;
164 }
165 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
166 block_group->discard_cursor = block_group->start;
167 block_group->discard_state = BTRFS_DISCARD_EXTENTS;
168 }
169 discard_ctl->block_group = block_group;
170 *discard_state = block_group->discard_state;
171 } else {
b0643e59 172 block_group = NULL;
2bee7eb8 173 }
b0643e59
DZ
174
175 spin_unlock(&discard_ctl->lock);
176
177 return block_group;
178}
179
180/**
181 * btrfs_discard_cancel_work - remove a block_group from the discard lists
182 * @discard_ctl: discard control
183 * @block_group: block_group of interest
184 *
185 * This removes @block_group from the discard lists. If necessary, it waits on
186 * the current work and then reschedules the delayed work.
187 */
188void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
189 struct btrfs_block_group *block_group)
190{
191 if (remove_from_discard_list(discard_ctl, block_group)) {
192 cancel_delayed_work_sync(&discard_ctl->work);
193 btrfs_discard_schedule_work(discard_ctl, true);
194 }
195}
196
197/**
198 * btrfs_discard_queue_work - handles queuing the block_groups
199 * @discard_ctl: discard control
200 * @block_group: block_group of interest
201 *
202 * This maintains the LRU order of the discard lists.
203 */
204void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
205 struct btrfs_block_group *block_group)
206{
207 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
208 return;
209
6e80d4f8
DZ
210 if (block_group->used == 0)
211 add_to_discard_unused_list(discard_ctl, block_group);
212 else
213 add_to_discard_list(discard_ctl, block_group);
b0643e59
DZ
214
215 if (!delayed_work_pending(&discard_ctl->work))
216 btrfs_discard_schedule_work(discard_ctl, false);
217}
218
219/**
220 * btrfs_discard_schedule_work - responsible for scheduling the discard work
221 * @discard_ctl: discard control
222 * @override: override the current timer
223 *
224 * Discards are issued by a delayed workqueue item. @override is used to
225 * update the current delay as the baseline delay interview is reevaluated
226 * on transaction commit. This is also maxed with any other rate limit.
227 */
228void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
229 bool override)
230{
231 struct btrfs_block_group *block_group;
232 const u64 now = ktime_get_ns();
233
234 spin_lock(&discard_ctl->lock);
235
236 if (!btrfs_run_discard_work(discard_ctl))
237 goto out;
238
239 if (!override && delayed_work_pending(&discard_ctl->work))
240 goto out;
241
242 block_group = find_next_block_group(discard_ctl, now);
243 if (block_group) {
a2309300
DZ
244 unsigned long delay = discard_ctl->delay;
245
246 /*
247 * This timeout is to hopefully prevent immediate discarding
248 * in a recently allocated block group.
249 */
250 if (now < block_group->discard_eligible_time) {
251 u64 bg_timeout = block_group->discard_eligible_time - now;
b0643e59 252
a2309300
DZ
253 delay = max(delay, nsecs_to_jiffies(bg_timeout));
254 }
b0643e59
DZ
255
256 mod_delayed_work(discard_ctl->discard_workers,
257 &discard_ctl->work, delay);
258 }
259out:
260 spin_unlock(&discard_ctl->lock);
261}
262
6e80d4f8
DZ
263/**
264 * btrfs_finish_discard_pass - determine next step of a block_group
265 * @discard_ctl: discard control
266 * @block_group: block_group of interest
267 *
268 * This determines the next step for a block group after it's finished going
269 * through a pass on a discard list. If it is unused and fully trimmed, we can
270 * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
271 * the appropriate filter list or let it fall off.
272 */
273static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
274 struct btrfs_block_group *block_group)
275{
276 remove_from_discard_list(discard_ctl, block_group);
277
278 if (block_group->used == 0) {
279 if (btrfs_is_free_space_trimmed(block_group))
280 btrfs_mark_bg_unused(block_group);
281 else
282 add_to_discard_unused_list(discard_ctl, block_group);
283 }
284}
285
b0643e59
DZ
286/**
287 * btrfs_discard_workfn - discard work function
288 * @work: work
289 *
2bee7eb8
DZ
290 * This finds the next block_group to start discarding and then discards a
291 * single region. It does this in a two-pass fashion: first extents and second
292 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
b0643e59
DZ
293 */
294static void btrfs_discard_workfn(struct work_struct *work)
295{
296 struct btrfs_discard_ctl *discard_ctl;
297 struct btrfs_block_group *block_group;
2bee7eb8 298 enum btrfs_discard_state discard_state;
b0643e59
DZ
299 u64 trimmed = 0;
300
301 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
302
2bee7eb8 303 block_group = peek_discard_list(discard_ctl, &discard_state);
b0643e59
DZ
304 if (!block_group || !btrfs_run_discard_work(discard_ctl))
305 return;
306
2bee7eb8
DZ
307 /* Perform discarding */
308 if (discard_state == BTRFS_DISCARD_BITMAPS)
309 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
310 block_group->discard_cursor,
311 btrfs_block_group_end(block_group),
312 0, true);
313 else
314 btrfs_trim_block_group_extents(block_group, &trimmed,
315 block_group->discard_cursor,
316 btrfs_block_group_end(block_group),
317 0, true);
318
319 /* Determine next steps for a block_group */
320 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
321 if (discard_state == BTRFS_DISCARD_BITMAPS) {
322 btrfs_finish_discard_pass(discard_ctl, block_group);
323 } else {
324 block_group->discard_cursor = block_group->start;
325 spin_lock(&discard_ctl->lock);
326 if (block_group->discard_state !=
327 BTRFS_DISCARD_RESET_CURSOR)
328 block_group->discard_state =
329 BTRFS_DISCARD_BITMAPS;
330 spin_unlock(&discard_ctl->lock);
331 }
332 }
333
334 spin_lock(&discard_ctl->lock);
335 discard_ctl->block_group = NULL;
336 spin_unlock(&discard_ctl->lock);
b0643e59 337
b0643e59
DZ
338 btrfs_discard_schedule_work(discard_ctl, false);
339}
340
341/**
342 * btrfs_run_discard_work - determines if async discard should be running
343 * @discard_ctl: discard control
344 *
345 * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
346 */
347bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
348{
349 struct btrfs_fs_info *fs_info = container_of(discard_ctl,
350 struct btrfs_fs_info,
351 discard_ctl);
352
353 return (!(fs_info->sb->s_flags & SB_RDONLY) &&
354 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
355}
356
a2309300
DZ
357/**
358 * btrfs_discard_calc_delay - recalculate the base delay
359 * @discard_ctl: discard control
360 *
361 * Recalculate the base delay which is based off the total number of
362 * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
363 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
364 */
365void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
366{
367 s32 discardable_extents;
368 u32 iops_limit;
369 unsigned long delay;
370 unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
371
372 discardable_extents = atomic_read(&discard_ctl->discardable_extents);
373 if (!discardable_extents)
374 return;
375
376 spin_lock(&discard_ctl->lock);
377
378 iops_limit = READ_ONCE(discard_ctl->iops_limit);
379 if (iops_limit)
380 lower_limit = max_t(unsigned long, lower_limit,
381 MSEC_PER_SEC / iops_limit);
382
383 delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
384 delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
385 discard_ctl->delay = msecs_to_jiffies(delay);
386
387 spin_unlock(&discard_ctl->lock);
388}
389
dfb79ddb
DZ
390/**
391 * btrfs_discard_update_discardable - propagate discard counters
392 * @block_group: block_group of interest
393 * @ctl: free_space_ctl of @block_group
394 *
395 * This propagates deltas of counters up to the discard_ctl. It maintains a
396 * current counter and a previous counter passing the delta up to the global
397 * stat. Then the current counter value becomes the previous counter value.
398 */
399void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
400 struct btrfs_free_space_ctl *ctl)
401{
402 struct btrfs_discard_ctl *discard_ctl;
403 s32 extents_delta;
5dc7c10b 404 s64 bytes_delta;
dfb79ddb
DZ
405
406 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
407 return;
408
409 discard_ctl = &block_group->fs_info->discard_ctl;
410
411 extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
412 ctl->discardable_extents[BTRFS_STAT_PREV];
413 if (extents_delta) {
414 atomic_add(extents_delta, &discard_ctl->discardable_extents);
415 ctl->discardable_extents[BTRFS_STAT_PREV] =
416 ctl->discardable_extents[BTRFS_STAT_CURR];
417 }
5dc7c10b
DZ
418
419 bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
420 ctl->discardable_bytes[BTRFS_STAT_PREV];
421 if (bytes_delta) {
422 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
423 ctl->discardable_bytes[BTRFS_STAT_PREV] =
424 ctl->discardable_bytes[BTRFS_STAT_CURR];
425 }
dfb79ddb
DZ
426}
427
6e80d4f8
DZ
428/**
429 * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
430 * @fs_info: fs_info of interest
431 *
432 * The unused_bgs list needs to be punted to the discard lists because the
433 * order of operations is changed. In the normal sychronous discard path, the
434 * block groups are trimmed via a single large trim in transaction commit. This
435 * is ultimately what we are trying to avoid with asynchronous discard. Thus,
436 * it must be done before going down the unused_bgs path.
437 */
438void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
439{
440 struct btrfs_block_group *block_group, *next;
441
442 spin_lock(&fs_info->unused_bgs_lock);
443 /* We enabled async discard, so punt all to the queue */
444 list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
445 bg_list) {
446 list_del_init(&block_group->bg_list);
447 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
448 }
449 spin_unlock(&fs_info->unused_bgs_lock);
450}
451
452/**
453 * btrfs_discard_purge_list - purge discard lists
454 * @discard_ctl: discard control
455 *
456 * If we are disabling async discard, we may have intercepted block groups that
457 * are completely free and ready for the unused_bgs path. As discarding will
458 * now happen in transaction commit or not at all, we can safely mark the
459 * corresponding block groups as unused and they will be sent on their merry
460 * way to the unused_bgs list.
461 */
462static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
463{
464 struct btrfs_block_group *block_group, *next;
465 int i;
466
467 spin_lock(&discard_ctl->lock);
468 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
469 list_for_each_entry_safe(block_group, next,
470 &discard_ctl->discard_list[i],
471 discard_list) {
472 list_del_init(&block_group->discard_list);
473 spin_unlock(&discard_ctl->lock);
474 if (block_group->used == 0)
475 btrfs_mark_bg_unused(block_group);
476 spin_lock(&discard_ctl->lock);
477 }
478 }
479 spin_unlock(&discard_ctl->lock);
480}
481
b0643e59
DZ
482void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
483{
484 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
485 btrfs_discard_cleanup(fs_info);
486 return;
487 }
488
6e80d4f8
DZ
489 btrfs_discard_punt_unused_bgs_list(fs_info);
490
b0643e59
DZ
491 set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
492}
493
494void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
495{
496 clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
497}
498
499void btrfs_discard_init(struct btrfs_fs_info *fs_info)
500{
501 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
502 int i;
503
504 spin_lock_init(&discard_ctl->lock);
505 INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
506
507 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
508 INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
dfb79ddb
DZ
509
510 atomic_set(&discard_ctl->discardable_extents, 0);
5dc7c10b 511 atomic64_set(&discard_ctl->discardable_bytes, 0);
a2309300
DZ
512 discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
513 discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
b0643e59
DZ
514}
515
516void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
517{
518 btrfs_discard_stop(fs_info);
519 cancel_delayed_work_sync(&fs_info->discard_ctl.work);
6e80d4f8 520 btrfs_discard_purge_list(&fs_info->discard_ctl);
b0643e59 521}