Commit | Line | Data |
---|---|---|
b0643e59 DZ |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/jiffies.h> | |
4 | #include <linux/kernel.h> | |
5 | #include <linux/ktime.h> | |
6 | #include <linux/list.h> | |
e93591bb | 7 | #include <linux/math64.h> |
b0643e59 DZ |
8 | #include <linux/sizes.h> |
9 | #include <linux/workqueue.h> | |
10 | #include "ctree.h" | |
11 | #include "block-group.h" | |
12 | #include "discard.h" | |
13 | #include "free-space-cache.h" | |
fc97a410 | 14 | #include "fs.h" |
b0643e59 | 15 | |
dbc2a8c9 DZ |
16 | /* |
17 | * This contains the logic to handle async discard. | |
18 | * | |
19 | * Async discard manages trimming of free space outside of transaction commit. | |
20 | * Discarding is done by managing the block_groups on a LRU list based on free | |
21 | * space recency. Two passes are used to first prioritize discarding extents | |
22 | * and then allow for trimming in the bitmap the best opportunity to coalesce. | |
23 | * The block_groups are maintained on multiple lists to allow for multiple | |
24 | * passes with different discard filter requirements. A delayed work item is | |
25 | * used to manage discarding with timeout determined by a max of the delay | |
26 | * incurred by the iops rate limit, the byte rate limit, and the max delay of | |
27 | * BTRFS_DISCARD_MAX_DELAY. | |
28 | * | |
29 | * Note, this only keeps track of block_groups that are explicitly for data. | |
30 | * Mixed block_groups are not supported. | |
31 | * | |
32 | * The first list is special to manage discarding of fully free block groups. | |
33 | * This is necessary because we issue a final trim for a full free block group | |
34 | * after forgetting it. When a block group becomes unused, instead of directly | |
35 | * being added to the unused_bgs list, we add it to this first list. Then | |
36 | * from there, if it becomes fully discarded, we place it onto the unused_bgs | |
37 | * list. | |
38 | * | |
39 | * The in-memory free space cache serves as the backing state for discard. | |
40 | * Consequently this means there is no persistence. We opt to load all the | |
41 | * block groups in as not discarded, so the mount case degenerates to the | |
42 | * crashing case. | |
43 | * | |
44 | * As the free space cache uses bitmaps, there exists a tradeoff between | |
45 | * ease/efficiency for find_free_extent() and the accuracy of discard state. | |
46 | * Here we opt to let untrimmed regions merge with everything while only letting | |
47 | * trimmed regions merge with other trimmed regions. This can cause | |
48 | * overtrimming, but the coalescing benefit seems to be worth it. Additionally, | |
49 | * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, | |
50 | * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, | |
51 | * this resets the state and we will retry trimming the whole bitmap. This is a | |
52 | * tradeoff between discard state accuracy and the cost of accounting. | |
53 | */ | |
54 | ||
b0643e59 DZ |
55 | /* This is an initial delay to give some chance for block reuse */ |
56 | #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) | |
6e80d4f8 | 57 | #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) |
b0643e59 | 58 | |
a2309300 DZ |
59 | #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) |
60 | #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) | |
e9f59429 | 61 | #define BTRFS_DISCARD_MAX_IOPS (1000U) |
a2309300 | 62 | |
43dd529a | 63 | /* Monotonically decreasing minimum length filters after index 0 */ |
7fe6d45e DZ |
64 | static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { |
65 | 0, | |
66 | BTRFS_ASYNC_DISCARD_MAX_FILTER, | |
67 | BTRFS_ASYNC_DISCARD_MIN_FILTER | |
68 | }; | |
69 | ||
b0643e59 DZ |
70 | static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, |
71 | struct btrfs_block_group *block_group) | |
72 | { | |
73 | return &discard_ctl->discard_list[block_group->discard_index]; | |
74 | } | |
75 | ||
2bee7eb8 DZ |
76 | static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
77 | struct btrfs_block_group *block_group) | |
b0643e59 | 78 | { |
2b5463fc | 79 | lockdep_assert_held(&discard_ctl->lock); |
2bee7eb8 | 80 | if (!btrfs_run_discard_work(discard_ctl)) |
b0643e59 | 81 | return; |
b0643e59 | 82 | |
6e80d4f8 DZ |
83 | if (list_empty(&block_group->discard_list) || |
84 | block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { | |
85 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) | |
86 | block_group->discard_index = BTRFS_DISCARD_INDEX_START; | |
b0643e59 DZ |
87 | block_group->discard_eligible_time = (ktime_get_ns() + |
88 | BTRFS_DISCARD_DELAY); | |
2bee7eb8 | 89 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
6e80d4f8 | 90 | } |
2b5463fc BB |
91 | if (list_empty(&block_group->discard_list)) |
92 | btrfs_get_block_group(block_group); | |
b0643e59 DZ |
93 | |
94 | list_move_tail(&block_group->discard_list, | |
95 | get_discard_list(discard_ctl, block_group)); | |
2bee7eb8 | 96 | } |
b0643e59 | 97 | |
2bee7eb8 DZ |
98 | static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
99 | struct btrfs_block_group *block_group) | |
100 | { | |
5cb0724e DZ |
101 | if (!btrfs_is_block_group_data_only(block_group)) |
102 | return; | |
103 | ||
2bee7eb8 DZ |
104 | spin_lock(&discard_ctl->lock); |
105 | __add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
106 | spin_unlock(&discard_ctl->lock); |
107 | } | |
108 | ||
6e80d4f8 DZ |
109 | static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, |
110 | struct btrfs_block_group *block_group) | |
111 | { | |
2b5463fc BB |
112 | bool queued; |
113 | ||
6e80d4f8 DZ |
114 | spin_lock(&discard_ctl->lock); |
115 | ||
2b5463fc BB |
116 | queued = !list_empty(&block_group->discard_list); |
117 | ||
6e80d4f8 DZ |
118 | if (!btrfs_run_discard_work(discard_ctl)) { |
119 | spin_unlock(&discard_ctl->lock); | |
120 | return; | |
121 | } | |
122 | ||
123 | list_del_init(&block_group->discard_list); | |
124 | ||
125 | block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; | |
126 | block_group->discard_eligible_time = (ktime_get_ns() + | |
127 | BTRFS_DISCARD_UNUSED_DELAY); | |
2bee7eb8 | 128 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
2b5463fc BB |
129 | if (!queued) |
130 | btrfs_get_block_group(block_group); | |
6e80d4f8 DZ |
131 | list_add_tail(&block_group->discard_list, |
132 | &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); | |
133 | ||
134 | spin_unlock(&discard_ctl->lock); | |
135 | } | |
136 | ||
b0643e59 DZ |
137 | static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, |
138 | struct btrfs_block_group *block_group) | |
139 | { | |
140 | bool running = false; | |
2b5463fc | 141 | bool queued = false; |
b0643e59 DZ |
142 | |
143 | spin_lock(&discard_ctl->lock); | |
144 | ||
145 | if (block_group == discard_ctl->block_group) { | |
146 | running = true; | |
147 | discard_ctl->block_group = NULL; | |
148 | } | |
149 | ||
150 | block_group->discard_eligible_time = 0; | |
2b5463fc | 151 | queued = !list_empty(&block_group->discard_list); |
b0643e59 | 152 | list_del_init(&block_group->discard_list); |
2b5463fc BB |
153 | /* |
154 | * If the block group is currently running in the discard workfn, we | |
155 | * don't want to deref it, since it's still being used by the workfn. | |
156 | * The workfn will notice this case and deref the block group when it is | |
157 | * finished. | |
158 | */ | |
159 | if (queued && !running) | |
160 | btrfs_put_block_group(block_group); | |
b0643e59 DZ |
161 | |
162 | spin_unlock(&discard_ctl->lock); | |
163 | ||
164 | return running; | |
165 | } | |
166 | ||
43dd529a DS |
167 | /* |
168 | * Find block_group that's up next for discarding. | |
169 | * | |
170 | * @discard_ctl: discard control | |
171 | * @now: current time | |
b0643e59 DZ |
172 | * |
173 | * Iterate over the discard lists to find the next block_group up for | |
174 | * discarding checking the discard_eligible_time of block_group. | |
175 | */ | |
176 | static struct btrfs_block_group *find_next_block_group( | |
177 | struct btrfs_discard_ctl *discard_ctl, | |
178 | u64 now) | |
179 | { | |
180 | struct btrfs_block_group *ret_block_group = NULL, *block_group; | |
181 | int i; | |
182 | ||
183 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
184 | struct list_head *discard_list = &discard_ctl->discard_list[i]; | |
185 | ||
186 | if (!list_empty(discard_list)) { | |
187 | block_group = list_first_entry(discard_list, | |
188 | struct btrfs_block_group, | |
189 | discard_list); | |
190 | ||
191 | if (!ret_block_group) | |
192 | ret_block_group = block_group; | |
193 | ||
194 | if (ret_block_group->discard_eligible_time < now) | |
195 | break; | |
196 | ||
197 | if (ret_block_group->discard_eligible_time > | |
198 | block_group->discard_eligible_time) | |
199 | ret_block_group = block_group; | |
200 | } | |
201 | } | |
202 | ||
203 | return ret_block_group; | |
204 | } | |
205 | ||
43dd529a DS |
206 | /* |
207 | * Look up next block group and set it for use. | |
92419695 NB |
208 | * |
209 | * @discard_ctl: discard control | |
2bee7eb8 | 210 | * @discard_state: the discard_state of the block_group after state management |
7fe6d45e | 211 | * @discard_index: the discard_index of the block_group after state management |
92419695 | 212 | * @now: time when discard was invoked, in ns |
b0643e59 | 213 | * |
43dd529a DS |
214 | * Wrap find_next_block_group() and set the block_group to be in use. |
215 | * @discard_state's control flow is managed here. Variables related to | |
216 | * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state | |
7fe6d45e DZ |
217 | * and @discard_index are remembered as it may change while we're discarding, |
218 | * but we want the discard to execute in the context determined here. | |
b0643e59 DZ |
219 | */ |
220 | static struct btrfs_block_group *peek_discard_list( | |
2bee7eb8 | 221 | struct btrfs_discard_ctl *discard_ctl, |
7fe6d45e | 222 | enum btrfs_discard_state *discard_state, |
ea9ed87c | 223 | int *discard_index, u64 now) |
b0643e59 DZ |
224 | { |
225 | struct btrfs_block_group *block_group; | |
b0643e59 DZ |
226 | |
227 | spin_lock(&discard_ctl->lock); | |
2bee7eb8 | 228 | again: |
b0643e59 DZ |
229 | block_group = find_next_block_group(discard_ctl, now); |
230 | ||
ea9ed87c | 231 | if (block_group && now >= block_group->discard_eligible_time) { |
2bee7eb8 DZ |
232 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && |
233 | block_group->used != 0) { | |
2b5463fc | 234 | if (btrfs_is_block_group_data_only(block_group)) { |
5cb0724e | 235 | __add_to_discard_list(discard_ctl, block_group); |
2b5463fc | 236 | } else { |
5cb0724e | 237 | list_del_init(&block_group->discard_list); |
2b5463fc BB |
238 | btrfs_put_block_group(block_group); |
239 | } | |
2bee7eb8 DZ |
240 | goto again; |
241 | } | |
242 | if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { | |
243 | block_group->discard_cursor = block_group->start; | |
244 | block_group->discard_state = BTRFS_DISCARD_EXTENTS; | |
245 | } | |
246 | discard_ctl->block_group = block_group; | |
ea9ed87c PB |
247 | } |
248 | if (block_group) { | |
2bee7eb8 | 249 | *discard_state = block_group->discard_state; |
7fe6d45e | 250 | *discard_index = block_group->discard_index; |
2bee7eb8 | 251 | } |
b0643e59 DZ |
252 | spin_unlock(&discard_ctl->lock); |
253 | ||
254 | return block_group; | |
255 | } | |
256 | ||
43dd529a DS |
257 | /* |
258 | * Update a block group's filters. | |
259 | * | |
260 | * @block_group: block group of interest | |
261 | * @bytes: recently freed region size after coalescing | |
7fe6d45e DZ |
262 | * |
263 | * Async discard maintains multiple lists with progressively smaller filters | |
264 | * to prioritize discarding based on size. Should a free space that matches | |
265 | * a larger filter be returned to the free_space_cache, prioritize that discard | |
266 | * by moving @block_group to the proper filter. | |
267 | */ | |
268 | void btrfs_discard_check_filter(struct btrfs_block_group *block_group, | |
269 | u64 bytes) | |
270 | { | |
271 | struct btrfs_discard_ctl *discard_ctl; | |
272 | ||
273 | if (!block_group || | |
274 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
275 | return; | |
276 | ||
277 | discard_ctl = &block_group->fs_info->discard_ctl; | |
278 | ||
279 | if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && | |
280 | bytes >= discard_minlen[block_group->discard_index - 1]) { | |
281 | int i; | |
282 | ||
283 | remove_from_discard_list(discard_ctl, block_group); | |
284 | ||
285 | for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; | |
286 | i++) { | |
287 | if (bytes >= discard_minlen[i]) { | |
288 | block_group->discard_index = i; | |
289 | add_to_discard_list(discard_ctl, block_group); | |
290 | break; | |
291 | } | |
292 | } | |
293 | } | |
294 | } | |
295 | ||
43dd529a DS |
296 | /* |
297 | * Move a block group along the discard lists. | |
298 | * | |
7fe6d45e DZ |
299 | * @discard_ctl: discard control |
300 | * @block_group: block_group of interest | |
301 | * | |
302 | * Increment @block_group's discard_index. If it falls of the list, let it be. | |
303 | * Otherwise add it back to the appropriate list. | |
304 | */ | |
305 | static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, | |
306 | struct btrfs_block_group *block_group) | |
307 | { | |
308 | block_group->discard_index++; | |
309 | if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { | |
310 | block_group->discard_index = 1; | |
311 | return; | |
312 | } | |
313 | ||
314 | add_to_discard_list(discard_ctl, block_group); | |
315 | } | |
316 | ||
43dd529a DS |
317 | /* |
318 | * Remove a block_group from the discard lists. | |
319 | * | |
b0643e59 DZ |
320 | * @discard_ctl: discard control |
321 | * @block_group: block_group of interest | |
322 | * | |
43dd529a DS |
323 | * Remove @block_group from the discard lists. If necessary, wait on the |
324 | * current work and then reschedule the delayed work. | |
b0643e59 DZ |
325 | */ |
326 | void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, | |
327 | struct btrfs_block_group *block_group) | |
328 | { | |
329 | if (remove_from_discard_list(discard_ctl, block_group)) { | |
330 | cancel_delayed_work_sync(&discard_ctl->work); | |
331 | btrfs_discard_schedule_work(discard_ctl, true); | |
332 | } | |
333 | } | |
334 | ||
43dd529a DS |
335 | /* |
336 | * Handles queuing the block_groups. | |
337 | * | |
b0643e59 DZ |
338 | * @discard_ctl: discard control |
339 | * @block_group: block_group of interest | |
340 | * | |
43dd529a | 341 | * Maintain the LRU order of the discard lists. |
b0643e59 DZ |
342 | */ |
343 | void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, | |
344 | struct btrfs_block_group *block_group) | |
345 | { | |
346 | if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
347 | return; | |
348 | ||
6e80d4f8 DZ |
349 | if (block_group->used == 0) |
350 | add_to_discard_unused_list(discard_ctl, block_group); | |
351 | else | |
352 | add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
353 | |
354 | if (!delayed_work_pending(&discard_ctl->work)) | |
355 | btrfs_discard_schedule_work(discard_ctl, false); | |
356 | } | |
357 | ||
8fc05859 PB |
358 | static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
359 | u64 now, bool override) | |
b0643e59 DZ |
360 | { |
361 | struct btrfs_block_group *block_group; | |
b0643e59 DZ |
362 | |
363 | if (!btrfs_run_discard_work(discard_ctl)) | |
8fc05859 | 364 | return; |
b0643e59 | 365 | if (!override && delayed_work_pending(&discard_ctl->work)) |
8fc05859 | 366 | return; |
b0643e59 DZ |
367 | |
368 | block_group = find_next_block_group(discard_ctl, now); | |
369 | if (block_group) { | |
6e88f116 | 370 | u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; |
e93591bb DZ |
371 | u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); |
372 | ||
373 | /* | |
374 | * A single delayed workqueue item is responsible for | |
375 | * discarding, so we can manage the bytes rate limit by keeping | |
376 | * track of the previous discard. | |
377 | */ | |
378 | if (kbps_limit && discard_ctl->prev_discard) { | |
379 | u64 bps_limit = ((u64)kbps_limit) * SZ_1K; | |
380 | u64 bps_delay = div64_u64(discard_ctl->prev_discard * | |
6e88f116 | 381 | NSEC_PER_SEC, bps_limit); |
e93591bb | 382 | |
6e88f116 | 383 | delay = max(delay, bps_delay); |
e93591bb | 384 | } |
a2309300 DZ |
385 | |
386 | /* | |
387 | * This timeout is to hopefully prevent immediate discarding | |
388 | * in a recently allocated block group. | |
389 | */ | |
390 | if (now < block_group->discard_eligible_time) { | |
391 | u64 bg_timeout = block_group->discard_eligible_time - now; | |
b0643e59 | 392 | |
6e88f116 | 393 | delay = max(delay, bg_timeout); |
a2309300 | 394 | } |
b0643e59 | 395 | |
df903e5d PB |
396 | if (override && discard_ctl->prev_discard) { |
397 | u64 elapsed = now - discard_ctl->prev_discard_time; | |
398 | ||
399 | if (delay > elapsed) | |
400 | delay -= elapsed; | |
401 | else | |
402 | delay = 0; | |
403 | } | |
404 | ||
b0643e59 | 405 | mod_delayed_work(discard_ctl->discard_workers, |
6e88f116 | 406 | &discard_ctl->work, nsecs_to_jiffies(delay)); |
b0643e59 | 407 | } |
8fc05859 PB |
408 | } |
409 | ||
410 | /* | |
43dd529a DS |
411 | * Responsible for scheduling the discard work. |
412 | * | |
8fc05859 PB |
413 | * @discard_ctl: discard control |
414 | * @override: override the current timer | |
415 | * | |
416 | * Discards are issued by a delayed workqueue item. @override is used to | |
417 | * update the current delay as the baseline delay interval is reevaluated on | |
418 | * transaction commit. This is also maxed with any other rate limit. | |
419 | */ | |
420 | void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, | |
421 | bool override) | |
422 | { | |
423 | const u64 now = ktime_get_ns(); | |
424 | ||
425 | spin_lock(&discard_ctl->lock); | |
426 | __btrfs_discard_schedule_work(discard_ctl, now, override); | |
b0643e59 DZ |
427 | spin_unlock(&discard_ctl->lock); |
428 | } | |
429 | ||
43dd529a DS |
430 | /* |
431 | * Determine next step of a block_group. | |
432 | * | |
6e80d4f8 DZ |
433 | * @discard_ctl: discard control |
434 | * @block_group: block_group of interest | |
435 | * | |
43dd529a DS |
436 | * Determine the next step for a block group after it's finished going through |
437 | * a pass on a discard list. If it is unused and fully trimmed, we can mark it | |
438 | * unused and send it to the unused_bgs path. Otherwise, pass it onto the | |
439 | * appropriate filter list or let it fall off. | |
6e80d4f8 DZ |
440 | */ |
441 | static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, | |
442 | struct btrfs_block_group *block_group) | |
443 | { | |
444 | remove_from_discard_list(discard_ctl, block_group); | |
445 | ||
446 | if (block_group->used == 0) { | |
447 | if (btrfs_is_free_space_trimmed(block_group)) | |
448 | btrfs_mark_bg_unused(block_group); | |
449 | else | |
450 | add_to_discard_unused_list(discard_ctl, block_group); | |
7fe6d45e DZ |
451 | } else { |
452 | btrfs_update_discard_index(discard_ctl, block_group); | |
6e80d4f8 DZ |
453 | } |
454 | } | |
455 | ||
43dd529a DS |
456 | /* |
457 | * Discard work queue callback | |
458 | * | |
b0643e59 DZ |
459 | * @work: work |
460 | * | |
43dd529a DS |
461 | * Find the next block_group to start discarding and then discard a single |
462 | * region. It does this in a two-pass fashion: first extents and second | |
2bee7eb8 | 463 | * bitmaps. Completely discarded block groups are sent to the unused_bgs path. |
b0643e59 DZ |
464 | */ |
465 | static void btrfs_discard_workfn(struct work_struct *work) | |
466 | { | |
467 | struct btrfs_discard_ctl *discard_ctl; | |
468 | struct btrfs_block_group *block_group; | |
2bee7eb8 | 469 | enum btrfs_discard_state discard_state; |
7fe6d45e | 470 | int discard_index = 0; |
b0643e59 | 471 | u64 trimmed = 0; |
7fe6d45e | 472 | u64 minlen = 0; |
ea9ed87c | 473 | u64 now = ktime_get_ns(); |
b0643e59 DZ |
474 | |
475 | discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); | |
476 | ||
7fe6d45e | 477 | block_group = peek_discard_list(discard_ctl, &discard_state, |
ea9ed87c | 478 | &discard_index, now); |
b0643e59 DZ |
479 | if (!block_group || !btrfs_run_discard_work(discard_ctl)) |
480 | return; | |
ea9ed87c PB |
481 | if (now < block_group->discard_eligible_time) { |
482 | btrfs_discard_schedule_work(discard_ctl, false); | |
483 | return; | |
484 | } | |
b0643e59 | 485 | |
2bee7eb8 | 486 | /* Perform discarding */ |
7fe6d45e DZ |
487 | minlen = discard_minlen[discard_index]; |
488 | ||
489 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
490 | u64 maxlen = 0; | |
491 | ||
492 | /* | |
493 | * Use the previous levels minimum discard length as the max | |
494 | * length filter. In the case something is added to make a | |
495 | * region go beyond the max filter, the entire bitmap is set | |
496 | * back to BTRFS_TRIM_STATE_UNTRIMMED. | |
497 | */ | |
498 | if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) | |
499 | maxlen = discard_minlen[discard_index - 1]; | |
500 | ||
2bee7eb8 DZ |
501 | btrfs_trim_block_group_bitmaps(block_group, &trimmed, |
502 | block_group->discard_cursor, | |
503 | btrfs_block_group_end(block_group), | |
7fe6d45e | 504 | minlen, maxlen, true); |
9ddf648f | 505 | discard_ctl->discard_bitmap_bytes += trimmed; |
7fe6d45e | 506 | } else { |
2bee7eb8 DZ |
507 | btrfs_trim_block_group_extents(block_group, &trimmed, |
508 | block_group->discard_cursor, | |
509 | btrfs_block_group_end(block_group), | |
7fe6d45e | 510 | minlen, true); |
9ddf648f | 511 | discard_ctl->discard_extent_bytes += trimmed; |
7fe6d45e | 512 | } |
2bee7eb8 DZ |
513 | |
514 | /* Determine next steps for a block_group */ | |
515 | if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { | |
516 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
517 | btrfs_finish_discard_pass(discard_ctl, block_group); | |
518 | } else { | |
519 | block_group->discard_cursor = block_group->start; | |
520 | spin_lock(&discard_ctl->lock); | |
521 | if (block_group->discard_state != | |
522 | BTRFS_DISCARD_RESET_CURSOR) | |
523 | block_group->discard_state = | |
524 | BTRFS_DISCARD_BITMAPS; | |
525 | spin_unlock(&discard_ctl->lock); | |
526 | } | |
527 | } | |
528 | ||
1ea2872f | 529 | now = ktime_get_ns(); |
2bee7eb8 | 530 | spin_lock(&discard_ctl->lock); |
1ea2872f PB |
531 | discard_ctl->prev_discard = trimmed; |
532 | discard_ctl->prev_discard_time = now; | |
2b5463fc BB |
533 | /* |
534 | * If the block group was removed from the discard list while it was | |
535 | * running in this workfn, then we didn't deref it, since this function | |
536 | * still owned that reference. But we set the discard_ctl->block_group | |
537 | * back to NULL, so we can use that condition to know that now we need | |
538 | * to deref the block_group. | |
539 | */ | |
540 | if (discard_ctl->block_group == NULL) | |
541 | btrfs_put_block_group(block_group); | |
2bee7eb8 | 542 | discard_ctl->block_group = NULL; |
8fc05859 | 543 | __btrfs_discard_schedule_work(discard_ctl, now, false); |
2bee7eb8 | 544 | spin_unlock(&discard_ctl->lock); |
b0643e59 DZ |
545 | } |
546 | ||
43dd529a DS |
547 | /* |
548 | * Determine if async discard should be running. | |
549 | * | |
b0643e59 DZ |
550 | * @discard_ctl: discard control |
551 | * | |
43dd529a | 552 | * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. |
b0643e59 DZ |
553 | */ |
554 | bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) | |
555 | { | |
556 | struct btrfs_fs_info *fs_info = container_of(discard_ctl, | |
557 | struct btrfs_fs_info, | |
558 | discard_ctl); | |
559 | ||
560 | return (!(fs_info->sb->s_flags & SB_RDONLY) && | |
561 | test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); | |
562 | } | |
563 | ||
43dd529a DS |
564 | /* |
565 | * Recalculate the base delay. | |
566 | * | |
a2309300 DZ |
567 | * @discard_ctl: discard control |
568 | * | |
569 | * Recalculate the base delay which is based off the total number of | |
570 | * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) | |
571 | * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). | |
572 | */ | |
573 | void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) | |
574 | { | |
575 | s32 discardable_extents; | |
81b29a3b | 576 | s64 discardable_bytes; |
a2309300 | 577 | u32 iops_limit; |
ef9cddfe | 578 | unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; |
a2309300 | 579 | unsigned long delay; |
a2309300 DZ |
580 | |
581 | discardable_extents = atomic_read(&discard_ctl->discardable_extents); | |
582 | if (!discardable_extents) | |
583 | return; | |
584 | ||
585 | spin_lock(&discard_ctl->lock); | |
586 | ||
81b29a3b | 587 | /* |
43dd529a | 588 | * The following is to fix a potential -1 discrepancy that we're not |
81b29a3b DZ |
589 | * sure how to reproduce. But given that this is the only place that |
590 | * utilizes these numbers and this is only called by from | |
591 | * btrfs_finish_extent_commit() which is synchronized, we can correct | |
592 | * here. | |
593 | */ | |
594 | if (discardable_extents < 0) | |
595 | atomic_add(-discardable_extents, | |
596 | &discard_ctl->discardable_extents); | |
597 | ||
598 | discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); | |
599 | if (discardable_bytes < 0) | |
600 | atomic64_add(-discardable_bytes, | |
601 | &discard_ctl->discardable_bytes); | |
602 | ||
603 | if (discardable_extents <= 0) { | |
604 | spin_unlock(&discard_ctl->lock); | |
605 | return; | |
606 | } | |
607 | ||
a2309300 | 608 | iops_limit = READ_ONCE(discard_ctl->iops_limit); |
ef9cddfe BB |
609 | |
610 | if (iops_limit) { | |
e50404a8 | 611 | delay = MSEC_PER_SEC / iops_limit; |
ef9cddfe BB |
612 | } else { |
613 | /* | |
614 | * Unset iops_limit means go as fast as possible, so allow a | |
615 | * delay of 0. | |
616 | */ | |
617 | delay = 0; | |
618 | min_delay = 0; | |
619 | } | |
a2309300 | 620 | |
ef9cddfe | 621 | delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); |
6e88f116 | 622 | discard_ctl->delay_ms = delay; |
a2309300 DZ |
623 | |
624 | spin_unlock(&discard_ctl->lock); | |
625 | } | |
626 | ||
43dd529a DS |
627 | /* |
628 | * Propagate discard counters. | |
629 | * | |
dfb79ddb | 630 | * @block_group: block_group of interest |
dfb79ddb | 631 | * |
43dd529a DS |
632 | * Propagate deltas of counters up to the discard_ctl. It maintains a current |
633 | * counter and a previous counter passing the delta up to the global stat. | |
634 | * Then the current counter value becomes the previous counter value. | |
dfb79ddb | 635 | */ |
66b53bae | 636 | void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) |
dfb79ddb | 637 | { |
66b53bae | 638 | struct btrfs_free_space_ctl *ctl; |
dfb79ddb DZ |
639 | struct btrfs_discard_ctl *discard_ctl; |
640 | s32 extents_delta; | |
5dc7c10b | 641 | s64 bytes_delta; |
dfb79ddb | 642 | |
5cb0724e DZ |
643 | if (!block_group || |
644 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || | |
645 | !btrfs_is_block_group_data_only(block_group)) | |
dfb79ddb DZ |
646 | return; |
647 | ||
66b53bae | 648 | ctl = block_group->free_space_ctl; |
dfb79ddb DZ |
649 | discard_ctl = &block_group->fs_info->discard_ctl; |
650 | ||
66b53bae | 651 | lockdep_assert_held(&ctl->tree_lock); |
dfb79ddb DZ |
652 | extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - |
653 | ctl->discardable_extents[BTRFS_STAT_PREV]; | |
654 | if (extents_delta) { | |
655 | atomic_add(extents_delta, &discard_ctl->discardable_extents); | |
656 | ctl->discardable_extents[BTRFS_STAT_PREV] = | |
657 | ctl->discardable_extents[BTRFS_STAT_CURR]; | |
658 | } | |
5dc7c10b DZ |
659 | |
660 | bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - | |
661 | ctl->discardable_bytes[BTRFS_STAT_PREV]; | |
662 | if (bytes_delta) { | |
663 | atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); | |
664 | ctl->discardable_bytes[BTRFS_STAT_PREV] = | |
665 | ctl->discardable_bytes[BTRFS_STAT_CURR]; | |
666 | } | |
dfb79ddb DZ |
667 | } |
668 | ||
43dd529a DS |
669 | /* |
670 | * Punt unused_bgs list to discard lists. | |
671 | * | |
6e80d4f8 DZ |
672 | * @fs_info: fs_info of interest |
673 | * | |
674 | * The unused_bgs list needs to be punted to the discard lists because the | |
1a9fd417 | 675 | * order of operations is changed. In the normal synchronous discard path, the |
6e80d4f8 DZ |
676 | * block groups are trimmed via a single large trim in transaction commit. This |
677 | * is ultimately what we are trying to avoid with asynchronous discard. Thus, | |
678 | * it must be done before going down the unused_bgs path. | |
679 | */ | |
680 | void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) | |
681 | { | |
682 | struct btrfs_block_group *block_group, *next; | |
683 | ||
684 | spin_lock(&fs_info->unused_bgs_lock); | |
685 | /* We enabled async discard, so punt all to the queue */ | |
686 | list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, | |
687 | bg_list) { | |
688 | list_del_init(&block_group->bg_list); | |
689 | btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); | |
2b5463fc BB |
690 | /* |
691 | * This put is for the get done by btrfs_mark_bg_unused. | |
692 | * Queueing discard incremented it for discard's reference. | |
693 | */ | |
694 | btrfs_put_block_group(block_group); | |
6e80d4f8 DZ |
695 | } |
696 | spin_unlock(&fs_info->unused_bgs_lock); | |
697 | } | |
698 | ||
43dd529a DS |
699 | /* |
700 | * Purge discard lists. | |
701 | * | |
6e80d4f8 DZ |
702 | * @discard_ctl: discard control |
703 | * | |
704 | * If we are disabling async discard, we may have intercepted block groups that | |
705 | * are completely free and ready for the unused_bgs path. As discarding will | |
706 | * now happen in transaction commit or not at all, we can safely mark the | |
707 | * corresponding block groups as unused and they will be sent on their merry | |
708 | * way to the unused_bgs list. | |
709 | */ | |
710 | static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) | |
711 | { | |
712 | struct btrfs_block_group *block_group, *next; | |
713 | int i; | |
714 | ||
715 | spin_lock(&discard_ctl->lock); | |
716 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
717 | list_for_each_entry_safe(block_group, next, | |
718 | &discard_ctl->discard_list[i], | |
719 | discard_list) { | |
720 | list_del_init(&block_group->discard_list); | |
721 | spin_unlock(&discard_ctl->lock); | |
722 | if (block_group->used == 0) | |
723 | btrfs_mark_bg_unused(block_group); | |
724 | spin_lock(&discard_ctl->lock); | |
2b5463fc | 725 | btrfs_put_block_group(block_group); |
6e80d4f8 DZ |
726 | } |
727 | } | |
728 | spin_unlock(&discard_ctl->lock); | |
729 | } | |
730 | ||
b0643e59 DZ |
731 | void btrfs_discard_resume(struct btrfs_fs_info *fs_info) |
732 | { | |
733 | if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { | |
734 | btrfs_discard_cleanup(fs_info); | |
735 | return; | |
736 | } | |
737 | ||
6e80d4f8 DZ |
738 | btrfs_discard_punt_unused_bgs_list(fs_info); |
739 | ||
b0643e59 DZ |
740 | set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); |
741 | } | |
742 | ||
743 | void btrfs_discard_stop(struct btrfs_fs_info *fs_info) | |
744 | { | |
745 | clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); | |
746 | } | |
747 | ||
748 | void btrfs_discard_init(struct btrfs_fs_info *fs_info) | |
749 | { | |
750 | struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; | |
751 | int i; | |
752 | ||
753 | spin_lock_init(&discard_ctl->lock); | |
754 | INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); | |
755 | ||
756 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) | |
757 | INIT_LIST_HEAD(&discard_ctl->discard_list[i]); | |
dfb79ddb | 758 | |
e93591bb | 759 | discard_ctl->prev_discard = 0; |
df903e5d | 760 | discard_ctl->prev_discard_time = 0; |
dfb79ddb | 761 | atomic_set(&discard_ctl->discardable_extents, 0); |
5dc7c10b | 762 | atomic64_set(&discard_ctl->discardable_bytes, 0); |
19b2a2c7 | 763 | discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; |
6e88f116 | 764 | discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; |
a2309300 | 765 | discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; |
e93591bb | 766 | discard_ctl->kbps_limit = 0; |
9ddf648f DZ |
767 | discard_ctl->discard_extent_bytes = 0; |
768 | discard_ctl->discard_bitmap_bytes = 0; | |
769 | atomic64_set(&discard_ctl->discard_bytes_saved, 0); | |
b0643e59 DZ |
770 | } |
771 | ||
772 | void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) | |
773 | { | |
774 | btrfs_discard_stop(fs_info); | |
775 | cancel_delayed_work_sync(&fs_info->discard_ctl.work); | |
6e80d4f8 | 776 | btrfs_discard_purge_list(&fs_info->discard_ctl); |
b0643e59 | 777 | } |