Commit | Line | Data |
---|---|---|
b0643e59 DZ |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/jiffies.h> | |
4 | #include <linux/kernel.h> | |
5 | #include <linux/ktime.h> | |
6 | #include <linux/list.h> | |
e93591bb | 7 | #include <linux/math64.h> |
b0643e59 DZ |
8 | #include <linux/sizes.h> |
9 | #include <linux/workqueue.h> | |
10 | #include "ctree.h" | |
11 | #include "block-group.h" | |
12 | #include "discard.h" | |
13 | #include "free-space-cache.h" | |
14 | ||
dbc2a8c9 DZ |
15 | /* |
16 | * This contains the logic to handle async discard. | |
17 | * | |
18 | * Async discard manages trimming of free space outside of transaction commit. | |
19 | * Discarding is done by managing the block_groups on a LRU list based on free | |
20 | * space recency. Two passes are used to first prioritize discarding extents | |
21 | * and then allow for trimming in the bitmap the best opportunity to coalesce. | |
22 | * The block_groups are maintained on multiple lists to allow for multiple | |
23 | * passes with different discard filter requirements. A delayed work item is | |
24 | * used to manage discarding with timeout determined by a max of the delay | |
25 | * incurred by the iops rate limit, the byte rate limit, and the max delay of | |
26 | * BTRFS_DISCARD_MAX_DELAY. | |
27 | * | |
28 | * Note, this only keeps track of block_groups that are explicitly for data. | |
29 | * Mixed block_groups are not supported. | |
30 | * | |
31 | * The first list is special to manage discarding of fully free block groups. | |
32 | * This is necessary because we issue a final trim for a full free block group | |
33 | * after forgetting it. When a block group becomes unused, instead of directly | |
34 | * being added to the unused_bgs list, we add it to this first list. Then | |
35 | * from there, if it becomes fully discarded, we place it onto the unused_bgs | |
36 | * list. | |
37 | * | |
38 | * The in-memory free space cache serves as the backing state for discard. | |
39 | * Consequently this means there is no persistence. We opt to load all the | |
40 | * block groups in as not discarded, so the mount case degenerates to the | |
41 | * crashing case. | |
42 | * | |
43 | * As the free space cache uses bitmaps, there exists a tradeoff between | |
44 | * ease/efficiency for find_free_extent() and the accuracy of discard state. | |
45 | * Here we opt to let untrimmed regions merge with everything while only letting | |
46 | * trimmed regions merge with other trimmed regions. This can cause | |
47 | * overtrimming, but the coalescing benefit seems to be worth it. Additionally, | |
48 | * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, | |
49 | * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, | |
50 | * this resets the state and we will retry trimming the whole bitmap. This is a | |
51 | * tradeoff between discard state accuracy and the cost of accounting. | |
52 | */ | |
53 | ||
b0643e59 DZ |
54 | /* This is an initial delay to give some chance for block reuse */ |
55 | #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) | |
6e80d4f8 | 56 | #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) |
b0643e59 | 57 | |
a2309300 DZ |
58 | /* Target completion latency of discarding all discardable extents */ |
59 | #define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) | |
60 | #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) | |
61 | #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) | |
62 | #define BTRFS_DISCARD_MAX_IOPS (10U) | |
63 | ||
7fe6d45e DZ |
64 | /* Montonically decreasing minimum length filters after index 0 */ |
65 | static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { | |
66 | 0, | |
67 | BTRFS_ASYNC_DISCARD_MAX_FILTER, | |
68 | BTRFS_ASYNC_DISCARD_MIN_FILTER | |
69 | }; | |
70 | ||
b0643e59 DZ |
71 | static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, |
72 | struct btrfs_block_group *block_group) | |
73 | { | |
74 | return &discard_ctl->discard_list[block_group->discard_index]; | |
75 | } | |
76 | ||
2bee7eb8 DZ |
77 | static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
78 | struct btrfs_block_group *block_group) | |
b0643e59 | 79 | { |
2bee7eb8 | 80 | if (!btrfs_run_discard_work(discard_ctl)) |
b0643e59 | 81 | return; |
b0643e59 | 82 | |
6e80d4f8 DZ |
83 | if (list_empty(&block_group->discard_list) || |
84 | block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { | |
85 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) | |
86 | block_group->discard_index = BTRFS_DISCARD_INDEX_START; | |
b0643e59 DZ |
87 | block_group->discard_eligible_time = (ktime_get_ns() + |
88 | BTRFS_DISCARD_DELAY); | |
2bee7eb8 | 89 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
6e80d4f8 | 90 | } |
b0643e59 DZ |
91 | |
92 | list_move_tail(&block_group->discard_list, | |
93 | get_discard_list(discard_ctl, block_group)); | |
2bee7eb8 | 94 | } |
b0643e59 | 95 | |
2bee7eb8 DZ |
96 | static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
97 | struct btrfs_block_group *block_group) | |
98 | { | |
5cb0724e DZ |
99 | if (!btrfs_is_block_group_data_only(block_group)) |
100 | return; | |
101 | ||
2bee7eb8 DZ |
102 | spin_lock(&discard_ctl->lock); |
103 | __add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
104 | spin_unlock(&discard_ctl->lock); |
105 | } | |
106 | ||
6e80d4f8 DZ |
107 | static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, |
108 | struct btrfs_block_group *block_group) | |
109 | { | |
110 | spin_lock(&discard_ctl->lock); | |
111 | ||
112 | if (!btrfs_run_discard_work(discard_ctl)) { | |
113 | spin_unlock(&discard_ctl->lock); | |
114 | return; | |
115 | } | |
116 | ||
117 | list_del_init(&block_group->discard_list); | |
118 | ||
119 | block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; | |
120 | block_group->discard_eligible_time = (ktime_get_ns() + | |
121 | BTRFS_DISCARD_UNUSED_DELAY); | |
2bee7eb8 | 122 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
6e80d4f8 DZ |
123 | list_add_tail(&block_group->discard_list, |
124 | &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); | |
125 | ||
126 | spin_unlock(&discard_ctl->lock); | |
127 | } | |
128 | ||
b0643e59 DZ |
129 | static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, |
130 | struct btrfs_block_group *block_group) | |
131 | { | |
132 | bool running = false; | |
133 | ||
134 | spin_lock(&discard_ctl->lock); | |
135 | ||
136 | if (block_group == discard_ctl->block_group) { | |
137 | running = true; | |
138 | discard_ctl->block_group = NULL; | |
139 | } | |
140 | ||
141 | block_group->discard_eligible_time = 0; | |
142 | list_del_init(&block_group->discard_list); | |
143 | ||
144 | spin_unlock(&discard_ctl->lock); | |
145 | ||
146 | return running; | |
147 | } | |
148 | ||
149 | /** | |
150 | * find_next_block_group - find block_group that's up next for discarding | |
151 | * @discard_ctl: discard control | |
152 | * @now: current time | |
153 | * | |
154 | * Iterate over the discard lists to find the next block_group up for | |
155 | * discarding checking the discard_eligible_time of block_group. | |
156 | */ | |
157 | static struct btrfs_block_group *find_next_block_group( | |
158 | struct btrfs_discard_ctl *discard_ctl, | |
159 | u64 now) | |
160 | { | |
161 | struct btrfs_block_group *ret_block_group = NULL, *block_group; | |
162 | int i; | |
163 | ||
164 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
165 | struct list_head *discard_list = &discard_ctl->discard_list[i]; | |
166 | ||
167 | if (!list_empty(discard_list)) { | |
168 | block_group = list_first_entry(discard_list, | |
169 | struct btrfs_block_group, | |
170 | discard_list); | |
171 | ||
172 | if (!ret_block_group) | |
173 | ret_block_group = block_group; | |
174 | ||
175 | if (ret_block_group->discard_eligible_time < now) | |
176 | break; | |
177 | ||
178 | if (ret_block_group->discard_eligible_time > | |
179 | block_group->discard_eligible_time) | |
180 | ret_block_group = block_group; | |
181 | } | |
182 | } | |
183 | ||
184 | return ret_block_group; | |
185 | } | |
186 | ||
187 | /** | |
92419695 NB |
188 | * Wrap find_next_block_group() |
189 | * | |
190 | * @discard_ctl: discard control | |
2bee7eb8 | 191 | * @discard_state: the discard_state of the block_group after state management |
7fe6d45e | 192 | * @discard_index: the discard_index of the block_group after state management |
92419695 | 193 | * @now: time when discard was invoked, in ns |
b0643e59 DZ |
194 | * |
195 | * This wraps find_next_block_group() and sets the block_group to be in use. | |
2bee7eb8 | 196 | * discard_state's control flow is managed here. Variables related to |
7fe6d45e DZ |
197 | * discard_state are reset here as needed (eg discard_cursor). @discard_state |
198 | * and @discard_index are remembered as it may change while we're discarding, | |
199 | * but we want the discard to execute in the context determined here. | |
b0643e59 DZ |
200 | */ |
201 | static struct btrfs_block_group *peek_discard_list( | |
2bee7eb8 | 202 | struct btrfs_discard_ctl *discard_ctl, |
7fe6d45e | 203 | enum btrfs_discard_state *discard_state, |
ea9ed87c | 204 | int *discard_index, u64 now) |
b0643e59 DZ |
205 | { |
206 | struct btrfs_block_group *block_group; | |
b0643e59 DZ |
207 | |
208 | spin_lock(&discard_ctl->lock); | |
2bee7eb8 | 209 | again: |
b0643e59 DZ |
210 | block_group = find_next_block_group(discard_ctl, now); |
211 | ||
ea9ed87c | 212 | if (block_group && now >= block_group->discard_eligible_time) { |
2bee7eb8 DZ |
213 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && |
214 | block_group->used != 0) { | |
5cb0724e DZ |
215 | if (btrfs_is_block_group_data_only(block_group)) |
216 | __add_to_discard_list(discard_ctl, block_group); | |
217 | else | |
218 | list_del_init(&block_group->discard_list); | |
2bee7eb8 DZ |
219 | goto again; |
220 | } | |
221 | if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { | |
222 | block_group->discard_cursor = block_group->start; | |
223 | block_group->discard_state = BTRFS_DISCARD_EXTENTS; | |
224 | } | |
225 | discard_ctl->block_group = block_group; | |
ea9ed87c PB |
226 | } |
227 | if (block_group) { | |
2bee7eb8 | 228 | *discard_state = block_group->discard_state; |
7fe6d45e | 229 | *discard_index = block_group->discard_index; |
2bee7eb8 | 230 | } |
b0643e59 DZ |
231 | spin_unlock(&discard_ctl->lock); |
232 | ||
233 | return block_group; | |
234 | } | |
235 | ||
7fe6d45e DZ |
236 | /** |
237 | * btrfs_discard_check_filter - updates a block groups filters | |
238 | * @block_group: block group of interest | |
239 | * @bytes: recently freed region size after coalescing | |
240 | * | |
241 | * Async discard maintains multiple lists with progressively smaller filters | |
242 | * to prioritize discarding based on size. Should a free space that matches | |
243 | * a larger filter be returned to the free_space_cache, prioritize that discard | |
244 | * by moving @block_group to the proper filter. | |
245 | */ | |
246 | void btrfs_discard_check_filter(struct btrfs_block_group *block_group, | |
247 | u64 bytes) | |
248 | { | |
249 | struct btrfs_discard_ctl *discard_ctl; | |
250 | ||
251 | if (!block_group || | |
252 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
253 | return; | |
254 | ||
255 | discard_ctl = &block_group->fs_info->discard_ctl; | |
256 | ||
257 | if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && | |
258 | bytes >= discard_minlen[block_group->discard_index - 1]) { | |
259 | int i; | |
260 | ||
261 | remove_from_discard_list(discard_ctl, block_group); | |
262 | ||
263 | for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; | |
264 | i++) { | |
265 | if (bytes >= discard_minlen[i]) { | |
266 | block_group->discard_index = i; | |
267 | add_to_discard_list(discard_ctl, block_group); | |
268 | break; | |
269 | } | |
270 | } | |
271 | } | |
272 | } | |
273 | ||
274 | /** | |
275 | * btrfs_update_discard_index - moves a block group along the discard lists | |
276 | * @discard_ctl: discard control | |
277 | * @block_group: block_group of interest | |
278 | * | |
279 | * Increment @block_group's discard_index. If it falls of the list, let it be. | |
280 | * Otherwise add it back to the appropriate list. | |
281 | */ | |
282 | static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, | |
283 | struct btrfs_block_group *block_group) | |
284 | { | |
285 | block_group->discard_index++; | |
286 | if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { | |
287 | block_group->discard_index = 1; | |
288 | return; | |
289 | } | |
290 | ||
291 | add_to_discard_list(discard_ctl, block_group); | |
292 | } | |
293 | ||
b0643e59 DZ |
294 | /** |
295 | * btrfs_discard_cancel_work - remove a block_group from the discard lists | |
296 | * @discard_ctl: discard control | |
297 | * @block_group: block_group of interest | |
298 | * | |
299 | * This removes @block_group from the discard lists. If necessary, it waits on | |
300 | * the current work and then reschedules the delayed work. | |
301 | */ | |
302 | void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, | |
303 | struct btrfs_block_group *block_group) | |
304 | { | |
305 | if (remove_from_discard_list(discard_ctl, block_group)) { | |
306 | cancel_delayed_work_sync(&discard_ctl->work); | |
307 | btrfs_discard_schedule_work(discard_ctl, true); | |
308 | } | |
309 | } | |
310 | ||
311 | /** | |
312 | * btrfs_discard_queue_work - handles queuing the block_groups | |
313 | * @discard_ctl: discard control | |
314 | * @block_group: block_group of interest | |
315 | * | |
316 | * This maintains the LRU order of the discard lists. | |
317 | */ | |
318 | void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, | |
319 | struct btrfs_block_group *block_group) | |
320 | { | |
321 | if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
322 | return; | |
323 | ||
6e80d4f8 DZ |
324 | if (block_group->used == 0) |
325 | add_to_discard_unused_list(discard_ctl, block_group); | |
326 | else | |
327 | add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
328 | |
329 | if (!delayed_work_pending(&discard_ctl->work)) | |
330 | btrfs_discard_schedule_work(discard_ctl, false); | |
331 | } | |
332 | ||
8fc05859 PB |
333 | static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
334 | u64 now, bool override) | |
b0643e59 DZ |
335 | { |
336 | struct btrfs_block_group *block_group; | |
b0643e59 DZ |
337 | |
338 | if (!btrfs_run_discard_work(discard_ctl)) | |
8fc05859 | 339 | return; |
b0643e59 | 340 | if (!override && delayed_work_pending(&discard_ctl->work)) |
8fc05859 | 341 | return; |
b0643e59 DZ |
342 | |
343 | block_group = find_next_block_group(discard_ctl, now); | |
344 | if (block_group) { | |
6e88f116 | 345 | u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; |
e93591bb DZ |
346 | u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); |
347 | ||
348 | /* | |
349 | * A single delayed workqueue item is responsible for | |
350 | * discarding, so we can manage the bytes rate limit by keeping | |
351 | * track of the previous discard. | |
352 | */ | |
353 | if (kbps_limit && discard_ctl->prev_discard) { | |
354 | u64 bps_limit = ((u64)kbps_limit) * SZ_1K; | |
355 | u64 bps_delay = div64_u64(discard_ctl->prev_discard * | |
6e88f116 | 356 | NSEC_PER_SEC, bps_limit); |
e93591bb | 357 | |
6e88f116 | 358 | delay = max(delay, bps_delay); |
e93591bb | 359 | } |
a2309300 DZ |
360 | |
361 | /* | |
362 | * This timeout is to hopefully prevent immediate discarding | |
363 | * in a recently allocated block group. | |
364 | */ | |
365 | if (now < block_group->discard_eligible_time) { | |
366 | u64 bg_timeout = block_group->discard_eligible_time - now; | |
b0643e59 | 367 | |
6e88f116 | 368 | delay = max(delay, bg_timeout); |
a2309300 | 369 | } |
b0643e59 | 370 | |
df903e5d PB |
371 | if (override && discard_ctl->prev_discard) { |
372 | u64 elapsed = now - discard_ctl->prev_discard_time; | |
373 | ||
374 | if (delay > elapsed) | |
375 | delay -= elapsed; | |
376 | else | |
377 | delay = 0; | |
378 | } | |
379 | ||
b0643e59 | 380 | mod_delayed_work(discard_ctl->discard_workers, |
6e88f116 | 381 | &discard_ctl->work, nsecs_to_jiffies(delay)); |
b0643e59 | 382 | } |
8fc05859 PB |
383 | } |
384 | ||
385 | /* | |
386 | * btrfs_discard_schedule_work - responsible for scheduling the discard work | |
387 | * @discard_ctl: discard control | |
388 | * @override: override the current timer | |
389 | * | |
390 | * Discards are issued by a delayed workqueue item. @override is used to | |
391 | * update the current delay as the baseline delay interval is reevaluated on | |
392 | * transaction commit. This is also maxed with any other rate limit. | |
393 | */ | |
394 | void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, | |
395 | bool override) | |
396 | { | |
397 | const u64 now = ktime_get_ns(); | |
398 | ||
399 | spin_lock(&discard_ctl->lock); | |
400 | __btrfs_discard_schedule_work(discard_ctl, now, override); | |
b0643e59 DZ |
401 | spin_unlock(&discard_ctl->lock); |
402 | } | |
403 | ||
6e80d4f8 DZ |
404 | /** |
405 | * btrfs_finish_discard_pass - determine next step of a block_group | |
406 | * @discard_ctl: discard control | |
407 | * @block_group: block_group of interest | |
408 | * | |
409 | * This determines the next step for a block group after it's finished going | |
410 | * through a pass on a discard list. If it is unused and fully trimmed, we can | |
411 | * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto | |
412 | * the appropriate filter list or let it fall off. | |
413 | */ | |
414 | static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, | |
415 | struct btrfs_block_group *block_group) | |
416 | { | |
417 | remove_from_discard_list(discard_ctl, block_group); | |
418 | ||
419 | if (block_group->used == 0) { | |
420 | if (btrfs_is_free_space_trimmed(block_group)) | |
421 | btrfs_mark_bg_unused(block_group); | |
422 | else | |
423 | add_to_discard_unused_list(discard_ctl, block_group); | |
7fe6d45e DZ |
424 | } else { |
425 | btrfs_update_discard_index(discard_ctl, block_group); | |
6e80d4f8 DZ |
426 | } |
427 | } | |
428 | ||
b0643e59 DZ |
429 | /** |
430 | * btrfs_discard_workfn - discard work function | |
431 | * @work: work | |
432 | * | |
2bee7eb8 DZ |
433 | * This finds the next block_group to start discarding and then discards a |
434 | * single region. It does this in a two-pass fashion: first extents and second | |
435 | * bitmaps. Completely discarded block groups are sent to the unused_bgs path. | |
b0643e59 DZ |
436 | */ |
437 | static void btrfs_discard_workfn(struct work_struct *work) | |
438 | { | |
439 | struct btrfs_discard_ctl *discard_ctl; | |
440 | struct btrfs_block_group *block_group; | |
2bee7eb8 | 441 | enum btrfs_discard_state discard_state; |
7fe6d45e | 442 | int discard_index = 0; |
b0643e59 | 443 | u64 trimmed = 0; |
7fe6d45e | 444 | u64 minlen = 0; |
ea9ed87c | 445 | u64 now = ktime_get_ns(); |
b0643e59 DZ |
446 | |
447 | discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); | |
448 | ||
7fe6d45e | 449 | block_group = peek_discard_list(discard_ctl, &discard_state, |
ea9ed87c | 450 | &discard_index, now); |
b0643e59 DZ |
451 | if (!block_group || !btrfs_run_discard_work(discard_ctl)) |
452 | return; | |
ea9ed87c PB |
453 | if (now < block_group->discard_eligible_time) { |
454 | btrfs_discard_schedule_work(discard_ctl, false); | |
455 | return; | |
456 | } | |
b0643e59 | 457 | |
2bee7eb8 | 458 | /* Perform discarding */ |
7fe6d45e DZ |
459 | minlen = discard_minlen[discard_index]; |
460 | ||
461 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
462 | u64 maxlen = 0; | |
463 | ||
464 | /* | |
465 | * Use the previous levels minimum discard length as the max | |
466 | * length filter. In the case something is added to make a | |
467 | * region go beyond the max filter, the entire bitmap is set | |
468 | * back to BTRFS_TRIM_STATE_UNTRIMMED. | |
469 | */ | |
470 | if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) | |
471 | maxlen = discard_minlen[discard_index - 1]; | |
472 | ||
2bee7eb8 DZ |
473 | btrfs_trim_block_group_bitmaps(block_group, &trimmed, |
474 | block_group->discard_cursor, | |
475 | btrfs_block_group_end(block_group), | |
7fe6d45e | 476 | minlen, maxlen, true); |
9ddf648f | 477 | discard_ctl->discard_bitmap_bytes += trimmed; |
7fe6d45e | 478 | } else { |
2bee7eb8 DZ |
479 | btrfs_trim_block_group_extents(block_group, &trimmed, |
480 | block_group->discard_cursor, | |
481 | btrfs_block_group_end(block_group), | |
7fe6d45e | 482 | minlen, true); |
9ddf648f | 483 | discard_ctl->discard_extent_bytes += trimmed; |
7fe6d45e | 484 | } |
2bee7eb8 DZ |
485 | |
486 | /* Determine next steps for a block_group */ | |
487 | if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { | |
488 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
489 | btrfs_finish_discard_pass(discard_ctl, block_group); | |
490 | } else { | |
491 | block_group->discard_cursor = block_group->start; | |
492 | spin_lock(&discard_ctl->lock); | |
493 | if (block_group->discard_state != | |
494 | BTRFS_DISCARD_RESET_CURSOR) | |
495 | block_group->discard_state = | |
496 | BTRFS_DISCARD_BITMAPS; | |
497 | spin_unlock(&discard_ctl->lock); | |
498 | } | |
499 | } | |
500 | ||
1ea2872f | 501 | now = ktime_get_ns(); |
2bee7eb8 | 502 | spin_lock(&discard_ctl->lock); |
1ea2872f PB |
503 | discard_ctl->prev_discard = trimmed; |
504 | discard_ctl->prev_discard_time = now; | |
2bee7eb8 | 505 | discard_ctl->block_group = NULL; |
8fc05859 | 506 | __btrfs_discard_schedule_work(discard_ctl, now, false); |
2bee7eb8 | 507 | spin_unlock(&discard_ctl->lock); |
b0643e59 DZ |
508 | } |
509 | ||
510 | /** | |
511 | * btrfs_run_discard_work - determines if async discard should be running | |
512 | * @discard_ctl: discard control | |
513 | * | |
514 | * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. | |
515 | */ | |
516 | bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) | |
517 | { | |
518 | struct btrfs_fs_info *fs_info = container_of(discard_ctl, | |
519 | struct btrfs_fs_info, | |
520 | discard_ctl); | |
521 | ||
522 | return (!(fs_info->sb->s_flags & SB_RDONLY) && | |
523 | test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); | |
524 | } | |
525 | ||
a2309300 DZ |
526 | /** |
527 | * btrfs_discard_calc_delay - recalculate the base delay | |
528 | * @discard_ctl: discard control | |
529 | * | |
530 | * Recalculate the base delay which is based off the total number of | |
531 | * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) | |
532 | * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). | |
533 | */ | |
534 | void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) | |
535 | { | |
536 | s32 discardable_extents; | |
81b29a3b | 537 | s64 discardable_bytes; |
a2309300 DZ |
538 | u32 iops_limit; |
539 | unsigned long delay; | |
a2309300 DZ |
540 | |
541 | discardable_extents = atomic_read(&discard_ctl->discardable_extents); | |
542 | if (!discardable_extents) | |
543 | return; | |
544 | ||
545 | spin_lock(&discard_ctl->lock); | |
546 | ||
81b29a3b DZ |
547 | /* |
548 | * The following is to fix a potential -1 discrepenancy that we're not | |
549 | * sure how to reproduce. But given that this is the only place that | |
550 | * utilizes these numbers and this is only called by from | |
551 | * btrfs_finish_extent_commit() which is synchronized, we can correct | |
552 | * here. | |
553 | */ | |
554 | if (discardable_extents < 0) | |
555 | atomic_add(-discardable_extents, | |
556 | &discard_ctl->discardable_extents); | |
557 | ||
558 | discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); | |
559 | if (discardable_bytes < 0) | |
560 | atomic64_add(-discardable_bytes, | |
561 | &discard_ctl->discardable_bytes); | |
562 | ||
563 | if (discardable_extents <= 0) { | |
564 | spin_unlock(&discard_ctl->lock); | |
565 | return; | |
566 | } | |
567 | ||
a2309300 DZ |
568 | iops_limit = READ_ONCE(discard_ctl->iops_limit); |
569 | if (iops_limit) | |
e50404a8 PB |
570 | delay = MSEC_PER_SEC / iops_limit; |
571 | else | |
572 | delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; | |
a2309300 | 573 | |
e50404a8 PB |
574 | delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, |
575 | BTRFS_DISCARD_MAX_DELAY_MSEC); | |
6e88f116 | 576 | discard_ctl->delay_ms = delay; |
a2309300 DZ |
577 | |
578 | spin_unlock(&discard_ctl->lock); | |
579 | } | |
580 | ||
dfb79ddb DZ |
581 | /** |
582 | * btrfs_discard_update_discardable - propagate discard counters | |
583 | * @block_group: block_group of interest | |
dfb79ddb DZ |
584 | * |
585 | * This propagates deltas of counters up to the discard_ctl. It maintains a | |
586 | * current counter and a previous counter passing the delta up to the global | |
587 | * stat. Then the current counter value becomes the previous counter value. | |
588 | */ | |
66b53bae | 589 | void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) |
dfb79ddb | 590 | { |
66b53bae | 591 | struct btrfs_free_space_ctl *ctl; |
dfb79ddb DZ |
592 | struct btrfs_discard_ctl *discard_ctl; |
593 | s32 extents_delta; | |
5dc7c10b | 594 | s64 bytes_delta; |
dfb79ddb | 595 | |
5cb0724e DZ |
596 | if (!block_group || |
597 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || | |
598 | !btrfs_is_block_group_data_only(block_group)) | |
dfb79ddb DZ |
599 | return; |
600 | ||
66b53bae | 601 | ctl = block_group->free_space_ctl; |
dfb79ddb DZ |
602 | discard_ctl = &block_group->fs_info->discard_ctl; |
603 | ||
66b53bae | 604 | lockdep_assert_held(&ctl->tree_lock); |
dfb79ddb DZ |
605 | extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - |
606 | ctl->discardable_extents[BTRFS_STAT_PREV]; | |
607 | if (extents_delta) { | |
608 | atomic_add(extents_delta, &discard_ctl->discardable_extents); | |
609 | ctl->discardable_extents[BTRFS_STAT_PREV] = | |
610 | ctl->discardable_extents[BTRFS_STAT_CURR]; | |
611 | } | |
5dc7c10b DZ |
612 | |
613 | bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - | |
614 | ctl->discardable_bytes[BTRFS_STAT_PREV]; | |
615 | if (bytes_delta) { | |
616 | atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); | |
617 | ctl->discardable_bytes[BTRFS_STAT_PREV] = | |
618 | ctl->discardable_bytes[BTRFS_STAT_CURR]; | |
619 | } | |
dfb79ddb DZ |
620 | } |
621 | ||
6e80d4f8 DZ |
622 | /** |
623 | * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists | |
624 | * @fs_info: fs_info of interest | |
625 | * | |
626 | * The unused_bgs list needs to be punted to the discard lists because the | |
627 | * order of operations is changed. In the normal sychronous discard path, the | |
628 | * block groups are trimmed via a single large trim in transaction commit. This | |
629 | * is ultimately what we are trying to avoid with asynchronous discard. Thus, | |
630 | * it must be done before going down the unused_bgs path. | |
631 | */ | |
632 | void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) | |
633 | { | |
634 | struct btrfs_block_group *block_group, *next; | |
635 | ||
636 | spin_lock(&fs_info->unused_bgs_lock); | |
637 | /* We enabled async discard, so punt all to the queue */ | |
638 | list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, | |
639 | bg_list) { | |
640 | list_del_init(&block_group->bg_list); | |
04e484c5 | 641 | btrfs_put_block_group(block_group); |
6e80d4f8 DZ |
642 | btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); |
643 | } | |
644 | spin_unlock(&fs_info->unused_bgs_lock); | |
645 | } | |
646 | ||
647 | /** | |
648 | * btrfs_discard_purge_list - purge discard lists | |
649 | * @discard_ctl: discard control | |
650 | * | |
651 | * If we are disabling async discard, we may have intercepted block groups that | |
652 | * are completely free and ready for the unused_bgs path. As discarding will | |
653 | * now happen in transaction commit or not at all, we can safely mark the | |
654 | * corresponding block groups as unused and they will be sent on their merry | |
655 | * way to the unused_bgs list. | |
656 | */ | |
657 | static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) | |
658 | { | |
659 | struct btrfs_block_group *block_group, *next; | |
660 | int i; | |
661 | ||
662 | spin_lock(&discard_ctl->lock); | |
663 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
664 | list_for_each_entry_safe(block_group, next, | |
665 | &discard_ctl->discard_list[i], | |
666 | discard_list) { | |
667 | list_del_init(&block_group->discard_list); | |
668 | spin_unlock(&discard_ctl->lock); | |
669 | if (block_group->used == 0) | |
670 | btrfs_mark_bg_unused(block_group); | |
671 | spin_lock(&discard_ctl->lock); | |
672 | } | |
673 | } | |
674 | spin_unlock(&discard_ctl->lock); | |
675 | } | |
676 | ||
b0643e59 DZ |
677 | void btrfs_discard_resume(struct btrfs_fs_info *fs_info) |
678 | { | |
679 | if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { | |
680 | btrfs_discard_cleanup(fs_info); | |
681 | return; | |
682 | } | |
683 | ||
6e80d4f8 DZ |
684 | btrfs_discard_punt_unused_bgs_list(fs_info); |
685 | ||
b0643e59 DZ |
686 | set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); |
687 | } | |
688 | ||
689 | void btrfs_discard_stop(struct btrfs_fs_info *fs_info) | |
690 | { | |
691 | clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); | |
692 | } | |
693 | ||
694 | void btrfs_discard_init(struct btrfs_fs_info *fs_info) | |
695 | { | |
696 | struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; | |
697 | int i; | |
698 | ||
699 | spin_lock_init(&discard_ctl->lock); | |
700 | INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); | |
701 | ||
702 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) | |
703 | INIT_LIST_HEAD(&discard_ctl->discard_list[i]); | |
dfb79ddb | 704 | |
e93591bb | 705 | discard_ctl->prev_discard = 0; |
df903e5d | 706 | discard_ctl->prev_discard_time = 0; |
dfb79ddb | 707 | atomic_set(&discard_ctl->discardable_extents, 0); |
5dc7c10b | 708 | atomic64_set(&discard_ctl->discardable_bytes, 0); |
19b2a2c7 | 709 | discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; |
6e88f116 | 710 | discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; |
a2309300 | 711 | discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; |
e93591bb | 712 | discard_ctl->kbps_limit = 0; |
9ddf648f DZ |
713 | discard_ctl->discard_extent_bytes = 0; |
714 | discard_ctl->discard_bitmap_bytes = 0; | |
715 | atomic64_set(&discard_ctl->discard_bytes_saved, 0); | |
b0643e59 DZ |
716 | } |
717 | ||
718 | void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) | |
719 | { | |
720 | btrfs_discard_stop(fs_info); | |
721 | cancel_delayed_work_sync(&fs_info->discard_ctl.work); | |
6e80d4f8 | 722 | btrfs_discard_purge_list(&fs_info->discard_ctl); |
b0643e59 | 723 | } |