Commit | Line | Data |
---|---|---|
c1d7c514 | 1 | // SPDX-License-Identifier: GPL-2.0 |
6702ed49 CM |
2 | /* |
3 | * Copyright (C) 2007 Oracle. All rights reserved. | |
6702ed49 CM |
4 | */ |
5 | ||
6 | #include <linux/sched.h> | |
7 | #include "ctree.h" | |
8 | #include "disk-io.h" | |
9 | #include "print-tree.h" | |
10 | #include "transaction.h" | |
e7a84565 | 11 | #include "locking.h" |
07e81dc9 | 12 | #include "accessors.h" |
6702ed49 | 13 | |
6e3df18b JB |
14 | static struct kmem_cache *btrfs_inode_defrag_cachep; |
15 | ||
16 | /* | |
17 | * When auto defrag is enabled we queue up these defrag structs to remember | |
18 | * which inodes need defragging passes. | |
19 | */ | |
20 | struct inode_defrag { | |
21 | struct rb_node rb_node; | |
22 | /* Inode number */ | |
23 | u64 ino; | |
24 | /* | |
25 | * Transid where the defrag was added, we search for extents newer than | |
26 | * this. | |
27 | */ | |
28 | u64 transid; | |
29 | ||
30 | /* Root objectid */ | |
31 | u64 root; | |
32 | ||
33 | /* | |
34 | * The extent size threshold for autodefrag. | |
35 | * | |
36 | * This value is different for compressed/non-compressed extents, thus | |
37 | * needs to be passed from higher layer. | |
38 | * (aka, inode_should_defrag()) | |
39 | */ | |
40 | u32 extent_thresh; | |
41 | }; | |
42 | ||
43 | static int __compare_inode_defrag(struct inode_defrag *defrag1, | |
44 | struct inode_defrag *defrag2) | |
45 | { | |
46 | if (defrag1->root > defrag2->root) | |
47 | return 1; | |
48 | else if (defrag1->root < defrag2->root) | |
49 | return -1; | |
50 | else if (defrag1->ino > defrag2->ino) | |
51 | return 1; | |
52 | else if (defrag1->ino < defrag2->ino) | |
53 | return -1; | |
54 | else | |
55 | return 0; | |
56 | } | |
57 | ||
58 | /* | |
59 | * Pop a record for an inode into the defrag tree. The lock must be held | |
60 | * already. | |
61 | * | |
62 | * If you're inserting a record for an older transid than an existing record, | |
63 | * the transid already in the tree is lowered. | |
64 | * | |
65 | * If an existing record is found the defrag item you pass in is freed. | |
66 | */ | |
67 | static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, | |
68 | struct inode_defrag *defrag) | |
69 | { | |
70 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
71 | struct inode_defrag *entry; | |
72 | struct rb_node **p; | |
73 | struct rb_node *parent = NULL; | |
74 | int ret; | |
75 | ||
76 | p = &fs_info->defrag_inodes.rb_node; | |
77 | while (*p) { | |
78 | parent = *p; | |
79 | entry = rb_entry(parent, struct inode_defrag, rb_node); | |
80 | ||
81 | ret = __compare_inode_defrag(defrag, entry); | |
82 | if (ret < 0) | |
83 | p = &parent->rb_left; | |
84 | else if (ret > 0) | |
85 | p = &parent->rb_right; | |
86 | else { | |
87 | /* | |
88 | * If we're reinserting an entry for an old defrag run, | |
89 | * make sure to lower the transid of our existing | |
90 | * record. | |
91 | */ | |
92 | if (defrag->transid < entry->transid) | |
93 | entry->transid = defrag->transid; | |
94 | entry->extent_thresh = min(defrag->extent_thresh, | |
95 | entry->extent_thresh); | |
96 | return -EEXIST; | |
97 | } | |
98 | } | |
99 | set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); | |
100 | rb_link_node(&defrag->rb_node, parent, p); | |
101 | rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); | |
102 | return 0; | |
103 | } | |
104 | ||
105 | static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) | |
106 | { | |
107 | if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) | |
108 | return 0; | |
109 | ||
110 | if (btrfs_fs_closing(fs_info)) | |
111 | return 0; | |
112 | ||
113 | return 1; | |
114 | } | |
115 | ||
116 | /* | |
117 | * Insert a defrag record for this inode if auto defrag is enabled. | |
118 | */ | |
119 | int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |
120 | struct btrfs_inode *inode, u32 extent_thresh) | |
121 | { | |
122 | struct btrfs_root *root = inode->root; | |
123 | struct btrfs_fs_info *fs_info = root->fs_info; | |
124 | struct inode_defrag *defrag; | |
125 | u64 transid; | |
126 | int ret; | |
127 | ||
128 | if (!__need_auto_defrag(fs_info)) | |
129 | return 0; | |
130 | ||
131 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) | |
132 | return 0; | |
133 | ||
134 | if (trans) | |
135 | transid = trans->transid; | |
136 | else | |
137 | transid = inode->root->last_trans; | |
138 | ||
139 | defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); | |
140 | if (!defrag) | |
141 | return -ENOMEM; | |
142 | ||
143 | defrag->ino = btrfs_ino(inode); | |
144 | defrag->transid = transid; | |
145 | defrag->root = root->root_key.objectid; | |
146 | defrag->extent_thresh = extent_thresh; | |
147 | ||
148 | spin_lock(&fs_info->defrag_inodes_lock); | |
149 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { | |
150 | /* | |
151 | * If we set IN_DEFRAG flag and evict the inode from memory, | |
152 | * and then re-read this inode, this new inode doesn't have | |
153 | * IN_DEFRAG flag. At the case, we may find the existed defrag. | |
154 | */ | |
155 | ret = __btrfs_add_inode_defrag(inode, defrag); | |
156 | if (ret) | |
157 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | |
158 | } else { | |
159 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | |
160 | } | |
161 | spin_unlock(&fs_info->defrag_inodes_lock); | |
162 | return 0; | |
163 | } | |
164 | ||
165 | /* | |
166 | * Pick the defragable inode that we want, if it doesn't exist, we will get the | |
167 | * next one. | |
168 | */ | |
169 | static struct inode_defrag *btrfs_pick_defrag_inode( | |
170 | struct btrfs_fs_info *fs_info, u64 root, u64 ino) | |
171 | { | |
172 | struct inode_defrag *entry = NULL; | |
173 | struct inode_defrag tmp; | |
174 | struct rb_node *p; | |
175 | struct rb_node *parent = NULL; | |
176 | int ret; | |
177 | ||
178 | tmp.ino = ino; | |
179 | tmp.root = root; | |
180 | ||
181 | spin_lock(&fs_info->defrag_inodes_lock); | |
182 | p = fs_info->defrag_inodes.rb_node; | |
183 | while (p) { | |
184 | parent = p; | |
185 | entry = rb_entry(parent, struct inode_defrag, rb_node); | |
186 | ||
187 | ret = __compare_inode_defrag(&tmp, entry); | |
188 | if (ret < 0) | |
189 | p = parent->rb_left; | |
190 | else if (ret > 0) | |
191 | p = parent->rb_right; | |
192 | else | |
193 | goto out; | |
194 | } | |
195 | ||
196 | if (parent && __compare_inode_defrag(&tmp, entry) > 0) { | |
197 | parent = rb_next(parent); | |
198 | if (parent) | |
199 | entry = rb_entry(parent, struct inode_defrag, rb_node); | |
200 | else | |
201 | entry = NULL; | |
202 | } | |
203 | out: | |
204 | if (entry) | |
205 | rb_erase(parent, &fs_info->defrag_inodes); | |
206 | spin_unlock(&fs_info->defrag_inodes_lock); | |
207 | return entry; | |
208 | } | |
209 | ||
210 | void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) | |
211 | { | |
212 | struct inode_defrag *defrag; | |
213 | struct rb_node *node; | |
214 | ||
215 | spin_lock(&fs_info->defrag_inodes_lock); | |
216 | node = rb_first(&fs_info->defrag_inodes); | |
217 | while (node) { | |
218 | rb_erase(node, &fs_info->defrag_inodes); | |
219 | defrag = rb_entry(node, struct inode_defrag, rb_node); | |
220 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | |
221 | ||
222 | cond_resched_lock(&fs_info->defrag_inodes_lock); | |
223 | ||
224 | node = rb_first(&fs_info->defrag_inodes); | |
225 | } | |
226 | spin_unlock(&fs_info->defrag_inodes_lock); | |
227 | } | |
228 | ||
229 | #define BTRFS_DEFRAG_BATCH 1024 | |
230 | ||
231 | static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, | |
232 | struct inode_defrag *defrag) | |
233 | { | |
234 | struct btrfs_root *inode_root; | |
235 | struct inode *inode; | |
236 | struct btrfs_ioctl_defrag_range_args range; | |
237 | int ret = 0; | |
238 | u64 cur = 0; | |
239 | ||
240 | again: | |
241 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) | |
242 | goto cleanup; | |
243 | if (!__need_auto_defrag(fs_info)) | |
244 | goto cleanup; | |
245 | ||
246 | /* Get the inode */ | |
247 | inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); | |
248 | if (IS_ERR(inode_root)) { | |
249 | ret = PTR_ERR(inode_root); | |
250 | goto cleanup; | |
251 | } | |
252 | ||
253 | inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); | |
254 | btrfs_put_root(inode_root); | |
255 | if (IS_ERR(inode)) { | |
256 | ret = PTR_ERR(inode); | |
257 | goto cleanup; | |
258 | } | |
259 | ||
260 | if (cur >= i_size_read(inode)) { | |
261 | iput(inode); | |
262 | goto cleanup; | |
263 | } | |
264 | ||
265 | /* Do a chunk of defrag */ | |
266 | clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | |
267 | memset(&range, 0, sizeof(range)); | |
268 | range.len = (u64)-1; | |
269 | range.start = cur; | |
270 | range.extent_thresh = defrag->extent_thresh; | |
271 | ||
272 | sb_start_write(fs_info->sb); | |
273 | ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, | |
274 | BTRFS_DEFRAG_BATCH); | |
275 | sb_end_write(fs_info->sb); | |
276 | iput(inode); | |
277 | ||
278 | if (ret < 0) | |
279 | goto cleanup; | |
280 | ||
281 | cur = max(cur + fs_info->sectorsize, range.start); | |
282 | goto again; | |
283 | ||
284 | cleanup: | |
285 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | |
286 | return ret; | |
287 | } | |
288 | ||
289 | /* | |
290 | * Run through the list of inodes in the FS that need defragging. | |
291 | */ | |
292 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | |
293 | { | |
294 | struct inode_defrag *defrag; | |
295 | u64 first_ino = 0; | |
296 | u64 root_objectid = 0; | |
297 | ||
298 | atomic_inc(&fs_info->defrag_running); | |
299 | while (1) { | |
300 | /* Pause the auto defragger. */ | |
301 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) | |
302 | break; | |
303 | ||
304 | if (!__need_auto_defrag(fs_info)) | |
305 | break; | |
306 | ||
307 | /* find an inode to defrag */ | |
308 | defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); | |
309 | if (!defrag) { | |
310 | if (root_objectid || first_ino) { | |
311 | root_objectid = 0; | |
312 | first_ino = 0; | |
313 | continue; | |
314 | } else { | |
315 | break; | |
316 | } | |
317 | } | |
318 | ||
319 | first_ino = defrag->ino + 1; | |
320 | root_objectid = defrag->root; | |
321 | ||
322 | __btrfs_run_defrag_inode(fs_info, defrag); | |
323 | } | |
324 | atomic_dec(&fs_info->defrag_running); | |
325 | ||
326 | /* | |
327 | * During unmount, we use the transaction_wait queue to wait for the | |
328 | * defragger to stop. | |
329 | */ | |
330 | wake_up(&fs_info->transaction_wait); | |
331 | return 0; | |
332 | } | |
333 | ||
de78b51a ES |
334 | /* |
335 | * Defrag all the leaves in a given btree. | |
336 | * Read all the leaves and try to get key order to | |
d352ac68 CM |
337 | * better reflect disk order |
338 | */ | |
d397712b | 339 | |
6702ed49 | 340 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
de78b51a | 341 | struct btrfs_root *root) |
6702ed49 CM |
342 | { |
343 | struct btrfs_path *path = NULL; | |
e7a84565 | 344 | struct btrfs_key key; |
6702ed49 CM |
345 | int ret = 0; |
346 | int wret; | |
347 | int level; | |
e7a84565 | 348 | int next_key_ret = 0; |
e9d0b13b | 349 | u64 last_ret = 0; |
3f157a2f | 350 | |
92a7cc42 | 351 | if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) |
6702ed49 | 352 | goto out; |
5f39d397 | 353 | |
6702ed49 CM |
354 | path = btrfs_alloc_path(); |
355 | if (!path) | |
356 | return -ENOMEM; | |
357 | ||
5f39d397 | 358 | level = btrfs_header_level(root->node); |
0f1ebbd1 | 359 | |
d397712b | 360 | if (level == 0) |
6702ed49 | 361 | goto out; |
d397712b | 362 | |
6702ed49 | 363 | if (root->defrag_progress.objectid == 0) { |
e7a84565 | 364 | struct extent_buffer *root_node; |
0ef3e66b CM |
365 | u32 nritems; |
366 | ||
e7a84565 CM |
367 | root_node = btrfs_lock_root_node(root); |
368 | nritems = btrfs_header_nritems(root_node); | |
0ef3e66b CM |
369 | root->defrag_max.objectid = 0; |
370 | /* from above we know this is not a leaf */ | |
e7a84565 | 371 | btrfs_node_key_to_cpu(root_node, &root->defrag_max, |
0ef3e66b | 372 | nritems - 1); |
e7a84565 CM |
373 | btrfs_tree_unlock(root_node); |
374 | free_extent_buffer(root_node); | |
375 | memset(&key, 0, sizeof(key)); | |
6702ed49 | 376 | } else { |
e7a84565 | 377 | memcpy(&key, &root->defrag_progress, sizeof(key)); |
6702ed49 CM |
378 | } |
379 | ||
e7a84565 | 380 | path->keep_locks = 1; |
3f157a2f | 381 | |
7c829b72 | 382 | ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); |
3f157a2f CM |
383 | if (ret < 0) |
384 | goto out; | |
385 | if (ret > 0) { | |
386 | ret = 0; | |
387 | goto out; | |
388 | } | |
b3b4aa74 | 389 | btrfs_release_path(path); |
0376374a FM |
390 | /* |
391 | * We don't need a lock on a leaf. btrfs_realloc_node() will lock all | |
392 | * leafs from path->nodes[1], so set lowest_level to 1 to avoid later | |
393 | * a deadlock (attempting to write lock an already write locked leaf). | |
394 | */ | |
395 | path->lowest_level = 1; | |
e7a84565 | 396 | wret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
6702ed49 | 397 | |
e7a84565 CM |
398 | if (wret < 0) { |
399 | ret = wret; | |
400 | goto out; | |
401 | } | |
402 | if (!path->nodes[1]) { | |
403 | ret = 0; | |
404 | goto out; | |
405 | } | |
0376374a FM |
406 | /* |
407 | * The node at level 1 must always be locked when our path has | |
408 | * keep_locks set and lowest_level is 1, regardless of the value of | |
409 | * path->slots[1]. | |
410 | */ | |
411 | BUG_ON(path->locks[1] == 0); | |
e7a84565 CM |
412 | ret = btrfs_realloc_node(trans, root, |
413 | path->nodes[1], 0, | |
de78b51a | 414 | &last_ret, |
e7a84565 | 415 | &root->defrag_progress); |
8929ecfa YZ |
416 | if (ret) { |
417 | WARN_ON(ret == -EAGAIN); | |
418 | goto out; | |
419 | } | |
0376374a FM |
420 | /* |
421 | * Now that we reallocated the node we can find the next key. Note that | |
422 | * btrfs_find_next_key() can release our path and do another search | |
423 | * without COWing, this is because even with path->keep_locks = 1, | |
424 | * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a | |
425 | * node when path->slots[node_level - 1] does not point to the last | |
426 | * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore | |
427 | * we search for the next key after reallocating our node. | |
428 | */ | |
429 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); | |
430 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, | |
7c829b72 | 431 | BTRFS_OLDEST_GENERATION); |
e7a84565 CM |
432 | if (next_key_ret == 0) { |
433 | memcpy(&root->defrag_progress, &key, sizeof(key)); | |
434 | ret = -EAGAIN; | |
6702ed49 | 435 | } |
6702ed49 | 436 | out: |
527afb44 | 437 | btrfs_free_path(path); |
0ef3e66b CM |
438 | if (ret == -EAGAIN) { |
439 | if (root->defrag_max.objectid > root->defrag_progress.objectid) | |
440 | goto done; | |
441 | if (root->defrag_max.type > root->defrag_progress.type) | |
442 | goto done; | |
443 | if (root->defrag_max.offset > root->defrag_progress.offset) | |
444 | goto done; | |
445 | ret = 0; | |
446 | } | |
447 | done: | |
a2570ef3 | 448 | if (ret != -EAGAIN) |
6702ed49 CM |
449 | memset(&root->defrag_progress, 0, |
450 | sizeof(root->defrag_progress)); | |
a2570ef3 | 451 | |
6702ed49 CM |
452 | return ret; |
453 | } | |
6e3df18b JB |
454 | |
455 | void __cold btrfs_auto_defrag_exit(void) | |
456 | { | |
457 | kmem_cache_destroy(btrfs_inode_defrag_cachep); | |
458 | } | |
459 | ||
460 | int __init btrfs_auto_defrag_init(void) | |
461 | { | |
462 | btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", | |
463 | sizeof(struct inode_defrag), 0, | |
464 | SLAB_MEM_SPREAD, | |
465 | NULL); | |
466 | if (!btrfs_inode_defrag_cachep) | |
467 | return -ENOMEM; | |
468 | ||
469 | return 0; | |
470 | } |