Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c1d7c514 | 2 | |
d1310b2e | 3 | #include <linux/err.h> |
d1310b2e | 4 | #include <linux/slab.h> |
a52d9a80 | 5 | #include <linux/spinlock.h> |
9b569ea0 | 6 | #include "messages.h" |
261507a0 | 7 | #include "ctree.h" |
a52d9a80 | 8 | #include "extent_map.h" |
ebb8765b | 9 | #include "compression.h" |
4c0c8cfc | 10 | #include "btrfs_inode.h" |
956a17d9 | 11 | #include "disk-io.h" |
a52d9a80 | 12 | |
86479a04 | 13 | |
a52d9a80 | 14 | static struct kmem_cache *extent_map_cache; |
ca664626 | 15 | |
d846a6d3 | 16 | int __init btrfs_extent_map_init(void) |
a52d9a80 | 17 | { |
837e1972 | 18 | extent_map_cache = kmem_cache_create("btrfs_extent_map", |
ef5a05c5 | 19 | sizeof(struct extent_map), 0, 0, NULL); |
2f4cbe64 WB |
20 | if (!extent_map_cache) |
21 | return -ENOMEM; | |
2f4cbe64 | 22 | return 0; |
a52d9a80 CM |
23 | } |
24 | ||
d846a6d3 | 25 | void __cold btrfs_extent_map_exit(void) |
a52d9a80 | 26 | { |
5598e900 | 27 | kmem_cache_destroy(extent_map_cache); |
a52d9a80 CM |
28 | } |
29 | ||
43dd529a DS |
30 | /* |
31 | * Initialize the extent tree @tree. Should be called for each new inode or | |
32 | * other user of the extent_map interface. | |
9d2423c5 | 33 | */ |
d846a6d3 | 34 | void btrfs_extent_map_tree_init(struct extent_map_tree *tree) |
a52d9a80 | 35 | { |
4e660ca3 | 36 | tree->root = RB_ROOT; |
5dc562c5 | 37 | INIT_LIST_HEAD(&tree->modified_extents); |
890871be | 38 | rwlock_init(&tree->lock); |
a52d9a80 | 39 | } |
a52d9a80 | 40 | |
43dd529a DS |
41 | /* |
42 | * Allocate a new extent_map structure. The new structure is returned with a | |
43 | * reference count of one and needs to be freed using free_extent_map() | |
9d2423c5 | 44 | */ |
ae98ae2a | 45 | struct extent_map *btrfs_alloc_extent_map(void) |
a52d9a80 CM |
46 | { |
47 | struct extent_map *em; | |
70c8a91c | 48 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
c26a9203 TI |
49 | if (!em) |
50 | return NULL; | |
cbc0e928 | 51 | RB_CLEAR_NODE(&em->rb_node); |
490b54d6 | 52 | refcount_set(&em->refs, 1); |
5dc562c5 | 53 | INIT_LIST_HEAD(&em->list); |
a52d9a80 CM |
54 | return em; |
55 | } | |
a52d9a80 | 56 | |
43dd529a DS |
57 | /* |
58 | * Drop the reference out on @em by one and free the structure if the reference | |
59 | * count hits zero. | |
9d2423c5 | 60 | */ |
ae98ae2a | 61 | void btrfs_free_extent_map(struct extent_map *em) |
a52d9a80 | 62 | { |
2bf5a725 CM |
63 | if (!em) |
64 | return; | |
490b54d6 | 65 | if (refcount_dec_and_test(&em->refs)) { |
2e871330 | 66 | WARN_ON(btrfs_extent_map_in_tree(em)); |
5dc562c5 | 67 | WARN_ON(!list_empty(&em->list)); |
a52d9a80 CM |
68 | kmem_cache_free(extent_map_cache, em); |
69 | } | |
70 | } | |
a52d9a80 | 71 | |
43dd529a | 72 | /* Do the math around the end of an extent, handling wrapping. */ |
32193c14 FDBM |
73 | static u64 range_end(u64 start, u64 len) |
74 | { | |
75 | if (start + len < start) | |
76 | return (u64)-1; | |
77 | return start + len; | |
78 | } | |
79 | ||
03ba0505 | 80 | static void remove_em(struct btrfs_inode *inode, struct extent_map *em) |
f1d97e76 FM |
81 | { |
82 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
83 | ||
03ba0505 FM |
84 | rb_erase(&em->rb_node, &inode->extent_tree.root); |
85 | RB_CLEAR_NODE(&em->rb_node); | |
86 | ||
f1d97e76 FM |
87 | if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) |
88 | percpu_counter_dec(&fs_info->evictable_extent_maps); | |
89 | } | |
90 | ||
4e660ca3 | 91 | static int tree_insert(struct rb_root *root, struct extent_map *em) |
a52d9a80 | 92 | { |
4e660ca3 | 93 | struct rb_node **p = &root->rb_node; |
d397712b | 94 | struct rb_node *parent = NULL; |
32193c14 FDBM |
95 | struct extent_map *entry = NULL; |
96 | struct rb_node *orig_parent = NULL; | |
97 | u64 end = range_end(em->start, em->len); | |
a52d9a80 | 98 | |
d397712b | 99 | while (*p) { |
a52d9a80 | 100 | parent = *p; |
d1310b2e CM |
101 | entry = rb_entry(parent, struct extent_map, rb_node); |
102 | ||
4e660ca3 | 103 | if (em->start < entry->start) |
a52d9a80 | 104 | p = &(*p)->rb_left; |
2e871330 | 105 | else if (em->start >= btrfs_extent_map_end(entry)) |
a52d9a80 | 106 | p = &(*p)->rb_right; |
4e660ca3 | 107 | else |
32193c14 | 108 | return -EEXIST; |
a52d9a80 CM |
109 | } |
110 | ||
32193c14 | 111 | orig_parent = parent; |
2e871330 | 112 | while (parent && em->start >= btrfs_extent_map_end(entry)) { |
32193c14 FDBM |
113 | parent = rb_next(parent); |
114 | entry = rb_entry(parent, struct extent_map, rb_node); | |
115 | } | |
116 | if (parent) | |
2e871330 | 117 | if (end > entry->start && em->start < btrfs_extent_map_end(entry)) |
32193c14 FDBM |
118 | return -EEXIST; |
119 | ||
120 | parent = orig_parent; | |
121 | entry = rb_entry(parent, struct extent_map, rb_node); | |
122 | while (parent && em->start < entry->start) { | |
123 | parent = rb_prev(parent); | |
124 | entry = rb_entry(parent, struct extent_map, rb_node); | |
125 | } | |
126 | if (parent) | |
2e871330 | 127 | if (end > entry->start && em->start < btrfs_extent_map_end(entry)) |
32193c14 FDBM |
128 | return -EEXIST; |
129 | ||
32193c14 | 130 | rb_link_node(&em->rb_node, orig_parent, p); |
4e660ca3 | 131 | rb_insert_color(&em->rb_node, root); |
32193c14 | 132 | return 0; |
a52d9a80 CM |
133 | } |
134 | ||
d352ac68 | 135 | /* |
43dd529a DS |
136 | * Search through the tree for an extent_map with a given offset. If it can't |
137 | * be found, try to find some neighboring extents | |
d352ac68 | 138 | */ |
9a36bad6 FM |
139 | static struct rb_node *tree_search(struct rb_root *root, u64 offset, |
140 | struct rb_node **prev_or_next_ret) | |
a52d9a80 | 141 | { |
d397712b | 142 | struct rb_node *n = root->rb_node; |
a52d9a80 | 143 | struct rb_node *prev = NULL; |
5f56406a | 144 | struct rb_node *orig_prev = NULL; |
d1310b2e CM |
145 | struct extent_map *entry; |
146 | struct extent_map *prev_entry = NULL; | |
a52d9a80 | 147 | |
6c05813e | 148 | ASSERT(prev_or_next_ret); |
08f088dd | 149 | |
d397712b | 150 | while (n) { |
d1310b2e | 151 | entry = rb_entry(n, struct extent_map, rb_node); |
a52d9a80 CM |
152 | prev = n; |
153 | prev_entry = entry; | |
154 | ||
155 | if (offset < entry->start) | |
156 | n = n->rb_left; | |
2e871330 | 157 | else if (offset >= btrfs_extent_map_end(entry)) |
a52d9a80 CM |
158 | n = n->rb_right; |
159 | else | |
160 | return n; | |
161 | } | |
5f56406a | 162 | |
08f088dd | 163 | orig_prev = prev; |
2e871330 | 164 | while (prev && offset >= btrfs_extent_map_end(prev_entry)) { |
08f088dd FM |
165 | prev = rb_next(prev); |
166 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | |
5f56406a CM |
167 | } |
168 | ||
6c05813e FM |
169 | /* |
170 | * Previous extent map found, return as in this case the caller does not | |
171 | * care about the next one. | |
172 | */ | |
173 | if (prev) { | |
174 | *prev_or_next_ret = prev; | |
175 | return NULL; | |
176 | } | |
177 | ||
178 | prev = orig_prev; | |
08f088dd FM |
179 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
180 | while (prev && offset < prev_entry->start) { | |
181 | prev = rb_prev(prev); | |
d1310b2e | 182 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
a52d9a80 | 183 | } |
6c05813e | 184 | *prev_or_next_ret = prev; |
08f088dd | 185 | |
a52d9a80 CM |
186 | return NULL; |
187 | } | |
188 | ||
e28b851e QW |
189 | static inline u64 extent_map_block_len(const struct extent_map *em) |
190 | { | |
962162ff | 191 | if (btrfs_extent_map_is_compressed(em)) |
e28b851e QW |
192 | return em->disk_num_bytes; |
193 | return em->len; | |
194 | } | |
195 | ||
2ecec0d6 FM |
196 | static inline u64 extent_map_block_end(const struct extent_map *em) |
197 | { | |
2e871330 | 198 | const u64 block_start = btrfs_extent_map_block_start(em); |
ab094670 FM |
199 | const u64 block_end = block_start + extent_map_block_len(em); |
200 | ||
201 | if (block_end < block_start) | |
2ecec0d6 | 202 | return (u64)-1; |
ab094670 FM |
203 | |
204 | return block_end; | |
2ecec0d6 FM |
205 | } |
206 | ||
1a9fb16c | 207 | static bool can_merge_extent_map(const struct extent_map *em) |
a52d9a80 | 208 | { |
f86f7a75 | 209 | if (em->flags & EXTENT_FLAG_PINNED) |
1a9fb16c | 210 | return false; |
7f3c74fb | 211 | |
1a9fb16c | 212 | /* Don't merge compressed extents, we need to know their actual size. */ |
962162ff | 213 | if (btrfs_extent_map_is_compressed(em)) |
1a9fb16c | 214 | return false; |
c8b97818 | 215 | |
f86f7a75 | 216 | if (em->flags & EXTENT_FLAG_LOGGING) |
1a9fb16c | 217 | return false; |
201a9038 | 218 | |
09a2a8f9 JB |
219 | /* |
220 | * We don't want to merge stuff that hasn't been written to the log yet | |
221 | * since it may not reflect exactly what is on disk, and that would be | |
222 | * bad. | |
223 | */ | |
1a9fb16c FM |
224 | if (!list_empty(&em->list)) |
225 | return false; | |
226 | ||
227 | return true; | |
228 | } | |
09a2a8f9 | 229 | |
1a9fb16c | 230 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
27f0d9c9 | 231 | static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) |
1a9fb16c | 232 | { |
2e871330 | 233 | if (btrfs_extent_map_end(prev) != next->start) |
27f0d9c9 FM |
234 | return false; |
235 | ||
a0f06253 FM |
236 | /* |
237 | * The merged flag is not an on-disk flag, it just indicates we had the | |
238 | * extent maps of 2 (or more) adjacent extents merged, so factor it out. | |
239 | */ | |
240 | if ((prev->flags & ~EXTENT_FLAG_MERGED) != | |
241 | (next->flags & ~EXTENT_FLAG_MERGED)) | |
27f0d9c9 FM |
242 | return false; |
243 | ||
c77a8c61 | 244 | if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) |
2e871330 | 245 | return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); |
27f0d9c9 FM |
246 | |
247 | /* HOLES and INLINE extents. */ | |
c77a8c61 | 248 | return next->disk_bytenr == prev->disk_bytenr; |
a52d9a80 CM |
249 | } |
250 | ||
3d2ac992 QW |
251 | /* |
252 | * Handle the on-disk data extents merge for @prev and @next. | |
253 | * | |
7a233905 BB |
254 | * @prev: left extent to merge |
255 | * @next: right extent to merge | |
256 | * @merged: the extent we will not discard after the merge; updated with new values | |
257 | * | |
258 | * After this, one of the two extents is the new merged extent and the other is | |
259 | * removed from the tree and likely freed. Note that @merged is one of @prev/@next | |
260 | * so there is const/non-const aliasing occurring here. | |
261 | * | |
3d2ac992 QW |
262 | * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. |
263 | * For now only uncompressed regular extent can be merged. | |
3d2ac992 | 264 | */ |
7a233905 BB |
265 | static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, |
266 | struct extent_map *merged) | |
3d2ac992 QW |
267 | { |
268 | u64 new_disk_bytenr; | |
269 | u64 new_disk_num_bytes; | |
270 | u64 new_offset; | |
271 | ||
272 | /* @prev and @next should not be compressed. */ | |
962162ff FM |
273 | ASSERT(!btrfs_extent_map_is_compressed(prev)); |
274 | ASSERT(!btrfs_extent_map_is_compressed(next)); | |
3d2ac992 QW |
275 | |
276 | /* | |
277 | * There are two different cases where @prev and @next can be merged. | |
278 | * | |
279 | * 1) They are referring to the same data extent: | |
280 | * | |
281 | * |<----- data extent A ----->| | |
282 | * |<- prev ->|<- next ->| | |
283 | * | |
284 | * 2) They are referring to different data extents but still adjacent: | |
285 | * | |
286 | * |<-- data extent A -->|<-- data extent B -->| | |
287 | * |<- prev ->|<- next ->| | |
288 | * | |
289 | * The calculation here always merges the data extents first, then updates | |
290 | * @offset using the new data extents. | |
291 | * | |
292 | * For case 1), the merged data extent would be the same. | |
293 | * For case 2), we just merge the two data extents into one. | |
294 | */ | |
295 | new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr); | |
296 | new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes, | |
297 | next->disk_bytenr + next->disk_num_bytes) - | |
298 | new_disk_bytenr; | |
299 | new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; | |
300 | ||
7a233905 BB |
301 | merged->disk_bytenr = new_disk_bytenr; |
302 | merged->disk_num_bytes = new_disk_num_bytes; | |
303 | merged->ram_bytes = new_disk_num_bytes; | |
304 | merged->offset = new_offset; | |
3d2ac992 QW |
305 | } |
306 | ||
3f255ece QW |
307 | static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, |
308 | struct extent_map *em) | |
309 | { | |
310 | if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) | |
311 | return; | |
312 | btrfs_crit(fs_info, | |
c77a8c61 | 313 | "%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x", |
3f255ece | 314 | prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes, |
c77a8c61 | 315 | em->ram_bytes, em->offset, em->flags); |
3f255ece QW |
316 | ASSERT(0); |
317 | } | |
318 | ||
319 | /* Internal sanity checks for btrfs debug builds. */ | |
320 | static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) | |
321 | { | |
322 | if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) | |
323 | return; | |
324 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { | |
325 | if (em->disk_num_bytes == 0) | |
326 | dump_extent_map(fs_info, "zero disk_num_bytes", em); | |
327 | if (em->offset + em->len > em->ram_bytes) | |
328 | dump_extent_map(fs_info, "ram_bytes too small", em); | |
329 | if (em->offset + em->len > em->disk_num_bytes && | |
962162ff | 330 | !btrfs_extent_map_is_compressed(em)) |
3f255ece | 331 | dump_extent_map(fs_info, "disk_num_bytes too small", em); |
962162ff | 332 | if (!btrfs_extent_map_is_compressed(em) && |
1b87d26a QW |
333 | em->ram_bytes != em->disk_num_bytes) |
334 | dump_extent_map(fs_info, | |
335 | "ram_bytes mismatch with disk_num_bytes for non-compressed em", | |
336 | em); | |
3f255ece QW |
337 | } else if (em->offset) { |
338 | dump_extent_map(fs_info, "non-zero offset for hole/inline", em); | |
339 | } | |
a52d9a80 CM |
340 | } |
341 | ||
5fa8a6ba | 342 | static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) |
a1ed835e | 343 | { |
3f255ece | 344 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
a1ed835e CM |
345 | struct extent_map *merge = NULL; |
346 | struct rb_node *rb; | |
a1ed835e | 347 | |
ac05ca91 FM |
348 | /* |
349 | * We can't modify an extent map that is in the tree and that is being | |
350 | * used by another task, as it can cause that other task to see it in | |
351 | * inconsistent state during the merging. We always have 1 reference for | |
352 | * the tree and 1 for this task (which is unpinning the extent map or | |
353 | * clearing the logging flag), so anything > 2 means it's being used by | |
354 | * other tasks too. | |
355 | */ | |
356 | if (refcount_read(&em->refs) > 2) | |
357 | return; | |
358 | ||
1a9fb16c FM |
359 | if (!can_merge_extent_map(em)) |
360 | return; | |
361 | ||
a1ed835e CM |
362 | if (em->start != 0) { |
363 | rb = rb_prev(&em->rb_node); | |
6aa79c4f DS |
364 | merge = rb_entry_safe(rb, struct extent_map, rb_node); |
365 | ||
27f0d9c9 | 366 | if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { |
a1ed835e CM |
367 | em->start = merge->start; |
368 | em->len += merge->len; | |
70c8a91c | 369 | em->generation = max(em->generation, merge->generation); |
3d2ac992 QW |
370 | |
371 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) | |
7a233905 | 372 | merge_ondisk_extents(merge, em, em); |
f86f7a75 | 373 | em->flags |= EXTENT_FLAG_MERGED; |
5dc562c5 | 374 | |
3f255ece | 375 | validate_extent_map(fs_info, em); |
03ba0505 | 376 | remove_em(inode, merge); |
ae98ae2a | 377 | btrfs_free_extent_map(merge); |
a1ed835e CM |
378 | } |
379 | } | |
380 | ||
381 | rb = rb_next(&em->rb_node); | |
6aa79c4f DS |
382 | merge = rb_entry_safe(rb, struct extent_map, rb_node); |
383 | ||
27f0d9c9 | 384 | if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { |
a1ed835e | 385 | em->len += merge->len; |
3d2ac992 | 386 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) |
7a233905 | 387 | merge_ondisk_extents(em, merge, em); |
3f255ece | 388 | validate_extent_map(fs_info, em); |
70c8a91c | 389 | em->generation = max(em->generation, merge->generation); |
f86f7a75 | 390 | em->flags |= EXTENT_FLAG_MERGED; |
03ba0505 | 391 | remove_em(inode, merge); |
ae98ae2a | 392 | btrfs_free_extent_map(merge); |
a1ed835e | 393 | } |
4d2c8f62 LZ |
394 | } |
395 | ||
43dd529a DS |
396 | /* |
397 | * Unpin an extent from the cache. | |
398 | * | |
00deaf04 | 399 | * @inode: the inode from which we are unpinning an extent range |
5dc562c5 JB |
400 | * @start: logical offset in the file |
401 | * @len: length of the extent | |
402 | * @gen: generation that this extent has been modified in | |
5dc562c5 JB |
403 | * |
404 | * Called after an extent has been written to disk properly. Set the generation | |
405 | * to the generation that actually added the file item to the inode so we know | |
406 | * we need to sync this extent when we call fsync(). | |
c03c89f8 DS |
407 | * |
408 | * Returns: 0 on success | |
409 | * -ENOENT when the extent is not found in the tree | |
410 | * -EUCLEAN if the found extent does not match the expected start | |
5dc562c5 | 411 | */ |
d846a6d3 | 412 | int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) |
4d2c8f62 | 413 | { |
00deaf04 FM |
414 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
415 | struct extent_map_tree *tree = &inode->extent_tree; | |
4d2c8f62 LZ |
416 | int ret = 0; |
417 | struct extent_map *em; | |
418 | ||
419 | write_lock(&tree->lock); | |
d846a6d3 | 420 | em = btrfs_lookup_extent_mapping(tree, start, len); |
4d2c8f62 | 421 | |
00deaf04 FM |
422 | if (WARN_ON(!em)) { |
423 | btrfs_warn(fs_info, | |
424 | "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", | |
425 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
4dc1d69c | 426 | start, start + len, gen); |
c03c89f8 | 427 | ret = -ENOENT; |
4d2c8f62 | 428 | goto out; |
00deaf04 FM |
429 | } |
430 | ||
c03c89f8 | 431 | if (WARN_ON(em->start != start)) { |
00deaf04 FM |
432 | btrfs_warn(fs_info, |
433 | "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", | |
434 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
4dc1d69c | 435 | em->start, start, start + len, gen); |
c03c89f8 DS |
436 | ret = -EUCLEAN; |
437 | goto out; | |
438 | } | |
4d2c8f62 | 439 | |
5dc562c5 | 440 | em->generation = gen; |
f86f7a75 | 441 | em->flags &= ~EXTENT_FLAG_PINNED; |
4d2c8f62 | 442 | |
5fa8a6ba | 443 | try_merge_map(inode, em); |
4e2f84e6 | 444 | |
a1ed835e CM |
445 | out: |
446 | write_unlock(&tree->lock); | |
ae98ae2a | 447 | btrfs_free_extent_map(em); |
a1ed835e CM |
448 | return ret; |
449 | ||
450 | } | |
451 | ||
d846a6d3 | 452 | void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) |
201a9038 | 453 | { |
5fa8a6ba | 454 | lockdep_assert_held_write(&inode->extent_tree.lock); |
74333c7d | 455 | |
f86f7a75 | 456 | em->flags &= ~EXTENT_FLAG_LOGGING; |
2e871330 | 457 | if (btrfs_extent_map_in_tree(em)) |
5fa8a6ba | 458 | try_merge_map(inode, em); |
201a9038 JB |
459 | } |
460 | ||
e778724a | 461 | static inline void setup_extent_mapping(struct btrfs_inode *inode, |
176840b3 FM |
462 | struct extent_map *em, |
463 | int modified) | |
464 | { | |
490b54d6 | 465 | refcount_inc(&em->refs); |
176840b3 | 466 | |
32d53f6f FM |
467 | ASSERT(list_empty(&em->list)); |
468 | ||
176840b3 | 469 | if (modified) |
e778724a | 470 | list_add(&em->list, &inode->extent_tree.modified_extents); |
176840b3 | 471 | else |
5fa8a6ba | 472 | try_merge_map(inode, em); |
176840b3 FM |
473 | } |
474 | ||
43dd529a | 475 | /* |
6c566def | 476 | * Add a new extent map to an inode's extent map tree. |
401bd2dd | 477 | * |
6c566def | 478 | * @inode: the target inode |
9d2423c5 | 479 | * @em: map to insert |
401bd2dd NB |
480 | * @modified: indicate whether the given @em should be added to the |
481 | * modified list, which indicates the extent needs to be logged | |
9d2423c5 | 482 | * |
6c566def FM |
483 | * Insert @em into the @inode's extent map tree or perform a simple |
484 | * forward/backward merge with existing mappings. The extent_map struct passed | |
485 | * in will be inserted into the tree directly, with an additional reference | |
486 | * taken, or a reference dropped if the merge attempt was successful. | |
a52d9a80 | 487 | */ |
6c566def | 488 | static int add_extent_mapping(struct btrfs_inode *inode, |
db9d9446 | 489 | struct extent_map *em, int modified) |
a52d9a80 | 490 | { |
6c566def | 491 | struct extent_map_tree *tree = &inode->extent_tree; |
f1d97e76 FM |
492 | struct btrfs_root *root = inode->root; |
493 | struct btrfs_fs_info *fs_info = root->fs_info; | |
ed48adf8 | 494 | int ret; |
a52d9a80 | 495 | |
d23ea3fa DS |
496 | lockdep_assert_held_write(&tree->lock); |
497 | ||
3f255ece | 498 | validate_extent_map(fs_info, em); |
7f5830bc | 499 | ret = tree_insert(&tree->root, em); |
32193c14 | 500 | if (ret) |
ed48adf8 | 501 | return ret; |
32193c14 | 502 | |
e778724a | 503 | setup_extent_mapping(inode, em, modified); |
ed48adf8 | 504 | |
f1d97e76 FM |
505 | if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) |
506 | percpu_counter_inc(&fs_info->evictable_extent_maps); | |
507 | ||
ed48adf8 | 508 | return 0; |
a52d9a80 | 509 | } |
a52d9a80 | 510 | |
7e886690 FM |
511 | static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, |
512 | u64 start, u64 len, int strict) | |
a52d9a80 CM |
513 | { |
514 | struct extent_map *em; | |
515 | struct rb_node *rb_node; | |
6c05813e | 516 | struct rb_node *prev_or_next = NULL; |
306929f3 CH |
517 | u64 end = range_end(start, len); |
518 | ||
9a36bad6 | 519 | rb_node = tree_search(&tree->root, start, &prev_or_next); |
a52d9a80 | 520 | if (!rb_node) { |
6c05813e FM |
521 | if (prev_or_next) |
522 | rb_node = prev_or_next; | |
ed64f066 LZ |
523 | else |
524 | return NULL; | |
a52d9a80 | 525 | } |
ed64f066 | 526 | |
a52d9a80 | 527 | em = rb_entry(rb_node, struct extent_map, rb_node); |
d1310b2e | 528 | |
2e871330 | 529 | if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) |
ed64f066 | 530 | return NULL; |
d1310b2e | 531 | |
490b54d6 | 532 | refcount_inc(&em->refs); |
a52d9a80 CM |
533 | return em; |
534 | } | |
a52d9a80 | 535 | |
43dd529a DS |
536 | /* |
537 | * Lookup extent_map that intersects @start + @len range. | |
538 | * | |
ed64f066 LZ |
539 | * @tree: tree to lookup in |
540 | * @start: byte offset to start the search | |
541 | * @len: length of the lookup range | |
542 | * | |
543 | * Find and return the first extent_map struct in @tree that intersects the | |
544 | * [start, len] range. There may be additional objects in the tree that | |
545 | * intersect, so check the object returned carefully to make sure that no | |
546 | * additional lookups are needed. | |
547 | */ | |
d846a6d3 FM |
548 | struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, |
549 | u64 start, u64 len) | |
ed64f066 | 550 | { |
7e886690 | 551 | return lookup_extent_mapping(tree, start, len, 1); |
ed64f066 LZ |
552 | } |
553 | ||
43dd529a DS |
554 | /* |
555 | * Find a nearby extent map intersecting @start + @len (not an exact search). | |
556 | * | |
b917b7c3 CM |
557 | * @tree: tree to lookup in |
558 | * @start: byte offset to start the search | |
559 | * @len: length of the lookup range | |
560 | * | |
561 | * Find and return the first extent_map struct in @tree that intersects the | |
562 | * [start, len] range. | |
563 | * | |
564 | * If one can't be found, any nearby extent may be returned | |
565 | */ | |
d846a6d3 FM |
566 | struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, |
567 | u64 start, u64 len) | |
b917b7c3 | 568 | { |
7e886690 | 569 | return lookup_extent_mapping(tree, start, len, 0); |
b917b7c3 CM |
570 | } |
571 | ||
43dd529a | 572 | /* |
c2fbd812 | 573 | * Remove an extent_map from its inode's extent tree. |
43dd529a | 574 | * |
c2fbd812 | 575 | * @inode: the inode the extent map belongs to |
bb7ab3b9 | 576 | * @em: extent map being removed |
9d2423c5 | 577 | * |
c2fbd812 FM |
578 | * Remove @em from the extent tree of @inode. No reference counts are dropped, |
579 | * and no checks are done to see if the range is in use. | |
a52d9a80 | 580 | */ |
d846a6d3 | 581 | void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) |
a52d9a80 | 582 | { |
c2fbd812 FM |
583 | struct extent_map_tree *tree = &inode->extent_tree; |
584 | ||
6d3b050e FM |
585 | lockdep_assert_held_write(&tree->lock); |
586 | ||
f86f7a75 | 587 | WARN_ON(em->flags & EXTENT_FLAG_PINNED); |
f86f7a75 | 588 | if (!(em->flags & EXTENT_FLAG_LOGGING)) |
ff44c6e3 | 589 | list_del_init(&em->list); |
f1d97e76 | 590 | |
03ba0505 | 591 | remove_em(inode, em); |
a52d9a80 | 592 | } |
176840b3 | 593 | |
6a3a9113 | 594 | static void replace_extent_mapping(struct btrfs_inode *inode, |
a6f3e205 CH |
595 | struct extent_map *cur, |
596 | struct extent_map *new, | |
597 | int modified) | |
176840b3 | 598 | { |
3f255ece | 599 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
6a3a9113 FM |
600 | struct extent_map_tree *tree = &inode->extent_tree; |
601 | ||
6d3b050e FM |
602 | lockdep_assert_held_write(&tree->lock); |
603 | ||
3f255ece QW |
604 | validate_extent_map(fs_info, new); |
605 | ||
f86f7a75 | 606 | WARN_ON(cur->flags & EXTENT_FLAG_PINNED); |
2e871330 | 607 | ASSERT(btrfs_extent_map_in_tree(cur)); |
f86f7a75 | 608 | if (!(cur->flags & EXTENT_FLAG_LOGGING)) |
176840b3 | 609 | list_del_init(&cur->list); |
4e660ca3 | 610 | rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); |
176840b3 FM |
611 | RB_CLEAR_NODE(&cur->rb_node); |
612 | ||
e778724a | 613 | setup_extent_mapping(inode, new, modified); |
176840b3 | 614 | } |
c04e61b5 | 615 | |
d47704bd | 616 | static struct extent_map *next_extent_map(const struct extent_map *em) |
c04e61b5 LB |
617 | { |
618 | struct rb_node *next; | |
619 | ||
620 | next = rb_next(&em->rb_node); | |
621 | if (!next) | |
622 | return NULL; | |
623 | return container_of(next, struct extent_map, rb_node); | |
624 | } | |
625 | ||
626 | static struct extent_map *prev_extent_map(struct extent_map *em) | |
627 | { | |
628 | struct rb_node *prev; | |
629 | ||
630 | prev = rb_prev(&em->rb_node); | |
631 | if (!prev) | |
632 | return NULL; | |
633 | return container_of(prev, struct extent_map, rb_node); | |
634 | } | |
635 | ||
52042d8e AG |
636 | /* |
637 | * Helper for btrfs_get_extent. Given an existing extent in the tree, | |
c04e61b5 LB |
638 | * the existing extent is the nearest extent to map_start, |
639 | * and an extent that you want to insert, deal with overlap and insert | |
640 | * the best fitted new extent into the tree. | |
641 | */ | |
6c566def | 642 | static noinline int merge_extent_mapping(struct btrfs_inode *inode, |
5f4791f4 LB |
643 | struct extent_map *existing, |
644 | struct extent_map *em, | |
645 | u64 map_start) | |
c04e61b5 LB |
646 | { |
647 | struct extent_map *prev; | |
648 | struct extent_map *next; | |
649 | u64 start; | |
650 | u64 end; | |
651 | u64 start_diff; | |
652 | ||
2e871330 | 653 | if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) |
c093bf30 | 654 | return -EINVAL; |
c04e61b5 LB |
655 | |
656 | if (existing->start > map_start) { | |
657 | next = existing; | |
658 | prev = prev_extent_map(next); | |
659 | } else { | |
660 | prev = existing; | |
661 | next = next_extent_map(prev); | |
662 | } | |
663 | ||
2e871330 | 664 | start = prev ? btrfs_extent_map_end(prev) : em->start; |
c04e61b5 | 665 | start = max_t(u64, start, em->start); |
2e871330 FM |
666 | end = next ? next->start : btrfs_extent_map_end(em); |
667 | end = min_t(u64, end, btrfs_extent_map_end(em)); | |
c04e61b5 LB |
668 | start_diff = start - em->start; |
669 | em->start = start; | |
670 | em->len = end - start; | |
de9f46cb | 671 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) |
3d2ac992 | 672 | em->offset += start_diff; |
6c566def | 673 | return add_extent_mapping(inode, em, 0); |
c04e61b5 LB |
674 | } |
675 | ||
43dd529a | 676 | /* |
0a308f80 | 677 | * Add extent mapping into an inode's extent map tree. |
9ad37bb3 | 678 | * |
0a308f80 | 679 | * @inode: target inode |
9ad37bb3 NB |
680 | * @em_in: extent we are inserting |
681 | * @start: start of the logical range btrfs_get_extent() is requesting | |
682 | * @len: length of the logical range btrfs_get_extent() is requesting | |
c04e61b5 LB |
683 | * |
684 | * Note that @em_in's range may be different from [start, start+len), | |
685 | * but they must be overlapped. | |
686 | * | |
0a308f80 FM |
687 | * Insert @em_in into the inode's extent map tree. In case there is an |
688 | * overlapping range, handle the -EEXIST by either: | |
c04e61b5 LB |
689 | * a) Returning the existing extent in @em_in if @start is within the |
690 | * existing em. | |
691 | * b) Merge the existing extent with @em_in passed in. | |
692 | * | |
693 | * Return 0 on success, otherwise -EEXIST. | |
694 | * | |
695 | */ | |
0a308f80 | 696 | int btrfs_add_extent_mapping(struct btrfs_inode *inode, |
c04e61b5 LB |
697 | struct extent_map **em_in, u64 start, u64 len) |
698 | { | |
699 | int ret; | |
700 | struct extent_map *em = *em_in; | |
0a308f80 | 701 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
c04e61b5 | 702 | |
d52a1365 QW |
703 | /* |
704 | * Tree-checker should have rejected any inline extent with non-zero | |
705 | * file offset. Here just do a sanity check. | |
706 | */ | |
c77a8c61 | 707 | if (em->disk_bytenr == EXTENT_MAP_INLINE) |
d52a1365 QW |
708 | ASSERT(em->start == 0); |
709 | ||
6c566def | 710 | ret = add_extent_mapping(inode, em, 0); |
c04e61b5 LB |
711 | /* it is possible that someone inserted the extent into the tree |
712 | * while we had the lock dropped. It is also possible that | |
713 | * an overlapping map exists in the tree | |
714 | */ | |
715 | if (ret == -EEXIST) { | |
716 | struct extent_map *existing; | |
717 | ||
d846a6d3 | 718 | existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); |
393da918 | 719 | |
f46b24c9 | 720 | trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); |
393da918 | 721 | |
c04e61b5 LB |
722 | /* |
723 | * existing will always be non-NULL, since there must be | |
724 | * extent causing the -EEXIST. | |
725 | */ | |
726 | if (start >= existing->start && | |
2e871330 | 727 | start < btrfs_extent_map_end(existing)) { |
ae98ae2a | 728 | btrfs_free_extent_map(em); |
c04e61b5 LB |
729 | *em_in = existing; |
730 | ret = 0; | |
731 | } else { | |
9a7e10e7 LB |
732 | u64 orig_start = em->start; |
733 | u64 orig_len = em->len; | |
734 | ||
c04e61b5 LB |
735 | /* |
736 | * The existing extent map is the one nearest to | |
737 | * the [start, start + len) range which overlaps | |
738 | */ | |
6c566def | 739 | ret = merge_extent_mapping(inode, existing, em, start); |
21334600 | 740 | if (WARN_ON(ret)) { |
ae98ae2a | 741 | btrfs_free_extent_map(em); |
c04e61b5 | 742 | *em_in = NULL; |
21334600 FM |
743 | btrfs_warn(fs_info, |
744 | "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", | |
2e871330 | 745 | existing->start, btrfs_extent_map_end(existing), |
21334600 | 746 | orig_start, orig_start + orig_len, start); |
c04e61b5 | 747 | } |
ae98ae2a | 748 | btrfs_free_extent_map(existing); |
c04e61b5 LB |
749 | } |
750 | } | |
751 | ||
752 | ASSERT(ret == 0 || ret == -EEXIST); | |
753 | return ret; | |
754 | } | |
4c0c8cfc | 755 | |
9c9d1b4f FM |
756 | /* |
757 | * Drop all extent maps from a tree in the fastest possible way, rescheduling | |
758 | * if needed. This avoids searching the tree, from the root down to the first | |
759 | * extent map, before each deletion. | |
760 | */ | |
c2fbd812 | 761 | static void drop_all_extent_maps_fast(struct btrfs_inode *inode) |
9c9d1b4f | 762 | { |
c2fbd812 | 763 | struct extent_map_tree *tree = &inode->extent_tree; |
4e660ca3 | 764 | struct rb_node *node; |
c2fbd812 | 765 | |
9c9d1b4f | 766 | write_lock(&tree->lock); |
4e660ca3 FM |
767 | node = rb_first(&tree->root); |
768 | while (node) { | |
9c9d1b4f | 769 | struct extent_map *em; |
4e660ca3 | 770 | struct rb_node *next = rb_next(node); |
9c9d1b4f | 771 | |
9c9d1b4f | 772 | em = rb_entry(node, struct extent_map, rb_node); |
f86f7a75 | 773 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
d846a6d3 | 774 | btrfs_remove_extent_mapping(inode, em); |
ae98ae2a | 775 | btrfs_free_extent_map(em); |
4e660ca3 FM |
776 | |
777 | if (cond_resched_rwlock_write(&tree->lock)) | |
778 | node = rb_first(&tree->root); | |
779 | else | |
780 | node = next; | |
9c9d1b4f FM |
781 | } |
782 | write_unlock(&tree->lock); | |
783 | } | |
784 | ||
4c0c8cfc FM |
785 | /* |
786 | * Drop all extent maps in a given range. | |
787 | * | |
788 | * @inode: The target inode. | |
789 | * @start: Start offset of the range. | |
790 | * @end: End offset of the range (inclusive value). | |
791 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. | |
792 | * | |
793 | * This drops all the extent maps that intersect the given range [@start, @end]. | |
794 | * Extent maps that partially overlap the range and extend behind or beyond it, | |
795 | * are split. | |
796 | * The caller should have locked an appropriate file range in the inode's io | |
797 | * tree before calling this function. | |
798 | */ | |
799 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, | |
800 | bool skip_pinned) | |
801 | { | |
db21370b FM |
802 | struct extent_map *split; |
803 | struct extent_map *split2; | |
804 | struct extent_map *em; | |
4c0c8cfc FM |
805 | struct extent_map_tree *em_tree = &inode->extent_tree; |
806 | u64 len = end - start + 1; | |
4c0c8cfc FM |
807 | |
808 | WARN_ON(end < start); | |
809 | if (end == (u64)-1) { | |
9c9d1b4f | 810 | if (start == 0 && !skip_pinned) { |
c2fbd812 | 811 | drop_all_extent_maps_fast(inode); |
9c9d1b4f FM |
812 | return; |
813 | } | |
4c0c8cfc | 814 | len = (u64)-1; |
db21370b FM |
815 | } else { |
816 | /* Make end offset exclusive for use in the loop below. */ | |
817 | end++; | |
4c0c8cfc | 818 | } |
db21370b FM |
819 | |
820 | /* | |
821 | * It's ok if we fail to allocate the extent maps, see the comment near | |
822 | * the bottom of the loop below. We only need two spare extent maps in | |
823 | * the worst case, where the first extent map that intersects our range | |
824 | * starts before the range and the last extent map that intersects our | |
825 | * range ends after our range (and they might be the same extent map), | |
826 | * because we need to split those two extent maps at the boundaries. | |
827 | */ | |
ae98ae2a FM |
828 | split = btrfs_alloc_extent_map(); |
829 | split2 = btrfs_alloc_extent_map(); | |
db21370b FM |
830 | |
831 | write_lock(&em_tree->lock); | |
d846a6d3 | 832 | em = btrfs_lookup_extent_mapping(em_tree, start, len); |
db21370b FM |
833 | |
834 | while (em) { | |
835 | /* extent_map_end() returns exclusive value (last byte + 1). */ | |
2e871330 | 836 | const u64 em_end = btrfs_extent_map_end(em); |
db21370b | 837 | struct extent_map *next_em = NULL; |
4c0c8cfc FM |
838 | u64 gen; |
839 | unsigned long flags; | |
4c0c8cfc | 840 | bool modified; |
4c0c8cfc | 841 | |
db21370b FM |
842 | if (em_end < end) { |
843 | next_em = next_extent_map(em); | |
844 | if (next_em) { | |
845 | if (next_em->start < end) | |
846 | refcount_inc(&next_em->refs); | |
847 | else | |
848 | next_em = NULL; | |
849 | } | |
4c0c8cfc | 850 | } |
db21370b | 851 | |
f86f7a75 | 852 | if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { |
f3109e33 | 853 | start = em_end; |
db21370b | 854 | goto next; |
4c0c8cfc | 855 | } |
db21370b | 856 | |
e4cc1483 | 857 | flags = em->flags; |
e4cc1483 FM |
858 | /* |
859 | * In case we split the extent map, we want to preserve the | |
860 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want | |
861 | * it on the new extent maps. | |
862 | */ | |
f86f7a75 | 863 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
4c0c8cfc | 864 | modified = !list_empty(&em->list); |
db21370b FM |
865 | |
866 | /* | |
867 | * The extent map does not cross our target range, so no need to | |
868 | * split it, we can remove it directly. | |
869 | */ | |
870 | if (em->start >= start && em_end <= end) | |
871 | goto remove_em; | |
872 | ||
db21370b | 873 | gen = em->generation; |
4c0c8cfc FM |
874 | |
875 | if (em->start < start) { | |
db21370b FM |
876 | if (!split) { |
877 | split = split2; | |
878 | split2 = NULL; | |
879 | if (!split) | |
880 | goto remove_em; | |
881 | } | |
4c0c8cfc FM |
882 | split->start = em->start; |
883 | split->len = start - em->start; | |
884 | ||
c77a8c61 | 885 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { |
3d2ac992 | 886 | split->disk_bytenr = em->disk_bytenr; |
e28b851e | 887 | split->disk_num_bytes = em->disk_num_bytes; |
3d2ac992 | 888 | split->offset = em->offset; |
4c0c8cfc FM |
889 | split->ram_bytes = em->ram_bytes; |
890 | } else { | |
3d2ac992 | 891 | split->disk_bytenr = em->disk_bytenr; |
e8fe524d | 892 | split->disk_num_bytes = 0; |
3d2ac992 | 893 | split->offset = 0; |
4c0c8cfc FM |
894 | split->ram_bytes = split->len; |
895 | } | |
896 | ||
897 | split->generation = gen; | |
898 | split->flags = flags; | |
6a3a9113 | 899 | replace_extent_mapping(inode, em, split, modified); |
ae98ae2a | 900 | btrfs_free_extent_map(split); |
4c0c8cfc FM |
901 | split = split2; |
902 | split2 = NULL; | |
903 | } | |
db21370b FM |
904 | if (em_end > end) { |
905 | if (!split) { | |
906 | split = split2; | |
907 | split2 = NULL; | |
908 | if (!split) | |
909 | goto remove_em; | |
910 | } | |
c962098c JB |
911 | split->start = end; |
912 | split->len = em_end - end; | |
3d2ac992 | 913 | split->disk_bytenr = em->disk_bytenr; |
4c0c8cfc | 914 | split->flags = flags; |
4c0c8cfc FM |
915 | split->generation = gen; |
916 | ||
c77a8c61 | 917 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { |
e28b851e | 918 | split->disk_num_bytes = em->disk_num_bytes; |
3d2ac992 | 919 | split->offset = em->offset + end - em->start; |
4c0c8cfc | 920 | split->ram_bytes = em->ram_bytes; |
4c0c8cfc | 921 | } else { |
3d2ac992 QW |
922 | split->disk_num_bytes = 0; |
923 | split->offset = 0; | |
4c0c8cfc | 924 | split->ram_bytes = split->len; |
4c0c8cfc FM |
925 | } |
926 | ||
2e871330 | 927 | if (btrfs_extent_map_in_tree(em)) { |
6a3a9113 | 928 | replace_extent_mapping(inode, em, split, modified); |
4c0c8cfc FM |
929 | } else { |
930 | int ret; | |
931 | ||
6c566def | 932 | ret = add_extent_mapping(inode, split, modified); |
4c0c8cfc FM |
933 | /* Logic error, shouldn't happen. */ |
934 | ASSERT(ret == 0); | |
935 | if (WARN_ON(ret != 0) && modified) | |
936 | btrfs_set_inode_full_sync(inode); | |
937 | } | |
ae98ae2a | 938 | btrfs_free_extent_map(split); |
4c0c8cfc FM |
939 | split = NULL; |
940 | } | |
db21370b | 941 | remove_em: |
2e871330 | 942 | if (btrfs_extent_map_in_tree(em)) { |
4c0c8cfc FM |
943 | /* |
944 | * If the extent map is still in the tree it means that | |
945 | * either of the following is true: | |
946 | * | |
947 | * 1) It fits entirely in our range (doesn't end beyond | |
948 | * it or starts before it); | |
949 | * | |
950 | * 2) It starts before our range and/or ends after our | |
951 | * range, and we were not able to allocate the extent | |
952 | * maps for split operations, @split and @split2. | |
953 | * | |
954 | * If we are at case 2) then we just remove the entire | |
955 | * extent map - this is fine since if anyone needs it to | |
956 | * access the subranges outside our range, will just | |
957 | * load it again from the subvolume tree's file extent | |
958 | * item. However if the extent map was in the list of | |
959 | * modified extents, then we must mark the inode for a | |
960 | * full fsync, otherwise a fast fsync will miss this | |
961 | * extent if it's new and needs to be logged. | |
962 | */ | |
db21370b FM |
963 | if ((em->start < start || em_end > end) && modified) { |
964 | ASSERT(!split); | |
4c0c8cfc FM |
965 | btrfs_set_inode_full_sync(inode); |
966 | } | |
d846a6d3 | 967 | btrfs_remove_extent_mapping(inode, em); |
4c0c8cfc | 968 | } |
4c0c8cfc | 969 | |
db21370b FM |
970 | /* |
971 | * Once for the tree reference (we replaced or removed the | |
972 | * extent map from the tree). | |
973 | */ | |
ae98ae2a | 974 | btrfs_free_extent_map(em); |
db21370b FM |
975 | next: |
976 | /* Once for us (for our lookup reference). */ | |
ae98ae2a | 977 | btrfs_free_extent_map(em); |
db21370b FM |
978 | |
979 | em = next_em; | |
4c0c8cfc FM |
980 | } |
981 | ||
db21370b FM |
982 | write_unlock(&em_tree->lock); |
983 | ||
ae98ae2a FM |
984 | btrfs_free_extent_map(split); |
985 | btrfs_free_extent_map(split2); | |
4c0c8cfc | 986 | } |
a1ba4c08 FM |
987 | |
988 | /* | |
989 | * Replace a range in the inode's extent map tree with a new extent map. | |
990 | * | |
991 | * @inode: The target inode. | |
992 | * @new_em: The new extent map to add to the inode's extent map tree. | |
993 | * @modified: Indicate if the new extent map should be added to the list of | |
994 | * modified extents (for fast fsync tracking). | |
995 | * | |
996 | * Drops all the extent maps in the inode's extent map tree that intersect the | |
997 | * range of the new extent map and adds the new extent map to the tree. | |
998 | * The caller should have locked an appropriate file range in the inode's io | |
999 | * tree before calling this function. | |
1000 | */ | |
1001 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | |
1002 | struct extent_map *new_em, | |
1003 | bool modified) | |
1004 | { | |
1005 | const u64 end = new_em->start + new_em->len - 1; | |
1006 | struct extent_map_tree *tree = &inode->extent_tree; | |
1007 | int ret; | |
1008 | ||
2e871330 | 1009 | ASSERT(!btrfs_extent_map_in_tree(new_em)); |
a1ba4c08 FM |
1010 | |
1011 | /* | |
1012 | * The caller has locked an appropriate file range in the inode's io | |
1013 | * tree, but getting -EEXIST when adding the new extent map can still | |
1014 | * happen in case there are extents that partially cover the range, and | |
1015 | * this is due to two tasks operating on different parts of the extent. | |
1016 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from | |
1017 | * btrfs_get_extent") for an example and details. | |
1018 | */ | |
1019 | do { | |
1020 | btrfs_drop_extent_map_range(inode, new_em->start, end, false); | |
1021 | write_lock(&tree->lock); | |
6c566def | 1022 | ret = add_extent_mapping(inode, new_em, modified); |
a1ba4c08 FM |
1023 | write_unlock(&tree->lock); |
1024 | } while (ret == -EEXIST); | |
1025 | ||
1026 | return ret; | |
1027 | } | |
a6f3e205 CH |
1028 | |
1029 | /* | |
f000bc6f CH |
1030 | * Split off the first pre bytes from the extent_map at [start, start + len], |
1031 | * and set the block_start for it to new_logical. | |
a6f3e205 CH |
1032 | * |
1033 | * This function is used when an ordered_extent needs to be split. | |
1034 | */ | |
d846a6d3 FM |
1035 | int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, |
1036 | u64 new_logical) | |
a6f3e205 CH |
1037 | { |
1038 | struct extent_map_tree *em_tree = &inode->extent_tree; | |
1039 | struct extent_map *em; | |
1040 | struct extent_map *split_pre = NULL; | |
1041 | struct extent_map *split_mid = NULL; | |
1042 | int ret = 0; | |
1043 | unsigned long flags; | |
1044 | ||
1045 | ASSERT(pre != 0); | |
1046 | ASSERT(pre < len); | |
1047 | ||
ae98ae2a | 1048 | split_pre = btrfs_alloc_extent_map(); |
a6f3e205 CH |
1049 | if (!split_pre) |
1050 | return -ENOMEM; | |
ae98ae2a | 1051 | split_mid = btrfs_alloc_extent_map(); |
a6f3e205 CH |
1052 | if (!split_mid) { |
1053 | ret = -ENOMEM; | |
1054 | goto out_free_pre; | |
1055 | } | |
1056 | ||
242570e8 | 1057 | btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); |
a6f3e205 | 1058 | write_lock(&em_tree->lock); |
d846a6d3 | 1059 | em = btrfs_lookup_extent_mapping(em_tree, start, len); |
a6f3e205 CH |
1060 | if (!em) { |
1061 | ret = -EIO; | |
1062 | goto out_unlock; | |
1063 | } | |
1064 | ||
1065 | ASSERT(em->len == len); | |
962162ff | 1066 | ASSERT(!btrfs_extent_map_is_compressed(em)); |
c77a8c61 | 1067 | ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); |
f86f7a75 FM |
1068 | ASSERT(em->flags & EXTENT_FLAG_PINNED); |
1069 | ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); | |
a6f3e205 CH |
1070 | ASSERT(!list_empty(&em->list)); |
1071 | ||
1072 | flags = em->flags; | |
f86f7a75 | 1073 | em->flags &= ~EXTENT_FLAG_PINNED; |
a6f3e205 CH |
1074 | |
1075 | /* First, replace the em with a new extent_map starting from * em->start */ | |
1076 | split_pre->start = em->start; | |
1077 | split_pre->len = pre; | |
3d2ac992 QW |
1078 | split_pre->disk_bytenr = new_logical; |
1079 | split_pre->disk_num_bytes = split_pre->len; | |
1080 | split_pre->offset = 0; | |
a6f3e205 CH |
1081 | split_pre->ram_bytes = split_pre->len; |
1082 | split_pre->flags = flags; | |
a6f3e205 CH |
1083 | split_pre->generation = em->generation; |
1084 | ||
6a3a9113 | 1085 | replace_extent_mapping(inode, em, split_pre, 1); |
a6f3e205 CH |
1086 | |
1087 | /* | |
1088 | * Now we only have an extent_map at: | |
1089 | * [em->start, em->start + pre] | |
1090 | */ | |
1091 | ||
1092 | /* Insert the middle extent_map. */ | |
1093 | split_mid->start = em->start + pre; | |
1094 | split_mid->len = em->len - pre; | |
2e871330 | 1095 | split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; |
3d2ac992 QW |
1096 | split_mid->disk_num_bytes = split_mid->len; |
1097 | split_mid->offset = 0; | |
a6f3e205 CH |
1098 | split_mid->ram_bytes = split_mid->len; |
1099 | split_mid->flags = flags; | |
a6f3e205 | 1100 | split_mid->generation = em->generation; |
6c566def | 1101 | add_extent_mapping(inode, split_mid, 1); |
a6f3e205 CH |
1102 | |
1103 | /* Once for us */ | |
ae98ae2a | 1104 | btrfs_free_extent_map(em); |
a6f3e205 | 1105 | /* Once for the tree */ |
ae98ae2a | 1106 | btrfs_free_extent_map(em); |
a6f3e205 CH |
1107 | |
1108 | out_unlock: | |
1109 | write_unlock(&em_tree->lock); | |
242570e8 | 1110 | btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); |
ae98ae2a | 1111 | btrfs_free_extent_map(split_mid); |
a6f3e205 | 1112 | out_free_pre: |
ae98ae2a | 1113 | btrfs_free_extent_map(split_pre); |
a6f3e205 CH |
1114 | return ret; |
1115 | } | |
956a17d9 | 1116 | |
44849405 FM |
1117 | struct btrfs_em_shrink_ctx { |
1118 | long nr_to_scan; | |
1119 | long scanned; | |
44849405 FM |
1120 | }; |
1121 | ||
1122 | static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) | |
956a17d9 | 1123 | { |
10204438 FM |
1124 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
1125 | const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); | |
956a17d9 FM |
1126 | struct extent_map_tree *tree = &inode->extent_tree; |
1127 | long nr_dropped = 0; | |
1128 | struct rb_node *node; | |
1129 | ||
c6c9c4d5 FM |
1130 | lockdep_assert_held_write(&tree->lock); |
1131 | ||
956a17d9 FM |
1132 | /* |
1133 | * Take the mmap lock so that we serialize with the inode logging phase | |
1134 | * of fsync because we may need to set the full sync flag on the inode, | |
1135 | * in case we have to remove extent maps in the tree's list of modified | |
1136 | * extents. If we set the full sync flag in the inode while an fsync is | |
1137 | * in progress, we may risk missing new extents because before the flag | |
1138 | * is set, fsync decides to only wait for writeback to complete and then | |
1139 | * during inode logging it sees the flag set and uses the subvolume tree | |
1140 | * to find new extents, which may not be there yet because ordered | |
1141 | * extents haven't completed yet. | |
1142 | * | |
c6c9c4d5 FM |
1143 | * We also do a try lock because we don't want to block for too long and |
1144 | * we are holding the extent map tree's lock in write mode. | |
956a17d9 FM |
1145 | */ |
1146 | if (!down_read_trylock(&inode->i_mmap_lock)) | |
1147 | return 0; | |
1148 | ||
4e660ca3 | 1149 | node = rb_first(&tree->root); |
956a17d9 | 1150 | while (node) { |
4e660ca3 | 1151 | struct rb_node *next = rb_next(node); |
956a17d9 FM |
1152 | struct extent_map *em; |
1153 | ||
1154 | em = rb_entry(node, struct extent_map, rb_node); | |
44849405 | 1155 | ctx->scanned++; |
956a17d9 FM |
1156 | |
1157 | if (em->flags & EXTENT_FLAG_PINNED) | |
1158 | goto next; | |
1159 | ||
1160 | /* | |
1161 | * If the inode is in the list of modified extents (new) and its | |
1162 | * generation is the same (or is greater than) the current fs | |
1163 | * generation, it means it was not yet persisted so we have to | |
1164 | * set the full sync flag so that the next fsync will not miss | |
1165 | * it. | |
1166 | */ | |
1167 | if (!list_empty(&em->list) && em->generation >= cur_fs_gen) | |
1168 | btrfs_set_inode_full_sync(inode); | |
1169 | ||
d846a6d3 | 1170 | btrfs_remove_extent_mapping(inode, em); |
0d89a15e | 1171 | trace_btrfs_extent_map_shrinker_remove_em(inode, em); |
956a17d9 | 1172 | /* Drop the reference for the tree. */ |
ae98ae2a | 1173 | btrfs_free_extent_map(em); |
956a17d9 FM |
1174 | nr_dropped++; |
1175 | next: | |
44849405 | 1176 | if (ctx->scanned >= ctx->nr_to_scan) |
956a17d9 FM |
1177 | break; |
1178 | ||
1179 | /* | |
b3ebb9b7 FM |
1180 | * Stop if we need to reschedule or there's contention on the |
1181 | * lock. This is to avoid slowing other tasks trying to take the | |
ae1e766f | 1182 | * lock. |
956a17d9 | 1183 | */ |
10204438 FM |
1184 | if (need_resched() || rwlock_needbreak(&tree->lock) || |
1185 | btrfs_fs_closing(fs_info)) | |
b3ebb9b7 | 1186 | break; |
a1b547f0 | 1187 | node = next; |
956a17d9 | 1188 | } |
956a17d9 FM |
1189 | up_read(&inode->i_mmap_lock); |
1190 | ||
1191 | return nr_dropped; | |
1192 | } | |
1193 | ||
c6c9c4d5 FM |
1194 | static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, |
1195 | u64 min_ino) | |
1196 | { | |
1197 | struct btrfs_inode *inode; | |
1198 | unsigned long from = min_ino; | |
1199 | ||
1200 | xa_lock(&root->inodes); | |
1201 | while (true) { | |
1202 | struct extent_map_tree *tree; | |
1203 | ||
1204 | inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); | |
1205 | if (!inode) | |
1206 | break; | |
1207 | ||
1208 | tree = &inode->extent_tree; | |
1209 | ||
1210 | /* | |
1211 | * We want to be fast so if the lock is busy we don't want to | |
1212 | * spend time waiting for it (some task is about to do IO for | |
1213 | * the inode). | |
1214 | */ | |
1215 | if (!write_trylock(&tree->lock)) | |
1216 | goto next; | |
1217 | ||
1218 | /* | |
1219 | * Skip inode if it doesn't have loaded extent maps, so we avoid | |
1220 | * getting a reference and doing an iput later. This includes | |
1221 | * cases like files that were opened for things like stat(2), or | |
1222 | * files with all extent maps previously released through the | |
1223 | * release folio callback (btrfs_release_folio()) or released in | |
1224 | * a previous run, or directories which never have extent maps. | |
1225 | */ | |
1226 | if (RB_EMPTY_ROOT(&tree->root)) { | |
1227 | write_unlock(&tree->lock); | |
1228 | goto next; | |
1229 | } | |
1230 | ||
1231 | if (igrab(&inode->vfs_inode)) | |
1232 | break; | |
1233 | ||
1234 | write_unlock(&tree->lock); | |
1235 | next: | |
1236 | from = btrfs_ino(inode) + 1; | |
1237 | cond_resched_lock(&root->inodes.xa_lock); | |
1238 | } | |
1239 | xa_unlock(&root->inodes); | |
1240 | ||
1241 | return inode; | |
1242 | } | |
1243 | ||
44849405 | 1244 | static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) |
956a17d9 | 1245 | { |
70a5f9e2 | 1246 | struct btrfs_fs_info *fs_info = root->fs_info; |
956a17d9 FM |
1247 | struct btrfs_inode *inode; |
1248 | long nr_dropped = 0; | |
e7fa8450 | 1249 | u64 min_ino = fs_info->em_shrinker_last_ino + 1; |
956a17d9 | 1250 | |
c6c9c4d5 | 1251 | inode = find_first_inode_to_shrink(root, min_ino); |
956a17d9 | 1252 | while (inode) { |
44849405 | 1253 | nr_dropped += btrfs_scan_inode(inode, ctx); |
c6c9c4d5 | 1254 | write_unlock(&inode->extent_tree.lock); |
956a17d9 FM |
1255 | |
1256 | min_ino = btrfs_ino(inode) + 1; | |
e7fa8450 | 1257 | fs_info->em_shrinker_last_ino = btrfs_ino(inode); |
15b3b325 | 1258 | iput(&inode->vfs_inode); |
956a17d9 | 1259 | |
59f37036 | 1260 | if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) |
956a17d9 FM |
1261 | break; |
1262 | ||
ae1e766f | 1263 | cond_resched(); |
b3ebb9b7 | 1264 | |
c6c9c4d5 | 1265 | inode = find_first_inode_to_shrink(root, min_ino); |
956a17d9 FM |
1266 | } |
1267 | ||
1268 | if (inode) { | |
1269 | /* | |
1270 | * There are still inodes in this root or we happened to process | |
1271 | * the last one and reached the scan limit. In either case set | |
1272 | * the current root to this one, so we'll resume from the next | |
1273 | * inode if there is one or we will find out this was the last | |
1274 | * one and move to the next root. | |
1275 | */ | |
e7fa8450 | 1276 | fs_info->em_shrinker_last_root = btrfs_root_id(root); |
956a17d9 FM |
1277 | } else { |
1278 | /* | |
1279 | * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so | |
1280 | * that when processing the next root we start from its first inode. | |
1281 | */ | |
e7fa8450 FM |
1282 | fs_info->em_shrinker_last_ino = 0; |
1283 | fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; | |
956a17d9 FM |
1284 | } |
1285 | ||
1286 | return nr_dropped; | |
1287 | } | |
1288 | ||
10204438 | 1289 | static void btrfs_extent_map_shrinker_worker(struct work_struct *work) |
956a17d9 | 1290 | { |
10204438 | 1291 | struct btrfs_fs_info *fs_info; |
44849405 FM |
1292 | struct btrfs_em_shrink_ctx ctx; |
1293 | u64 start_root_id; | |
1294 | u64 next_root_id; | |
956a17d9 FM |
1295 | bool cycled = false; |
1296 | long nr_dropped = 0; | |
44849405 | 1297 | |
e7fa8450 | 1298 | fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); |
10204438 | 1299 | |
44849405 | 1300 | ctx.scanned = 0; |
e7fa8450 | 1301 | ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); |
44849405 | 1302 | |
e7fa8450 FM |
1303 | start_root_id = fs_info->em_shrinker_last_root; |
1304 | next_root_id = fs_info->em_shrinker_last_root; | |
956a17d9 | 1305 | |
0d89a15e FM |
1306 | if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { |
1307 | s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | |
1308 | ||
70a5f9e2 | 1309 | trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); |
0d89a15e FM |
1310 | } |
1311 | ||
10204438 | 1312 | while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { |
956a17d9 FM |
1313 | struct btrfs_root *root; |
1314 | unsigned long count; | |
1315 | ||
ae1e766f FM |
1316 | cond_resched(); |
1317 | ||
956a17d9 FM |
1318 | spin_lock(&fs_info->fs_roots_radix_lock); |
1319 | count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, | |
1320 | (void **)&root, | |
1321 | (unsigned long)next_root_id, 1); | |
1322 | if (count == 0) { | |
1323 | spin_unlock(&fs_info->fs_roots_radix_lock); | |
1324 | if (start_root_id > 0 && !cycled) { | |
1325 | next_root_id = 0; | |
e7fa8450 FM |
1326 | fs_info->em_shrinker_last_root = 0; |
1327 | fs_info->em_shrinker_last_ino = 0; | |
956a17d9 FM |
1328 | cycled = true; |
1329 | continue; | |
1330 | } | |
1331 | break; | |
1332 | } | |
1333 | next_root_id = btrfs_root_id(root) + 1; | |
1334 | root = btrfs_grab_root(root); | |
1335 | spin_unlock(&fs_info->fs_roots_radix_lock); | |
1336 | ||
1337 | if (!root) | |
1338 | continue; | |
1339 | ||
1340 | if (is_fstree(btrfs_root_id(root))) | |
44849405 | 1341 | nr_dropped += btrfs_scan_root(root, &ctx); |
956a17d9 FM |
1342 | |
1343 | btrfs_put_root(root); | |
1344 | } | |
1345 | ||
0d89a15e FM |
1346 | if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { |
1347 | s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | |
1348 | ||
70a5f9e2 | 1349 | trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); |
0d89a15e FM |
1350 | } |
1351 | ||
e7fa8450 | 1352 | atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); |
10204438 FM |
1353 | } |
1354 | ||
1355 | void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | |
1356 | { | |
1357 | /* | |
1358 | * Do nothing if the shrinker is already running. In case of high memory | |
1359 | * pressure we can have a lot of tasks calling us and all passing the | |
1360 | * same nr_to_scan value, but in reality we may need only to free | |
1361 | * nr_to_scan extent maps (or less). In case we need to free more than | |
1362 | * that, we will be called again by the fs shrinker, so no worries about | |
1363 | * not doing enough work to reclaim memory from extent maps. | |
1364 | * We can also be repeatedly called with the same nr_to_scan value | |
1365 | * simply because the shrinker runs asynchronously and multiple calls | |
1366 | * to this function are made before the shrinker does enough progress. | |
1367 | * | |
1368 | * That's why we set the atomic counter to nr_to_scan only if its | |
1369 | * current value is zero, instead of incrementing the counter by | |
1370 | * nr_to_scan. | |
1371 | */ | |
e7fa8450 | 1372 | if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) |
10204438 FM |
1373 | return; |
1374 | ||
e7fa8450 | 1375 | queue_work(system_unbound_wq, &fs_info->em_shrinker_work); |
10204438 FM |
1376 | } |
1377 | ||
1378 | void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) | |
1379 | { | |
e7fa8450 FM |
1380 | atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); |
1381 | INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); | |
956a17d9 | 1382 | } |