Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c1d7c514 | 2 | |
d1310b2e | 3 | #include <linux/err.h> |
d1310b2e | 4 | #include <linux/slab.h> |
a52d9a80 | 5 | #include <linux/spinlock.h> |
9b569ea0 | 6 | #include "messages.h" |
261507a0 | 7 | #include "ctree.h" |
a52d9a80 | 8 | #include "extent_map.h" |
ebb8765b | 9 | #include "compression.h" |
4c0c8cfc | 10 | #include "btrfs_inode.h" |
956a17d9 | 11 | #include "disk-io.h" |
a52d9a80 | 12 | |
86479a04 | 13 | |
a52d9a80 | 14 | static struct kmem_cache *extent_map_cache; |
ca664626 | 15 | |
2f4cbe64 | 16 | int __init extent_map_init(void) |
a52d9a80 | 17 | { |
837e1972 | 18 | extent_map_cache = kmem_cache_create("btrfs_extent_map", |
ef5a05c5 | 19 | sizeof(struct extent_map), 0, 0, NULL); |
2f4cbe64 WB |
20 | if (!extent_map_cache) |
21 | return -ENOMEM; | |
2f4cbe64 | 22 | return 0; |
a52d9a80 CM |
23 | } |
24 | ||
e67c718b | 25 | void __cold extent_map_exit(void) |
a52d9a80 | 26 | { |
5598e900 | 27 | kmem_cache_destroy(extent_map_cache); |
a52d9a80 CM |
28 | } |
29 | ||
43dd529a DS |
30 | /* |
31 | * Initialize the extent tree @tree. Should be called for each new inode or | |
32 | * other user of the extent_map interface. | |
9d2423c5 | 33 | */ |
a8067e02 | 34 | void extent_map_tree_init(struct extent_map_tree *tree) |
a52d9a80 | 35 | { |
4e660ca3 | 36 | tree->root = RB_ROOT; |
5dc562c5 | 37 | INIT_LIST_HEAD(&tree->modified_extents); |
890871be | 38 | rwlock_init(&tree->lock); |
a52d9a80 | 39 | } |
a52d9a80 | 40 | |
43dd529a DS |
41 | /* |
42 | * Allocate a new extent_map structure. The new structure is returned with a | |
43 | * reference count of one and needs to be freed using free_extent_map() | |
9d2423c5 | 44 | */ |
172ddd60 | 45 | struct extent_map *alloc_extent_map(void) |
a52d9a80 CM |
46 | { |
47 | struct extent_map *em; | |
70c8a91c | 48 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
c26a9203 TI |
49 | if (!em) |
50 | return NULL; | |
cbc0e928 | 51 | RB_CLEAR_NODE(&em->rb_node); |
490b54d6 | 52 | refcount_set(&em->refs, 1); |
5dc562c5 | 53 | INIT_LIST_HEAD(&em->list); |
a52d9a80 CM |
54 | return em; |
55 | } | |
a52d9a80 | 56 | |
43dd529a DS |
57 | /* |
58 | * Drop the reference out on @em by one and free the structure if the reference | |
59 | * count hits zero. | |
9d2423c5 | 60 | */ |
a52d9a80 CM |
61 | void free_extent_map(struct extent_map *em) |
62 | { | |
2bf5a725 CM |
63 | if (!em) |
64 | return; | |
490b54d6 | 65 | if (refcount_dec_and_test(&em->refs)) { |
cbc0e928 | 66 | WARN_ON(extent_map_in_tree(em)); |
5dc562c5 | 67 | WARN_ON(!list_empty(&em->list)); |
a52d9a80 CM |
68 | kmem_cache_free(extent_map_cache, em); |
69 | } | |
70 | } | |
a52d9a80 | 71 | |
43dd529a | 72 | /* Do the math around the end of an extent, handling wrapping. */ |
32193c14 FDBM |
73 | static u64 range_end(u64 start, u64 len) |
74 | { | |
75 | if (start + len < start) | |
76 | return (u64)-1; | |
77 | return start + len; | |
78 | } | |
79 | ||
f1d97e76 FM |
80 | static void dec_evictable_extent_maps(struct btrfs_inode *inode) |
81 | { | |
82 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
83 | ||
84 | if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) | |
85 | percpu_counter_dec(&fs_info->evictable_extent_maps); | |
86 | } | |
87 | ||
4e660ca3 | 88 | static int tree_insert(struct rb_root *root, struct extent_map *em) |
a52d9a80 | 89 | { |
4e660ca3 | 90 | struct rb_node **p = &root->rb_node; |
d397712b | 91 | struct rb_node *parent = NULL; |
32193c14 FDBM |
92 | struct extent_map *entry = NULL; |
93 | struct rb_node *orig_parent = NULL; | |
94 | u64 end = range_end(em->start, em->len); | |
a52d9a80 | 95 | |
d397712b | 96 | while (*p) { |
a52d9a80 | 97 | parent = *p; |
d1310b2e CM |
98 | entry = rb_entry(parent, struct extent_map, rb_node); |
99 | ||
4e660ca3 | 100 | if (em->start < entry->start) |
a52d9a80 | 101 | p = &(*p)->rb_left; |
4e660ca3 | 102 | else if (em->start >= extent_map_end(entry)) |
a52d9a80 | 103 | p = &(*p)->rb_right; |
4e660ca3 | 104 | else |
32193c14 | 105 | return -EEXIST; |
a52d9a80 CM |
106 | } |
107 | ||
32193c14 FDBM |
108 | orig_parent = parent; |
109 | while (parent && em->start >= extent_map_end(entry)) { | |
110 | parent = rb_next(parent); | |
111 | entry = rb_entry(parent, struct extent_map, rb_node); | |
112 | } | |
113 | if (parent) | |
114 | if (end > entry->start && em->start < extent_map_end(entry)) | |
115 | return -EEXIST; | |
116 | ||
117 | parent = orig_parent; | |
118 | entry = rb_entry(parent, struct extent_map, rb_node); | |
119 | while (parent && em->start < entry->start) { | |
120 | parent = rb_prev(parent); | |
121 | entry = rb_entry(parent, struct extent_map, rb_node); | |
122 | } | |
123 | if (parent) | |
124 | if (end > entry->start && em->start < extent_map_end(entry)) | |
125 | return -EEXIST; | |
126 | ||
32193c14 | 127 | rb_link_node(&em->rb_node, orig_parent, p); |
4e660ca3 | 128 | rb_insert_color(&em->rb_node, root); |
32193c14 | 129 | return 0; |
a52d9a80 CM |
130 | } |
131 | ||
d352ac68 | 132 | /* |
43dd529a DS |
133 | * Search through the tree for an extent_map with a given offset. If it can't |
134 | * be found, try to find some neighboring extents | |
d352ac68 | 135 | */ |
a52d9a80 | 136 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
6c05813e | 137 | struct rb_node **prev_or_next_ret) |
a52d9a80 | 138 | { |
d397712b | 139 | struct rb_node *n = root->rb_node; |
a52d9a80 | 140 | struct rb_node *prev = NULL; |
5f56406a | 141 | struct rb_node *orig_prev = NULL; |
d1310b2e CM |
142 | struct extent_map *entry; |
143 | struct extent_map *prev_entry = NULL; | |
a52d9a80 | 144 | |
6c05813e | 145 | ASSERT(prev_or_next_ret); |
08f088dd | 146 | |
d397712b | 147 | while (n) { |
d1310b2e | 148 | entry = rb_entry(n, struct extent_map, rb_node); |
a52d9a80 CM |
149 | prev = n; |
150 | prev_entry = entry; | |
151 | ||
152 | if (offset < entry->start) | |
153 | n = n->rb_left; | |
d1310b2e | 154 | else if (offset >= extent_map_end(entry)) |
a52d9a80 CM |
155 | n = n->rb_right; |
156 | else | |
157 | return n; | |
158 | } | |
5f56406a | 159 | |
08f088dd FM |
160 | orig_prev = prev; |
161 | while (prev && offset >= extent_map_end(prev_entry)) { | |
162 | prev = rb_next(prev); | |
163 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | |
5f56406a CM |
164 | } |
165 | ||
6c05813e FM |
166 | /* |
167 | * Previous extent map found, return as in this case the caller does not | |
168 | * care about the next one. | |
169 | */ | |
170 | if (prev) { | |
171 | *prev_or_next_ret = prev; | |
172 | return NULL; | |
173 | } | |
174 | ||
175 | prev = orig_prev; | |
08f088dd FM |
176 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
177 | while (prev && offset < prev_entry->start) { | |
178 | prev = rb_prev(prev); | |
d1310b2e | 179 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
a52d9a80 | 180 | } |
6c05813e | 181 | *prev_or_next_ret = prev; |
08f088dd | 182 | |
a52d9a80 CM |
183 | return NULL; |
184 | } | |
185 | ||
e28b851e QW |
186 | static inline u64 extent_map_block_len(const struct extent_map *em) |
187 | { | |
188 | if (extent_map_is_compressed(em)) | |
189 | return em->disk_num_bytes; | |
190 | return em->len; | |
191 | } | |
192 | ||
2ecec0d6 FM |
193 | static inline u64 extent_map_block_end(const struct extent_map *em) |
194 | { | |
ab094670 FM |
195 | const u64 block_start = extent_map_block_start(em); |
196 | const u64 block_end = block_start + extent_map_block_len(em); | |
197 | ||
198 | if (block_end < block_start) | |
2ecec0d6 | 199 | return (u64)-1; |
ab094670 FM |
200 | |
201 | return block_end; | |
2ecec0d6 FM |
202 | } |
203 | ||
1a9fb16c | 204 | static bool can_merge_extent_map(const struct extent_map *em) |
a52d9a80 | 205 | { |
f86f7a75 | 206 | if (em->flags & EXTENT_FLAG_PINNED) |
1a9fb16c | 207 | return false; |
7f3c74fb | 208 | |
1a9fb16c | 209 | /* Don't merge compressed extents, we need to know their actual size. */ |
f86f7a75 | 210 | if (extent_map_is_compressed(em)) |
1a9fb16c | 211 | return false; |
c8b97818 | 212 | |
f86f7a75 | 213 | if (em->flags & EXTENT_FLAG_LOGGING) |
1a9fb16c | 214 | return false; |
201a9038 | 215 | |
09a2a8f9 JB |
216 | /* |
217 | * We don't want to merge stuff that hasn't been written to the log yet | |
218 | * since it may not reflect exactly what is on disk, and that would be | |
219 | * bad. | |
220 | */ | |
1a9fb16c FM |
221 | if (!list_empty(&em->list)) |
222 | return false; | |
223 | ||
224 | return true; | |
225 | } | |
09a2a8f9 | 226 | |
1a9fb16c | 227 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
27f0d9c9 | 228 | static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) |
1a9fb16c | 229 | { |
27f0d9c9 FM |
230 | if (extent_map_end(prev) != next->start) |
231 | return false; | |
232 | ||
233 | if (prev->flags != next->flags) | |
234 | return false; | |
235 | ||
c77a8c61 QW |
236 | if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) |
237 | return extent_map_block_start(next) == extent_map_block_end(prev); | |
27f0d9c9 FM |
238 | |
239 | /* HOLES and INLINE extents. */ | |
c77a8c61 | 240 | return next->disk_bytenr == prev->disk_bytenr; |
a52d9a80 CM |
241 | } |
242 | ||
3d2ac992 QW |
243 | /* |
244 | * Handle the on-disk data extents merge for @prev and @next. | |
245 | * | |
246 | * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. | |
247 | * For now only uncompressed regular extent can be merged. | |
248 | * | |
249 | * @prev and @next will be both updated to point to the new merged range. | |
250 | * Thus one of them should be removed by the caller. | |
251 | */ | |
252 | static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next) | |
253 | { | |
254 | u64 new_disk_bytenr; | |
255 | u64 new_disk_num_bytes; | |
256 | u64 new_offset; | |
257 | ||
258 | /* @prev and @next should not be compressed. */ | |
259 | ASSERT(!extent_map_is_compressed(prev)); | |
260 | ASSERT(!extent_map_is_compressed(next)); | |
261 | ||
262 | /* | |
263 | * There are two different cases where @prev and @next can be merged. | |
264 | * | |
265 | * 1) They are referring to the same data extent: | |
266 | * | |
267 | * |<----- data extent A ----->| | |
268 | * |<- prev ->|<- next ->| | |
269 | * | |
270 | * 2) They are referring to different data extents but still adjacent: | |
271 | * | |
272 | * |<-- data extent A -->|<-- data extent B -->| | |
273 | * |<- prev ->|<- next ->| | |
274 | * | |
275 | * The calculation here always merges the data extents first, then updates | |
276 | * @offset using the new data extents. | |
277 | * | |
278 | * For case 1), the merged data extent would be the same. | |
279 | * For case 2), we just merge the two data extents into one. | |
280 | */ | |
281 | new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr); | |
282 | new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes, | |
283 | next->disk_bytenr + next->disk_num_bytes) - | |
284 | new_disk_bytenr; | |
285 | new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; | |
286 | ||
287 | prev->disk_bytenr = new_disk_bytenr; | |
288 | prev->disk_num_bytes = new_disk_num_bytes; | |
289 | prev->ram_bytes = new_disk_num_bytes; | |
290 | prev->offset = new_offset; | |
291 | ||
292 | next->disk_bytenr = new_disk_bytenr; | |
293 | next->disk_num_bytes = new_disk_num_bytes; | |
294 | next->ram_bytes = new_disk_num_bytes; | |
295 | next->offset = new_offset; | |
296 | } | |
297 | ||
3f255ece QW |
298 | static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, |
299 | struct extent_map *em) | |
300 | { | |
301 | if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) | |
302 | return; | |
303 | btrfs_crit(fs_info, | |
c77a8c61 | 304 | "%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x", |
3f255ece | 305 | prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes, |
c77a8c61 | 306 | em->ram_bytes, em->offset, em->flags); |
3f255ece QW |
307 | ASSERT(0); |
308 | } | |
309 | ||
310 | /* Internal sanity checks for btrfs debug builds. */ | |
311 | static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) | |
312 | { | |
313 | if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) | |
314 | return; | |
315 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { | |
316 | if (em->disk_num_bytes == 0) | |
317 | dump_extent_map(fs_info, "zero disk_num_bytes", em); | |
318 | if (em->offset + em->len > em->ram_bytes) | |
319 | dump_extent_map(fs_info, "ram_bytes too small", em); | |
320 | if (em->offset + em->len > em->disk_num_bytes && | |
321 | !extent_map_is_compressed(em)) | |
322 | dump_extent_map(fs_info, "disk_num_bytes too small", em); | |
1b87d26a QW |
323 | if (!extent_map_is_compressed(em) && |
324 | em->ram_bytes != em->disk_num_bytes) | |
325 | dump_extent_map(fs_info, | |
326 | "ram_bytes mismatch with disk_num_bytes for non-compressed em", | |
327 | em); | |
3f255ece QW |
328 | } else if (em->offset) { |
329 | dump_extent_map(fs_info, "non-zero offset for hole/inline", em); | |
330 | } | |
a52d9a80 CM |
331 | } |
332 | ||
5fa8a6ba | 333 | static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) |
a1ed835e | 334 | { |
3f255ece | 335 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
5fa8a6ba | 336 | struct extent_map_tree *tree = &inode->extent_tree; |
a1ed835e CM |
337 | struct extent_map *merge = NULL; |
338 | struct rb_node *rb; | |
a1ed835e | 339 | |
ac05ca91 FM |
340 | /* |
341 | * We can't modify an extent map that is in the tree and that is being | |
342 | * used by another task, as it can cause that other task to see it in | |
343 | * inconsistent state during the merging. We always have 1 reference for | |
344 | * the tree and 1 for this task (which is unpinning the extent map or | |
345 | * clearing the logging flag), so anything > 2 means it's being used by | |
346 | * other tasks too. | |
347 | */ | |
348 | if (refcount_read(&em->refs) > 2) | |
349 | return; | |
350 | ||
1a9fb16c FM |
351 | if (!can_merge_extent_map(em)) |
352 | return; | |
353 | ||
a1ed835e CM |
354 | if (em->start != 0) { |
355 | rb = rb_prev(&em->rb_node); | |
356 | if (rb) | |
357 | merge = rb_entry(rb, struct extent_map, rb_node); | |
27f0d9c9 | 358 | if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { |
a1ed835e CM |
359 | em->start = merge->start; |
360 | em->len += merge->len; | |
70c8a91c | 361 | em->generation = max(em->generation, merge->generation); |
3d2ac992 QW |
362 | |
363 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) | |
364 | merge_ondisk_extents(merge, em); | |
f86f7a75 | 365 | em->flags |= EXTENT_FLAG_MERGED; |
5dc562c5 | 366 | |
3f255ece | 367 | validate_extent_map(fs_info, em); |
4e660ca3 | 368 | rb_erase(&merge->rb_node, &tree->root); |
cbc0e928 | 369 | RB_CLEAR_NODE(&merge->rb_node); |
a1ed835e | 370 | free_extent_map(merge); |
f1d97e76 | 371 | dec_evictable_extent_maps(inode); |
a1ed835e CM |
372 | } |
373 | } | |
374 | ||
375 | rb = rb_next(&em->rb_node); | |
376 | if (rb) | |
377 | merge = rb_entry(rb, struct extent_map, rb_node); | |
27f0d9c9 | 378 | if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { |
a1ed835e | 379 | em->len += merge->len; |
3d2ac992 QW |
380 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) |
381 | merge_ondisk_extents(em, merge); | |
3f255ece | 382 | validate_extent_map(fs_info, em); |
4e660ca3 | 383 | rb_erase(&merge->rb_node, &tree->root); |
cbc0e928 | 384 | RB_CLEAR_NODE(&merge->rb_node); |
70c8a91c | 385 | em->generation = max(em->generation, merge->generation); |
f86f7a75 | 386 | em->flags |= EXTENT_FLAG_MERGED; |
a1ed835e | 387 | free_extent_map(merge); |
f1d97e76 | 388 | dec_evictable_extent_maps(inode); |
a1ed835e | 389 | } |
4d2c8f62 LZ |
390 | } |
391 | ||
43dd529a DS |
392 | /* |
393 | * Unpin an extent from the cache. | |
394 | * | |
00deaf04 | 395 | * @inode: the inode from which we are unpinning an extent range |
5dc562c5 JB |
396 | * @start: logical offset in the file |
397 | * @len: length of the extent | |
398 | * @gen: generation that this extent has been modified in | |
5dc562c5 JB |
399 | * |
400 | * Called after an extent has been written to disk properly. Set the generation | |
401 | * to the generation that actually added the file item to the inode so we know | |
402 | * we need to sync this extent when we call fsync(). | |
c03c89f8 DS |
403 | * |
404 | * Returns: 0 on success | |
405 | * -ENOENT when the extent is not found in the tree | |
406 | * -EUCLEAN if the found extent does not match the expected start | |
5dc562c5 | 407 | */ |
00deaf04 | 408 | int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) |
4d2c8f62 | 409 | { |
00deaf04 FM |
410 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
411 | struct extent_map_tree *tree = &inode->extent_tree; | |
4d2c8f62 LZ |
412 | int ret = 0; |
413 | struct extent_map *em; | |
414 | ||
415 | write_lock(&tree->lock); | |
416 | em = lookup_extent_mapping(tree, start, len); | |
417 | ||
00deaf04 FM |
418 | if (WARN_ON(!em)) { |
419 | btrfs_warn(fs_info, | |
420 | "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", | |
421 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
4dc1d69c | 422 | start, start + len, gen); |
c03c89f8 | 423 | ret = -ENOENT; |
4d2c8f62 | 424 | goto out; |
00deaf04 FM |
425 | } |
426 | ||
c03c89f8 | 427 | if (WARN_ON(em->start != start)) { |
00deaf04 FM |
428 | btrfs_warn(fs_info, |
429 | "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", | |
430 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
4dc1d69c | 431 | em->start, start, start + len, gen); |
c03c89f8 DS |
432 | ret = -EUCLEAN; |
433 | goto out; | |
434 | } | |
4d2c8f62 | 435 | |
5dc562c5 | 436 | em->generation = gen; |
f86f7a75 | 437 | em->flags &= ~EXTENT_FLAG_PINNED; |
4d2c8f62 | 438 | |
5fa8a6ba | 439 | try_merge_map(inode, em); |
4e2f84e6 | 440 | |
a1ed835e CM |
441 | out: |
442 | write_unlock(&tree->lock); | |
8a565ec0 | 443 | free_extent_map(em); |
a1ed835e CM |
444 | return ret; |
445 | ||
446 | } | |
447 | ||
002f3a2c | 448 | void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) |
201a9038 | 449 | { |
5fa8a6ba | 450 | lockdep_assert_held_write(&inode->extent_tree.lock); |
74333c7d | 451 | |
f86f7a75 | 452 | em->flags &= ~EXTENT_FLAG_LOGGING; |
cbc0e928 | 453 | if (extent_map_in_tree(em)) |
5fa8a6ba | 454 | try_merge_map(inode, em); |
201a9038 JB |
455 | } |
456 | ||
e778724a | 457 | static inline void setup_extent_mapping(struct btrfs_inode *inode, |
176840b3 FM |
458 | struct extent_map *em, |
459 | int modified) | |
460 | { | |
490b54d6 | 461 | refcount_inc(&em->refs); |
176840b3 | 462 | |
32d53f6f FM |
463 | ASSERT(list_empty(&em->list)); |
464 | ||
176840b3 | 465 | if (modified) |
e778724a | 466 | list_add(&em->list, &inode->extent_tree.modified_extents); |
176840b3 | 467 | else |
5fa8a6ba | 468 | try_merge_map(inode, em); |
176840b3 FM |
469 | } |
470 | ||
43dd529a | 471 | /* |
6c566def | 472 | * Add a new extent map to an inode's extent map tree. |
401bd2dd | 473 | * |
6c566def | 474 | * @inode: the target inode |
9d2423c5 | 475 | * @em: map to insert |
401bd2dd NB |
476 | * @modified: indicate whether the given @em should be added to the |
477 | * modified list, which indicates the extent needs to be logged | |
9d2423c5 | 478 | * |
6c566def FM |
479 | * Insert @em into the @inode's extent map tree or perform a simple |
480 | * forward/backward merge with existing mappings. The extent_map struct passed | |
481 | * in will be inserted into the tree directly, with an additional reference | |
482 | * taken, or a reference dropped if the merge attempt was successful. | |
a52d9a80 | 483 | */ |
6c566def | 484 | static int add_extent_mapping(struct btrfs_inode *inode, |
db9d9446 | 485 | struct extent_map *em, int modified) |
a52d9a80 | 486 | { |
6c566def | 487 | struct extent_map_tree *tree = &inode->extent_tree; |
f1d97e76 FM |
488 | struct btrfs_root *root = inode->root; |
489 | struct btrfs_fs_info *fs_info = root->fs_info; | |
ed48adf8 | 490 | int ret; |
a52d9a80 | 491 | |
d23ea3fa DS |
492 | lockdep_assert_held_write(&tree->lock); |
493 | ||
3f255ece | 494 | validate_extent_map(fs_info, em); |
7f5830bc | 495 | ret = tree_insert(&tree->root, em); |
32193c14 | 496 | if (ret) |
ed48adf8 | 497 | return ret; |
32193c14 | 498 | |
e778724a | 499 | setup_extent_mapping(inode, em, modified); |
ed48adf8 | 500 | |
f1d97e76 FM |
501 | if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) |
502 | percpu_counter_inc(&fs_info->evictable_extent_maps); | |
503 | ||
ed48adf8 | 504 | return 0; |
a52d9a80 | 505 | } |
a52d9a80 | 506 | |
48a3b636 ES |
507 | static struct extent_map * |
508 | __lookup_extent_mapping(struct extent_map_tree *tree, | |
509 | u64 start, u64 len, int strict) | |
a52d9a80 CM |
510 | { |
511 | struct extent_map *em; | |
512 | struct rb_node *rb_node; | |
6c05813e | 513 | struct rb_node *prev_or_next = NULL; |
306929f3 CH |
514 | u64 end = range_end(start, len); |
515 | ||
4e660ca3 | 516 | rb_node = __tree_search(&tree->root, start, &prev_or_next); |
a52d9a80 | 517 | if (!rb_node) { |
6c05813e FM |
518 | if (prev_or_next) |
519 | rb_node = prev_or_next; | |
ed64f066 LZ |
520 | else |
521 | return NULL; | |
a52d9a80 | 522 | } |
ed64f066 | 523 | |
a52d9a80 | 524 | em = rb_entry(rb_node, struct extent_map, rb_node); |
d1310b2e | 525 | |
ed64f066 LZ |
526 | if (strict && !(end > em->start && start < extent_map_end(em))) |
527 | return NULL; | |
d1310b2e | 528 | |
490b54d6 | 529 | refcount_inc(&em->refs); |
a52d9a80 CM |
530 | return em; |
531 | } | |
a52d9a80 | 532 | |
43dd529a DS |
533 | /* |
534 | * Lookup extent_map that intersects @start + @len range. | |
535 | * | |
ed64f066 LZ |
536 | * @tree: tree to lookup in |
537 | * @start: byte offset to start the search | |
538 | * @len: length of the lookup range | |
539 | * | |
540 | * Find and return the first extent_map struct in @tree that intersects the | |
541 | * [start, len] range. There may be additional objects in the tree that | |
542 | * intersect, so check the object returned carefully to make sure that no | |
543 | * additional lookups are needed. | |
544 | */ | |
545 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | |
546 | u64 start, u64 len) | |
547 | { | |
548 | return __lookup_extent_mapping(tree, start, len, 1); | |
549 | } | |
550 | ||
43dd529a DS |
551 | /* |
552 | * Find a nearby extent map intersecting @start + @len (not an exact search). | |
553 | * | |
b917b7c3 CM |
554 | * @tree: tree to lookup in |
555 | * @start: byte offset to start the search | |
556 | * @len: length of the lookup range | |
557 | * | |
558 | * Find and return the first extent_map struct in @tree that intersects the | |
559 | * [start, len] range. | |
560 | * | |
561 | * If one can't be found, any nearby extent may be returned | |
562 | */ | |
563 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, | |
564 | u64 start, u64 len) | |
565 | { | |
ed64f066 | 566 | return __lookup_extent_mapping(tree, start, len, 0); |
b917b7c3 CM |
567 | } |
568 | ||
43dd529a | 569 | /* |
c2fbd812 | 570 | * Remove an extent_map from its inode's extent tree. |
43dd529a | 571 | * |
c2fbd812 | 572 | * @inode: the inode the extent map belongs to |
bb7ab3b9 | 573 | * @em: extent map being removed |
9d2423c5 | 574 | * |
c2fbd812 FM |
575 | * Remove @em from the extent tree of @inode. No reference counts are dropped, |
576 | * and no checks are done to see if the range is in use. | |
a52d9a80 | 577 | */ |
c2fbd812 | 578 | void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) |
a52d9a80 | 579 | { |
c2fbd812 FM |
580 | struct extent_map_tree *tree = &inode->extent_tree; |
581 | ||
6d3b050e FM |
582 | lockdep_assert_held_write(&tree->lock); |
583 | ||
f86f7a75 | 584 | WARN_ON(em->flags & EXTENT_FLAG_PINNED); |
4e660ca3 | 585 | rb_erase(&em->rb_node, &tree->root); |
f86f7a75 | 586 | if (!(em->flags & EXTENT_FLAG_LOGGING)) |
ff44c6e3 | 587 | list_del_init(&em->list); |
cbc0e928 | 588 | RB_CLEAR_NODE(&em->rb_node); |
f1d97e76 FM |
589 | |
590 | dec_evictable_extent_maps(inode); | |
a52d9a80 | 591 | } |
176840b3 | 592 | |
6a3a9113 | 593 | static void replace_extent_mapping(struct btrfs_inode *inode, |
a6f3e205 CH |
594 | struct extent_map *cur, |
595 | struct extent_map *new, | |
596 | int modified) | |
176840b3 | 597 | { |
3f255ece | 598 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
6a3a9113 FM |
599 | struct extent_map_tree *tree = &inode->extent_tree; |
600 | ||
6d3b050e FM |
601 | lockdep_assert_held_write(&tree->lock); |
602 | ||
3f255ece QW |
603 | validate_extent_map(fs_info, new); |
604 | ||
f86f7a75 | 605 | WARN_ON(cur->flags & EXTENT_FLAG_PINNED); |
176840b3 | 606 | ASSERT(extent_map_in_tree(cur)); |
f86f7a75 | 607 | if (!(cur->flags & EXTENT_FLAG_LOGGING)) |
176840b3 | 608 | list_del_init(&cur->list); |
4e660ca3 | 609 | rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); |
176840b3 FM |
610 | RB_CLEAR_NODE(&cur->rb_node); |
611 | ||
e778724a | 612 | setup_extent_mapping(inode, new, modified); |
176840b3 | 613 | } |
c04e61b5 | 614 | |
d47704bd | 615 | static struct extent_map *next_extent_map(const struct extent_map *em) |
c04e61b5 LB |
616 | { |
617 | struct rb_node *next; | |
618 | ||
619 | next = rb_next(&em->rb_node); | |
620 | if (!next) | |
621 | return NULL; | |
622 | return container_of(next, struct extent_map, rb_node); | |
623 | } | |
624 | ||
625 | static struct extent_map *prev_extent_map(struct extent_map *em) | |
626 | { | |
627 | struct rb_node *prev; | |
628 | ||
629 | prev = rb_prev(&em->rb_node); | |
630 | if (!prev) | |
631 | return NULL; | |
632 | return container_of(prev, struct extent_map, rb_node); | |
633 | } | |
634 | ||
52042d8e AG |
635 | /* |
636 | * Helper for btrfs_get_extent. Given an existing extent in the tree, | |
c04e61b5 LB |
637 | * the existing extent is the nearest extent to map_start, |
638 | * and an extent that you want to insert, deal with overlap and insert | |
639 | * the best fitted new extent into the tree. | |
640 | */ | |
6c566def | 641 | static noinline int merge_extent_mapping(struct btrfs_inode *inode, |
5f4791f4 LB |
642 | struct extent_map *existing, |
643 | struct extent_map *em, | |
644 | u64 map_start) | |
c04e61b5 LB |
645 | { |
646 | struct extent_map *prev; | |
647 | struct extent_map *next; | |
648 | u64 start; | |
649 | u64 end; | |
650 | u64 start_diff; | |
651 | ||
c093bf30 DS |
652 | if (map_start < em->start || map_start >= extent_map_end(em)) |
653 | return -EINVAL; | |
c04e61b5 LB |
654 | |
655 | if (existing->start > map_start) { | |
656 | next = existing; | |
657 | prev = prev_extent_map(next); | |
658 | } else { | |
659 | prev = existing; | |
660 | next = next_extent_map(prev); | |
661 | } | |
662 | ||
663 | start = prev ? extent_map_end(prev) : em->start; | |
664 | start = max_t(u64, start, em->start); | |
665 | end = next ? next->start : extent_map_end(em); | |
666 | end = min_t(u64, end, extent_map_end(em)); | |
667 | start_diff = start - em->start; | |
668 | em->start = start; | |
669 | em->len = end - start; | |
de9f46cb | 670 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) |
3d2ac992 | 671 | em->offset += start_diff; |
6c566def | 672 | return add_extent_mapping(inode, em, 0); |
c04e61b5 LB |
673 | } |
674 | ||
43dd529a | 675 | /* |
0a308f80 | 676 | * Add extent mapping into an inode's extent map tree. |
9ad37bb3 | 677 | * |
0a308f80 | 678 | * @inode: target inode |
9ad37bb3 NB |
679 | * @em_in: extent we are inserting |
680 | * @start: start of the logical range btrfs_get_extent() is requesting | |
681 | * @len: length of the logical range btrfs_get_extent() is requesting | |
c04e61b5 LB |
682 | * |
683 | * Note that @em_in's range may be different from [start, start+len), | |
684 | * but they must be overlapped. | |
685 | * | |
0a308f80 FM |
686 | * Insert @em_in into the inode's extent map tree. In case there is an |
687 | * overlapping range, handle the -EEXIST by either: | |
c04e61b5 LB |
688 | * a) Returning the existing extent in @em_in if @start is within the |
689 | * existing em. | |
690 | * b) Merge the existing extent with @em_in passed in. | |
691 | * | |
692 | * Return 0 on success, otherwise -EEXIST. | |
693 | * | |
694 | */ | |
0a308f80 | 695 | int btrfs_add_extent_mapping(struct btrfs_inode *inode, |
c04e61b5 LB |
696 | struct extent_map **em_in, u64 start, u64 len) |
697 | { | |
698 | int ret; | |
699 | struct extent_map *em = *em_in; | |
0a308f80 | 700 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
c04e61b5 | 701 | |
d52a1365 QW |
702 | /* |
703 | * Tree-checker should have rejected any inline extent with non-zero | |
704 | * file offset. Here just do a sanity check. | |
705 | */ | |
c77a8c61 | 706 | if (em->disk_bytenr == EXTENT_MAP_INLINE) |
d52a1365 QW |
707 | ASSERT(em->start == 0); |
708 | ||
6c566def | 709 | ret = add_extent_mapping(inode, em, 0); |
c04e61b5 LB |
710 | /* it is possible that someone inserted the extent into the tree |
711 | * while we had the lock dropped. It is also possible that | |
712 | * an overlapping map exists in the tree | |
713 | */ | |
714 | if (ret == -EEXIST) { | |
715 | struct extent_map *existing; | |
716 | ||
6c566def | 717 | existing = search_extent_mapping(&inode->extent_tree, start, len); |
393da918 | 718 | |
f46b24c9 | 719 | trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); |
393da918 | 720 | |
c04e61b5 LB |
721 | /* |
722 | * existing will always be non-NULL, since there must be | |
723 | * extent causing the -EEXIST. | |
724 | */ | |
725 | if (start >= existing->start && | |
726 | start < extent_map_end(existing)) { | |
727 | free_extent_map(em); | |
728 | *em_in = existing; | |
729 | ret = 0; | |
730 | } else { | |
9a7e10e7 LB |
731 | u64 orig_start = em->start; |
732 | u64 orig_len = em->len; | |
733 | ||
c04e61b5 LB |
734 | /* |
735 | * The existing extent map is the one nearest to | |
736 | * the [start, start + len) range which overlaps | |
737 | */ | |
6c566def | 738 | ret = merge_extent_mapping(inode, existing, em, start); |
21334600 | 739 | if (WARN_ON(ret)) { |
c04e61b5 LB |
740 | free_extent_map(em); |
741 | *em_in = NULL; | |
21334600 FM |
742 | btrfs_warn(fs_info, |
743 | "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", | |
744 | existing->start, extent_map_end(existing), | |
745 | orig_start, orig_start + orig_len, start); | |
c04e61b5 | 746 | } |
9a7e10e7 | 747 | free_extent_map(existing); |
c04e61b5 LB |
748 | } |
749 | } | |
750 | ||
751 | ASSERT(ret == 0 || ret == -EEXIST); | |
752 | return ret; | |
753 | } | |
4c0c8cfc | 754 | |
9c9d1b4f FM |
755 | /* |
756 | * Drop all extent maps from a tree in the fastest possible way, rescheduling | |
757 | * if needed. This avoids searching the tree, from the root down to the first | |
758 | * extent map, before each deletion. | |
759 | */ | |
c2fbd812 | 760 | static void drop_all_extent_maps_fast(struct btrfs_inode *inode) |
9c9d1b4f | 761 | { |
c2fbd812 | 762 | struct extent_map_tree *tree = &inode->extent_tree; |
4e660ca3 | 763 | struct rb_node *node; |
c2fbd812 | 764 | |
9c9d1b4f | 765 | write_lock(&tree->lock); |
4e660ca3 FM |
766 | node = rb_first(&tree->root); |
767 | while (node) { | |
9c9d1b4f | 768 | struct extent_map *em; |
4e660ca3 | 769 | struct rb_node *next = rb_next(node); |
9c9d1b4f | 770 | |
9c9d1b4f | 771 | em = rb_entry(node, struct extent_map, rb_node); |
f86f7a75 | 772 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
c2fbd812 | 773 | remove_extent_mapping(inode, em); |
9c9d1b4f | 774 | free_extent_map(em); |
4e660ca3 FM |
775 | |
776 | if (cond_resched_rwlock_write(&tree->lock)) | |
777 | node = rb_first(&tree->root); | |
778 | else | |
779 | node = next; | |
9c9d1b4f FM |
780 | } |
781 | write_unlock(&tree->lock); | |
782 | } | |
783 | ||
4c0c8cfc FM |
784 | /* |
785 | * Drop all extent maps in a given range. | |
786 | * | |
787 | * @inode: The target inode. | |
788 | * @start: Start offset of the range. | |
789 | * @end: End offset of the range (inclusive value). | |
790 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. | |
791 | * | |
792 | * This drops all the extent maps that intersect the given range [@start, @end]. | |
793 | * Extent maps that partially overlap the range and extend behind or beyond it, | |
794 | * are split. | |
795 | * The caller should have locked an appropriate file range in the inode's io | |
796 | * tree before calling this function. | |
797 | */ | |
798 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, | |
799 | bool skip_pinned) | |
800 | { | |
db21370b FM |
801 | struct extent_map *split; |
802 | struct extent_map *split2; | |
803 | struct extent_map *em; | |
4c0c8cfc FM |
804 | struct extent_map_tree *em_tree = &inode->extent_tree; |
805 | u64 len = end - start + 1; | |
4c0c8cfc FM |
806 | |
807 | WARN_ON(end < start); | |
808 | if (end == (u64)-1) { | |
9c9d1b4f | 809 | if (start == 0 && !skip_pinned) { |
c2fbd812 | 810 | drop_all_extent_maps_fast(inode); |
9c9d1b4f FM |
811 | return; |
812 | } | |
4c0c8cfc | 813 | len = (u64)-1; |
db21370b FM |
814 | } else { |
815 | /* Make end offset exclusive for use in the loop below. */ | |
816 | end++; | |
4c0c8cfc | 817 | } |
db21370b FM |
818 | |
819 | /* | |
820 | * It's ok if we fail to allocate the extent maps, see the comment near | |
821 | * the bottom of the loop below. We only need two spare extent maps in | |
822 | * the worst case, where the first extent map that intersects our range | |
823 | * starts before the range and the last extent map that intersects our | |
824 | * range ends after our range (and they might be the same extent map), | |
825 | * because we need to split those two extent maps at the boundaries. | |
826 | */ | |
827 | split = alloc_extent_map(); | |
828 | split2 = alloc_extent_map(); | |
829 | ||
830 | write_lock(&em_tree->lock); | |
831 | em = lookup_extent_mapping(em_tree, start, len); | |
832 | ||
833 | while (em) { | |
834 | /* extent_map_end() returns exclusive value (last byte + 1). */ | |
835 | const u64 em_end = extent_map_end(em); | |
836 | struct extent_map *next_em = NULL; | |
4c0c8cfc FM |
837 | u64 gen; |
838 | unsigned long flags; | |
4c0c8cfc | 839 | bool modified; |
4c0c8cfc | 840 | |
db21370b FM |
841 | if (em_end < end) { |
842 | next_em = next_extent_map(em); | |
843 | if (next_em) { | |
844 | if (next_em->start < end) | |
845 | refcount_inc(&next_em->refs); | |
846 | else | |
847 | next_em = NULL; | |
848 | } | |
4c0c8cfc | 849 | } |
db21370b | 850 | |
f86f7a75 | 851 | if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { |
f3109e33 | 852 | start = em_end; |
db21370b | 853 | goto next; |
4c0c8cfc | 854 | } |
db21370b | 855 | |
e4cc1483 | 856 | flags = em->flags; |
e4cc1483 FM |
857 | /* |
858 | * In case we split the extent map, we want to preserve the | |
859 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want | |
860 | * it on the new extent maps. | |
861 | */ | |
f86f7a75 | 862 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
4c0c8cfc | 863 | modified = !list_empty(&em->list); |
db21370b FM |
864 | |
865 | /* | |
866 | * The extent map does not cross our target range, so no need to | |
867 | * split it, we can remove it directly. | |
868 | */ | |
869 | if (em->start >= start && em_end <= end) | |
870 | goto remove_em; | |
871 | ||
db21370b | 872 | gen = em->generation; |
4c0c8cfc FM |
873 | |
874 | if (em->start < start) { | |
db21370b FM |
875 | if (!split) { |
876 | split = split2; | |
877 | split2 = NULL; | |
878 | if (!split) | |
879 | goto remove_em; | |
880 | } | |
4c0c8cfc FM |
881 | split->start = em->start; |
882 | split->len = start - em->start; | |
883 | ||
c77a8c61 | 884 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { |
3d2ac992 | 885 | split->disk_bytenr = em->disk_bytenr; |
e28b851e | 886 | split->disk_num_bytes = em->disk_num_bytes; |
3d2ac992 | 887 | split->offset = em->offset; |
4c0c8cfc FM |
888 | split->ram_bytes = em->ram_bytes; |
889 | } else { | |
3d2ac992 | 890 | split->disk_bytenr = em->disk_bytenr; |
e8fe524d | 891 | split->disk_num_bytes = 0; |
3d2ac992 | 892 | split->offset = 0; |
4c0c8cfc FM |
893 | split->ram_bytes = split->len; |
894 | } | |
895 | ||
896 | split->generation = gen; | |
897 | split->flags = flags; | |
6a3a9113 | 898 | replace_extent_mapping(inode, em, split, modified); |
4c0c8cfc FM |
899 | free_extent_map(split); |
900 | split = split2; | |
901 | split2 = NULL; | |
902 | } | |
db21370b FM |
903 | if (em_end > end) { |
904 | if (!split) { | |
905 | split = split2; | |
906 | split2 = NULL; | |
907 | if (!split) | |
908 | goto remove_em; | |
909 | } | |
c962098c JB |
910 | split->start = end; |
911 | split->len = em_end - end; | |
3d2ac992 | 912 | split->disk_bytenr = em->disk_bytenr; |
4c0c8cfc | 913 | split->flags = flags; |
4c0c8cfc FM |
914 | split->generation = gen; |
915 | ||
c77a8c61 | 916 | if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { |
e28b851e | 917 | split->disk_num_bytes = em->disk_num_bytes; |
3d2ac992 | 918 | split->offset = em->offset + end - em->start; |
4c0c8cfc | 919 | split->ram_bytes = em->ram_bytes; |
4c0c8cfc | 920 | } else { |
3d2ac992 QW |
921 | split->disk_num_bytes = 0; |
922 | split->offset = 0; | |
4c0c8cfc | 923 | split->ram_bytes = split->len; |
4c0c8cfc FM |
924 | } |
925 | ||
926 | if (extent_map_in_tree(em)) { | |
6a3a9113 | 927 | replace_extent_mapping(inode, em, split, modified); |
4c0c8cfc FM |
928 | } else { |
929 | int ret; | |
930 | ||
6c566def | 931 | ret = add_extent_mapping(inode, split, modified); |
4c0c8cfc FM |
932 | /* Logic error, shouldn't happen. */ |
933 | ASSERT(ret == 0); | |
934 | if (WARN_ON(ret != 0) && modified) | |
935 | btrfs_set_inode_full_sync(inode); | |
936 | } | |
937 | free_extent_map(split); | |
938 | split = NULL; | |
939 | } | |
db21370b | 940 | remove_em: |
4c0c8cfc FM |
941 | if (extent_map_in_tree(em)) { |
942 | /* | |
943 | * If the extent map is still in the tree it means that | |
944 | * either of the following is true: | |
945 | * | |
946 | * 1) It fits entirely in our range (doesn't end beyond | |
947 | * it or starts before it); | |
948 | * | |
949 | * 2) It starts before our range and/or ends after our | |
950 | * range, and we were not able to allocate the extent | |
951 | * maps for split operations, @split and @split2. | |
952 | * | |
953 | * If we are at case 2) then we just remove the entire | |
954 | * extent map - this is fine since if anyone needs it to | |
955 | * access the subranges outside our range, will just | |
956 | * load it again from the subvolume tree's file extent | |
957 | * item. However if the extent map was in the list of | |
958 | * modified extents, then we must mark the inode for a | |
959 | * full fsync, otherwise a fast fsync will miss this | |
960 | * extent if it's new and needs to be logged. | |
961 | */ | |
db21370b FM |
962 | if ((em->start < start || em_end > end) && modified) { |
963 | ASSERT(!split); | |
4c0c8cfc FM |
964 | btrfs_set_inode_full_sync(inode); |
965 | } | |
c2fbd812 | 966 | remove_extent_mapping(inode, em); |
4c0c8cfc | 967 | } |
4c0c8cfc | 968 | |
db21370b FM |
969 | /* |
970 | * Once for the tree reference (we replaced or removed the | |
971 | * extent map from the tree). | |
972 | */ | |
4c0c8cfc | 973 | free_extent_map(em); |
db21370b FM |
974 | next: |
975 | /* Once for us (for our lookup reference). */ | |
4c0c8cfc | 976 | free_extent_map(em); |
db21370b FM |
977 | |
978 | em = next_em; | |
4c0c8cfc FM |
979 | } |
980 | ||
db21370b FM |
981 | write_unlock(&em_tree->lock); |
982 | ||
4c0c8cfc FM |
983 | free_extent_map(split); |
984 | free_extent_map(split2); | |
985 | } | |
a1ba4c08 FM |
986 | |
987 | /* | |
988 | * Replace a range in the inode's extent map tree with a new extent map. | |
989 | * | |
990 | * @inode: The target inode. | |
991 | * @new_em: The new extent map to add to the inode's extent map tree. | |
992 | * @modified: Indicate if the new extent map should be added to the list of | |
993 | * modified extents (for fast fsync tracking). | |
994 | * | |
995 | * Drops all the extent maps in the inode's extent map tree that intersect the | |
996 | * range of the new extent map and adds the new extent map to the tree. | |
997 | * The caller should have locked an appropriate file range in the inode's io | |
998 | * tree before calling this function. | |
999 | */ | |
1000 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | |
1001 | struct extent_map *new_em, | |
1002 | bool modified) | |
1003 | { | |
1004 | const u64 end = new_em->start + new_em->len - 1; | |
1005 | struct extent_map_tree *tree = &inode->extent_tree; | |
1006 | int ret; | |
1007 | ||
1008 | ASSERT(!extent_map_in_tree(new_em)); | |
1009 | ||
1010 | /* | |
1011 | * The caller has locked an appropriate file range in the inode's io | |
1012 | * tree, but getting -EEXIST when adding the new extent map can still | |
1013 | * happen in case there are extents that partially cover the range, and | |
1014 | * this is due to two tasks operating on different parts of the extent. | |
1015 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from | |
1016 | * btrfs_get_extent") for an example and details. | |
1017 | */ | |
1018 | do { | |
1019 | btrfs_drop_extent_map_range(inode, new_em->start, end, false); | |
1020 | write_lock(&tree->lock); | |
6c566def | 1021 | ret = add_extent_mapping(inode, new_em, modified); |
a1ba4c08 FM |
1022 | write_unlock(&tree->lock); |
1023 | } while (ret == -EEXIST); | |
1024 | ||
1025 | return ret; | |
1026 | } | |
a6f3e205 CH |
1027 | |
1028 | /* | |
f000bc6f CH |
1029 | * Split off the first pre bytes from the extent_map at [start, start + len], |
1030 | * and set the block_start for it to new_logical. | |
a6f3e205 CH |
1031 | * |
1032 | * This function is used when an ordered_extent needs to be split. | |
1033 | */ | |
f000bc6f CH |
1034 | int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, |
1035 | u64 new_logical) | |
a6f3e205 CH |
1036 | { |
1037 | struct extent_map_tree *em_tree = &inode->extent_tree; | |
1038 | struct extent_map *em; | |
1039 | struct extent_map *split_pre = NULL; | |
1040 | struct extent_map *split_mid = NULL; | |
1041 | int ret = 0; | |
1042 | unsigned long flags; | |
1043 | ||
1044 | ASSERT(pre != 0); | |
1045 | ASSERT(pre < len); | |
1046 | ||
1047 | split_pre = alloc_extent_map(); | |
1048 | if (!split_pre) | |
1049 | return -ENOMEM; | |
1050 | split_mid = alloc_extent_map(); | |
1051 | if (!split_mid) { | |
1052 | ret = -ENOMEM; | |
1053 | goto out_free_pre; | |
1054 | } | |
1055 | ||
1056 | lock_extent(&inode->io_tree, start, start + len - 1, NULL); | |
1057 | write_lock(&em_tree->lock); | |
1058 | em = lookup_extent_mapping(em_tree, start, len); | |
1059 | if (!em) { | |
1060 | ret = -EIO; | |
1061 | goto out_unlock; | |
1062 | } | |
1063 | ||
1064 | ASSERT(em->len == len); | |
f86f7a75 | 1065 | ASSERT(!extent_map_is_compressed(em)); |
c77a8c61 | 1066 | ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); |
f86f7a75 FM |
1067 | ASSERT(em->flags & EXTENT_FLAG_PINNED); |
1068 | ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); | |
a6f3e205 CH |
1069 | ASSERT(!list_empty(&em->list)); |
1070 | ||
1071 | flags = em->flags; | |
f86f7a75 | 1072 | em->flags &= ~EXTENT_FLAG_PINNED; |
a6f3e205 CH |
1073 | |
1074 | /* First, replace the em with a new extent_map starting from * em->start */ | |
1075 | split_pre->start = em->start; | |
1076 | split_pre->len = pre; | |
3d2ac992 QW |
1077 | split_pre->disk_bytenr = new_logical; |
1078 | split_pre->disk_num_bytes = split_pre->len; | |
1079 | split_pre->offset = 0; | |
a6f3e205 CH |
1080 | split_pre->ram_bytes = split_pre->len; |
1081 | split_pre->flags = flags; | |
a6f3e205 CH |
1082 | split_pre->generation = em->generation; |
1083 | ||
6a3a9113 | 1084 | replace_extent_mapping(inode, em, split_pre, 1); |
a6f3e205 CH |
1085 | |
1086 | /* | |
1087 | * Now we only have an extent_map at: | |
1088 | * [em->start, em->start + pre] | |
1089 | */ | |
1090 | ||
1091 | /* Insert the middle extent_map. */ | |
1092 | split_mid->start = em->start + pre; | |
1093 | split_mid->len = em->len - pre; | |
c77a8c61 | 1094 | split_mid->disk_bytenr = extent_map_block_start(em) + pre; |
3d2ac992 QW |
1095 | split_mid->disk_num_bytes = split_mid->len; |
1096 | split_mid->offset = 0; | |
a6f3e205 CH |
1097 | split_mid->ram_bytes = split_mid->len; |
1098 | split_mid->flags = flags; | |
a6f3e205 | 1099 | split_mid->generation = em->generation; |
6c566def | 1100 | add_extent_mapping(inode, split_mid, 1); |
a6f3e205 CH |
1101 | |
1102 | /* Once for us */ | |
1103 | free_extent_map(em); | |
1104 | /* Once for the tree */ | |
1105 | free_extent_map(em); | |
1106 | ||
1107 | out_unlock: | |
1108 | write_unlock(&em_tree->lock); | |
1109 | unlock_extent(&inode->io_tree, start, start + len - 1, NULL); | |
1110 | free_extent_map(split_mid); | |
1111 | out_free_pre: | |
1112 | free_extent_map(split_pre); | |
1113 | return ret; | |
1114 | } | |
956a17d9 | 1115 | |
44849405 FM |
1116 | struct btrfs_em_shrink_ctx { |
1117 | long nr_to_scan; | |
1118 | long scanned; | |
1119 | u64 last_ino; | |
1120 | u64 last_root; | |
1121 | }; | |
1122 | ||
1123 | static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) | |
956a17d9 FM |
1124 | { |
1125 | const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); | |
1126 | struct extent_map_tree *tree = &inode->extent_tree; | |
1127 | long nr_dropped = 0; | |
1128 | struct rb_node *node; | |
1129 | ||
1130 | /* | |
1131 | * Take the mmap lock so that we serialize with the inode logging phase | |
1132 | * of fsync because we may need to set the full sync flag on the inode, | |
1133 | * in case we have to remove extent maps in the tree's list of modified | |
1134 | * extents. If we set the full sync flag in the inode while an fsync is | |
1135 | * in progress, we may risk missing new extents because before the flag | |
1136 | * is set, fsync decides to only wait for writeback to complete and then | |
1137 | * during inode logging it sees the flag set and uses the subvolume tree | |
1138 | * to find new extents, which may not be there yet because ordered | |
1139 | * extents haven't completed yet. | |
1140 | * | |
1141 | * We also do a try lock because otherwise we could deadlock. This is | |
1142 | * because the shrinker for this filesystem may be invoked while we are | |
1143 | * in a path that is holding the mmap lock in write mode. For example in | |
1144 | * a reflink operation while COWing an extent buffer, when allocating | |
1145 | * pages for a new extent buffer and under memory pressure, the shrinker | |
1146 | * may be invoked, and therefore we would deadlock by attempting to read | |
1147 | * lock the mmap lock while we are holding already a write lock on it. | |
1148 | */ | |
1149 | if (!down_read_trylock(&inode->i_mmap_lock)) | |
1150 | return 0; | |
1151 | ||
b3ebb9b7 | 1152 | /* |
ae1e766f | 1153 | * We want to be fast so if the lock is busy we don't want to spend time |
b3ebb9b7 FM |
1154 | * waiting for it - either some task is about to do IO for the inode or |
1155 | * we may have another task shrinking extent maps, here in this code, so | |
1156 | * skip this inode. | |
1157 | */ | |
1158 | if (!write_trylock(&tree->lock)) { | |
1159 | up_read(&inode->i_mmap_lock); | |
1160 | return 0; | |
1161 | } | |
1162 | ||
4e660ca3 | 1163 | node = rb_first(&tree->root); |
956a17d9 | 1164 | while (node) { |
4e660ca3 | 1165 | struct rb_node *next = rb_next(node); |
956a17d9 FM |
1166 | struct extent_map *em; |
1167 | ||
1168 | em = rb_entry(node, struct extent_map, rb_node); | |
44849405 | 1169 | ctx->scanned++; |
956a17d9 FM |
1170 | |
1171 | if (em->flags & EXTENT_FLAG_PINNED) | |
1172 | goto next; | |
1173 | ||
1174 | /* | |
1175 | * If the inode is in the list of modified extents (new) and its | |
1176 | * generation is the same (or is greater than) the current fs | |
1177 | * generation, it means it was not yet persisted so we have to | |
1178 | * set the full sync flag so that the next fsync will not miss | |
1179 | * it. | |
1180 | */ | |
1181 | if (!list_empty(&em->list) && em->generation >= cur_fs_gen) | |
1182 | btrfs_set_inode_full_sync(inode); | |
1183 | ||
1184 | remove_extent_mapping(inode, em); | |
0d89a15e | 1185 | trace_btrfs_extent_map_shrinker_remove_em(inode, em); |
956a17d9 FM |
1186 | /* Drop the reference for the tree. */ |
1187 | free_extent_map(em); | |
1188 | nr_dropped++; | |
1189 | next: | |
44849405 | 1190 | if (ctx->scanned >= ctx->nr_to_scan) |
956a17d9 FM |
1191 | break; |
1192 | ||
1193 | /* | |
b3ebb9b7 FM |
1194 | * Stop if we need to reschedule or there's contention on the |
1195 | * lock. This is to avoid slowing other tasks trying to take the | |
ae1e766f | 1196 | * lock. |
956a17d9 | 1197 | */ |
b3ebb9b7 FM |
1198 | if (need_resched() || rwlock_needbreak(&tree->lock)) |
1199 | break; | |
a1b547f0 | 1200 | node = next; |
956a17d9 FM |
1201 | } |
1202 | write_unlock(&tree->lock); | |
1203 | up_read(&inode->i_mmap_lock); | |
1204 | ||
1205 | return nr_dropped; | |
1206 | } | |
1207 | ||
44849405 | 1208 | static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) |
956a17d9 | 1209 | { |
956a17d9 FM |
1210 | struct btrfs_inode *inode; |
1211 | long nr_dropped = 0; | |
44849405 | 1212 | u64 min_ino = ctx->last_ino + 1; |
956a17d9 FM |
1213 | |
1214 | inode = btrfs_find_first_inode(root, min_ino); | |
1215 | while (inode) { | |
44849405 | 1216 | nr_dropped += btrfs_scan_inode(inode, ctx); |
956a17d9 FM |
1217 | |
1218 | min_ino = btrfs_ino(inode) + 1; | |
44849405 | 1219 | ctx->last_ino = btrfs_ino(inode); |
68a3ebd1 | 1220 | btrfs_add_delayed_iput(inode); |
956a17d9 | 1221 | |
44849405 | 1222 | if (ctx->scanned >= ctx->nr_to_scan) |
956a17d9 FM |
1223 | break; |
1224 | ||
ae1e766f | 1225 | cond_resched(); |
b3ebb9b7 | 1226 | |
956a17d9 FM |
1227 | inode = btrfs_find_first_inode(root, min_ino); |
1228 | } | |
1229 | ||
1230 | if (inode) { | |
1231 | /* | |
1232 | * There are still inodes in this root or we happened to process | |
1233 | * the last one and reached the scan limit. In either case set | |
1234 | * the current root to this one, so we'll resume from the next | |
1235 | * inode if there is one or we will find out this was the last | |
1236 | * one and move to the next root. | |
1237 | */ | |
44849405 | 1238 | ctx->last_root = btrfs_root_id(root); |
956a17d9 FM |
1239 | } else { |
1240 | /* | |
1241 | * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so | |
1242 | * that when processing the next root we start from its first inode. | |
1243 | */ | |
44849405 FM |
1244 | ctx->last_ino = 0; |
1245 | ctx->last_root = btrfs_root_id(root) + 1; | |
956a17d9 FM |
1246 | } |
1247 | ||
1248 | return nr_dropped; | |
1249 | } | |
1250 | ||
1251 | long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | |
1252 | { | |
44849405 FM |
1253 | struct btrfs_em_shrink_ctx ctx; |
1254 | u64 start_root_id; | |
1255 | u64 next_root_id; | |
956a17d9 FM |
1256 | bool cycled = false; |
1257 | long nr_dropped = 0; | |
44849405 FM |
1258 | |
1259 | ctx.scanned = 0; | |
1260 | ctx.nr_to_scan = nr_to_scan; | |
1261 | ||
1262 | /* | |
1263 | * In case we have multiple tasks running this shrinker, make the next | |
1264 | * one start from the next inode in case it starts before we finish. | |
1265 | */ | |
1266 | spin_lock(&fs_info->extent_map_shrinker_lock); | |
1267 | ctx.last_ino = fs_info->extent_map_shrinker_last_ino; | |
1268 | fs_info->extent_map_shrinker_last_ino++; | |
1269 | ctx.last_root = fs_info->extent_map_shrinker_last_root; | |
1270 | spin_unlock(&fs_info->extent_map_shrinker_lock); | |
1271 | ||
1272 | start_root_id = ctx.last_root; | |
1273 | next_root_id = ctx.last_root; | |
956a17d9 | 1274 | |
0d89a15e FM |
1275 | if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { |
1276 | s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | |
1277 | ||
44849405 FM |
1278 | trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, |
1279 | nr, ctx.last_root, | |
1280 | ctx.last_ino); | |
0d89a15e FM |
1281 | } |
1282 | ||
ae1e766f | 1283 | while (ctx.scanned < ctx.nr_to_scan) { |
956a17d9 FM |
1284 | struct btrfs_root *root; |
1285 | unsigned long count; | |
1286 | ||
ae1e766f FM |
1287 | cond_resched(); |
1288 | ||
956a17d9 FM |
1289 | spin_lock(&fs_info->fs_roots_radix_lock); |
1290 | count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, | |
1291 | (void **)&root, | |
1292 | (unsigned long)next_root_id, 1); | |
1293 | if (count == 0) { | |
1294 | spin_unlock(&fs_info->fs_roots_radix_lock); | |
1295 | if (start_root_id > 0 && !cycled) { | |
1296 | next_root_id = 0; | |
44849405 FM |
1297 | ctx.last_root = 0; |
1298 | ctx.last_ino = 0; | |
956a17d9 FM |
1299 | cycled = true; |
1300 | continue; | |
1301 | } | |
1302 | break; | |
1303 | } | |
1304 | next_root_id = btrfs_root_id(root) + 1; | |
1305 | root = btrfs_grab_root(root); | |
1306 | spin_unlock(&fs_info->fs_roots_radix_lock); | |
1307 | ||
1308 | if (!root) | |
1309 | continue; | |
1310 | ||
1311 | if (is_fstree(btrfs_root_id(root))) | |
44849405 | 1312 | nr_dropped += btrfs_scan_root(root, &ctx); |
956a17d9 FM |
1313 | |
1314 | btrfs_put_root(root); | |
1315 | } | |
1316 | ||
44849405 FM |
1317 | /* |
1318 | * In case of multiple tasks running this extent map shrinking code this | |
1319 | * isn't perfect but it's simple and silences things like KCSAN. It's | |
1320 | * not possible to know which task made more progress because we can | |
1321 | * cycle back to the first root and first inode if it's not the first | |
1322 | * time the shrinker ran, see the above logic. Also a task that started | |
1323 | * later may finish ealier than another task and made less progress. So | |
1324 | * make this simple and update to the progress of the last task that | |
1325 | * finished, with the occasional possiblity of having two consecutive | |
1326 | * runs of the shrinker process the same inodes. | |
1327 | */ | |
1328 | spin_lock(&fs_info->extent_map_shrinker_lock); | |
1329 | fs_info->extent_map_shrinker_last_ino = ctx.last_ino; | |
1330 | fs_info->extent_map_shrinker_last_root = ctx.last_root; | |
1331 | spin_unlock(&fs_info->extent_map_shrinker_lock); | |
1332 | ||
0d89a15e FM |
1333 | if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { |
1334 | s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | |
1335 | ||
44849405 FM |
1336 | trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, |
1337 | nr, ctx.last_root, | |
1338 | ctx.last_ino); | |
0d89a15e FM |
1339 | } |
1340 | ||
956a17d9 FM |
1341 | return nr_dropped; |
1342 | } |