Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c1d7c514 | 2 | |
d1310b2e | 3 | #include <linux/err.h> |
d1310b2e | 4 | #include <linux/slab.h> |
a52d9a80 | 5 | #include <linux/spinlock.h> |
9b569ea0 | 6 | #include "messages.h" |
261507a0 | 7 | #include "ctree.h" |
1c11b63e | 8 | #include "volumes.h" |
a52d9a80 | 9 | #include "extent_map.h" |
ebb8765b | 10 | #include "compression.h" |
4c0c8cfc | 11 | #include "btrfs_inode.h" |
a52d9a80 | 12 | |
86479a04 | 13 | |
a52d9a80 | 14 | static struct kmem_cache *extent_map_cache; |
ca664626 | 15 | |
2f4cbe64 | 16 | int __init extent_map_init(void) |
a52d9a80 | 17 | { |
837e1972 | 18 | extent_map_cache = kmem_cache_create("btrfs_extent_map", |
9601e3f6 | 19 | sizeof(struct extent_map), 0, |
fba4b697 | 20 | SLAB_MEM_SPREAD, NULL); |
2f4cbe64 WB |
21 | if (!extent_map_cache) |
22 | return -ENOMEM; | |
2f4cbe64 | 23 | return 0; |
a52d9a80 CM |
24 | } |
25 | ||
e67c718b | 26 | void __cold extent_map_exit(void) |
a52d9a80 | 27 | { |
5598e900 | 28 | kmem_cache_destroy(extent_map_cache); |
a52d9a80 CM |
29 | } |
30 | ||
43dd529a DS |
31 | /* |
32 | * Initialize the extent tree @tree. Should be called for each new inode or | |
33 | * other user of the extent_map interface. | |
9d2423c5 | 34 | */ |
a8067e02 | 35 | void extent_map_tree_init(struct extent_map_tree *tree) |
a52d9a80 | 36 | { |
07e1ce09 | 37 | tree->map = RB_ROOT_CACHED; |
5dc562c5 | 38 | INIT_LIST_HEAD(&tree->modified_extents); |
890871be | 39 | rwlock_init(&tree->lock); |
a52d9a80 | 40 | } |
a52d9a80 | 41 | |
43dd529a DS |
42 | /* |
43 | * Allocate a new extent_map structure. The new structure is returned with a | |
44 | * reference count of one and needs to be freed using free_extent_map() | |
9d2423c5 | 45 | */ |
172ddd60 | 46 | struct extent_map *alloc_extent_map(void) |
a52d9a80 CM |
47 | { |
48 | struct extent_map *em; | |
70c8a91c | 49 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
c26a9203 TI |
50 | if (!em) |
51 | return NULL; | |
cbc0e928 | 52 | RB_CLEAR_NODE(&em->rb_node); |
261507a0 | 53 | em->compress_type = BTRFS_COMPRESS_NONE; |
490b54d6 | 54 | refcount_set(&em->refs, 1); |
5dc562c5 | 55 | INIT_LIST_HEAD(&em->list); |
a52d9a80 CM |
56 | return em; |
57 | } | |
a52d9a80 | 58 | |
43dd529a DS |
59 | /* |
60 | * Drop the reference out on @em by one and free the structure if the reference | |
61 | * count hits zero. | |
9d2423c5 | 62 | */ |
a52d9a80 CM |
63 | void free_extent_map(struct extent_map *em) |
64 | { | |
2bf5a725 CM |
65 | if (!em) |
66 | return; | |
490b54d6 | 67 | if (refcount_dec_and_test(&em->refs)) { |
cbc0e928 | 68 | WARN_ON(extent_map_in_tree(em)); |
5dc562c5 | 69 | WARN_ON(!list_empty(&em->list)); |
298a8f9c | 70 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
95617d69 | 71 | kfree(em->map_lookup); |
a52d9a80 CM |
72 | kmem_cache_free(extent_map_cache, em); |
73 | } | |
74 | } | |
a52d9a80 | 75 | |
43dd529a | 76 | /* Do the math around the end of an extent, handling wrapping. */ |
32193c14 FDBM |
77 | static u64 range_end(u64 start, u64 len) |
78 | { | |
79 | if (start + len < start) | |
80 | return (u64)-1; | |
81 | return start + len; | |
82 | } | |
83 | ||
07e1ce09 | 84 | static int tree_insert(struct rb_root_cached *root, struct extent_map *em) |
a52d9a80 | 85 | { |
07e1ce09 | 86 | struct rb_node **p = &root->rb_root.rb_node; |
d397712b | 87 | struct rb_node *parent = NULL; |
32193c14 FDBM |
88 | struct extent_map *entry = NULL; |
89 | struct rb_node *orig_parent = NULL; | |
90 | u64 end = range_end(em->start, em->len); | |
07e1ce09 | 91 | bool leftmost = true; |
a52d9a80 | 92 | |
d397712b | 93 | while (*p) { |
a52d9a80 | 94 | parent = *p; |
d1310b2e CM |
95 | entry = rb_entry(parent, struct extent_map, rb_node); |
96 | ||
07e1ce09 | 97 | if (em->start < entry->start) { |
a52d9a80 | 98 | p = &(*p)->rb_left; |
07e1ce09 | 99 | } else if (em->start >= extent_map_end(entry)) { |
a52d9a80 | 100 | p = &(*p)->rb_right; |
07e1ce09 LB |
101 | leftmost = false; |
102 | } else { | |
32193c14 | 103 | return -EEXIST; |
07e1ce09 | 104 | } |
a52d9a80 CM |
105 | } |
106 | ||
32193c14 FDBM |
107 | orig_parent = parent; |
108 | while (parent && em->start >= extent_map_end(entry)) { | |
109 | parent = rb_next(parent); | |
110 | entry = rb_entry(parent, struct extent_map, rb_node); | |
111 | } | |
112 | if (parent) | |
113 | if (end > entry->start && em->start < extent_map_end(entry)) | |
114 | return -EEXIST; | |
115 | ||
116 | parent = orig_parent; | |
117 | entry = rb_entry(parent, struct extent_map, rb_node); | |
118 | while (parent && em->start < entry->start) { | |
119 | parent = rb_prev(parent); | |
120 | entry = rb_entry(parent, struct extent_map, rb_node); | |
121 | } | |
122 | if (parent) | |
123 | if (end > entry->start && em->start < extent_map_end(entry)) | |
124 | return -EEXIST; | |
125 | ||
32193c14 | 126 | rb_link_node(&em->rb_node, orig_parent, p); |
07e1ce09 | 127 | rb_insert_color_cached(&em->rb_node, root, leftmost); |
32193c14 | 128 | return 0; |
a52d9a80 CM |
129 | } |
130 | ||
d352ac68 | 131 | /* |
43dd529a DS |
132 | * Search through the tree for an extent_map with a given offset. If it can't |
133 | * be found, try to find some neighboring extents | |
d352ac68 | 134 | */ |
a52d9a80 | 135 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
6c05813e | 136 | struct rb_node **prev_or_next_ret) |
a52d9a80 | 137 | { |
d397712b | 138 | struct rb_node *n = root->rb_node; |
a52d9a80 | 139 | struct rb_node *prev = NULL; |
5f56406a | 140 | struct rb_node *orig_prev = NULL; |
d1310b2e CM |
141 | struct extent_map *entry; |
142 | struct extent_map *prev_entry = NULL; | |
a52d9a80 | 143 | |
6c05813e | 144 | ASSERT(prev_or_next_ret); |
08f088dd | 145 | |
d397712b | 146 | while (n) { |
d1310b2e | 147 | entry = rb_entry(n, struct extent_map, rb_node); |
a52d9a80 CM |
148 | prev = n; |
149 | prev_entry = entry; | |
150 | ||
151 | if (offset < entry->start) | |
152 | n = n->rb_left; | |
d1310b2e | 153 | else if (offset >= extent_map_end(entry)) |
a52d9a80 CM |
154 | n = n->rb_right; |
155 | else | |
156 | return n; | |
157 | } | |
5f56406a | 158 | |
08f088dd FM |
159 | orig_prev = prev; |
160 | while (prev && offset >= extent_map_end(prev_entry)) { | |
161 | prev = rb_next(prev); | |
162 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | |
5f56406a CM |
163 | } |
164 | ||
6c05813e FM |
165 | /* |
166 | * Previous extent map found, return as in this case the caller does not | |
167 | * care about the next one. | |
168 | */ | |
169 | if (prev) { | |
170 | *prev_or_next_ret = prev; | |
171 | return NULL; | |
172 | } | |
173 | ||
174 | prev = orig_prev; | |
08f088dd FM |
175 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
176 | while (prev && offset < prev_entry->start) { | |
177 | prev = rb_prev(prev); | |
d1310b2e | 178 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
a52d9a80 | 179 | } |
6c05813e | 180 | *prev_or_next_ret = prev; |
08f088dd | 181 | |
a52d9a80 CM |
182 | return NULL; |
183 | } | |
184 | ||
43dd529a | 185 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
d1310b2e | 186 | static int mergable_maps(struct extent_map *prev, struct extent_map *next) |
a52d9a80 | 187 | { |
7f3c74fb CM |
188 | if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) |
189 | return 0; | |
190 | ||
c8b97818 CM |
191 | /* |
192 | * don't merge compressed extents, we need to know their | |
193 | * actual size | |
194 | */ | |
195 | if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) | |
196 | return 0; | |
197 | ||
201a9038 JB |
198 | if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || |
199 | test_bit(EXTENT_FLAG_LOGGING, &next->flags)) | |
200 | return 0; | |
201 | ||
09a2a8f9 JB |
202 | /* |
203 | * We don't want to merge stuff that hasn't been written to the log yet | |
204 | * since it may not reflect exactly what is on disk, and that would be | |
205 | * bad. | |
206 | */ | |
207 | if (!list_empty(&prev->list) || !list_empty(&next->list)) | |
208 | return 0; | |
209 | ||
951e05a9 NB |
210 | ASSERT(next->block_start != EXTENT_MAP_DELALLOC && |
211 | prev->block_start != EXTENT_MAP_DELALLOC); | |
212 | ||
c3e14909 DS |
213 | if (prev->map_lookup || next->map_lookup) |
214 | ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && | |
215 | test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); | |
216 | ||
d1310b2e CM |
217 | if (extent_map_end(prev) == next->start && |
218 | prev->flags == next->flags && | |
c3e14909 | 219 | prev->map_lookup == next->map_lookup && |
d1310b2e CM |
220 | ((next->block_start == EXTENT_MAP_HOLE && |
221 | prev->block_start == EXTENT_MAP_HOLE) || | |
222 | (next->block_start == EXTENT_MAP_INLINE && | |
223 | prev->block_start == EXTENT_MAP_INLINE) || | |
d1310b2e CM |
224 | (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && |
225 | next->block_start == extent_map_block_end(prev)))) { | |
226 | return 1; | |
227 | } | |
a52d9a80 CM |
228 | return 0; |
229 | } | |
230 | ||
4d2c8f62 | 231 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
a1ed835e | 232 | { |
a1ed835e CM |
233 | struct extent_map *merge = NULL; |
234 | struct rb_node *rb; | |
a1ed835e | 235 | |
ac05ca91 FM |
236 | /* |
237 | * We can't modify an extent map that is in the tree and that is being | |
238 | * used by another task, as it can cause that other task to see it in | |
239 | * inconsistent state during the merging. We always have 1 reference for | |
240 | * the tree and 1 for this task (which is unpinning the extent map or | |
241 | * clearing the logging flag), so anything > 2 means it's being used by | |
242 | * other tasks too. | |
243 | */ | |
244 | if (refcount_read(&em->refs) > 2) | |
245 | return; | |
246 | ||
a1ed835e CM |
247 | if (em->start != 0) { |
248 | rb = rb_prev(&em->rb_node); | |
249 | if (rb) | |
250 | merge = rb_entry(rb, struct extent_map, rb_node); | |
251 | if (rb && mergable_maps(merge, em)) { | |
252 | em->start = merge->start; | |
70c8a91c | 253 | em->orig_start = merge->orig_start; |
a1ed835e CM |
254 | em->len += merge->len; |
255 | em->block_len += merge->block_len; | |
256 | em->block_start = merge->block_start; | |
70c8a91c JB |
257 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
258 | em->mod_start = merge->mod_start; | |
259 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 260 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
5dc562c5 | 261 | |
07e1ce09 | 262 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 263 | RB_CLEAR_NODE(&merge->rb_node); |
a1ed835e CM |
264 | free_extent_map(merge); |
265 | } | |
266 | } | |
267 | ||
268 | rb = rb_next(&em->rb_node); | |
269 | if (rb) | |
270 | merge = rb_entry(rb, struct extent_map, rb_node); | |
271 | if (rb && mergable_maps(em, merge)) { | |
272 | em->len += merge->len; | |
d527afe1 | 273 | em->block_len += merge->block_len; |
07e1ce09 | 274 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 275 | RB_CLEAR_NODE(&merge->rb_node); |
70c8a91c JB |
276 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
277 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 278 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
a1ed835e CM |
279 | free_extent_map(merge); |
280 | } | |
4d2c8f62 LZ |
281 | } |
282 | ||
43dd529a DS |
283 | /* |
284 | * Unpin an extent from the cache. | |
285 | * | |
5dc562c5 JB |
286 | * @tree: tree to unpin the extent in |
287 | * @start: logical offset in the file | |
288 | * @len: length of the extent | |
289 | * @gen: generation that this extent has been modified in | |
5dc562c5 JB |
290 | * |
291 | * Called after an extent has been written to disk properly. Set the generation | |
292 | * to the generation that actually added the file item to the inode so we know | |
293 | * we need to sync this extent when we call fsync(). | |
294 | */ | |
295 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, | |
296 | u64 gen) | |
4d2c8f62 LZ |
297 | { |
298 | int ret = 0; | |
299 | struct extent_map *em; | |
4e2f84e6 | 300 | bool prealloc = false; |
4d2c8f62 LZ |
301 | |
302 | write_lock(&tree->lock); | |
303 | em = lookup_extent_mapping(tree, start, len); | |
304 | ||
305 | WARN_ON(!em || em->start != start); | |
306 | ||
307 | if (!em) | |
308 | goto out; | |
309 | ||
5dc562c5 | 310 | em->generation = gen; |
4d2c8f62 | 311 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
4e2f84e6 LB |
312 | em->mod_start = em->start; |
313 | em->mod_len = em->len; | |
314 | ||
b11e234d | 315 | if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { |
4e2f84e6 | 316 | prealloc = true; |
b11e234d | 317 | clear_bit(EXTENT_FLAG_FILLING, &em->flags); |
4e2f84e6 | 318 | } |
4d2c8f62 LZ |
319 | |
320 | try_merge_map(tree, em); | |
4e2f84e6 LB |
321 | |
322 | if (prealloc) { | |
323 | em->mod_start = em->start; | |
324 | em->mod_len = em->len; | |
325 | } | |
326 | ||
a1ed835e CM |
327 | free_extent_map(em); |
328 | out: | |
329 | write_unlock(&tree->lock); | |
330 | return ret; | |
331 | ||
332 | } | |
333 | ||
201a9038 JB |
334 | void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) |
335 | { | |
74333c7d FM |
336 | lockdep_assert_held_write(&tree->lock); |
337 | ||
201a9038 | 338 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); |
cbc0e928 | 339 | if (extent_map_in_tree(em)) |
222c81dc | 340 | try_merge_map(tree, em); |
201a9038 JB |
341 | } |
342 | ||
176840b3 FM |
343 | static inline void setup_extent_mapping(struct extent_map_tree *tree, |
344 | struct extent_map *em, | |
345 | int modified) | |
346 | { | |
490b54d6 | 347 | refcount_inc(&em->refs); |
176840b3 FM |
348 | em->mod_start = em->start; |
349 | em->mod_len = em->len; | |
350 | ||
351 | if (modified) | |
352 | list_move(&em->list, &tree->modified_extents); | |
353 | else | |
354 | try_merge_map(tree, em); | |
355 | } | |
356 | ||
1c11b63e JM |
357 | static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) |
358 | { | |
359 | struct map_lookup *map = em->map_lookup; | |
360 | u64 stripe_size = em->orig_block_len; | |
361 | int i; | |
362 | ||
363 | for (i = 0; i < map->num_stripes; i++) { | |
4c664611 | 364 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
1c11b63e JM |
365 | struct btrfs_device *device = stripe->dev; |
366 | ||
367 | set_extent_bits_nowait(&device->alloc_state, stripe->physical, | |
368 | stripe->physical + stripe_size - 1, bits); | |
369 | } | |
370 | } | |
371 | ||
372 | static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) | |
373 | { | |
374 | struct map_lookup *map = em->map_lookup; | |
375 | u64 stripe_size = em->orig_block_len; | |
376 | int i; | |
377 | ||
378 | for (i = 0; i < map->num_stripes; i++) { | |
4c664611 | 379 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
1c11b63e JM |
380 | struct btrfs_device *device = stripe->dev; |
381 | ||
382 | __clear_extent_bit(&device->alloc_state, stripe->physical, | |
383 | stripe->physical + stripe_size - 1, bits, | |
bd015294 | 384 | NULL, GFP_NOWAIT, NULL); |
1c11b63e JM |
385 | } |
386 | } | |
387 | ||
43dd529a | 388 | /* |
401bd2dd NB |
389 | * Add new extent map to the extent tree |
390 | * | |
9d2423c5 CH |
391 | * @tree: tree to insert new map in |
392 | * @em: map to insert | |
401bd2dd NB |
393 | * @modified: indicate whether the given @em should be added to the |
394 | * modified list, which indicates the extent needs to be logged | |
9d2423c5 CH |
395 | * |
396 | * Insert @em into @tree or perform a simple forward/backward merge with | |
397 | * existing mappings. The extent_map struct passed in will be inserted | |
398 | * into the tree directly, with an additional reference taken, or a | |
25985edc | 399 | * reference dropped if the merge attempt was successful. |
a52d9a80 CM |
400 | */ |
401 | int add_extent_mapping(struct extent_map_tree *tree, | |
09a2a8f9 | 402 | struct extent_map *em, int modified) |
a52d9a80 CM |
403 | { |
404 | int ret = 0; | |
a52d9a80 | 405 | |
d23ea3fa DS |
406 | lockdep_assert_held_write(&tree->lock); |
407 | ||
32193c14 FDBM |
408 | ret = tree_insert(&tree->map, em); |
409 | if (ret) | |
a52d9a80 | 410 | goto out; |
32193c14 | 411 | |
176840b3 | 412 | setup_extent_mapping(tree, em, modified); |
8811133d | 413 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { |
1c11b63e | 414 | extent_map_device_set_bits(em, CHUNK_ALLOCATED); |
8811133d NB |
415 | extent_map_device_clear_bits(em, CHUNK_TRIMMED); |
416 | } | |
a52d9a80 | 417 | out: |
a52d9a80 CM |
418 | return ret; |
419 | } | |
a52d9a80 | 420 | |
48a3b636 ES |
421 | static struct extent_map * |
422 | __lookup_extent_mapping(struct extent_map_tree *tree, | |
423 | u64 start, u64 len, int strict) | |
a52d9a80 CM |
424 | { |
425 | struct extent_map *em; | |
426 | struct rb_node *rb_node; | |
6c05813e | 427 | struct rb_node *prev_or_next = NULL; |
306929f3 CH |
428 | u64 end = range_end(start, len); |
429 | ||
6c05813e | 430 | rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); |
a52d9a80 | 431 | if (!rb_node) { |
6c05813e FM |
432 | if (prev_or_next) |
433 | rb_node = prev_or_next; | |
ed64f066 LZ |
434 | else |
435 | return NULL; | |
a52d9a80 | 436 | } |
ed64f066 | 437 | |
a52d9a80 | 438 | em = rb_entry(rb_node, struct extent_map, rb_node); |
d1310b2e | 439 | |
ed64f066 LZ |
440 | if (strict && !(end > em->start && start < extent_map_end(em))) |
441 | return NULL; | |
d1310b2e | 442 | |
490b54d6 | 443 | refcount_inc(&em->refs); |
a52d9a80 CM |
444 | return em; |
445 | } | |
a52d9a80 | 446 | |
43dd529a DS |
447 | /* |
448 | * Lookup extent_map that intersects @start + @len range. | |
449 | * | |
ed64f066 LZ |
450 | * @tree: tree to lookup in |
451 | * @start: byte offset to start the search | |
452 | * @len: length of the lookup range | |
453 | * | |
454 | * Find and return the first extent_map struct in @tree that intersects the | |
455 | * [start, len] range. There may be additional objects in the tree that | |
456 | * intersect, so check the object returned carefully to make sure that no | |
457 | * additional lookups are needed. | |
458 | */ | |
459 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | |
460 | u64 start, u64 len) | |
461 | { | |
462 | return __lookup_extent_mapping(tree, start, len, 1); | |
463 | } | |
464 | ||
43dd529a DS |
465 | /* |
466 | * Find a nearby extent map intersecting @start + @len (not an exact search). | |
467 | * | |
b917b7c3 CM |
468 | * @tree: tree to lookup in |
469 | * @start: byte offset to start the search | |
470 | * @len: length of the lookup range | |
471 | * | |
472 | * Find and return the first extent_map struct in @tree that intersects the | |
473 | * [start, len] range. | |
474 | * | |
475 | * If one can't be found, any nearby extent may be returned | |
476 | */ | |
477 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, | |
478 | u64 start, u64 len) | |
479 | { | |
ed64f066 | 480 | return __lookup_extent_mapping(tree, start, len, 0); |
b917b7c3 CM |
481 | } |
482 | ||
43dd529a DS |
483 | /* |
484 | * Remove an extent_map from the extent tree. | |
485 | * | |
9d2423c5 | 486 | * @tree: extent tree to remove from |
bb7ab3b9 | 487 | * @em: extent map being removed |
9d2423c5 | 488 | * |
43dd529a DS |
489 | * Remove @em from @tree. No reference counts are dropped, and no checks |
490 | * are done to see if the range is in use. | |
a52d9a80 | 491 | */ |
c1766dd7 | 492 | void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) |
a52d9a80 | 493 | { |
6d3b050e FM |
494 | lockdep_assert_held_write(&tree->lock); |
495 | ||
7f3c74fb | 496 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); |
07e1ce09 | 497 | rb_erase_cached(&em->rb_node, &tree->map); |
ff44c6e3 JB |
498 | if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) |
499 | list_del_init(&em->list); | |
1c11b63e JM |
500 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
501 | extent_map_device_clear_bits(em, CHUNK_ALLOCATED); | |
cbc0e928 | 502 | RB_CLEAR_NODE(&em->rb_node); |
a52d9a80 | 503 | } |
176840b3 FM |
504 | |
505 | void replace_extent_mapping(struct extent_map_tree *tree, | |
506 | struct extent_map *cur, | |
507 | struct extent_map *new, | |
508 | int modified) | |
509 | { | |
6d3b050e FM |
510 | lockdep_assert_held_write(&tree->lock); |
511 | ||
176840b3 FM |
512 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); |
513 | ASSERT(extent_map_in_tree(cur)); | |
514 | if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) | |
515 | list_del_init(&cur->list); | |
07e1ce09 | 516 | rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); |
176840b3 FM |
517 | RB_CLEAR_NODE(&cur->rb_node); |
518 | ||
519 | setup_extent_mapping(tree, new, modified); | |
520 | } | |
c04e61b5 | 521 | |
d47704bd | 522 | static struct extent_map *next_extent_map(const struct extent_map *em) |
c04e61b5 LB |
523 | { |
524 | struct rb_node *next; | |
525 | ||
526 | next = rb_next(&em->rb_node); | |
527 | if (!next) | |
528 | return NULL; | |
529 | return container_of(next, struct extent_map, rb_node); | |
530 | } | |
531 | ||
532 | static struct extent_map *prev_extent_map(struct extent_map *em) | |
533 | { | |
534 | struct rb_node *prev; | |
535 | ||
536 | prev = rb_prev(&em->rb_node); | |
537 | if (!prev) | |
538 | return NULL; | |
539 | return container_of(prev, struct extent_map, rb_node); | |
540 | } | |
541 | ||
52042d8e AG |
542 | /* |
543 | * Helper for btrfs_get_extent. Given an existing extent in the tree, | |
c04e61b5 LB |
544 | * the existing extent is the nearest extent to map_start, |
545 | * and an extent that you want to insert, deal with overlap and insert | |
546 | * the best fitted new extent into the tree. | |
547 | */ | |
5f4791f4 LB |
548 | static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, |
549 | struct extent_map *existing, | |
550 | struct extent_map *em, | |
551 | u64 map_start) | |
c04e61b5 LB |
552 | { |
553 | struct extent_map *prev; | |
554 | struct extent_map *next; | |
555 | u64 start; | |
556 | u64 end; | |
557 | u64 start_diff; | |
558 | ||
559 | BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); | |
560 | ||
561 | if (existing->start > map_start) { | |
562 | next = existing; | |
563 | prev = prev_extent_map(next); | |
564 | } else { | |
565 | prev = existing; | |
566 | next = next_extent_map(prev); | |
567 | } | |
568 | ||
569 | start = prev ? extent_map_end(prev) : em->start; | |
570 | start = max_t(u64, start, em->start); | |
571 | end = next ? next->start : extent_map_end(em); | |
572 | end = min_t(u64, end, extent_map_end(em)); | |
573 | start_diff = start - em->start; | |
574 | em->start = start; | |
575 | em->len = end - start; | |
576 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | |
577 | !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | |
578 | em->block_start += start_diff; | |
579 | em->block_len = em->len; | |
580 | } | |
581 | return add_extent_mapping(em_tree, em, 0); | |
582 | } | |
583 | ||
43dd529a DS |
584 | /* |
585 | * Add extent mapping into em_tree. | |
9ad37bb3 NB |
586 | * |
587 | * @fs_info: the filesystem | |
588 | * @em_tree: extent tree into which we want to insert the extent mapping | |
589 | * @em_in: extent we are inserting | |
590 | * @start: start of the logical range btrfs_get_extent() is requesting | |
591 | * @len: length of the logical range btrfs_get_extent() is requesting | |
c04e61b5 LB |
592 | * |
593 | * Note that @em_in's range may be different from [start, start+len), | |
594 | * but they must be overlapped. | |
595 | * | |
596 | * Insert @em_in into @em_tree. In case there is an overlapping range, handle | |
597 | * the -EEXIST by either: | |
598 | * a) Returning the existing extent in @em_in if @start is within the | |
599 | * existing em. | |
600 | * b) Merge the existing extent with @em_in passed in. | |
601 | * | |
602 | * Return 0 on success, otherwise -EEXIST. | |
603 | * | |
604 | */ | |
f46b24c9 DS |
605 | int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, |
606 | struct extent_map_tree *em_tree, | |
c04e61b5 LB |
607 | struct extent_map **em_in, u64 start, u64 len) |
608 | { | |
609 | int ret; | |
610 | struct extent_map *em = *em_in; | |
611 | ||
d52a1365 QW |
612 | /* |
613 | * Tree-checker should have rejected any inline extent with non-zero | |
614 | * file offset. Here just do a sanity check. | |
615 | */ | |
616 | if (em->block_start == EXTENT_MAP_INLINE) | |
617 | ASSERT(em->start == 0); | |
618 | ||
c04e61b5 LB |
619 | ret = add_extent_mapping(em_tree, em, 0); |
620 | /* it is possible that someone inserted the extent into the tree | |
621 | * while we had the lock dropped. It is also possible that | |
622 | * an overlapping map exists in the tree | |
623 | */ | |
624 | if (ret == -EEXIST) { | |
625 | struct extent_map *existing; | |
626 | ||
627 | ret = 0; | |
628 | ||
629 | existing = search_extent_mapping(em_tree, start, len); | |
393da918 | 630 | |
f46b24c9 | 631 | trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); |
393da918 | 632 | |
c04e61b5 LB |
633 | /* |
634 | * existing will always be non-NULL, since there must be | |
635 | * extent causing the -EEXIST. | |
636 | */ | |
637 | if (start >= existing->start && | |
638 | start < extent_map_end(existing)) { | |
639 | free_extent_map(em); | |
640 | *em_in = existing; | |
641 | ret = 0; | |
642 | } else { | |
9a7e10e7 LB |
643 | u64 orig_start = em->start; |
644 | u64 orig_len = em->len; | |
645 | ||
c04e61b5 LB |
646 | /* |
647 | * The existing extent map is the one nearest to | |
648 | * the [start, start + len) range which overlaps | |
649 | */ | |
650 | ret = merge_extent_mapping(em_tree, existing, | |
651 | em, start); | |
c04e61b5 LB |
652 | if (ret) { |
653 | free_extent_map(em); | |
654 | *em_in = NULL; | |
9a7e10e7 LB |
655 | WARN_ONCE(ret, |
656 | "unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", | |
657 | ret, existing->start, existing->len, | |
658 | orig_start, orig_len); | |
c04e61b5 | 659 | } |
9a7e10e7 | 660 | free_extent_map(existing); |
c04e61b5 LB |
661 | } |
662 | } | |
663 | ||
664 | ASSERT(ret == 0 || ret == -EEXIST); | |
665 | return ret; | |
666 | } | |
4c0c8cfc | 667 | |
9c9d1b4f FM |
668 | /* |
669 | * Drop all extent maps from a tree in the fastest possible way, rescheduling | |
670 | * if needed. This avoids searching the tree, from the root down to the first | |
671 | * extent map, before each deletion. | |
672 | */ | |
673 | static void drop_all_extent_maps_fast(struct extent_map_tree *tree) | |
674 | { | |
675 | write_lock(&tree->lock); | |
676 | while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { | |
677 | struct extent_map *em; | |
678 | struct rb_node *node; | |
679 | ||
680 | node = rb_first_cached(&tree->map); | |
681 | em = rb_entry(node, struct extent_map, rb_node); | |
682 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | |
683 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); | |
684 | remove_extent_mapping(tree, em); | |
685 | free_extent_map(em); | |
686 | cond_resched_rwlock_write(&tree->lock); | |
687 | } | |
688 | write_unlock(&tree->lock); | |
689 | } | |
690 | ||
4c0c8cfc FM |
691 | /* |
692 | * Drop all extent maps in a given range. | |
693 | * | |
694 | * @inode: The target inode. | |
695 | * @start: Start offset of the range. | |
696 | * @end: End offset of the range (inclusive value). | |
697 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. | |
698 | * | |
699 | * This drops all the extent maps that intersect the given range [@start, @end]. | |
700 | * Extent maps that partially overlap the range and extend behind or beyond it, | |
701 | * are split. | |
702 | * The caller should have locked an appropriate file range in the inode's io | |
703 | * tree before calling this function. | |
704 | */ | |
705 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, | |
706 | bool skip_pinned) | |
707 | { | |
db21370b FM |
708 | struct extent_map *split; |
709 | struct extent_map *split2; | |
710 | struct extent_map *em; | |
4c0c8cfc FM |
711 | struct extent_map_tree *em_tree = &inode->extent_tree; |
712 | u64 len = end - start + 1; | |
4c0c8cfc FM |
713 | |
714 | WARN_ON(end < start); | |
715 | if (end == (u64)-1) { | |
9c9d1b4f FM |
716 | if (start == 0 && !skip_pinned) { |
717 | drop_all_extent_maps_fast(em_tree); | |
718 | return; | |
719 | } | |
4c0c8cfc | 720 | len = (u64)-1; |
db21370b FM |
721 | } else { |
722 | /* Make end offset exclusive for use in the loop below. */ | |
723 | end++; | |
4c0c8cfc | 724 | } |
db21370b FM |
725 | |
726 | /* | |
727 | * It's ok if we fail to allocate the extent maps, see the comment near | |
728 | * the bottom of the loop below. We only need two spare extent maps in | |
729 | * the worst case, where the first extent map that intersects our range | |
730 | * starts before the range and the last extent map that intersects our | |
731 | * range ends after our range (and they might be the same extent map), | |
732 | * because we need to split those two extent maps at the boundaries. | |
733 | */ | |
734 | split = alloc_extent_map(); | |
735 | split2 = alloc_extent_map(); | |
736 | ||
737 | write_lock(&em_tree->lock); | |
738 | em = lookup_extent_mapping(em_tree, start, len); | |
739 | ||
740 | while (em) { | |
741 | /* extent_map_end() returns exclusive value (last byte + 1). */ | |
742 | const u64 em_end = extent_map_end(em); | |
743 | struct extent_map *next_em = NULL; | |
4c0c8cfc FM |
744 | u64 gen; |
745 | unsigned long flags; | |
4c0c8cfc FM |
746 | bool modified; |
747 | bool compressed; | |
748 | ||
db21370b FM |
749 | if (em_end < end) { |
750 | next_em = next_extent_map(em); | |
751 | if (next_em) { | |
752 | if (next_em->start < end) | |
753 | refcount_inc(&next_em->refs); | |
754 | else | |
755 | next_em = NULL; | |
756 | } | |
4c0c8cfc | 757 | } |
db21370b | 758 | |
4c0c8cfc | 759 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { |
f3109e33 | 760 | start = em_end; |
db21370b | 761 | if (end != (u64)-1) |
f3109e33 | 762 | len = start + len - em_end; |
db21370b | 763 | goto next; |
4c0c8cfc | 764 | } |
db21370b | 765 | |
e4cc1483 | 766 | flags = em->flags; |
4c0c8cfc | 767 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
e4cc1483 FM |
768 | /* |
769 | * In case we split the extent map, we want to preserve the | |
770 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want | |
771 | * it on the new extent maps. | |
772 | */ | |
4c0c8cfc FM |
773 | clear_bit(EXTENT_FLAG_LOGGING, &flags); |
774 | modified = !list_empty(&em->list); | |
db21370b FM |
775 | |
776 | /* | |
777 | * The extent map does not cross our target range, so no need to | |
778 | * split it, we can remove it directly. | |
779 | */ | |
780 | if (em->start >= start && em_end <= end) | |
781 | goto remove_em; | |
782 | ||
db21370b FM |
783 | gen = em->generation; |
784 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | |
4c0c8cfc FM |
785 | |
786 | if (em->start < start) { | |
db21370b FM |
787 | if (!split) { |
788 | split = split2; | |
789 | split2 = NULL; | |
790 | if (!split) | |
791 | goto remove_em; | |
792 | } | |
4c0c8cfc FM |
793 | split->start = em->start; |
794 | split->len = start - em->start; | |
795 | ||
796 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
797 | split->orig_start = em->orig_start; | |
798 | split->block_start = em->block_start; | |
799 | ||
800 | if (compressed) | |
801 | split->block_len = em->block_len; | |
802 | else | |
803 | split->block_len = split->len; | |
804 | split->orig_block_len = max(split->block_len, | |
805 | em->orig_block_len); | |
806 | split->ram_bytes = em->ram_bytes; | |
807 | } else { | |
808 | split->orig_start = split->start; | |
809 | split->block_len = 0; | |
810 | split->block_start = em->block_start; | |
811 | split->orig_block_len = 0; | |
812 | split->ram_bytes = split->len; | |
813 | } | |
814 | ||
815 | split->generation = gen; | |
816 | split->flags = flags; | |
817 | split->compress_type = em->compress_type; | |
818 | replace_extent_mapping(em_tree, em, split, modified); | |
819 | free_extent_map(split); | |
820 | split = split2; | |
821 | split2 = NULL; | |
822 | } | |
db21370b FM |
823 | if (em_end > end) { |
824 | if (!split) { | |
825 | split = split2; | |
826 | split2 = NULL; | |
827 | if (!split) | |
828 | goto remove_em; | |
829 | } | |
4c0c8cfc | 830 | split->start = start + len; |
f3109e33 | 831 | split->len = em_end - (start + len); |
4c0c8cfc FM |
832 | split->block_start = em->block_start; |
833 | split->flags = flags; | |
834 | split->compress_type = em->compress_type; | |
835 | split->generation = gen; | |
836 | ||
837 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
838 | split->orig_block_len = max(em->block_len, | |
839 | em->orig_block_len); | |
840 | ||
841 | split->ram_bytes = em->ram_bytes; | |
842 | if (compressed) { | |
843 | split->block_len = em->block_len; | |
844 | split->orig_start = em->orig_start; | |
845 | } else { | |
846 | const u64 diff = start + len - em->start; | |
847 | ||
848 | split->block_len = split->len; | |
849 | split->block_start += diff; | |
850 | split->orig_start = em->orig_start; | |
851 | } | |
852 | } else { | |
853 | split->ram_bytes = split->len; | |
854 | split->orig_start = split->start; | |
855 | split->block_len = 0; | |
856 | split->orig_block_len = 0; | |
857 | } | |
858 | ||
859 | if (extent_map_in_tree(em)) { | |
860 | replace_extent_mapping(em_tree, em, split, | |
861 | modified); | |
862 | } else { | |
863 | int ret; | |
864 | ||
865 | ret = add_extent_mapping(em_tree, split, | |
866 | modified); | |
867 | /* Logic error, shouldn't happen. */ | |
868 | ASSERT(ret == 0); | |
869 | if (WARN_ON(ret != 0) && modified) | |
870 | btrfs_set_inode_full_sync(inode); | |
871 | } | |
872 | free_extent_map(split); | |
873 | split = NULL; | |
874 | } | |
db21370b | 875 | remove_em: |
4c0c8cfc FM |
876 | if (extent_map_in_tree(em)) { |
877 | /* | |
878 | * If the extent map is still in the tree it means that | |
879 | * either of the following is true: | |
880 | * | |
881 | * 1) It fits entirely in our range (doesn't end beyond | |
882 | * it or starts before it); | |
883 | * | |
884 | * 2) It starts before our range and/or ends after our | |
885 | * range, and we were not able to allocate the extent | |
886 | * maps for split operations, @split and @split2. | |
887 | * | |
888 | * If we are at case 2) then we just remove the entire | |
889 | * extent map - this is fine since if anyone needs it to | |
890 | * access the subranges outside our range, will just | |
891 | * load it again from the subvolume tree's file extent | |
892 | * item. However if the extent map was in the list of | |
893 | * modified extents, then we must mark the inode for a | |
894 | * full fsync, otherwise a fast fsync will miss this | |
895 | * extent if it's new and needs to be logged. | |
896 | */ | |
db21370b FM |
897 | if ((em->start < start || em_end > end) && modified) { |
898 | ASSERT(!split); | |
4c0c8cfc FM |
899 | btrfs_set_inode_full_sync(inode); |
900 | } | |
901 | remove_extent_mapping(em_tree, em); | |
902 | } | |
4c0c8cfc | 903 | |
db21370b FM |
904 | /* |
905 | * Once for the tree reference (we replaced or removed the | |
906 | * extent map from the tree). | |
907 | */ | |
4c0c8cfc | 908 | free_extent_map(em); |
db21370b FM |
909 | next: |
910 | /* Once for us (for our lookup reference). */ | |
4c0c8cfc | 911 | free_extent_map(em); |
db21370b FM |
912 | |
913 | em = next_em; | |
4c0c8cfc FM |
914 | } |
915 | ||
db21370b FM |
916 | write_unlock(&em_tree->lock); |
917 | ||
4c0c8cfc FM |
918 | free_extent_map(split); |
919 | free_extent_map(split2); | |
920 | } | |
a1ba4c08 FM |
921 | |
922 | /* | |
923 | * Replace a range in the inode's extent map tree with a new extent map. | |
924 | * | |
925 | * @inode: The target inode. | |
926 | * @new_em: The new extent map to add to the inode's extent map tree. | |
927 | * @modified: Indicate if the new extent map should be added to the list of | |
928 | * modified extents (for fast fsync tracking). | |
929 | * | |
930 | * Drops all the extent maps in the inode's extent map tree that intersect the | |
931 | * range of the new extent map and adds the new extent map to the tree. | |
932 | * The caller should have locked an appropriate file range in the inode's io | |
933 | * tree before calling this function. | |
934 | */ | |
935 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | |
936 | struct extent_map *new_em, | |
937 | bool modified) | |
938 | { | |
939 | const u64 end = new_em->start + new_em->len - 1; | |
940 | struct extent_map_tree *tree = &inode->extent_tree; | |
941 | int ret; | |
942 | ||
943 | ASSERT(!extent_map_in_tree(new_em)); | |
944 | ||
945 | /* | |
946 | * The caller has locked an appropriate file range in the inode's io | |
947 | * tree, but getting -EEXIST when adding the new extent map can still | |
948 | * happen in case there are extents that partially cover the range, and | |
949 | * this is due to two tasks operating on different parts of the extent. | |
950 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from | |
951 | * btrfs_get_extent") for an example and details. | |
952 | */ | |
953 | do { | |
954 | btrfs_drop_extent_map_range(inode, new_em->start, end, false); | |
955 | write_lock(&tree->lock); | |
956 | ret = add_extent_mapping(tree, new_em, modified); | |
957 | write_unlock(&tree->lock); | |
958 | } while (ret == -EEXIST); | |
959 | ||
960 | return ret; | |
961 | } |