Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c1d7c514 | 2 | |
d1310b2e | 3 | #include <linux/err.h> |
d1310b2e | 4 | #include <linux/slab.h> |
a52d9a80 | 5 | #include <linux/spinlock.h> |
9b569ea0 | 6 | #include "messages.h" |
261507a0 | 7 | #include "ctree.h" |
1c11b63e | 8 | #include "volumes.h" |
a52d9a80 | 9 | #include "extent_map.h" |
ebb8765b | 10 | #include "compression.h" |
4c0c8cfc | 11 | #include "btrfs_inode.h" |
a52d9a80 | 12 | |
86479a04 | 13 | |
a52d9a80 | 14 | static struct kmem_cache *extent_map_cache; |
ca664626 | 15 | |
2f4cbe64 | 16 | int __init extent_map_init(void) |
a52d9a80 | 17 | { |
837e1972 | 18 | extent_map_cache = kmem_cache_create("btrfs_extent_map", |
9601e3f6 | 19 | sizeof(struct extent_map), 0, |
fba4b697 | 20 | SLAB_MEM_SPREAD, NULL); |
2f4cbe64 WB |
21 | if (!extent_map_cache) |
22 | return -ENOMEM; | |
2f4cbe64 | 23 | return 0; |
a52d9a80 CM |
24 | } |
25 | ||
e67c718b | 26 | void __cold extent_map_exit(void) |
a52d9a80 | 27 | { |
5598e900 | 28 | kmem_cache_destroy(extent_map_cache); |
a52d9a80 CM |
29 | } |
30 | ||
9d2423c5 CH |
31 | /** |
32 | * extent_map_tree_init - initialize extent map tree | |
33 | * @tree: tree to initialize | |
9d2423c5 CH |
34 | * |
35 | * Initialize the extent tree @tree. Should be called for each new inode | |
36 | * or other user of the extent_map interface. | |
37 | */ | |
a8067e02 | 38 | void extent_map_tree_init(struct extent_map_tree *tree) |
a52d9a80 | 39 | { |
07e1ce09 | 40 | tree->map = RB_ROOT_CACHED; |
5dc562c5 | 41 | INIT_LIST_HEAD(&tree->modified_extents); |
890871be | 42 | rwlock_init(&tree->lock); |
a52d9a80 | 43 | } |
a52d9a80 | 44 | |
9d2423c5 CH |
45 | /** |
46 | * alloc_extent_map - allocate new extent map structure | |
9d2423c5 CH |
47 | * |
48 | * Allocate a new extent_map structure. The new structure is | |
49 | * returned with a reference count of one and needs to be | |
50 | * freed using free_extent_map() | |
51 | */ | |
172ddd60 | 52 | struct extent_map *alloc_extent_map(void) |
a52d9a80 CM |
53 | { |
54 | struct extent_map *em; | |
70c8a91c | 55 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
c26a9203 TI |
56 | if (!em) |
57 | return NULL; | |
cbc0e928 | 58 | RB_CLEAR_NODE(&em->rb_node); |
261507a0 | 59 | em->compress_type = BTRFS_COMPRESS_NONE; |
490b54d6 | 60 | refcount_set(&em->refs, 1); |
5dc562c5 | 61 | INIT_LIST_HEAD(&em->list); |
a52d9a80 CM |
62 | return em; |
63 | } | |
a52d9a80 | 64 | |
9d2423c5 CH |
65 | /** |
66 | * free_extent_map - drop reference count of an extent_map | |
01327610 | 67 | * @em: extent map being released |
9d2423c5 CH |
68 | * |
69 | * Drops the reference out on @em by one and free the structure | |
70 | * if the reference count hits zero. | |
71 | */ | |
a52d9a80 CM |
72 | void free_extent_map(struct extent_map *em) |
73 | { | |
2bf5a725 CM |
74 | if (!em) |
75 | return; | |
490b54d6 | 76 | if (refcount_dec_and_test(&em->refs)) { |
cbc0e928 | 77 | WARN_ON(extent_map_in_tree(em)); |
5dc562c5 | 78 | WARN_ON(!list_empty(&em->list)); |
298a8f9c | 79 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
95617d69 | 80 | kfree(em->map_lookup); |
a52d9a80 CM |
81 | kmem_cache_free(extent_map_cache, em); |
82 | } | |
83 | } | |
a52d9a80 | 84 | |
32193c14 FDBM |
85 | /* simple helper to do math around the end of an extent, handling wrap */ |
86 | static u64 range_end(u64 start, u64 len) | |
87 | { | |
88 | if (start + len < start) | |
89 | return (u64)-1; | |
90 | return start + len; | |
91 | } | |
92 | ||
07e1ce09 | 93 | static int tree_insert(struct rb_root_cached *root, struct extent_map *em) |
a52d9a80 | 94 | { |
07e1ce09 | 95 | struct rb_node **p = &root->rb_root.rb_node; |
d397712b | 96 | struct rb_node *parent = NULL; |
32193c14 FDBM |
97 | struct extent_map *entry = NULL; |
98 | struct rb_node *orig_parent = NULL; | |
99 | u64 end = range_end(em->start, em->len); | |
07e1ce09 | 100 | bool leftmost = true; |
a52d9a80 | 101 | |
d397712b | 102 | while (*p) { |
a52d9a80 | 103 | parent = *p; |
d1310b2e CM |
104 | entry = rb_entry(parent, struct extent_map, rb_node); |
105 | ||
07e1ce09 | 106 | if (em->start < entry->start) { |
a52d9a80 | 107 | p = &(*p)->rb_left; |
07e1ce09 | 108 | } else if (em->start >= extent_map_end(entry)) { |
a52d9a80 | 109 | p = &(*p)->rb_right; |
07e1ce09 LB |
110 | leftmost = false; |
111 | } else { | |
32193c14 | 112 | return -EEXIST; |
07e1ce09 | 113 | } |
a52d9a80 CM |
114 | } |
115 | ||
32193c14 FDBM |
116 | orig_parent = parent; |
117 | while (parent && em->start >= extent_map_end(entry)) { | |
118 | parent = rb_next(parent); | |
119 | entry = rb_entry(parent, struct extent_map, rb_node); | |
120 | } | |
121 | if (parent) | |
122 | if (end > entry->start && em->start < extent_map_end(entry)) | |
123 | return -EEXIST; | |
124 | ||
125 | parent = orig_parent; | |
126 | entry = rb_entry(parent, struct extent_map, rb_node); | |
127 | while (parent && em->start < entry->start) { | |
128 | parent = rb_prev(parent); | |
129 | entry = rb_entry(parent, struct extent_map, rb_node); | |
130 | } | |
131 | if (parent) | |
132 | if (end > entry->start && em->start < extent_map_end(entry)) | |
133 | return -EEXIST; | |
134 | ||
32193c14 | 135 | rb_link_node(&em->rb_node, orig_parent, p); |
07e1ce09 | 136 | rb_insert_color_cached(&em->rb_node, root, leftmost); |
32193c14 | 137 | return 0; |
a52d9a80 CM |
138 | } |
139 | ||
d352ac68 CM |
140 | /* |
141 | * search through the tree for an extent_map with a given offset. If | |
142 | * it can't be found, try to find some neighboring extents | |
143 | */ | |
a52d9a80 | 144 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
6c05813e | 145 | struct rb_node **prev_or_next_ret) |
a52d9a80 | 146 | { |
d397712b | 147 | struct rb_node *n = root->rb_node; |
a52d9a80 | 148 | struct rb_node *prev = NULL; |
5f56406a | 149 | struct rb_node *orig_prev = NULL; |
d1310b2e CM |
150 | struct extent_map *entry; |
151 | struct extent_map *prev_entry = NULL; | |
a52d9a80 | 152 | |
6c05813e | 153 | ASSERT(prev_or_next_ret); |
08f088dd | 154 | |
d397712b | 155 | while (n) { |
d1310b2e | 156 | entry = rb_entry(n, struct extent_map, rb_node); |
a52d9a80 CM |
157 | prev = n; |
158 | prev_entry = entry; | |
159 | ||
160 | if (offset < entry->start) | |
161 | n = n->rb_left; | |
d1310b2e | 162 | else if (offset >= extent_map_end(entry)) |
a52d9a80 CM |
163 | n = n->rb_right; |
164 | else | |
165 | return n; | |
166 | } | |
5f56406a | 167 | |
08f088dd FM |
168 | orig_prev = prev; |
169 | while (prev && offset >= extent_map_end(prev_entry)) { | |
170 | prev = rb_next(prev); | |
171 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | |
5f56406a CM |
172 | } |
173 | ||
6c05813e FM |
174 | /* |
175 | * Previous extent map found, return as in this case the caller does not | |
176 | * care about the next one. | |
177 | */ | |
178 | if (prev) { | |
179 | *prev_or_next_ret = prev; | |
180 | return NULL; | |
181 | } | |
182 | ||
183 | prev = orig_prev; | |
08f088dd FM |
184 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
185 | while (prev && offset < prev_entry->start) { | |
186 | prev = rb_prev(prev); | |
d1310b2e | 187 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
a52d9a80 | 188 | } |
6c05813e | 189 | *prev_or_next_ret = prev; |
08f088dd | 190 | |
a52d9a80 CM |
191 | return NULL; |
192 | } | |
193 | ||
d352ac68 | 194 | /* check to see if two extent_map structs are adjacent and safe to merge */ |
d1310b2e | 195 | static int mergable_maps(struct extent_map *prev, struct extent_map *next) |
a52d9a80 | 196 | { |
7f3c74fb CM |
197 | if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) |
198 | return 0; | |
199 | ||
c8b97818 CM |
200 | /* |
201 | * don't merge compressed extents, we need to know their | |
202 | * actual size | |
203 | */ | |
204 | if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) | |
205 | return 0; | |
206 | ||
201a9038 JB |
207 | if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || |
208 | test_bit(EXTENT_FLAG_LOGGING, &next->flags)) | |
209 | return 0; | |
210 | ||
09a2a8f9 JB |
211 | /* |
212 | * We don't want to merge stuff that hasn't been written to the log yet | |
213 | * since it may not reflect exactly what is on disk, and that would be | |
214 | * bad. | |
215 | */ | |
216 | if (!list_empty(&prev->list) || !list_empty(&next->list)) | |
217 | return 0; | |
218 | ||
951e05a9 NB |
219 | ASSERT(next->block_start != EXTENT_MAP_DELALLOC && |
220 | prev->block_start != EXTENT_MAP_DELALLOC); | |
221 | ||
c3e14909 DS |
222 | if (prev->map_lookup || next->map_lookup) |
223 | ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && | |
224 | test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); | |
225 | ||
d1310b2e CM |
226 | if (extent_map_end(prev) == next->start && |
227 | prev->flags == next->flags && | |
c3e14909 | 228 | prev->map_lookup == next->map_lookup && |
d1310b2e CM |
229 | ((next->block_start == EXTENT_MAP_HOLE && |
230 | prev->block_start == EXTENT_MAP_HOLE) || | |
231 | (next->block_start == EXTENT_MAP_INLINE && | |
232 | prev->block_start == EXTENT_MAP_INLINE) || | |
d1310b2e CM |
233 | (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && |
234 | next->block_start == extent_map_block_end(prev)))) { | |
235 | return 1; | |
236 | } | |
a52d9a80 CM |
237 | return 0; |
238 | } | |
239 | ||
4d2c8f62 | 240 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
a1ed835e | 241 | { |
a1ed835e CM |
242 | struct extent_map *merge = NULL; |
243 | struct rb_node *rb; | |
a1ed835e | 244 | |
ac05ca91 FM |
245 | /* |
246 | * We can't modify an extent map that is in the tree and that is being | |
247 | * used by another task, as it can cause that other task to see it in | |
248 | * inconsistent state during the merging. We always have 1 reference for | |
249 | * the tree and 1 for this task (which is unpinning the extent map or | |
250 | * clearing the logging flag), so anything > 2 means it's being used by | |
251 | * other tasks too. | |
252 | */ | |
253 | if (refcount_read(&em->refs) > 2) | |
254 | return; | |
255 | ||
a1ed835e CM |
256 | if (em->start != 0) { |
257 | rb = rb_prev(&em->rb_node); | |
258 | if (rb) | |
259 | merge = rb_entry(rb, struct extent_map, rb_node); | |
260 | if (rb && mergable_maps(merge, em)) { | |
261 | em->start = merge->start; | |
70c8a91c | 262 | em->orig_start = merge->orig_start; |
a1ed835e CM |
263 | em->len += merge->len; |
264 | em->block_len += merge->block_len; | |
265 | em->block_start = merge->block_start; | |
70c8a91c JB |
266 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
267 | em->mod_start = merge->mod_start; | |
268 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 269 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
5dc562c5 | 270 | |
07e1ce09 | 271 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 272 | RB_CLEAR_NODE(&merge->rb_node); |
a1ed835e CM |
273 | free_extent_map(merge); |
274 | } | |
275 | } | |
276 | ||
277 | rb = rb_next(&em->rb_node); | |
278 | if (rb) | |
279 | merge = rb_entry(rb, struct extent_map, rb_node); | |
280 | if (rb && mergable_maps(em, merge)) { | |
281 | em->len += merge->len; | |
d527afe1 | 282 | em->block_len += merge->block_len; |
07e1ce09 | 283 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 284 | RB_CLEAR_NODE(&merge->rb_node); |
70c8a91c JB |
285 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
286 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 287 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
a1ed835e CM |
288 | free_extent_map(merge); |
289 | } | |
4d2c8f62 LZ |
290 | } |
291 | ||
5dc562c5 | 292 | /** |
52b1de91 | 293 | * unpin_extent_cache - unpin an extent from the cache |
5dc562c5 JB |
294 | * @tree: tree to unpin the extent in |
295 | * @start: logical offset in the file | |
296 | * @len: length of the extent | |
297 | * @gen: generation that this extent has been modified in | |
5dc562c5 JB |
298 | * |
299 | * Called after an extent has been written to disk properly. Set the generation | |
300 | * to the generation that actually added the file item to the inode so we know | |
301 | * we need to sync this extent when we call fsync(). | |
302 | */ | |
303 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, | |
304 | u64 gen) | |
4d2c8f62 LZ |
305 | { |
306 | int ret = 0; | |
307 | struct extent_map *em; | |
4e2f84e6 | 308 | bool prealloc = false; |
4d2c8f62 LZ |
309 | |
310 | write_lock(&tree->lock); | |
311 | em = lookup_extent_mapping(tree, start, len); | |
312 | ||
313 | WARN_ON(!em || em->start != start); | |
314 | ||
315 | if (!em) | |
316 | goto out; | |
317 | ||
5dc562c5 | 318 | em->generation = gen; |
4d2c8f62 | 319 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
4e2f84e6 LB |
320 | em->mod_start = em->start; |
321 | em->mod_len = em->len; | |
322 | ||
b11e234d | 323 | if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { |
4e2f84e6 | 324 | prealloc = true; |
b11e234d | 325 | clear_bit(EXTENT_FLAG_FILLING, &em->flags); |
4e2f84e6 | 326 | } |
4d2c8f62 LZ |
327 | |
328 | try_merge_map(tree, em); | |
4e2f84e6 LB |
329 | |
330 | if (prealloc) { | |
331 | em->mod_start = em->start; | |
332 | em->mod_len = em->len; | |
333 | } | |
334 | ||
a1ed835e CM |
335 | free_extent_map(em); |
336 | out: | |
337 | write_unlock(&tree->lock); | |
338 | return ret; | |
339 | ||
340 | } | |
341 | ||
201a9038 JB |
342 | void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) |
343 | { | |
74333c7d FM |
344 | lockdep_assert_held_write(&tree->lock); |
345 | ||
201a9038 | 346 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); |
cbc0e928 | 347 | if (extent_map_in_tree(em)) |
222c81dc | 348 | try_merge_map(tree, em); |
201a9038 JB |
349 | } |
350 | ||
176840b3 FM |
351 | static inline void setup_extent_mapping(struct extent_map_tree *tree, |
352 | struct extent_map *em, | |
353 | int modified) | |
354 | { | |
490b54d6 | 355 | refcount_inc(&em->refs); |
176840b3 FM |
356 | em->mod_start = em->start; |
357 | em->mod_len = em->len; | |
358 | ||
359 | if (modified) | |
360 | list_move(&em->list, &tree->modified_extents); | |
361 | else | |
362 | try_merge_map(tree, em); | |
363 | } | |
364 | ||
1c11b63e JM |
365 | static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) |
366 | { | |
367 | struct map_lookup *map = em->map_lookup; | |
368 | u64 stripe_size = em->orig_block_len; | |
369 | int i; | |
370 | ||
371 | for (i = 0; i < map->num_stripes; i++) { | |
4c664611 | 372 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
1c11b63e JM |
373 | struct btrfs_device *device = stripe->dev; |
374 | ||
375 | set_extent_bits_nowait(&device->alloc_state, stripe->physical, | |
376 | stripe->physical + stripe_size - 1, bits); | |
377 | } | |
378 | } | |
379 | ||
380 | static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) | |
381 | { | |
382 | struct map_lookup *map = em->map_lookup; | |
383 | u64 stripe_size = em->orig_block_len; | |
384 | int i; | |
385 | ||
386 | for (i = 0; i < map->num_stripes; i++) { | |
4c664611 | 387 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
1c11b63e JM |
388 | struct btrfs_device *device = stripe->dev; |
389 | ||
390 | __clear_extent_bit(&device->alloc_state, stripe->physical, | |
391 | stripe->physical + stripe_size - 1, bits, | |
bd015294 | 392 | NULL, GFP_NOWAIT, NULL); |
1c11b63e JM |
393 | } |
394 | } | |
395 | ||
9d2423c5 | 396 | /** |
401bd2dd NB |
397 | * Add new extent map to the extent tree |
398 | * | |
9d2423c5 CH |
399 | * @tree: tree to insert new map in |
400 | * @em: map to insert | |
401bd2dd NB |
401 | * @modified: indicate whether the given @em should be added to the |
402 | * modified list, which indicates the extent needs to be logged | |
9d2423c5 CH |
403 | * |
404 | * Insert @em into @tree or perform a simple forward/backward merge with | |
405 | * existing mappings. The extent_map struct passed in will be inserted | |
406 | * into the tree directly, with an additional reference taken, or a | |
25985edc | 407 | * reference dropped if the merge attempt was successful. |
a52d9a80 CM |
408 | */ |
409 | int add_extent_mapping(struct extent_map_tree *tree, | |
09a2a8f9 | 410 | struct extent_map *em, int modified) |
a52d9a80 CM |
411 | { |
412 | int ret = 0; | |
a52d9a80 | 413 | |
d23ea3fa DS |
414 | lockdep_assert_held_write(&tree->lock); |
415 | ||
32193c14 FDBM |
416 | ret = tree_insert(&tree->map, em); |
417 | if (ret) | |
a52d9a80 | 418 | goto out; |
32193c14 | 419 | |
176840b3 | 420 | setup_extent_mapping(tree, em, modified); |
8811133d | 421 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { |
1c11b63e | 422 | extent_map_device_set_bits(em, CHUNK_ALLOCATED); |
8811133d NB |
423 | extent_map_device_clear_bits(em, CHUNK_TRIMMED); |
424 | } | |
a52d9a80 | 425 | out: |
a52d9a80 CM |
426 | return ret; |
427 | } | |
a52d9a80 | 428 | |
48a3b636 ES |
429 | static struct extent_map * |
430 | __lookup_extent_mapping(struct extent_map_tree *tree, | |
431 | u64 start, u64 len, int strict) | |
a52d9a80 CM |
432 | { |
433 | struct extent_map *em; | |
434 | struct rb_node *rb_node; | |
6c05813e | 435 | struct rb_node *prev_or_next = NULL; |
306929f3 CH |
436 | u64 end = range_end(start, len); |
437 | ||
6c05813e | 438 | rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); |
a52d9a80 | 439 | if (!rb_node) { |
6c05813e FM |
440 | if (prev_or_next) |
441 | rb_node = prev_or_next; | |
ed64f066 LZ |
442 | else |
443 | return NULL; | |
a52d9a80 | 444 | } |
ed64f066 | 445 | |
a52d9a80 | 446 | em = rb_entry(rb_node, struct extent_map, rb_node); |
d1310b2e | 447 | |
ed64f066 LZ |
448 | if (strict && !(end > em->start && start < extent_map_end(em))) |
449 | return NULL; | |
d1310b2e | 450 | |
490b54d6 | 451 | refcount_inc(&em->refs); |
a52d9a80 CM |
452 | return em; |
453 | } | |
a52d9a80 | 454 | |
ed64f066 LZ |
455 | /** |
456 | * lookup_extent_mapping - lookup extent_map | |
457 | * @tree: tree to lookup in | |
458 | * @start: byte offset to start the search | |
459 | * @len: length of the lookup range | |
460 | * | |
461 | * Find and return the first extent_map struct in @tree that intersects the | |
462 | * [start, len] range. There may be additional objects in the tree that | |
463 | * intersect, so check the object returned carefully to make sure that no | |
464 | * additional lookups are needed. | |
465 | */ | |
466 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | |
467 | u64 start, u64 len) | |
468 | { | |
469 | return __lookup_extent_mapping(tree, start, len, 1); | |
470 | } | |
471 | ||
b917b7c3 CM |
472 | /** |
473 | * search_extent_mapping - find a nearby extent map | |
474 | * @tree: tree to lookup in | |
475 | * @start: byte offset to start the search | |
476 | * @len: length of the lookup range | |
477 | * | |
478 | * Find and return the first extent_map struct in @tree that intersects the | |
479 | * [start, len] range. | |
480 | * | |
481 | * If one can't be found, any nearby extent may be returned | |
482 | */ | |
483 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, | |
484 | u64 start, u64 len) | |
485 | { | |
ed64f066 | 486 | return __lookup_extent_mapping(tree, start, len, 0); |
b917b7c3 CM |
487 | } |
488 | ||
9d2423c5 CH |
489 | /** |
490 | * remove_extent_mapping - removes an extent_map from the extent tree | |
491 | * @tree: extent tree to remove from | |
bb7ab3b9 | 492 | * @em: extent map being removed |
9d2423c5 CH |
493 | * |
494 | * Removes @em from @tree. No reference counts are dropped, and no checks | |
495 | * are done to see if the range is in use | |
a52d9a80 | 496 | */ |
c1766dd7 | 497 | void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) |
a52d9a80 | 498 | { |
6d3b050e FM |
499 | lockdep_assert_held_write(&tree->lock); |
500 | ||
7f3c74fb | 501 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); |
07e1ce09 | 502 | rb_erase_cached(&em->rb_node, &tree->map); |
ff44c6e3 JB |
503 | if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) |
504 | list_del_init(&em->list); | |
1c11b63e JM |
505 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
506 | extent_map_device_clear_bits(em, CHUNK_ALLOCATED); | |
cbc0e928 | 507 | RB_CLEAR_NODE(&em->rb_node); |
a52d9a80 | 508 | } |
176840b3 FM |
509 | |
510 | void replace_extent_mapping(struct extent_map_tree *tree, | |
511 | struct extent_map *cur, | |
512 | struct extent_map *new, | |
513 | int modified) | |
514 | { | |
6d3b050e FM |
515 | lockdep_assert_held_write(&tree->lock); |
516 | ||
176840b3 FM |
517 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); |
518 | ASSERT(extent_map_in_tree(cur)); | |
519 | if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) | |
520 | list_del_init(&cur->list); | |
07e1ce09 | 521 | rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); |
176840b3 FM |
522 | RB_CLEAR_NODE(&cur->rb_node); |
523 | ||
524 | setup_extent_mapping(tree, new, modified); | |
525 | } | |
c04e61b5 | 526 | |
d47704bd | 527 | static struct extent_map *next_extent_map(const struct extent_map *em) |
c04e61b5 LB |
528 | { |
529 | struct rb_node *next; | |
530 | ||
531 | next = rb_next(&em->rb_node); | |
532 | if (!next) | |
533 | return NULL; | |
534 | return container_of(next, struct extent_map, rb_node); | |
535 | } | |
536 | ||
d47704bd FM |
537 | /* |
538 | * Get the extent map that immediately follows another one. | |
539 | * | |
540 | * @tree: The extent map tree that the extent map belong to. | |
541 | * Holding read or write access on the tree's lock is required. | |
542 | * @em: An extent map from the given tree. The caller must ensure that | |
543 | * between getting @em and between calling this function, the | |
544 | * extent map @em is not removed from the tree - for example, by | |
545 | * holding the tree's lock for the duration of those 2 operations. | |
546 | * | |
547 | * Returns the extent map that immediately follows @em, or NULL if @em is the | |
548 | * last extent map in the tree. | |
549 | */ | |
550 | struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, | |
551 | const struct extent_map *em) | |
552 | { | |
553 | struct extent_map *next; | |
554 | ||
555 | /* The lock must be acquired either in read mode or write mode. */ | |
556 | lockdep_assert_held(&tree->lock); | |
557 | ASSERT(extent_map_in_tree(em)); | |
558 | ||
559 | next = next_extent_map(em); | |
560 | if (next) | |
561 | refcount_inc(&next->refs); | |
562 | ||
563 | return next; | |
564 | } | |
565 | ||
c04e61b5 LB |
566 | static struct extent_map *prev_extent_map(struct extent_map *em) |
567 | { | |
568 | struct rb_node *prev; | |
569 | ||
570 | prev = rb_prev(&em->rb_node); | |
571 | if (!prev) | |
572 | return NULL; | |
573 | return container_of(prev, struct extent_map, rb_node); | |
574 | } | |
575 | ||
52042d8e AG |
576 | /* |
577 | * Helper for btrfs_get_extent. Given an existing extent in the tree, | |
c04e61b5 LB |
578 | * the existing extent is the nearest extent to map_start, |
579 | * and an extent that you want to insert, deal with overlap and insert | |
580 | * the best fitted new extent into the tree. | |
581 | */ | |
5f4791f4 LB |
582 | static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, |
583 | struct extent_map *existing, | |
584 | struct extent_map *em, | |
585 | u64 map_start) | |
c04e61b5 LB |
586 | { |
587 | struct extent_map *prev; | |
588 | struct extent_map *next; | |
589 | u64 start; | |
590 | u64 end; | |
591 | u64 start_diff; | |
592 | ||
593 | BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); | |
594 | ||
595 | if (existing->start > map_start) { | |
596 | next = existing; | |
597 | prev = prev_extent_map(next); | |
598 | } else { | |
599 | prev = existing; | |
600 | next = next_extent_map(prev); | |
601 | } | |
602 | ||
603 | start = prev ? extent_map_end(prev) : em->start; | |
604 | start = max_t(u64, start, em->start); | |
605 | end = next ? next->start : extent_map_end(em); | |
606 | end = min_t(u64, end, extent_map_end(em)); | |
607 | start_diff = start - em->start; | |
608 | em->start = start; | |
609 | em->len = end - start; | |
610 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | |
611 | !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | |
612 | em->block_start += start_diff; | |
613 | em->block_len = em->len; | |
614 | } | |
615 | return add_extent_mapping(em_tree, em, 0); | |
616 | } | |
617 | ||
618 | /** | |
9ad37bb3 NB |
619 | * Add extent mapping into em_tree |
620 | * | |
621 | * @fs_info: the filesystem | |
622 | * @em_tree: extent tree into which we want to insert the extent mapping | |
623 | * @em_in: extent we are inserting | |
624 | * @start: start of the logical range btrfs_get_extent() is requesting | |
625 | * @len: length of the logical range btrfs_get_extent() is requesting | |
c04e61b5 LB |
626 | * |
627 | * Note that @em_in's range may be different from [start, start+len), | |
628 | * but they must be overlapped. | |
629 | * | |
630 | * Insert @em_in into @em_tree. In case there is an overlapping range, handle | |
631 | * the -EEXIST by either: | |
632 | * a) Returning the existing extent in @em_in if @start is within the | |
633 | * existing em. | |
634 | * b) Merge the existing extent with @em_in passed in. | |
635 | * | |
636 | * Return 0 on success, otherwise -EEXIST. | |
637 | * | |
638 | */ | |
f46b24c9 DS |
639 | int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, |
640 | struct extent_map_tree *em_tree, | |
c04e61b5 LB |
641 | struct extent_map **em_in, u64 start, u64 len) |
642 | { | |
643 | int ret; | |
644 | struct extent_map *em = *em_in; | |
645 | ||
646 | ret = add_extent_mapping(em_tree, em, 0); | |
647 | /* it is possible that someone inserted the extent into the tree | |
648 | * while we had the lock dropped. It is also possible that | |
649 | * an overlapping map exists in the tree | |
650 | */ | |
651 | if (ret == -EEXIST) { | |
652 | struct extent_map *existing; | |
653 | ||
654 | ret = 0; | |
655 | ||
656 | existing = search_extent_mapping(em_tree, start, len); | |
393da918 | 657 | |
f46b24c9 | 658 | trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); |
393da918 | 659 | |
c04e61b5 LB |
660 | /* |
661 | * existing will always be non-NULL, since there must be | |
662 | * extent causing the -EEXIST. | |
663 | */ | |
664 | if (start >= existing->start && | |
665 | start < extent_map_end(existing)) { | |
666 | free_extent_map(em); | |
667 | *em_in = existing; | |
668 | ret = 0; | |
669 | } else { | |
9a7e10e7 LB |
670 | u64 orig_start = em->start; |
671 | u64 orig_len = em->len; | |
672 | ||
c04e61b5 LB |
673 | /* |
674 | * The existing extent map is the one nearest to | |
675 | * the [start, start + len) range which overlaps | |
676 | */ | |
677 | ret = merge_extent_mapping(em_tree, existing, | |
678 | em, start); | |
c04e61b5 LB |
679 | if (ret) { |
680 | free_extent_map(em); | |
681 | *em_in = NULL; | |
9a7e10e7 LB |
682 | WARN_ONCE(ret, |
683 | "unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", | |
684 | ret, existing->start, existing->len, | |
685 | orig_start, orig_len); | |
c04e61b5 | 686 | } |
9a7e10e7 | 687 | free_extent_map(existing); |
c04e61b5 LB |
688 | } |
689 | } | |
690 | ||
691 | ASSERT(ret == 0 || ret == -EEXIST); | |
692 | return ret; | |
693 | } | |
4c0c8cfc | 694 | |
9c9d1b4f FM |
695 | /* |
696 | * Drop all extent maps from a tree in the fastest possible way, rescheduling | |
697 | * if needed. This avoids searching the tree, from the root down to the first | |
698 | * extent map, before each deletion. | |
699 | */ | |
700 | static void drop_all_extent_maps_fast(struct extent_map_tree *tree) | |
701 | { | |
702 | write_lock(&tree->lock); | |
703 | while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { | |
704 | struct extent_map *em; | |
705 | struct rb_node *node; | |
706 | ||
707 | node = rb_first_cached(&tree->map); | |
708 | em = rb_entry(node, struct extent_map, rb_node); | |
709 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | |
710 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); | |
711 | remove_extent_mapping(tree, em); | |
712 | free_extent_map(em); | |
713 | cond_resched_rwlock_write(&tree->lock); | |
714 | } | |
715 | write_unlock(&tree->lock); | |
716 | } | |
717 | ||
4c0c8cfc FM |
718 | /* |
719 | * Drop all extent maps in a given range. | |
720 | * | |
721 | * @inode: The target inode. | |
722 | * @start: Start offset of the range. | |
723 | * @end: End offset of the range (inclusive value). | |
724 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. | |
725 | * | |
726 | * This drops all the extent maps that intersect the given range [@start, @end]. | |
727 | * Extent maps that partially overlap the range and extend behind or beyond it, | |
728 | * are split. | |
729 | * The caller should have locked an appropriate file range in the inode's io | |
730 | * tree before calling this function. | |
731 | */ | |
732 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, | |
733 | bool skip_pinned) | |
734 | { | |
db21370b FM |
735 | struct extent_map *split; |
736 | struct extent_map *split2; | |
737 | struct extent_map *em; | |
4c0c8cfc FM |
738 | struct extent_map_tree *em_tree = &inode->extent_tree; |
739 | u64 len = end - start + 1; | |
4c0c8cfc FM |
740 | |
741 | WARN_ON(end < start); | |
742 | if (end == (u64)-1) { | |
9c9d1b4f FM |
743 | if (start == 0 && !skip_pinned) { |
744 | drop_all_extent_maps_fast(em_tree); | |
745 | return; | |
746 | } | |
4c0c8cfc | 747 | len = (u64)-1; |
db21370b FM |
748 | } else { |
749 | /* Make end offset exclusive for use in the loop below. */ | |
750 | end++; | |
4c0c8cfc | 751 | } |
db21370b FM |
752 | |
753 | /* | |
754 | * It's ok if we fail to allocate the extent maps, see the comment near | |
755 | * the bottom of the loop below. We only need two spare extent maps in | |
756 | * the worst case, where the first extent map that intersects our range | |
757 | * starts before the range and the last extent map that intersects our | |
758 | * range ends after our range (and they might be the same extent map), | |
759 | * because we need to split those two extent maps at the boundaries. | |
760 | */ | |
761 | split = alloc_extent_map(); | |
762 | split2 = alloc_extent_map(); | |
763 | ||
764 | write_lock(&em_tree->lock); | |
765 | em = lookup_extent_mapping(em_tree, start, len); | |
766 | ||
767 | while (em) { | |
768 | /* extent_map_end() returns exclusive value (last byte + 1). */ | |
769 | const u64 em_end = extent_map_end(em); | |
770 | struct extent_map *next_em = NULL; | |
4c0c8cfc FM |
771 | u64 gen; |
772 | unsigned long flags; | |
4c0c8cfc FM |
773 | bool modified; |
774 | bool compressed; | |
775 | ||
db21370b FM |
776 | if (em_end < end) { |
777 | next_em = next_extent_map(em); | |
778 | if (next_em) { | |
779 | if (next_em->start < end) | |
780 | refcount_inc(&next_em->refs); | |
781 | else | |
782 | next_em = NULL; | |
783 | } | |
4c0c8cfc | 784 | } |
db21370b | 785 | |
4c0c8cfc | 786 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { |
f3109e33 | 787 | start = em_end; |
db21370b | 788 | if (end != (u64)-1) |
f3109e33 | 789 | len = start + len - em_end; |
db21370b | 790 | goto next; |
4c0c8cfc | 791 | } |
db21370b | 792 | |
4c0c8cfc FM |
793 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
794 | clear_bit(EXTENT_FLAG_LOGGING, &flags); | |
795 | modified = !list_empty(&em->list); | |
db21370b FM |
796 | |
797 | /* | |
798 | * The extent map does not cross our target range, so no need to | |
799 | * split it, we can remove it directly. | |
800 | */ | |
801 | if (em->start >= start && em_end <= end) | |
802 | goto remove_em; | |
803 | ||
804 | flags = em->flags; | |
805 | gen = em->generation; | |
806 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | |
4c0c8cfc FM |
807 | |
808 | if (em->start < start) { | |
db21370b FM |
809 | if (!split) { |
810 | split = split2; | |
811 | split2 = NULL; | |
812 | if (!split) | |
813 | goto remove_em; | |
814 | } | |
4c0c8cfc FM |
815 | split->start = em->start; |
816 | split->len = start - em->start; | |
817 | ||
818 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
819 | split->orig_start = em->orig_start; | |
820 | split->block_start = em->block_start; | |
821 | ||
822 | if (compressed) | |
823 | split->block_len = em->block_len; | |
824 | else | |
825 | split->block_len = split->len; | |
826 | split->orig_block_len = max(split->block_len, | |
827 | em->orig_block_len); | |
828 | split->ram_bytes = em->ram_bytes; | |
829 | } else { | |
830 | split->orig_start = split->start; | |
831 | split->block_len = 0; | |
832 | split->block_start = em->block_start; | |
833 | split->orig_block_len = 0; | |
834 | split->ram_bytes = split->len; | |
835 | } | |
836 | ||
837 | split->generation = gen; | |
838 | split->flags = flags; | |
839 | split->compress_type = em->compress_type; | |
840 | replace_extent_mapping(em_tree, em, split, modified); | |
841 | free_extent_map(split); | |
842 | split = split2; | |
843 | split2 = NULL; | |
844 | } | |
db21370b FM |
845 | if (em_end > end) { |
846 | if (!split) { | |
847 | split = split2; | |
848 | split2 = NULL; | |
849 | if (!split) | |
850 | goto remove_em; | |
851 | } | |
4c0c8cfc | 852 | split->start = start + len; |
f3109e33 | 853 | split->len = em_end - (start + len); |
4c0c8cfc FM |
854 | split->block_start = em->block_start; |
855 | split->flags = flags; | |
856 | split->compress_type = em->compress_type; | |
857 | split->generation = gen; | |
858 | ||
859 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
860 | split->orig_block_len = max(em->block_len, | |
861 | em->orig_block_len); | |
862 | ||
863 | split->ram_bytes = em->ram_bytes; | |
864 | if (compressed) { | |
865 | split->block_len = em->block_len; | |
866 | split->orig_start = em->orig_start; | |
867 | } else { | |
868 | const u64 diff = start + len - em->start; | |
869 | ||
870 | split->block_len = split->len; | |
871 | split->block_start += diff; | |
872 | split->orig_start = em->orig_start; | |
873 | } | |
874 | } else { | |
875 | split->ram_bytes = split->len; | |
876 | split->orig_start = split->start; | |
877 | split->block_len = 0; | |
878 | split->orig_block_len = 0; | |
879 | } | |
880 | ||
881 | if (extent_map_in_tree(em)) { | |
882 | replace_extent_mapping(em_tree, em, split, | |
883 | modified); | |
884 | } else { | |
885 | int ret; | |
886 | ||
887 | ret = add_extent_mapping(em_tree, split, | |
888 | modified); | |
889 | /* Logic error, shouldn't happen. */ | |
890 | ASSERT(ret == 0); | |
891 | if (WARN_ON(ret != 0) && modified) | |
892 | btrfs_set_inode_full_sync(inode); | |
893 | } | |
894 | free_extent_map(split); | |
895 | split = NULL; | |
896 | } | |
db21370b | 897 | remove_em: |
4c0c8cfc FM |
898 | if (extent_map_in_tree(em)) { |
899 | /* | |
900 | * If the extent map is still in the tree it means that | |
901 | * either of the following is true: | |
902 | * | |
903 | * 1) It fits entirely in our range (doesn't end beyond | |
904 | * it or starts before it); | |
905 | * | |
906 | * 2) It starts before our range and/or ends after our | |
907 | * range, and we were not able to allocate the extent | |
908 | * maps for split operations, @split and @split2. | |
909 | * | |
910 | * If we are at case 2) then we just remove the entire | |
911 | * extent map - this is fine since if anyone needs it to | |
912 | * access the subranges outside our range, will just | |
913 | * load it again from the subvolume tree's file extent | |
914 | * item. However if the extent map was in the list of | |
915 | * modified extents, then we must mark the inode for a | |
916 | * full fsync, otherwise a fast fsync will miss this | |
917 | * extent if it's new and needs to be logged. | |
918 | */ | |
db21370b FM |
919 | if ((em->start < start || em_end > end) && modified) { |
920 | ASSERT(!split); | |
4c0c8cfc FM |
921 | btrfs_set_inode_full_sync(inode); |
922 | } | |
923 | remove_extent_mapping(em_tree, em); | |
924 | } | |
4c0c8cfc | 925 | |
db21370b FM |
926 | /* |
927 | * Once for the tree reference (we replaced or removed the | |
928 | * extent map from the tree). | |
929 | */ | |
4c0c8cfc | 930 | free_extent_map(em); |
db21370b FM |
931 | next: |
932 | /* Once for us (for our lookup reference). */ | |
4c0c8cfc | 933 | free_extent_map(em); |
db21370b FM |
934 | |
935 | em = next_em; | |
4c0c8cfc FM |
936 | } |
937 | ||
db21370b FM |
938 | write_unlock(&em_tree->lock); |
939 | ||
4c0c8cfc FM |
940 | free_extent_map(split); |
941 | free_extent_map(split2); | |
942 | } | |
a1ba4c08 FM |
943 | |
944 | /* | |
945 | * Replace a range in the inode's extent map tree with a new extent map. | |
946 | * | |
947 | * @inode: The target inode. | |
948 | * @new_em: The new extent map to add to the inode's extent map tree. | |
949 | * @modified: Indicate if the new extent map should be added to the list of | |
950 | * modified extents (for fast fsync tracking). | |
951 | * | |
952 | * Drops all the extent maps in the inode's extent map tree that intersect the | |
953 | * range of the new extent map and adds the new extent map to the tree. | |
954 | * The caller should have locked an appropriate file range in the inode's io | |
955 | * tree before calling this function. | |
956 | */ | |
957 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | |
958 | struct extent_map *new_em, | |
959 | bool modified) | |
960 | { | |
961 | const u64 end = new_em->start + new_em->len - 1; | |
962 | struct extent_map_tree *tree = &inode->extent_tree; | |
963 | int ret; | |
964 | ||
965 | ASSERT(!extent_map_in_tree(new_em)); | |
966 | ||
967 | /* | |
968 | * The caller has locked an appropriate file range in the inode's io | |
969 | * tree, but getting -EEXIST when adding the new extent map can still | |
970 | * happen in case there are extents that partially cover the range, and | |
971 | * this is due to two tasks operating on different parts of the extent. | |
972 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from | |
973 | * btrfs_get_extent") for an example and details. | |
974 | */ | |
975 | do { | |
976 | btrfs_drop_extent_map_range(inode, new_em->start, end, false); | |
977 | write_lock(&tree->lock); | |
978 | ret = add_extent_mapping(tree, new_em, modified); | |
979 | write_unlock(&tree->lock); | |
980 | } while (ret == -EEXIST); | |
981 | ||
982 | return ret; | |
983 | } |