btrfs: migrate btrfs_trans_release_chunk_metadata
[linux-block.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
995946dd 19#include "tree-log.h"
fec577fb
CM
20#include "disk-io.h"
21#include "print-tree.h"
0b86a832 22#include "volumes.h"
53b381b3 23#include "raid56.h"
925baedd 24#include "locking.h"
fa9c0d79 25#include "free-space-cache.h"
1e144fb8 26#include "free-space-tree.h"
3fed40cc 27#include "math.h"
6ab0a202 28#include "sysfs.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
fec577fb 33
709c0486
AJ
34#undef SCRAMBLE_DELAYED_REFS
35
9f9b8e8d 36
5d4f98a2 37static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
38 struct btrfs_delayed_ref_node *node, u64 parent,
39 u64 root_objectid, u64 owner_objectid,
40 u64 owner_offset, int refs_to_drop,
41 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
42static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
43 struct extent_buffer *leaf,
44 struct btrfs_extent_item *ei);
45static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
46 u64 parent, u64 root_objectid,
47 u64 flags, u64 owner, u64 offset,
48 struct btrfs_key *ins, int ref_mod);
49static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 50 struct btrfs_delayed_ref_node *node,
21ebfbe7 51 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
52static int find_next_key(struct btrfs_path *path, int level,
53 struct btrfs_key *key);
6a63209f 54
817d52f8
JB
55static noinline int
56block_group_cache_done(struct btrfs_block_group_cache *cache)
57{
58 smp_mb();
36cce922
JB
59 return cache->cached == BTRFS_CACHE_FINISHED ||
60 cache->cached == BTRFS_CACHE_ERROR;
817d52f8
JB
61}
62
0f9dd46c
JB
63static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
64{
65 return (cache->flags & bits) == bits;
66}
67
758f2dfc 68void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
11dfe35a
JB
69{
70 atomic_inc(&cache->count);
71}
72
73void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
74{
f0486c68
YZ
75 if (atomic_dec_and_test(&cache->count)) {
76 WARN_ON(cache->pinned > 0);
77 WARN_ON(cache->reserved > 0);
0966a7b1
QW
78
79 /*
80 * If not empty, someone is still holding mutex of
81 * full_stripe_lock, which can only be released by caller.
82 * And it will definitely cause use-after-free when caller
83 * tries to release full stripe lock.
84 *
85 * No better way to resolve, but only to warn.
86 */
87 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
34d52cb6 88 kfree(cache->free_space_ctl);
11dfe35a 89 kfree(cache);
f0486c68 90 }
11dfe35a
JB
91}
92
0f9dd46c
JB
93/*
94 * this adds the block group to the fs_info rb tree for the block group
95 * cache
96 */
b2950863 97static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
0f9dd46c
JB
98 struct btrfs_block_group_cache *block_group)
99{
100 struct rb_node **p;
101 struct rb_node *parent = NULL;
102 struct btrfs_block_group_cache *cache;
103
104 spin_lock(&info->block_group_cache_lock);
105 p = &info->block_group_cache_tree.rb_node;
106
107 while (*p) {
108 parent = *p;
109 cache = rb_entry(parent, struct btrfs_block_group_cache,
110 cache_node);
111 if (block_group->key.objectid < cache->key.objectid) {
112 p = &(*p)->rb_left;
113 } else if (block_group->key.objectid > cache->key.objectid) {
114 p = &(*p)->rb_right;
115 } else {
116 spin_unlock(&info->block_group_cache_lock);
117 return -EEXIST;
118 }
119 }
120
121 rb_link_node(&block_group->cache_node, parent, p);
122 rb_insert_color(&block_group->cache_node,
123 &info->block_group_cache_tree);
a1897fdd
LB
124
125 if (info->first_logical_byte > block_group->key.objectid)
126 info->first_logical_byte = block_group->key.objectid;
127
0f9dd46c
JB
128 spin_unlock(&info->block_group_cache_lock);
129
130 return 0;
131}
132
133/*
134 * This will return the block group at or after bytenr if contains is 0, else
135 * it will return the block group that contains the bytenr
136 */
137static struct btrfs_block_group_cache *
138block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
139 int contains)
140{
141 struct btrfs_block_group_cache *cache, *ret = NULL;
142 struct rb_node *n;
143 u64 end, start;
144
145 spin_lock(&info->block_group_cache_lock);
146 n = info->block_group_cache_tree.rb_node;
147
148 while (n) {
149 cache = rb_entry(n, struct btrfs_block_group_cache,
150 cache_node);
151 end = cache->key.objectid + cache->key.offset - 1;
152 start = cache->key.objectid;
153
154 if (bytenr < start) {
155 if (!contains && (!ret || start < ret->key.objectid))
156 ret = cache;
157 n = n->rb_left;
158 } else if (bytenr > start) {
159 if (contains && bytenr <= end) {
160 ret = cache;
161 break;
162 }
163 n = n->rb_right;
164 } else {
165 ret = cache;
166 break;
167 }
168 }
a1897fdd 169 if (ret) {
11dfe35a 170 btrfs_get_block_group(ret);
a1897fdd
LB
171 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
172 info->first_logical_byte = ret->key.objectid;
173 }
0f9dd46c
JB
174 spin_unlock(&info->block_group_cache_lock);
175
176 return ret;
177}
178
2ff7e61e 179static int add_excluded_extent(struct btrfs_fs_info *fs_info,
11833d66 180 u64 start, u64 num_bytes)
817d52f8 181{
11833d66 182 u64 end = start + num_bytes - 1;
0b246afa 183 set_extent_bits(&fs_info->freed_extents[0],
ceeb0ae7 184 start, end, EXTENT_UPTODATE);
0b246afa 185 set_extent_bits(&fs_info->freed_extents[1],
ceeb0ae7 186 start, end, EXTENT_UPTODATE);
11833d66
YZ
187 return 0;
188}
817d52f8 189
9e715da8 190static void free_excluded_extents(struct btrfs_block_group_cache *cache)
11833d66 191{
9e715da8 192 struct btrfs_fs_info *fs_info = cache->fs_info;
11833d66 193 u64 start, end;
817d52f8 194
11833d66
YZ
195 start = cache->key.objectid;
196 end = start + cache->key.offset - 1;
197
0b246afa 198 clear_extent_bits(&fs_info->freed_extents[0],
91166212 199 start, end, EXTENT_UPTODATE);
0b246afa 200 clear_extent_bits(&fs_info->freed_extents[1],
91166212 201 start, end, EXTENT_UPTODATE);
817d52f8
JB
202}
203
3c4da657 204static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
817d52f8 205{
3c4da657 206 struct btrfs_fs_info *fs_info = cache->fs_info;
817d52f8
JB
207 u64 bytenr;
208 u64 *logical;
209 int stripe_len;
210 int i, nr, ret;
211
06b2331f
YZ
212 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
213 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
214 cache->bytes_super += stripe_len;
2ff7e61e 215 ret = add_excluded_extent(fs_info, cache->key.objectid,
06b2331f 216 stripe_len);
835d974f
JB
217 if (ret)
218 return ret;
06b2331f
YZ
219 }
220
817d52f8
JB
221 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
222 bytenr = btrfs_sb_offset(i);
0b246afa 223 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
63a9c7b9 224 bytenr, &logical, &nr, &stripe_len);
835d974f
JB
225 if (ret)
226 return ret;
11833d66 227
817d52f8 228 while (nr--) {
51bf5f0b
JB
229 u64 start, len;
230
231 if (logical[nr] > cache->key.objectid +
232 cache->key.offset)
233 continue;
234
235 if (logical[nr] + stripe_len <= cache->key.objectid)
236 continue;
237
238 start = logical[nr];
239 if (start < cache->key.objectid) {
240 start = cache->key.objectid;
241 len = (logical[nr] + stripe_len) - start;
242 } else {
243 len = min_t(u64, stripe_len,
244 cache->key.objectid +
245 cache->key.offset - start);
246 }
247
248 cache->bytes_super += len;
2ff7e61e 249 ret = add_excluded_extent(fs_info, start, len);
835d974f
JB
250 if (ret) {
251 kfree(logical);
252 return ret;
253 }
817d52f8 254 }
11833d66 255
817d52f8
JB
256 kfree(logical);
257 }
817d52f8
JB
258 return 0;
259}
260
11833d66
YZ
261static struct btrfs_caching_control *
262get_caching_control(struct btrfs_block_group_cache *cache)
263{
264 struct btrfs_caching_control *ctl;
265
266 spin_lock(&cache->lock);
dde5abee
JB
267 if (!cache->caching_ctl) {
268 spin_unlock(&cache->lock);
11833d66
YZ
269 return NULL;
270 }
271
272 ctl = cache->caching_ctl;
1e4f4714 273 refcount_inc(&ctl->count);
11833d66
YZ
274 spin_unlock(&cache->lock);
275 return ctl;
276}
277
278static void put_caching_control(struct btrfs_caching_control *ctl)
279{
1e4f4714 280 if (refcount_dec_and_test(&ctl->count))
11833d66
YZ
281 kfree(ctl);
282}
283
d0bd4560 284#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 285static void fragment_free_space(struct btrfs_block_group_cache *block_group)
d0bd4560 286{
2ff7e61e 287 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
288 u64 start = block_group->key.objectid;
289 u64 len = block_group->key.offset;
290 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
0b246afa 291 fs_info->nodesize : fs_info->sectorsize;
d0bd4560
JB
292 u64 step = chunk << 1;
293
294 while (len > chunk) {
295 btrfs_remove_free_space(block_group, start, chunk);
296 start += step;
297 if (len < step)
298 len = 0;
299 else
300 len -= step;
301 }
302}
303#endif
304
0f9dd46c
JB
305/*
306 * this is only called by cache_block_group, since we could have freed extents
307 * we need to check the pinned_extents for any extents that can't be used yet
308 * since their free space will be released as soon as the transaction commits.
309 */
a5ed9182 310u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
4457c1c7 311 u64 start, u64 end)
0f9dd46c 312{
4457c1c7 313 struct btrfs_fs_info *info = block_group->fs_info;
817d52f8 314 u64 extent_start, extent_end, size, total_added = 0;
0f9dd46c
JB
315 int ret;
316
317 while (start < end) {
11833d66 318 ret = find_first_extent_bit(info->pinned_extents, start,
0f9dd46c 319 &extent_start, &extent_end,
e6138876
JB
320 EXTENT_DIRTY | EXTENT_UPTODATE,
321 NULL);
0f9dd46c
JB
322 if (ret)
323 break;
324
06b2331f 325 if (extent_start <= start) {
0f9dd46c
JB
326 start = extent_end + 1;
327 } else if (extent_start > start && extent_start < end) {
328 size = extent_start - start;
817d52f8 329 total_added += size;
ea6a478e
JB
330 ret = btrfs_add_free_space(block_group, start,
331 size);
79787eaa 332 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
333 start = extent_end + 1;
334 } else {
335 break;
336 }
337 }
338
339 if (start < end) {
340 size = end - start;
817d52f8 341 total_added += size;
ea6a478e 342 ret = btrfs_add_free_space(block_group, start, size);
79787eaa 343 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
344 }
345
817d52f8 346 return total_added;
0f9dd46c
JB
347}
348
73fa48b6 349static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
e37c9e69 350{
0b246afa
JM
351 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
352 struct btrfs_fs_info *fs_info = block_group->fs_info;
353 struct btrfs_root *extent_root = fs_info->extent_root;
e37c9e69 354 struct btrfs_path *path;
5f39d397 355 struct extent_buffer *leaf;
11833d66 356 struct btrfs_key key;
817d52f8 357 u64 total_found = 0;
11833d66
YZ
358 u64 last = 0;
359 u32 nritems;
73fa48b6 360 int ret;
d0bd4560 361 bool wakeup = true;
f510cfec 362
e37c9e69
CM
363 path = btrfs_alloc_path();
364 if (!path)
73fa48b6 365 return -ENOMEM;
7d7d6068 366
817d52f8 367 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
11833d66 368
d0bd4560
JB
369#ifdef CONFIG_BTRFS_DEBUG
370 /*
371 * If we're fragmenting we don't want to make anybody think we can
372 * allocate from this block group until we've had a chance to fragment
373 * the free space.
374 */
2ff7e61e 375 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
376 wakeup = false;
377#endif
5cd57b2c 378 /*
817d52f8
JB
379 * We don't want to deadlock with somebody trying to allocate a new
380 * extent for the extent root while also trying to search the extent
381 * root to add free space. So we skip locking and search the commit
382 * root, since its read-only
5cd57b2c
CM
383 */
384 path->skip_locking = 1;
817d52f8 385 path->search_commit_root = 1;
e4058b54 386 path->reada = READA_FORWARD;
817d52f8 387
e4404d6e 388 key.objectid = last;
e37c9e69 389 key.offset = 0;
11833d66 390 key.type = BTRFS_EXTENT_ITEM_KEY;
013f1b12 391
52ee28d2 392next:
11833d66 393 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
e37c9e69 394 if (ret < 0)
73fa48b6 395 goto out;
a512bbf8 396
11833d66
YZ
397 leaf = path->nodes[0];
398 nritems = btrfs_header_nritems(leaf);
399
d397712b 400 while (1) {
7841cb28 401 if (btrfs_fs_closing(fs_info) > 1) {
f25784b3 402 last = (u64)-1;
817d52f8 403 break;
f25784b3 404 }
817d52f8 405
11833d66
YZ
406 if (path->slots[0] < nritems) {
407 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
408 } else {
409 ret = find_next_key(path, 0, &key);
410 if (ret)
e37c9e69 411 break;
817d52f8 412
c9ea7b24 413 if (need_resched() ||
9e351cc8 414 rwsem_is_contended(&fs_info->commit_root_sem)) {
d0bd4560
JB
415 if (wakeup)
416 caching_ctl->progress = last;
ff5714cc 417 btrfs_release_path(path);
9e351cc8 418 up_read(&fs_info->commit_root_sem);
589d8ade 419 mutex_unlock(&caching_ctl->mutex);
11833d66 420 cond_resched();
73fa48b6
OS
421 mutex_lock(&caching_ctl->mutex);
422 down_read(&fs_info->commit_root_sem);
423 goto next;
589d8ade 424 }
0a3896d0
JB
425
426 ret = btrfs_next_leaf(extent_root, path);
427 if (ret < 0)
73fa48b6 428 goto out;
0a3896d0
JB
429 if (ret)
430 break;
589d8ade
JB
431 leaf = path->nodes[0];
432 nritems = btrfs_header_nritems(leaf);
433 continue;
11833d66 434 }
817d52f8 435
52ee28d2
LB
436 if (key.objectid < last) {
437 key.objectid = last;
438 key.offset = 0;
439 key.type = BTRFS_EXTENT_ITEM_KEY;
440
d0bd4560
JB
441 if (wakeup)
442 caching_ctl->progress = last;
52ee28d2
LB
443 btrfs_release_path(path);
444 goto next;
445 }
446
11833d66
YZ
447 if (key.objectid < block_group->key.objectid) {
448 path->slots[0]++;
817d52f8 449 continue;
e37c9e69 450 }
0f9dd46c 451
e37c9e69 452 if (key.objectid >= block_group->key.objectid +
0f9dd46c 453 block_group->key.offset)
e37c9e69 454 break;
7d7d6068 455
3173a18f
JB
456 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
457 key.type == BTRFS_METADATA_ITEM_KEY) {
4457c1c7 458 total_found += add_new_free_space(block_group, last,
817d52f8 459 key.objectid);
3173a18f
JB
460 if (key.type == BTRFS_METADATA_ITEM_KEY)
461 last = key.objectid +
da17066c 462 fs_info->nodesize;
3173a18f
JB
463 else
464 last = key.objectid + key.offset;
817d52f8 465
73fa48b6 466 if (total_found > CACHING_CTL_WAKE_UP) {
11833d66 467 total_found = 0;
d0bd4560
JB
468 if (wakeup)
469 wake_up(&caching_ctl->wait);
11833d66 470 }
817d52f8 471 }
e37c9e69
CM
472 path->slots[0]++;
473 }
817d52f8 474 ret = 0;
e37c9e69 475
4457c1c7 476 total_found += add_new_free_space(block_group, last,
817d52f8
JB
477 block_group->key.objectid +
478 block_group->key.offset);
11833d66 479 caching_ctl->progress = (u64)-1;
817d52f8 480
73fa48b6
OS
481out:
482 btrfs_free_path(path);
483 return ret;
484}
485
486static noinline void caching_thread(struct btrfs_work *work)
487{
488 struct btrfs_block_group_cache *block_group;
489 struct btrfs_fs_info *fs_info;
490 struct btrfs_caching_control *caching_ctl;
491 int ret;
492
493 caching_ctl = container_of(work, struct btrfs_caching_control, work);
494 block_group = caching_ctl->block_group;
495 fs_info = block_group->fs_info;
496
497 mutex_lock(&caching_ctl->mutex);
498 down_read(&fs_info->commit_root_sem);
499
1e144fb8
OS
500 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
501 ret = load_free_space_tree(caching_ctl);
502 else
503 ret = load_extent_tree_free(caching_ctl);
73fa48b6 504
817d52f8 505 spin_lock(&block_group->lock);
11833d66 506 block_group->caching_ctl = NULL;
73fa48b6 507 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
817d52f8 508 spin_unlock(&block_group->lock);
0f9dd46c 509
d0bd4560 510#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 511 if (btrfs_should_fragment_free_space(block_group)) {
d0bd4560
JB
512 u64 bytes_used;
513
514 spin_lock(&block_group->space_info->lock);
515 spin_lock(&block_group->lock);
516 bytes_used = block_group->key.offset -
517 btrfs_block_group_used(&block_group->item);
518 block_group->space_info->bytes_used += bytes_used >> 1;
519 spin_unlock(&block_group->lock);
520 spin_unlock(&block_group->space_info->lock);
2ff7e61e 521 fragment_free_space(block_group);
d0bd4560
JB
522 }
523#endif
524
525 caching_ctl->progress = (u64)-1;
11833d66 526
9e351cc8 527 up_read(&fs_info->commit_root_sem);
9e715da8 528 free_excluded_extents(block_group);
11833d66 529 mutex_unlock(&caching_ctl->mutex);
73fa48b6 530
11833d66
YZ
531 wake_up(&caching_ctl->wait);
532
533 put_caching_control(caching_ctl);
11dfe35a 534 btrfs_put_block_group(block_group);
817d52f8
JB
535}
536
9d66e233 537static int cache_block_group(struct btrfs_block_group_cache *cache,
9d66e233 538 int load_cache_only)
817d52f8 539{
291c7d2f 540 DEFINE_WAIT(wait);
11833d66
YZ
541 struct btrfs_fs_info *fs_info = cache->fs_info;
542 struct btrfs_caching_control *caching_ctl;
817d52f8
JB
543 int ret = 0;
544
291c7d2f 545 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
79787eaa
JM
546 if (!caching_ctl)
547 return -ENOMEM;
291c7d2f
JB
548
549 INIT_LIST_HEAD(&caching_ctl->list);
550 mutex_init(&caching_ctl->mutex);
551 init_waitqueue_head(&caching_ctl->wait);
552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid;
1e4f4714 554 refcount_set(&caching_ctl->count, 1);
9e0af237
LB
555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
291c7d2f
JB
557
558 spin_lock(&cache->lock);
559 /*
560 * This should be a rare occasion, but this could happen I think in the
561 * case where one thread starts to load the space cache info, and then
562 * some other thread starts a transaction commit which tries to do an
563 * allocation while the other thread is still loading the space cache
564 * info. The previous loop should have kept us from choosing this block
565 * group, but if we've moved to the state where we will wait on caching
566 * block groups we need to first check if we're doing a fast load here,
567 * so we can wait for it to finish, otherwise we could end up allocating
568 * from a block group who's cache gets evicted for one reason or
569 * another.
570 */
571 while (cache->cached == BTRFS_CACHE_FAST) {
572 struct btrfs_caching_control *ctl;
573
574 ctl = cache->caching_ctl;
1e4f4714 575 refcount_inc(&ctl->count);
291c7d2f
JB
576 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
577 spin_unlock(&cache->lock);
578
579 schedule();
580
581 finish_wait(&ctl->wait, &wait);
582 put_caching_control(ctl);
583 spin_lock(&cache->lock);
584 }
585
586 if (cache->cached != BTRFS_CACHE_NO) {
587 spin_unlock(&cache->lock);
588 kfree(caching_ctl);
11833d66 589 return 0;
291c7d2f
JB
590 }
591 WARN_ON(cache->caching_ctl);
592 cache->caching_ctl = caching_ctl;
593 cache->cached = BTRFS_CACHE_FAST;
594 spin_unlock(&cache->lock);
11833d66 595
d8953d69 596 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
cb83b7b8 597 mutex_lock(&caching_ctl->mutex);
bb6cb1c5 598 ret = load_free_space_cache(cache);
9d66e233
JB
599
600 spin_lock(&cache->lock);
601 if (ret == 1) {
291c7d2f 602 cache->caching_ctl = NULL;
9d66e233
JB
603 cache->cached = BTRFS_CACHE_FINISHED;
604 cache->last_byte_to_unpin = (u64)-1;
cb83b7b8 605 caching_ctl->progress = (u64)-1;
9d66e233 606 } else {
291c7d2f
JB
607 if (load_cache_only) {
608 cache->caching_ctl = NULL;
609 cache->cached = BTRFS_CACHE_NO;
610 } else {
611 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 612 cache->has_caching_ctl = 1;
291c7d2f 613 }
9d66e233
JB
614 }
615 spin_unlock(&cache->lock);
d0bd4560
JB
616#ifdef CONFIG_BTRFS_DEBUG
617 if (ret == 1 &&
2ff7e61e 618 btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
619 u64 bytes_used;
620
621 spin_lock(&cache->space_info->lock);
622 spin_lock(&cache->lock);
623 bytes_used = cache->key.offset -
624 btrfs_block_group_used(&cache->item);
625 cache->space_info->bytes_used += bytes_used >> 1;
626 spin_unlock(&cache->lock);
627 spin_unlock(&cache->space_info->lock);
2ff7e61e 628 fragment_free_space(cache);
d0bd4560
JB
629 }
630#endif
cb83b7b8
JB
631 mutex_unlock(&caching_ctl->mutex);
632
291c7d2f 633 wake_up(&caching_ctl->wait);
3c14874a 634 if (ret == 1) {
291c7d2f 635 put_caching_control(caching_ctl);
9e715da8 636 free_excluded_extents(cache);
9d66e233 637 return 0;
3c14874a 638 }
291c7d2f
JB
639 } else {
640 /*
1e144fb8
OS
641 * We're either using the free space tree or no caching at all.
642 * Set cached to the appropriate value and wakeup any waiters.
291c7d2f
JB
643 */
644 spin_lock(&cache->lock);
645 if (load_cache_only) {
646 cache->caching_ctl = NULL;
647 cache->cached = BTRFS_CACHE_NO;
648 } else {
649 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 650 cache->has_caching_ctl = 1;
291c7d2f
JB
651 }
652 spin_unlock(&cache->lock);
653 wake_up(&caching_ctl->wait);
9d66e233
JB
654 }
655
291c7d2f
JB
656 if (load_cache_only) {
657 put_caching_control(caching_ctl);
11833d66 658 return 0;
817d52f8 659 }
817d52f8 660
9e351cc8 661 down_write(&fs_info->commit_root_sem);
1e4f4714 662 refcount_inc(&caching_ctl->count);
11833d66 663 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
9e351cc8 664 up_write(&fs_info->commit_root_sem);
11833d66 665
11dfe35a 666 btrfs_get_block_group(cache);
11833d66 667
e66f0bb1 668 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
817d52f8 669
ef8bbdfe 670 return ret;
e37c9e69
CM
671}
672
0f9dd46c
JB
673/*
674 * return the block group that starts at or after bytenr
675 */
d397712b
CM
676static struct btrfs_block_group_cache *
677btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
0ef3e66b 678{
e2c89907 679 return block_group_cache_tree_search(info, bytenr, 0);
0ef3e66b
CM
680}
681
0f9dd46c 682/*
9f55684c 683 * return the block group that contains the given bytenr
0f9dd46c 684 */
d397712b
CM
685struct btrfs_block_group_cache *btrfs_lookup_block_group(
686 struct btrfs_fs_info *info,
687 u64 bytenr)
be744175 688{
e2c89907 689 return block_group_cache_tree_search(info, bytenr, 1);
be744175 690}
0b86a832 691
78192442 692static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
0d9f824d 693{
ddf30cf0
QW
694 if (ref->type == BTRFS_REF_METADATA) {
695 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
78192442 696 return BTRFS_BLOCK_GROUP_SYSTEM;
0d9f824d 697 else
78192442 698 return BTRFS_BLOCK_GROUP_METADATA;
0d9f824d 699 }
78192442
QW
700 return BTRFS_BLOCK_GROUP_DATA;
701}
702
703static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
704 struct btrfs_ref *ref)
705{
706 struct btrfs_space_info *space_info;
707 u64 flags = generic_ref_to_space_flags(ref);
708
280c2908 709 space_info = btrfs_find_space_info(fs_info, flags);
78192442
QW
710 ASSERT(space_info);
711 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
712 BTRFS_TOTAL_BYTES_PINNED_BATCH);
713}
714
715static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
716 struct btrfs_ref *ref)
717{
718 struct btrfs_space_info *space_info;
719 u64 flags = generic_ref_to_space_flags(ref);
0d9f824d 720
280c2908 721 space_info = btrfs_find_space_info(fs_info, flags);
55e8196a 722 ASSERT(space_info);
78192442 723 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
dec59fa3 724 BTRFS_TOTAL_BYTES_PINNED_BATCH);
0d9f824d
OS
725}
726
1a4ed8fd 727/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 728int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5
CM
729{
730 int ret;
731 struct btrfs_key key;
31840ae1 732 struct btrfs_path *path;
e02119d5 733
31840ae1 734 path = btrfs_alloc_path();
d8926bb3
MF
735 if (!path)
736 return -ENOMEM;
737
e02119d5
CM
738 key.objectid = start;
739 key.offset = len;
3173a18f 740 key.type = BTRFS_EXTENT_ITEM_KEY;
0b246afa 741 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
31840ae1 742 btrfs_free_path(path);
7bb86316
CM
743 return ret;
744}
745
a22285a6 746/*
3173a18f 747 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
748 *
749 * the head node for delayed ref is used to store the sum of all the
750 * reference count modifications queued up in the rbtree. the head
751 * node may also store the extent flags to set. This way you can check
752 * to see what the reference count and extent flags would be if all of
753 * the delayed refs are not processed.
754 */
755int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 756 struct btrfs_fs_info *fs_info, u64 bytenr,
3173a18f 757 u64 offset, int metadata, u64 *refs, u64 *flags)
a22285a6
YZ
758{
759 struct btrfs_delayed_ref_head *head;
760 struct btrfs_delayed_ref_root *delayed_refs;
761 struct btrfs_path *path;
762 struct btrfs_extent_item *ei;
763 struct extent_buffer *leaf;
764 struct btrfs_key key;
765 u32 item_size;
766 u64 num_refs;
767 u64 extent_flags;
768 int ret;
769
3173a18f
JB
770 /*
771 * If we don't have skinny metadata, don't bother doing anything
772 * different
773 */
0b246afa
JM
774 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
775 offset = fs_info->nodesize;
3173a18f
JB
776 metadata = 0;
777 }
778
a22285a6
YZ
779 path = btrfs_alloc_path();
780 if (!path)
781 return -ENOMEM;
782
a22285a6
YZ
783 if (!trans) {
784 path->skip_locking = 1;
785 path->search_commit_root = 1;
786 }
639eefc8
FDBM
787
788search_again:
789 key.objectid = bytenr;
790 key.offset = offset;
791 if (metadata)
792 key.type = BTRFS_METADATA_ITEM_KEY;
793 else
794 key.type = BTRFS_EXTENT_ITEM_KEY;
795
0b246afa 796 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
a22285a6
YZ
797 if (ret < 0)
798 goto out_free;
799
3173a18f 800 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
801 if (path->slots[0]) {
802 path->slots[0]--;
803 btrfs_item_key_to_cpu(path->nodes[0], &key,
804 path->slots[0]);
805 if (key.objectid == bytenr &&
806 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 807 key.offset == fs_info->nodesize)
74be9510
FDBM
808 ret = 0;
809 }
3173a18f
JB
810 }
811
a22285a6
YZ
812 if (ret == 0) {
813 leaf = path->nodes[0];
814 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
815 if (item_size >= sizeof(*ei)) {
816 ei = btrfs_item_ptr(leaf, path->slots[0],
817 struct btrfs_extent_item);
818 num_refs = btrfs_extent_refs(leaf, ei);
819 extent_flags = btrfs_extent_flags(leaf, ei);
820 } else {
ba3c2b19
NB
821 ret = -EINVAL;
822 btrfs_print_v0_err(fs_info);
823 if (trans)
824 btrfs_abort_transaction(trans, ret);
825 else
826 btrfs_handle_fs_error(fs_info, ret, NULL);
827
828 goto out_free;
a22285a6 829 }
ba3c2b19 830
a22285a6
YZ
831 BUG_ON(num_refs == 0);
832 } else {
833 num_refs = 0;
834 extent_flags = 0;
835 ret = 0;
836 }
837
838 if (!trans)
839 goto out;
840
841 delayed_refs = &trans->transaction->delayed_refs;
842 spin_lock(&delayed_refs->lock);
f72ad18e 843 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
844 if (head) {
845 if (!mutex_trylock(&head->mutex)) {
d278850e 846 refcount_inc(&head->refs);
a22285a6
YZ
847 spin_unlock(&delayed_refs->lock);
848
b3b4aa74 849 btrfs_release_path(path);
a22285a6 850
8cc33e5c
DS
851 /*
852 * Mutex was contended, block until it's released and try
853 * again
854 */
a22285a6
YZ
855 mutex_lock(&head->mutex);
856 mutex_unlock(&head->mutex);
d278850e 857 btrfs_put_delayed_ref_head(head);
639eefc8 858 goto search_again;
a22285a6 859 }
d7df2c79 860 spin_lock(&head->lock);
a22285a6
YZ
861 if (head->extent_op && head->extent_op->update_flags)
862 extent_flags |= head->extent_op->flags_to_set;
863 else
864 BUG_ON(num_refs == 0);
865
d278850e 866 num_refs += head->ref_mod;
d7df2c79 867 spin_unlock(&head->lock);
a22285a6
YZ
868 mutex_unlock(&head->mutex);
869 }
870 spin_unlock(&delayed_refs->lock);
871out:
872 WARN_ON(num_refs == 0);
873 if (refs)
874 *refs = num_refs;
875 if (flags)
876 *flags = extent_flags;
877out_free:
878 btrfs_free_path(path);
879 return ret;
880}
881
d8d5f3e1
CM
882/*
883 * Back reference rules. Back refs have three main goals:
884 *
885 * 1) differentiate between all holders of references to an extent so that
886 * when a reference is dropped we can make sure it was a valid reference
887 * before freeing the extent.
888 *
889 * 2) Provide enough information to quickly find the holders of an extent
890 * if we notice a given block is corrupted or bad.
891 *
892 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
893 * maintenance. This is actually the same as #2, but with a slightly
894 * different use case.
895 *
5d4f98a2
YZ
896 * There are two kinds of back refs. The implicit back refs is optimized
897 * for pointers in non-shared tree blocks. For a given pointer in a block,
898 * back refs of this kind provide information about the block's owner tree
899 * and the pointer's key. These information allow us to find the block by
900 * b-tree searching. The full back refs is for pointers in tree blocks not
901 * referenced by their owner trees. The location of tree block is recorded
902 * in the back refs. Actually the full back refs is generic, and can be
903 * used in all cases the implicit back refs is used. The major shortcoming
904 * of the full back refs is its overhead. Every time a tree block gets
905 * COWed, we have to update back refs entry for all pointers in it.
906 *
907 * For a newly allocated tree block, we use implicit back refs for
908 * pointers in it. This means most tree related operations only involve
909 * implicit back refs. For a tree block created in old transaction, the
910 * only way to drop a reference to it is COW it. So we can detect the
911 * event that tree block loses its owner tree's reference and do the
912 * back refs conversion.
913 *
01327610 914 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
915 *
916 * The reference count of the block is one and the tree is the block's
917 * owner tree. Nothing to do in this case.
918 *
919 * The reference count of the block is one and the tree is not the
920 * block's owner tree. In this case, full back refs is used for pointers
921 * in the block. Remove these full back refs, add implicit back refs for
922 * every pointers in the new block.
923 *
924 * The reference count of the block is greater than one and the tree is
925 * the block's owner tree. In this case, implicit back refs is used for
926 * pointers in the block. Add full back refs for every pointers in the
927 * block, increase lower level extents' reference counts. The original
928 * implicit back refs are entailed to the new block.
929 *
930 * The reference count of the block is greater than one and the tree is
931 * not the block's owner tree. Add implicit back refs for every pointer in
932 * the new block, increase lower level extents' reference count.
933 *
934 * Back Reference Key composing:
935 *
936 * The key objectid corresponds to the first byte in the extent,
937 * The key type is used to differentiate between types of back refs.
938 * There are different meanings of the key offset for different types
939 * of back refs.
940 *
d8d5f3e1
CM
941 * File extents can be referenced by:
942 *
943 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 944 * - different files inside a single subvolume
d8d5f3e1
CM
945 * - different offsets inside a file (bookend extents in file.c)
946 *
5d4f98a2 947 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
948 *
949 * - Objectid of the subvolume root
d8d5f3e1 950 * - objectid of the file holding the reference
5d4f98a2
YZ
951 * - original offset in the file
952 * - how many bookend extents
d8d5f3e1 953 *
5d4f98a2
YZ
954 * The key offset for the implicit back refs is hash of the first
955 * three fields.
d8d5f3e1 956 *
5d4f98a2 957 * The extent ref structure for the full back refs has field for:
d8d5f3e1 958 *
5d4f98a2 959 * - number of pointers in the tree leaf
d8d5f3e1 960 *
5d4f98a2
YZ
961 * The key offset for the implicit back refs is the first byte of
962 * the tree leaf
d8d5f3e1 963 *
5d4f98a2
YZ
964 * When a file extent is allocated, The implicit back refs is used.
965 * the fields are filled in:
d8d5f3e1 966 *
5d4f98a2 967 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 968 *
5d4f98a2
YZ
969 * When a file extent is removed file truncation, we find the
970 * corresponding implicit back refs and check the following fields:
d8d5f3e1 971 *
5d4f98a2 972 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 973 *
5d4f98a2 974 * Btree extents can be referenced by:
d8d5f3e1 975 *
5d4f98a2 976 * - Different subvolumes
d8d5f3e1 977 *
5d4f98a2
YZ
978 * Both the implicit back refs and the full back refs for tree blocks
979 * only consist of key. The key offset for the implicit back refs is
980 * objectid of block's owner tree. The key offset for the full back refs
981 * is the first byte of parent block.
d8d5f3e1 982 *
5d4f98a2
YZ
983 * When implicit back refs is used, information about the lowest key and
984 * level of the tree block are required. These information are stored in
985 * tree block info structure.
d8d5f3e1 986 */
31840ae1 987
167ce953
LB
988/*
989 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 990 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
991 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
992 */
993int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
994 struct btrfs_extent_inline_ref *iref,
995 enum btrfs_inline_ref_type is_data)
996{
997 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 998 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953
LB
999
1000 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1001 type == BTRFS_SHARED_BLOCK_REF_KEY ||
1002 type == BTRFS_SHARED_DATA_REF_KEY ||
1003 type == BTRFS_EXTENT_DATA_REF_KEY) {
1004 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 1005 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 1006 return type;
64ecdb64
LB
1007 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1008 ASSERT(eb->fs_info);
1009 /*
1010 * Every shared one has parent tree
1011 * block, which must be aligned to
1012 * nodesize.
1013 */
1014 if (offset &&
1015 IS_ALIGNED(offset, eb->fs_info->nodesize))
1016 return type;
1017 }
167ce953 1018 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 1019 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 1020 return type;
64ecdb64
LB
1021 if (type == BTRFS_SHARED_DATA_REF_KEY) {
1022 ASSERT(eb->fs_info);
1023 /*
1024 * Every shared one has parent tree
1025 * block, which must be aligned to
1026 * nodesize.
1027 */
1028 if (offset &&
1029 IS_ALIGNED(offset, eb->fs_info->nodesize))
1030 return type;
1031 }
167ce953
LB
1032 } else {
1033 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1034 return type;
1035 }
1036 }
1037
1038 btrfs_print_leaf((struct extent_buffer *)eb);
1039 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1040 eb->start, type);
1041 WARN_ON(1);
1042
1043 return BTRFS_REF_TYPE_INVALID;
1044}
1045
5d4f98a2
YZ
1046static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1047{
1048 u32 high_crc = ~(u32)0;
1049 u32 low_crc = ~(u32)0;
1050 __le64 lenum;
1051
1052 lenum = cpu_to_le64(root_objectid);
65019df8 1053 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 1054 lenum = cpu_to_le64(owner);
65019df8 1055 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 1056 lenum = cpu_to_le64(offset);
65019df8 1057 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
1058
1059 return ((u64)high_crc << 31) ^ (u64)low_crc;
1060}
1061
1062static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1063 struct btrfs_extent_data_ref *ref)
1064{
1065 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1066 btrfs_extent_data_ref_objectid(leaf, ref),
1067 btrfs_extent_data_ref_offset(leaf, ref));
1068}
1069
1070static int match_extent_data_ref(struct extent_buffer *leaf,
1071 struct btrfs_extent_data_ref *ref,
1072 u64 root_objectid, u64 owner, u64 offset)
1073{
1074 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1075 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1076 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1077 return 0;
1078 return 1;
1079}
1080
1081static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1082 struct btrfs_path *path,
1083 u64 bytenr, u64 parent,
1084 u64 root_objectid,
1085 u64 owner, u64 offset)
1086{
bd1d53ef 1087 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2
YZ
1088 struct btrfs_key key;
1089 struct btrfs_extent_data_ref *ref;
31840ae1 1090 struct extent_buffer *leaf;
5d4f98a2 1091 u32 nritems;
74493f7a 1092 int ret;
5d4f98a2
YZ
1093 int recow;
1094 int err = -ENOENT;
74493f7a 1095
31840ae1 1096 key.objectid = bytenr;
5d4f98a2
YZ
1097 if (parent) {
1098 key.type = BTRFS_SHARED_DATA_REF_KEY;
1099 key.offset = parent;
1100 } else {
1101 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1102 key.offset = hash_extent_data_ref(root_objectid,
1103 owner, offset);
1104 }
1105again:
1106 recow = 0;
1107 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1108 if (ret < 0) {
1109 err = ret;
1110 goto fail;
1111 }
31840ae1 1112
5d4f98a2
YZ
1113 if (parent) {
1114 if (!ret)
1115 return 0;
5d4f98a2 1116 goto fail;
31840ae1
ZY
1117 }
1118
1119 leaf = path->nodes[0];
5d4f98a2
YZ
1120 nritems = btrfs_header_nritems(leaf);
1121 while (1) {
1122 if (path->slots[0] >= nritems) {
1123 ret = btrfs_next_leaf(root, path);
1124 if (ret < 0)
1125 err = ret;
1126 if (ret)
1127 goto fail;
1128
1129 leaf = path->nodes[0];
1130 nritems = btrfs_header_nritems(leaf);
1131 recow = 1;
1132 }
1133
1134 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 if (key.objectid != bytenr ||
1136 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1137 goto fail;
1138
1139 ref = btrfs_item_ptr(leaf, path->slots[0],
1140 struct btrfs_extent_data_ref);
1141
1142 if (match_extent_data_ref(leaf, ref, root_objectid,
1143 owner, offset)) {
1144 if (recow) {
b3b4aa74 1145 btrfs_release_path(path);
5d4f98a2
YZ
1146 goto again;
1147 }
1148 err = 0;
1149 break;
1150 }
1151 path->slots[0]++;
31840ae1 1152 }
5d4f98a2
YZ
1153fail:
1154 return err;
31840ae1
ZY
1155}
1156
5d4f98a2 1157static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1158 struct btrfs_path *path,
1159 u64 bytenr, u64 parent,
1160 u64 root_objectid, u64 owner,
1161 u64 offset, int refs_to_add)
31840ae1 1162{
62b895af 1163 struct btrfs_root *root = trans->fs_info->extent_root;
31840ae1
ZY
1164 struct btrfs_key key;
1165 struct extent_buffer *leaf;
5d4f98a2 1166 u32 size;
31840ae1
ZY
1167 u32 num_refs;
1168 int ret;
74493f7a 1169
74493f7a 1170 key.objectid = bytenr;
5d4f98a2
YZ
1171 if (parent) {
1172 key.type = BTRFS_SHARED_DATA_REF_KEY;
1173 key.offset = parent;
1174 size = sizeof(struct btrfs_shared_data_ref);
1175 } else {
1176 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1177 key.offset = hash_extent_data_ref(root_objectid,
1178 owner, offset);
1179 size = sizeof(struct btrfs_extent_data_ref);
1180 }
74493f7a 1181
5d4f98a2
YZ
1182 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1183 if (ret && ret != -EEXIST)
1184 goto fail;
1185
1186 leaf = path->nodes[0];
1187 if (parent) {
1188 struct btrfs_shared_data_ref *ref;
31840ae1 1189 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
1190 struct btrfs_shared_data_ref);
1191 if (ret == 0) {
1192 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1193 } else {
1194 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1195 num_refs += refs_to_add;
1196 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 1197 }
5d4f98a2
YZ
1198 } else {
1199 struct btrfs_extent_data_ref *ref;
1200 while (ret == -EEXIST) {
1201 ref = btrfs_item_ptr(leaf, path->slots[0],
1202 struct btrfs_extent_data_ref);
1203 if (match_extent_data_ref(leaf, ref, root_objectid,
1204 owner, offset))
1205 break;
b3b4aa74 1206 btrfs_release_path(path);
5d4f98a2
YZ
1207 key.offset++;
1208 ret = btrfs_insert_empty_item(trans, root, path, &key,
1209 size);
1210 if (ret && ret != -EEXIST)
1211 goto fail;
31840ae1 1212
5d4f98a2
YZ
1213 leaf = path->nodes[0];
1214 }
1215 ref = btrfs_item_ptr(leaf, path->slots[0],
1216 struct btrfs_extent_data_ref);
1217 if (ret == 0) {
1218 btrfs_set_extent_data_ref_root(leaf, ref,
1219 root_objectid);
1220 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1221 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1222 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1223 } else {
1224 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1225 num_refs += refs_to_add;
1226 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 1227 }
31840ae1 1228 }
5d4f98a2
YZ
1229 btrfs_mark_buffer_dirty(leaf);
1230 ret = 0;
1231fail:
b3b4aa74 1232 btrfs_release_path(path);
7bb86316 1233 return ret;
74493f7a
CM
1234}
1235
5d4f98a2 1236static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2 1237 struct btrfs_path *path,
fcebe456 1238 int refs_to_drop, int *last_ref)
31840ae1 1239{
5d4f98a2
YZ
1240 struct btrfs_key key;
1241 struct btrfs_extent_data_ref *ref1 = NULL;
1242 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 1243 struct extent_buffer *leaf;
5d4f98a2 1244 u32 num_refs = 0;
31840ae1
ZY
1245 int ret = 0;
1246
1247 leaf = path->nodes[0];
5d4f98a2
YZ
1248 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1249
1250 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1251 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1252 struct btrfs_extent_data_ref);
1253 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1254 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1255 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1256 struct btrfs_shared_data_ref);
1257 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
6d8ff4e4 1258 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
ba3c2b19
NB
1259 btrfs_print_v0_err(trans->fs_info);
1260 btrfs_abort_transaction(trans, -EINVAL);
1261 return -EINVAL;
5d4f98a2
YZ
1262 } else {
1263 BUG();
1264 }
1265
56bec294
CM
1266 BUG_ON(num_refs < refs_to_drop);
1267 num_refs -= refs_to_drop;
5d4f98a2 1268
31840ae1 1269 if (num_refs == 0) {
e9f6290d 1270 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
fcebe456 1271 *last_ref = 1;
31840ae1 1272 } else {
5d4f98a2
YZ
1273 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1274 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1275 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1276 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
31840ae1
ZY
1277 btrfs_mark_buffer_dirty(leaf);
1278 }
31840ae1
ZY
1279 return ret;
1280}
1281
9ed0dea0 1282static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 1283 struct btrfs_extent_inline_ref *iref)
15916de8 1284{
5d4f98a2
YZ
1285 struct btrfs_key key;
1286 struct extent_buffer *leaf;
1287 struct btrfs_extent_data_ref *ref1;
1288 struct btrfs_shared_data_ref *ref2;
1289 u32 num_refs = 0;
3de28d57 1290 int type;
5d4f98a2
YZ
1291
1292 leaf = path->nodes[0];
1293 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19
NB
1294
1295 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
5d4f98a2 1296 if (iref) {
3de28d57
LB
1297 /*
1298 * If type is invalid, we should have bailed out earlier than
1299 * this call.
1300 */
1301 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1302 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1303 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
1304 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1305 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1306 } else {
1307 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1308 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1309 }
1310 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1311 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1312 struct btrfs_extent_data_ref);
1313 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1314 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1315 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1316 struct btrfs_shared_data_ref);
1317 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
1318 } else {
1319 WARN_ON(1);
1320 }
1321 return num_refs;
1322}
15916de8 1323
5d4f98a2 1324static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1325 struct btrfs_path *path,
1326 u64 bytenr, u64 parent,
1327 u64 root_objectid)
1f3c79a2 1328{
b8582eea 1329 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2 1330 struct btrfs_key key;
1f3c79a2 1331 int ret;
1f3c79a2 1332
5d4f98a2
YZ
1333 key.objectid = bytenr;
1334 if (parent) {
1335 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1336 key.offset = parent;
1337 } else {
1338 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1339 key.offset = root_objectid;
1f3c79a2
LH
1340 }
1341
5d4f98a2
YZ
1342 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1343 if (ret > 0)
1344 ret = -ENOENT;
5d4f98a2 1345 return ret;
1f3c79a2
LH
1346}
1347
5d4f98a2 1348static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1349 struct btrfs_path *path,
1350 u64 bytenr, u64 parent,
1351 u64 root_objectid)
31840ae1 1352{
5d4f98a2 1353 struct btrfs_key key;
31840ae1 1354 int ret;
31840ae1 1355
5d4f98a2
YZ
1356 key.objectid = bytenr;
1357 if (parent) {
1358 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1359 key.offset = parent;
1360 } else {
1361 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1362 key.offset = root_objectid;
1363 }
1364
10728404 1365 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
87bde3cd 1366 path, &key, 0);
b3b4aa74 1367 btrfs_release_path(path);
31840ae1
ZY
1368 return ret;
1369}
1370
5d4f98a2 1371static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 1372{
5d4f98a2
YZ
1373 int type;
1374 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1375 if (parent > 0)
1376 type = BTRFS_SHARED_BLOCK_REF_KEY;
1377 else
1378 type = BTRFS_TREE_BLOCK_REF_KEY;
1379 } else {
1380 if (parent > 0)
1381 type = BTRFS_SHARED_DATA_REF_KEY;
1382 else
1383 type = BTRFS_EXTENT_DATA_REF_KEY;
1384 }
1385 return type;
31840ae1 1386}
56bec294 1387
2c47e605
YZ
1388static int find_next_key(struct btrfs_path *path, int level,
1389 struct btrfs_key *key)
56bec294 1390
02217ed2 1391{
2c47e605 1392 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
1393 if (!path->nodes[level])
1394 break;
5d4f98a2
YZ
1395 if (path->slots[level] + 1 >=
1396 btrfs_header_nritems(path->nodes[level]))
1397 continue;
1398 if (level == 0)
1399 btrfs_item_key_to_cpu(path->nodes[level], key,
1400 path->slots[level] + 1);
1401 else
1402 btrfs_node_key_to_cpu(path->nodes[level], key,
1403 path->slots[level] + 1);
1404 return 0;
1405 }
1406 return 1;
1407}
037e6390 1408
5d4f98a2
YZ
1409/*
1410 * look for inline back ref. if back ref is found, *ref_ret is set
1411 * to the address of inline back ref, and 0 is returned.
1412 *
1413 * if back ref isn't found, *ref_ret is set to the address where it
1414 * should be inserted, and -ENOENT is returned.
1415 *
1416 * if insert is true and there are too many inline back refs, the path
1417 * points to the extent item, and -EAGAIN is returned.
1418 *
1419 * NOTE: inline back refs are ordered in the same way that back ref
1420 * items in the tree are ordered.
1421 */
1422static noinline_for_stack
1423int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1424 struct btrfs_path *path,
1425 struct btrfs_extent_inline_ref **ref_ret,
1426 u64 bytenr, u64 num_bytes,
1427 u64 parent, u64 root_objectid,
1428 u64 owner, u64 offset, int insert)
1429{
867cc1fb 1430 struct btrfs_fs_info *fs_info = trans->fs_info;
87bde3cd 1431 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
1432 struct btrfs_key key;
1433 struct extent_buffer *leaf;
1434 struct btrfs_extent_item *ei;
1435 struct btrfs_extent_inline_ref *iref;
1436 u64 flags;
1437 u64 item_size;
1438 unsigned long ptr;
1439 unsigned long end;
1440 int extra_size;
1441 int type;
1442 int want;
1443 int ret;
1444 int err = 0;
0b246afa 1445 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 1446 int needed;
26b8003f 1447
db94535d 1448 key.objectid = bytenr;
31840ae1 1449 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 1450 key.offset = num_bytes;
31840ae1 1451
5d4f98a2
YZ
1452 want = extent_ref_type(parent, owner);
1453 if (insert) {
1454 extra_size = btrfs_extent_inline_ref_size(want);
85d4198e 1455 path->keep_locks = 1;
5d4f98a2
YZ
1456 } else
1457 extra_size = -1;
3173a18f
JB
1458
1459 /*
16d1c062
NB
1460 * Owner is our level, so we can just add one to get the level for the
1461 * block we are interested in.
3173a18f
JB
1462 */
1463 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1464 key.type = BTRFS_METADATA_ITEM_KEY;
1465 key.offset = owner;
1466 }
1467
1468again:
5d4f98a2 1469 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 1470 if (ret < 0) {
5d4f98a2
YZ
1471 err = ret;
1472 goto out;
1473 }
3173a18f
JB
1474
1475 /*
1476 * We may be a newly converted file system which still has the old fat
1477 * extent entries for metadata, so try and see if we have one of those.
1478 */
1479 if (ret > 0 && skinny_metadata) {
1480 skinny_metadata = false;
1481 if (path->slots[0]) {
1482 path->slots[0]--;
1483 btrfs_item_key_to_cpu(path->nodes[0], &key,
1484 path->slots[0]);
1485 if (key.objectid == bytenr &&
1486 key.type == BTRFS_EXTENT_ITEM_KEY &&
1487 key.offset == num_bytes)
1488 ret = 0;
1489 }
1490 if (ret) {
9ce49a0b 1491 key.objectid = bytenr;
3173a18f
JB
1492 key.type = BTRFS_EXTENT_ITEM_KEY;
1493 key.offset = num_bytes;
1494 btrfs_release_path(path);
1495 goto again;
1496 }
1497 }
1498
79787eaa
JM
1499 if (ret && !insert) {
1500 err = -ENOENT;
1501 goto out;
fae7f21c 1502 } else if (WARN_ON(ret)) {
492104c8 1503 err = -EIO;
492104c8 1504 goto out;
79787eaa 1505 }
5d4f98a2
YZ
1506
1507 leaf = path->nodes[0];
1508 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
6d8ff4e4 1509 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
1510 err = -EINVAL;
1511 btrfs_print_v0_err(fs_info);
1512 btrfs_abort_transaction(trans, err);
1513 goto out;
1514 }
5d4f98a2 1515
5d4f98a2
YZ
1516 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1517 flags = btrfs_extent_flags(leaf, ei);
1518
1519 ptr = (unsigned long)(ei + 1);
1520 end = (unsigned long)ei + item_size;
1521
3173a18f 1522 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
1523 ptr += sizeof(struct btrfs_tree_block_info);
1524 BUG_ON(ptr > end);
5d4f98a2
YZ
1525 }
1526
3de28d57
LB
1527 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1528 needed = BTRFS_REF_TYPE_DATA;
1529 else
1530 needed = BTRFS_REF_TYPE_BLOCK;
1531
5d4f98a2
YZ
1532 err = -ENOENT;
1533 while (1) {
1534 if (ptr >= end) {
1535 WARN_ON(ptr > end);
1536 break;
1537 }
1538 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57
LB
1539 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1540 if (type == BTRFS_REF_TYPE_INVALID) {
af431dcb 1541 err = -EUCLEAN;
3de28d57
LB
1542 goto out;
1543 }
1544
5d4f98a2
YZ
1545 if (want < type)
1546 break;
1547 if (want > type) {
1548 ptr += btrfs_extent_inline_ref_size(type);
1549 continue;
1550 }
1551
1552 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1553 struct btrfs_extent_data_ref *dref;
1554 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1555 if (match_extent_data_ref(leaf, dref, root_objectid,
1556 owner, offset)) {
1557 err = 0;
1558 break;
1559 }
1560 if (hash_extent_data_ref_item(leaf, dref) <
1561 hash_extent_data_ref(root_objectid, owner, offset))
1562 break;
1563 } else {
1564 u64 ref_offset;
1565 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1566 if (parent > 0) {
1567 if (parent == ref_offset) {
1568 err = 0;
1569 break;
1570 }
1571 if (ref_offset < parent)
1572 break;
1573 } else {
1574 if (root_objectid == ref_offset) {
1575 err = 0;
1576 break;
1577 }
1578 if (ref_offset < root_objectid)
1579 break;
1580 }
1581 }
1582 ptr += btrfs_extent_inline_ref_size(type);
1583 }
1584 if (err == -ENOENT && insert) {
1585 if (item_size + extra_size >=
1586 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1587 err = -EAGAIN;
1588 goto out;
1589 }
1590 /*
1591 * To add new inline back ref, we have to make sure
1592 * there is no corresponding back ref item.
1593 * For simplicity, we just do not add new inline back
1594 * ref if there is any kind of item for this block
1595 */
2c47e605
YZ
1596 if (find_next_key(path, 0, &key) == 0 &&
1597 key.objectid == bytenr &&
85d4198e 1598 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
1599 err = -EAGAIN;
1600 goto out;
1601 }
1602 }
1603 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1604out:
85d4198e 1605 if (insert) {
5d4f98a2
YZ
1606 path->keep_locks = 0;
1607 btrfs_unlock_up_safe(path, 1);
1608 }
1609 return err;
1610}
1611
1612/*
1613 * helper to add new inline back ref
1614 */
1615static noinline_for_stack
87bde3cd 1616void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
1617 struct btrfs_path *path,
1618 struct btrfs_extent_inline_ref *iref,
1619 u64 parent, u64 root_objectid,
1620 u64 owner, u64 offset, int refs_to_add,
1621 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
1622{
1623 struct extent_buffer *leaf;
1624 struct btrfs_extent_item *ei;
1625 unsigned long ptr;
1626 unsigned long end;
1627 unsigned long item_offset;
1628 u64 refs;
1629 int size;
1630 int type;
5d4f98a2
YZ
1631
1632 leaf = path->nodes[0];
1633 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1634 item_offset = (unsigned long)iref - (unsigned long)ei;
1635
1636 type = extent_ref_type(parent, owner);
1637 size = btrfs_extent_inline_ref_size(type);
1638
c71dd880 1639 btrfs_extend_item(path, size);
5d4f98a2
YZ
1640
1641 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1642 refs = btrfs_extent_refs(leaf, ei);
1643 refs += refs_to_add;
1644 btrfs_set_extent_refs(leaf, ei, refs);
1645 if (extent_op)
1646 __run_delayed_extent_op(extent_op, leaf, ei);
1647
1648 ptr = (unsigned long)ei + item_offset;
1649 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1650 if (ptr < end - size)
1651 memmove_extent_buffer(leaf, ptr + size, ptr,
1652 end - size - ptr);
1653
1654 iref = (struct btrfs_extent_inline_ref *)ptr;
1655 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1656 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1657 struct btrfs_extent_data_ref *dref;
1658 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1659 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1660 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1661 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1662 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1663 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1664 struct btrfs_shared_data_ref *sref;
1665 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1666 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1667 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1668 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1669 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1670 } else {
1671 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1672 }
1673 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1674}
1675
1676static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1677 struct btrfs_path *path,
1678 struct btrfs_extent_inline_ref **ref_ret,
1679 u64 bytenr, u64 num_bytes, u64 parent,
1680 u64 root_objectid, u64 owner, u64 offset)
1681{
1682 int ret;
1683
867cc1fb
NB
1684 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1685 num_bytes, parent, root_objectid,
1686 owner, offset, 0);
5d4f98a2 1687 if (ret != -ENOENT)
54aa1f4d 1688 return ret;
5d4f98a2 1689
b3b4aa74 1690 btrfs_release_path(path);
5d4f98a2
YZ
1691 *ref_ret = NULL;
1692
1693 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1694 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1695 root_objectid);
5d4f98a2 1696 } else {
bd1d53ef
NB
1697 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1698 root_objectid, owner, offset);
b9473439 1699 }
5d4f98a2
YZ
1700 return ret;
1701}
31840ae1 1702
5d4f98a2
YZ
1703/*
1704 * helper to update/remove inline back ref
1705 */
1706static noinline_for_stack
61a18f1c 1707void update_inline_extent_backref(struct btrfs_path *path,
143bede5
JM
1708 struct btrfs_extent_inline_ref *iref,
1709 int refs_to_mod,
fcebe456
JB
1710 struct btrfs_delayed_extent_op *extent_op,
1711 int *last_ref)
5d4f98a2 1712{
61a18f1c 1713 struct extent_buffer *leaf = path->nodes[0];
5d4f98a2
YZ
1714 struct btrfs_extent_item *ei;
1715 struct btrfs_extent_data_ref *dref = NULL;
1716 struct btrfs_shared_data_ref *sref = NULL;
1717 unsigned long ptr;
1718 unsigned long end;
1719 u32 item_size;
1720 int size;
1721 int type;
5d4f98a2
YZ
1722 u64 refs;
1723
5d4f98a2
YZ
1724 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1725 refs = btrfs_extent_refs(leaf, ei);
1726 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1727 refs += refs_to_mod;
1728 btrfs_set_extent_refs(leaf, ei, refs);
1729 if (extent_op)
1730 __run_delayed_extent_op(extent_op, leaf, ei);
1731
3de28d57
LB
1732 /*
1733 * If type is invalid, we should have bailed out after
1734 * lookup_inline_extent_backref().
1735 */
1736 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1737 ASSERT(type != BTRFS_REF_TYPE_INVALID);
5d4f98a2
YZ
1738
1739 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1740 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1741 refs = btrfs_extent_data_ref_count(leaf, dref);
1742 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1743 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1744 refs = btrfs_shared_data_ref_count(leaf, sref);
1745 } else {
1746 refs = 1;
1747 BUG_ON(refs_to_mod != -1);
56bec294 1748 }
31840ae1 1749
5d4f98a2
YZ
1750 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1751 refs += refs_to_mod;
1752
1753 if (refs > 0) {
1754 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1755 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1756 else
1757 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1758 } else {
fcebe456 1759 *last_ref = 1;
5d4f98a2
YZ
1760 size = btrfs_extent_inline_ref_size(type);
1761 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1762 ptr = (unsigned long)iref;
1763 end = (unsigned long)ei + item_size;
1764 if (ptr + size < end)
1765 memmove_extent_buffer(leaf, ptr, ptr + size,
1766 end - ptr - size);
1767 item_size -= size;
78ac4f9e 1768 btrfs_truncate_item(path, item_size, 1);
5d4f98a2
YZ
1769 }
1770 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1771}
1772
1773static noinline_for_stack
1774int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1775 struct btrfs_path *path,
1776 u64 bytenr, u64 num_bytes, u64 parent,
1777 u64 root_objectid, u64 owner,
1778 u64 offset, int refs_to_add,
1779 struct btrfs_delayed_extent_op *extent_op)
1780{
1781 struct btrfs_extent_inline_ref *iref;
1782 int ret;
1783
867cc1fb
NB
1784 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1785 num_bytes, parent, root_objectid,
1786 owner, offset, 1);
5d4f98a2
YZ
1787 if (ret == 0) {
1788 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
61a18f1c
NB
1789 update_inline_extent_backref(path, iref, refs_to_add,
1790 extent_op, NULL);
5d4f98a2 1791 } else if (ret == -ENOENT) {
a639cdeb 1792 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
143bede5
JM
1793 root_objectid, owner, offset,
1794 refs_to_add, extent_op);
1795 ret = 0;
771ed689 1796 }
5d4f98a2
YZ
1797 return ret;
1798}
31840ae1 1799
5d4f98a2 1800static int insert_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1801 struct btrfs_path *path,
1802 u64 bytenr, u64 parent, u64 root_objectid,
1803 u64 owner, u64 offset, int refs_to_add)
1804{
1805 int ret;
1806 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1807 BUG_ON(refs_to_add != 1);
10728404
NB
1808 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1809 root_objectid);
5d4f98a2 1810 } else {
62b895af
NB
1811 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1812 root_objectid, owner, offset,
1813 refs_to_add);
5d4f98a2
YZ
1814 }
1815 return ret;
1816}
56bec294 1817
5d4f98a2 1818static int remove_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1819 struct btrfs_path *path,
1820 struct btrfs_extent_inline_ref *iref,
fcebe456 1821 int refs_to_drop, int is_data, int *last_ref)
5d4f98a2 1822{
143bede5 1823 int ret = 0;
b9473439 1824
5d4f98a2
YZ
1825 BUG_ON(!is_data && refs_to_drop != 1);
1826 if (iref) {
61a18f1c
NB
1827 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1828 last_ref);
5d4f98a2 1829 } else if (is_data) {
e9f6290d 1830 ret = remove_extent_data_ref(trans, path, refs_to_drop,
fcebe456 1831 last_ref);
5d4f98a2 1832 } else {
fcebe456 1833 *last_ref = 1;
87cc7a8a 1834 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
5d4f98a2
YZ
1835 }
1836 return ret;
1837}
1838
d04c6b88
JM
1839static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1840 u64 *discarded_bytes)
5d4f98a2 1841{
86557861
JM
1842 int j, ret = 0;
1843 u64 bytes_left, end;
4d89d377 1844 u64 aligned_start = ALIGN(start, 1 << 9);
d04c6b88 1845
4d89d377
JM
1846 if (WARN_ON(start != aligned_start)) {
1847 len -= aligned_start - start;
1848 len = round_down(len, 1 << 9);
1849 start = aligned_start;
1850 }
d04c6b88 1851
4d89d377 1852 *discarded_bytes = 0;
86557861
JM
1853
1854 if (!len)
1855 return 0;
1856
1857 end = start + len;
1858 bytes_left = len;
1859
1860 /* Skip any superblocks on this device. */
1861 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1862 u64 sb_start = btrfs_sb_offset(j);
1863 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1864 u64 size = sb_start - start;
1865
1866 if (!in_range(sb_start, start, bytes_left) &&
1867 !in_range(sb_end, start, bytes_left) &&
1868 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1869 continue;
1870
1871 /*
1872 * Superblock spans beginning of range. Adjust start and
1873 * try again.
1874 */
1875 if (sb_start <= start) {
1876 start += sb_end - start;
1877 if (start > end) {
1878 bytes_left = 0;
1879 break;
1880 }
1881 bytes_left = end - start;
1882 continue;
1883 }
1884
1885 if (size) {
1886 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1887 GFP_NOFS, 0);
1888 if (!ret)
1889 *discarded_bytes += size;
1890 else if (ret != -EOPNOTSUPP)
1891 return ret;
1892 }
1893
1894 start = sb_end;
1895 if (start > end) {
1896 bytes_left = 0;
1897 break;
1898 }
1899 bytes_left = end - start;
1900 }
1901
1902 if (bytes_left) {
1903 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
4d89d377
JM
1904 GFP_NOFS, 0);
1905 if (!ret)
86557861 1906 *discarded_bytes += bytes_left;
4d89d377 1907 }
d04c6b88 1908 return ret;
5d4f98a2 1909}
5d4f98a2 1910
2ff7e61e 1911int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1912 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1913{
5d4f98a2 1914 int ret;
5378e607 1915 u64 discarded_bytes = 0;
a1d3c478 1916 struct btrfs_bio *bbio = NULL;
5d4f98a2 1917
e244a0ae 1918
2999241d
FM
1919 /*
1920 * Avoid races with device replace and make sure our bbio has devices
1921 * associated to its stripes that don't go away while we are discarding.
1922 */
0b246afa 1923 btrfs_bio_counter_inc_blocked(fs_info);
5d4f98a2 1924 /* Tell the block device(s) that the sectors can be discarded */
0b246afa
JM
1925 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1926 &bbio, 0);
79787eaa 1927 /* Error condition is -ENOMEM */
5d4f98a2 1928 if (!ret) {
a1d3c478 1929 struct btrfs_bio_stripe *stripe = bbio->stripes;
5d4f98a2
YZ
1930 int i;
1931
5d4f98a2 1932
a1d3c478 1933 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
d04c6b88 1934 u64 bytes;
38b5f68e
AJ
1935 struct request_queue *req_q;
1936
627e0873
FM
1937 if (!stripe->dev->bdev) {
1938 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1939 continue;
1940 }
38b5f68e
AJ
1941 req_q = bdev_get_queue(stripe->dev->bdev);
1942 if (!blk_queue_discard(req_q))
d5e2003c
JB
1943 continue;
1944
5378e607
LD
1945 ret = btrfs_issue_discard(stripe->dev->bdev,
1946 stripe->physical,
d04c6b88
JM
1947 stripe->length,
1948 &bytes);
5378e607 1949 if (!ret)
d04c6b88 1950 discarded_bytes += bytes;
5378e607 1951 else if (ret != -EOPNOTSUPP)
79787eaa 1952 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
d5e2003c
JB
1953
1954 /*
1955 * Just in case we get back EOPNOTSUPP for some reason,
1956 * just ignore the return value so we don't screw up
1957 * people calling discard_extent.
1958 */
1959 ret = 0;
5d4f98a2 1960 }
6e9606d2 1961 btrfs_put_bbio(bbio);
5d4f98a2 1962 }
0b246afa 1963 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1964
1965 if (actual_bytes)
1966 *actual_bytes = discarded_bytes;
1967
5d4f98a2 1968
53b381b3
DW
1969 if (ret == -EOPNOTSUPP)
1970 ret = 0;
5d4f98a2 1971 return ret;
5d4f98a2
YZ
1972}
1973
79787eaa 1974/* Can return -ENOMEM */
5d4f98a2 1975int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1976 struct btrfs_ref *generic_ref)
5d4f98a2 1977{
82fa113f 1978 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 1979 int old_ref_mod, new_ref_mod;
5d4f98a2 1980 int ret;
66d7e7f0 1981
82fa113f
QW
1982 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1983 generic_ref->action);
1984 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1985 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1986
82fa113f
QW
1987 if (generic_ref->type == BTRFS_REF_METADATA)
1988 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
ed4f255b 1989 NULL, &old_ref_mod, &new_ref_mod);
82fa113f
QW
1990 else
1991 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
d7eae340 1992 &old_ref_mod, &new_ref_mod);
d7eae340 1993
82fa113f 1994 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1995
ddf30cf0 1996 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
78192442 1997 sub_pinned_bytes(fs_info, generic_ref);
d7eae340 1998
5d4f98a2
YZ
1999 return ret;
2000}
2001
bd3c685e
NB
2002/*
2003 * __btrfs_inc_extent_ref - insert backreference for a given extent
2004 *
2005 * @trans: Handle of transaction
2006 *
2007 * @node: The delayed ref node used to get the bytenr/length for
2008 * extent whose references are incremented.
2009 *
2010 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2011 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2012 * bytenr of the parent block. Since new extents are always
2013 * created with indirect references, this will only be the case
2014 * when relocating a shared extent. In that case, root_objectid
2015 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2016 * be 0
2017 *
2018 * @root_objectid: The id of the root where this modification has originated,
2019 * this can be either one of the well-known metadata trees or
2020 * the subvolume id which references this extent.
2021 *
2022 * @owner: For data extents it is the inode number of the owning file.
2023 * For metadata extents this parameter holds the level in the
2024 * tree of the extent.
2025 *
2026 * @offset: For metadata extents the offset is ignored and is currently
2027 * always passed as 0. For data extents it is the fileoffset
2028 * this extent belongs to.
2029 *
2030 * @refs_to_add Number of references to add
2031 *
2032 * @extent_op Pointer to a structure, holding information necessary when
2033 * updating a tree block's flags
2034 *
2035 */
5d4f98a2 2036static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 2037 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
2038 u64 parent, u64 root_objectid,
2039 u64 owner, u64 offset, int refs_to_add,
2040 struct btrfs_delayed_extent_op *extent_op)
2041{
2042 struct btrfs_path *path;
2043 struct extent_buffer *leaf;
2044 struct btrfs_extent_item *item;
fcebe456 2045 struct btrfs_key key;
c682f9b3
QW
2046 u64 bytenr = node->bytenr;
2047 u64 num_bytes = node->num_bytes;
5d4f98a2
YZ
2048 u64 refs;
2049 int ret;
5d4f98a2
YZ
2050
2051 path = btrfs_alloc_path();
2052 if (!path)
2053 return -ENOMEM;
2054
e4058b54 2055 path->reada = READA_FORWARD;
5d4f98a2
YZ
2056 path->leave_spinning = 1;
2057 /* this will setup the path even if it fails to insert the back ref */
a639cdeb
NB
2058 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2059 parent, root_objectid, owner,
2060 offset, refs_to_add, extent_op);
0ed4792a 2061 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 2062 goto out;
fcebe456
JB
2063
2064 /*
2065 * Ok we had -EAGAIN which means we didn't have space to insert and
2066 * inline extent ref, so just update the reference count and add a
2067 * normal backref.
2068 */
5d4f98a2 2069 leaf = path->nodes[0];
fcebe456 2070 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
2071 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2072 refs = btrfs_extent_refs(leaf, item);
2073 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2074 if (extent_op)
2075 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 2076
5d4f98a2 2077 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 2078 btrfs_release_path(path);
56bec294 2079
e4058b54 2080 path->reada = READA_FORWARD;
b9473439 2081 path->leave_spinning = 1;
56bec294 2082 /* now insert the actual backref */
37593410
NB
2083 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2084 owner, offset, refs_to_add);
79787eaa 2085 if (ret)
66642832 2086 btrfs_abort_transaction(trans, ret);
5d4f98a2 2087out:
56bec294 2088 btrfs_free_path(path);
30d133fc 2089 return ret;
56bec294
CM
2090}
2091
5d4f98a2 2092static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2093 struct btrfs_delayed_ref_node *node,
2094 struct btrfs_delayed_extent_op *extent_op,
2095 int insert_reserved)
56bec294 2096{
5d4f98a2
YZ
2097 int ret = 0;
2098 struct btrfs_delayed_data_ref *ref;
2099 struct btrfs_key ins;
2100 u64 parent = 0;
2101 u64 ref_root = 0;
2102 u64 flags = 0;
2103
2104 ins.objectid = node->bytenr;
2105 ins.offset = node->num_bytes;
2106 ins.type = BTRFS_EXTENT_ITEM_KEY;
2107
2108 ref = btrfs_delayed_node_to_data_ref(node);
2bf98ef3 2109 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
599c75ec 2110
5d4f98a2
YZ
2111 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2112 parent = ref->parent;
fcebe456 2113 ref_root = ref->root;
5d4f98a2
YZ
2114
2115 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2116 if (extent_op)
5d4f98a2 2117 flags |= extent_op->flags_to_set;
ef89b824
NB
2118 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2119 flags, ref->objectid,
2120 ref->offset, &ins,
2121 node->ref_mod);
5d4f98a2 2122 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
2123 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2124 ref->objectid, ref->offset,
2125 node->ref_mod, extent_op);
5d4f98a2 2126 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 2127 ret = __btrfs_free_extent(trans, node, parent,
5d4f98a2
YZ
2128 ref_root, ref->objectid,
2129 ref->offset, node->ref_mod,
c682f9b3 2130 extent_op);
5d4f98a2
YZ
2131 } else {
2132 BUG();
2133 }
2134 return ret;
2135}
2136
2137static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2138 struct extent_buffer *leaf,
2139 struct btrfs_extent_item *ei)
2140{
2141 u64 flags = btrfs_extent_flags(leaf, ei);
2142 if (extent_op->update_flags) {
2143 flags |= extent_op->flags_to_set;
2144 btrfs_set_extent_flags(leaf, ei, flags);
2145 }
2146
2147 if (extent_op->update_key) {
2148 struct btrfs_tree_block_info *bi;
2149 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2150 bi = (struct btrfs_tree_block_info *)(ei + 1);
2151 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2152 }
2153}
2154
2155static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 2156 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
2157 struct btrfs_delayed_extent_op *extent_op)
2158{
20b9a2d6 2159 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
2160 struct btrfs_key key;
2161 struct btrfs_path *path;
2162 struct btrfs_extent_item *ei;
2163 struct extent_buffer *leaf;
2164 u32 item_size;
56bec294 2165 int ret;
5d4f98a2 2166 int err = 0;
b1c79e09 2167 int metadata = !extent_op->is_data;
5d4f98a2 2168
79787eaa
JM
2169 if (trans->aborted)
2170 return 0;
2171
0b246afa 2172 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
2173 metadata = 0;
2174
5d4f98a2
YZ
2175 path = btrfs_alloc_path();
2176 if (!path)
2177 return -ENOMEM;
2178
d278850e 2179 key.objectid = head->bytenr;
5d4f98a2 2180
3173a18f 2181 if (metadata) {
3173a18f 2182 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 2183 key.offset = extent_op->level;
3173a18f
JB
2184 } else {
2185 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 2186 key.offset = head->num_bytes;
3173a18f
JB
2187 }
2188
2189again:
e4058b54 2190 path->reada = READA_FORWARD;
5d4f98a2 2191 path->leave_spinning = 1;
0b246afa 2192 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
5d4f98a2
YZ
2193 if (ret < 0) {
2194 err = ret;
2195 goto out;
2196 }
2197 if (ret > 0) {
3173a18f 2198 if (metadata) {
55994887
FDBM
2199 if (path->slots[0] > 0) {
2200 path->slots[0]--;
2201 btrfs_item_key_to_cpu(path->nodes[0], &key,
2202 path->slots[0]);
d278850e 2203 if (key.objectid == head->bytenr &&
55994887 2204 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 2205 key.offset == head->num_bytes)
55994887
FDBM
2206 ret = 0;
2207 }
2208 if (ret > 0) {
2209 btrfs_release_path(path);
2210 metadata = 0;
3173a18f 2211
d278850e
JB
2212 key.objectid = head->bytenr;
2213 key.offset = head->num_bytes;
55994887
FDBM
2214 key.type = BTRFS_EXTENT_ITEM_KEY;
2215 goto again;
2216 }
2217 } else {
2218 err = -EIO;
2219 goto out;
3173a18f 2220 }
5d4f98a2
YZ
2221 }
2222
2223 leaf = path->nodes[0];
2224 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ba3c2b19 2225
6d8ff4e4 2226 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
2227 err = -EINVAL;
2228 btrfs_print_v0_err(fs_info);
2229 btrfs_abort_transaction(trans, err);
2230 goto out;
2231 }
2232
5d4f98a2
YZ
2233 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2234 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 2235
5d4f98a2
YZ
2236 btrfs_mark_buffer_dirty(leaf);
2237out:
2238 btrfs_free_path(path);
2239 return err;
56bec294
CM
2240}
2241
5d4f98a2 2242static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2243 struct btrfs_delayed_ref_node *node,
2244 struct btrfs_delayed_extent_op *extent_op,
2245 int insert_reserved)
56bec294
CM
2246{
2247 int ret = 0;
5d4f98a2 2248 struct btrfs_delayed_tree_ref *ref;
5d4f98a2
YZ
2249 u64 parent = 0;
2250 u64 ref_root = 0;
56bec294 2251
5d4f98a2 2252 ref = btrfs_delayed_node_to_tree_ref(node);
f97806f2 2253 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
599c75ec 2254
5d4f98a2
YZ
2255 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2256 parent = ref->parent;
fcebe456 2257 ref_root = ref->root;
5d4f98a2 2258
02794222 2259 if (node->ref_mod != 1) {
f97806f2 2260 btrfs_err(trans->fs_info,
02794222
LB
2261 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2262 node->bytenr, node->ref_mod, node->action, ref_root,
2263 parent);
2264 return -EIO;
2265 }
5d4f98a2 2266 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2267 BUG_ON(!extent_op || !extent_op->update_flags);
21ebfbe7 2268 ret = alloc_reserved_tree_block(trans, node, extent_op);
5d4f98a2 2269 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
2270 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2271 ref->level, 0, 1, extent_op);
5d4f98a2 2272 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 2273 ret = __btrfs_free_extent(trans, node, parent, ref_root,
c682f9b3 2274 ref->level, 0, 1, extent_op);
5d4f98a2
YZ
2275 } else {
2276 BUG();
2277 }
56bec294
CM
2278 return ret;
2279}
2280
2281/* helper function to actually process a single delayed ref entry */
5d4f98a2 2282static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2283 struct btrfs_delayed_ref_node *node,
2284 struct btrfs_delayed_extent_op *extent_op,
2285 int insert_reserved)
56bec294 2286{
79787eaa
JM
2287 int ret = 0;
2288
857cc2fc
JB
2289 if (trans->aborted) {
2290 if (insert_reserved)
5fac7f9e 2291 btrfs_pin_extent(trans->fs_info, node->bytenr,
857cc2fc 2292 node->num_bytes, 1);
79787eaa 2293 return 0;
857cc2fc 2294 }
79787eaa 2295
5d4f98a2
YZ
2296 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2297 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
f97806f2 2298 ret = run_delayed_tree_ref(trans, node, extent_op,
5d4f98a2
YZ
2299 insert_reserved);
2300 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2301 node->type == BTRFS_SHARED_DATA_REF_KEY)
2bf98ef3 2302 ret = run_delayed_data_ref(trans, node, extent_op,
5d4f98a2
YZ
2303 insert_reserved);
2304 else
2305 BUG();
80ee54bf
JB
2306 if (ret && insert_reserved)
2307 btrfs_pin_extent(trans->fs_info, node->bytenr,
2308 node->num_bytes, 1);
5d4f98a2 2309 return ret;
56bec294
CM
2310}
2311
c6fc2454 2312static inline struct btrfs_delayed_ref_node *
56bec294
CM
2313select_delayed_ref(struct btrfs_delayed_ref_head *head)
2314{
cffc3374
FM
2315 struct btrfs_delayed_ref_node *ref;
2316
e3d03965 2317 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 2318 return NULL;
d7df2c79 2319
cffc3374
FM
2320 /*
2321 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2322 * This is to prevent a ref count from going down to zero, which deletes
2323 * the extent item from the extent tree, when there still are references
2324 * to add, which would fail because they would not find the extent item.
2325 */
1d57ee94
WX
2326 if (!list_empty(&head->ref_add_list))
2327 return list_first_entry(&head->ref_add_list,
2328 struct btrfs_delayed_ref_node, add_list);
2329
e3d03965 2330 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 2331 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
2332 ASSERT(list_empty(&ref->add_list));
2333 return ref;
56bec294
CM
2334}
2335
2eadaa22
JB
2336static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2337 struct btrfs_delayed_ref_head *head)
2338{
2339 spin_lock(&delayed_refs->lock);
2340 head->processing = 0;
2341 delayed_refs->num_heads_ready++;
2342 spin_unlock(&delayed_refs->lock);
2343 btrfs_delayed_ref_unlock(head);
2344}
2345
bedc6617
JB
2346static struct btrfs_delayed_extent_op *cleanup_extent_op(
2347 struct btrfs_delayed_ref_head *head)
b00e6250
JB
2348{
2349 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
2350
2351 if (!extent_op)
bedc6617
JB
2352 return NULL;
2353
b00e6250 2354 if (head->must_insert_reserved) {
bedc6617 2355 head->extent_op = NULL;
b00e6250 2356 btrfs_free_delayed_extent_op(extent_op);
bedc6617 2357 return NULL;
b00e6250 2358 }
bedc6617
JB
2359 return extent_op;
2360}
2361
2362static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2363 struct btrfs_delayed_ref_head *head)
2364{
2365 struct btrfs_delayed_extent_op *extent_op;
2366 int ret;
2367
2368 extent_op = cleanup_extent_op(head);
2369 if (!extent_op)
2370 return 0;
2371 head->extent_op = NULL;
b00e6250 2372 spin_unlock(&head->lock);
20b9a2d6 2373 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
2374 btrfs_free_delayed_extent_op(extent_op);
2375 return ret ? ret : 1;
2376}
2377
31890da0
JB
2378void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2379 struct btrfs_delayed_ref_root *delayed_refs,
2380 struct btrfs_delayed_ref_head *head)
07c47775 2381{
ba2c4d4e 2382 int nr_items = 1; /* Dropping this ref head update. */
07c47775
JB
2383
2384 if (head->total_ref_mod < 0) {
2385 struct btrfs_space_info *space_info;
2386 u64 flags;
2387
2388 if (head->is_data)
2389 flags = BTRFS_BLOCK_GROUP_DATA;
2390 else if (head->is_system)
2391 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2392 else
2393 flags = BTRFS_BLOCK_GROUP_METADATA;
280c2908 2394 space_info = btrfs_find_space_info(fs_info, flags);
07c47775
JB
2395 ASSERT(space_info);
2396 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2397 -head->num_bytes,
2398 BTRFS_TOTAL_BYTES_PINNED_BATCH);
2399
ba2c4d4e
JB
2400 /*
2401 * We had csum deletions accounted for in our delayed refs rsv,
2402 * we need to drop the csum leaves for this update from our
2403 * delayed_refs_rsv.
2404 */
07c47775
JB
2405 if (head->is_data) {
2406 spin_lock(&delayed_refs->lock);
2407 delayed_refs->pending_csums -= head->num_bytes;
2408 spin_unlock(&delayed_refs->lock);
ba2c4d4e
JB
2409 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2410 head->num_bytes);
07c47775
JB
2411 }
2412 }
2413
ba2c4d4e 2414 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
07c47775
JB
2415}
2416
194ab0bc 2417static int cleanup_ref_head(struct btrfs_trans_handle *trans,
194ab0bc
JB
2418 struct btrfs_delayed_ref_head *head)
2419{
f9871edd
NB
2420
2421 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
2422 struct btrfs_delayed_ref_root *delayed_refs;
2423 int ret;
2424
2425 delayed_refs = &trans->transaction->delayed_refs;
2426
bedc6617 2427 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
2428 if (ret < 0) {
2429 unselect_delayed_ref_head(delayed_refs, head);
2430 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2431 return ret;
2432 } else if (ret) {
2433 return ret;
2434 }
2435
2436 /*
2437 * Need to drop our head ref lock and re-acquire the delayed ref lock
2438 * and then re-check to make sure nobody got added.
2439 */
2440 spin_unlock(&head->lock);
2441 spin_lock(&delayed_refs->lock);
2442 spin_lock(&head->lock);
e3d03965 2443 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
2444 spin_unlock(&head->lock);
2445 spin_unlock(&delayed_refs->lock);
2446 return 1;
2447 }
d7baffda 2448 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 2449 spin_unlock(&head->lock);
1e7a1421 2450 spin_unlock(&delayed_refs->lock);
c1103f7a 2451
c1103f7a 2452 if (head->must_insert_reserved) {
d278850e
JB
2453 btrfs_pin_extent(fs_info, head->bytenr,
2454 head->num_bytes, 1);
c1103f7a 2455 if (head->is_data) {
d278850e
JB
2456 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2457 head->num_bytes);
c1103f7a
JB
2458 }
2459 }
2460
31890da0 2461 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
2462
2463 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 2464 btrfs_delayed_ref_unlock(head);
d278850e 2465 btrfs_put_delayed_ref_head(head);
194ab0bc
JB
2466 return 0;
2467}
2468
b1cdbcb5
NB
2469static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2470 struct btrfs_trans_handle *trans)
2471{
2472 struct btrfs_delayed_ref_root *delayed_refs =
2473 &trans->transaction->delayed_refs;
2474 struct btrfs_delayed_ref_head *head = NULL;
2475 int ret;
2476
2477 spin_lock(&delayed_refs->lock);
5637c74b 2478 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
2479 if (!head) {
2480 spin_unlock(&delayed_refs->lock);
2481 return head;
2482 }
2483
2484 /*
2485 * Grab the lock that says we are going to process all the refs for
2486 * this head
2487 */
9e920a6f 2488 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
2489 spin_unlock(&delayed_refs->lock);
2490
2491 /*
2492 * We may have dropped the spin lock to get the head mutex lock, and
2493 * that might have given someone else time to free the head. If that's
2494 * true, it has been removed from our list and we can move on.
2495 */
2496 if (ret == -EAGAIN)
2497 head = ERR_PTR(-EAGAIN);
2498
2499 return head;
2500}
2501
e7261386
NB
2502static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2503 struct btrfs_delayed_ref_head *locked_ref,
2504 unsigned long *run_refs)
2505{
2506 struct btrfs_fs_info *fs_info = trans->fs_info;
2507 struct btrfs_delayed_ref_root *delayed_refs;
2508 struct btrfs_delayed_extent_op *extent_op;
2509 struct btrfs_delayed_ref_node *ref;
2510 int must_insert_reserved = 0;
2511 int ret;
2512
2513 delayed_refs = &trans->transaction->delayed_refs;
2514
0110a4c4
NB
2515 lockdep_assert_held(&locked_ref->mutex);
2516 lockdep_assert_held(&locked_ref->lock);
2517
e7261386
NB
2518 while ((ref = select_delayed_ref(locked_ref))) {
2519 if (ref->seq &&
2520 btrfs_check_delayed_seq(fs_info, ref->seq)) {
2521 spin_unlock(&locked_ref->lock);
2522 unselect_delayed_ref_head(delayed_refs, locked_ref);
2523 return -EAGAIN;
2524 }
2525
2526 (*run_refs)++;
2527 ref->in_tree = 0;
2528 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2529 RB_CLEAR_NODE(&ref->ref_node);
2530 if (!list_empty(&ref->add_list))
2531 list_del(&ref->add_list);
2532 /*
2533 * When we play the delayed ref, also correct the ref_mod on
2534 * head
2535 */
2536 switch (ref->action) {
2537 case BTRFS_ADD_DELAYED_REF:
2538 case BTRFS_ADD_DELAYED_EXTENT:
2539 locked_ref->ref_mod -= ref->ref_mod;
2540 break;
2541 case BTRFS_DROP_DELAYED_REF:
2542 locked_ref->ref_mod += ref->ref_mod;
2543 break;
2544 default:
2545 WARN_ON(1);
2546 }
2547 atomic_dec(&delayed_refs->num_entries);
2548
2549 /*
2550 * Record the must_insert_reserved flag before we drop the
2551 * spin lock.
2552 */
2553 must_insert_reserved = locked_ref->must_insert_reserved;
2554 locked_ref->must_insert_reserved = 0;
2555
2556 extent_op = locked_ref->extent_op;
2557 locked_ref->extent_op = NULL;
2558 spin_unlock(&locked_ref->lock);
2559
2560 ret = run_one_delayed_ref(trans, ref, extent_op,
2561 must_insert_reserved);
2562
2563 btrfs_free_delayed_extent_op(extent_op);
2564 if (ret) {
2565 unselect_delayed_ref_head(delayed_refs, locked_ref);
2566 btrfs_put_delayed_ref(ref);
2567 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2568 ret);
2569 return ret;
2570 }
2571
2572 btrfs_put_delayed_ref(ref);
2573 cond_resched();
2574
2575 spin_lock(&locked_ref->lock);
2576 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2577 }
2578
2579 return 0;
2580}
2581
79787eaa
JM
2582/*
2583 * Returns 0 on success or if called with an already aborted transaction.
2584 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2585 */
d7df2c79 2586static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
d7df2c79 2587 unsigned long nr)
56bec294 2588{
0a1e458a 2589 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 2590 struct btrfs_delayed_ref_root *delayed_refs;
56bec294 2591 struct btrfs_delayed_ref_head *locked_ref = NULL;
0a2b2a84 2592 ktime_t start = ktime_get();
56bec294 2593 int ret;
d7df2c79 2594 unsigned long count = 0;
0a2b2a84 2595 unsigned long actual_count = 0;
56bec294
CM
2596
2597 delayed_refs = &trans->transaction->delayed_refs;
0110a4c4 2598 do {
56bec294 2599 if (!locked_ref) {
b1cdbcb5 2600 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
2601 if (IS_ERR_OR_NULL(locked_ref)) {
2602 if (PTR_ERR(locked_ref) == -EAGAIN) {
2603 continue;
2604 } else {
2605 break;
2606 }
56bec294 2607 }
0110a4c4 2608 count++;
56bec294 2609 }
2c3cf7d5
FM
2610 /*
2611 * We need to try and merge add/drops of the same ref since we
2612 * can run into issues with relocate dropping the implicit ref
2613 * and then it being added back again before the drop can
2614 * finish. If we merged anything we need to re-loop so we can
2615 * get a good ref.
2616 * Or we can get node references of the same type that weren't
2617 * merged when created due to bumps in the tree mod seq, and
2618 * we need to merge them to prevent adding an inline extent
2619 * backref before dropping it (triggering a BUG_ON at
2620 * insert_inline_extent_backref()).
2621 */
d7df2c79 2622 spin_lock(&locked_ref->lock);
be97f133 2623 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
ae1e206b 2624
0110a4c4
NB
2625 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2626 &actual_count);
2627 if (ret < 0 && ret != -EAGAIN) {
2628 /*
2629 * Error, btrfs_run_delayed_refs_for_head already
2630 * unlocked everything so just bail out
2631 */
2632 return ret;
2633 } else if (!ret) {
2634 /*
2635 * Success, perform the usual cleanup of a processed
2636 * head
2637 */
f9871edd 2638 ret = cleanup_ref_head(trans, locked_ref);
194ab0bc 2639 if (ret > 0 ) {
b00e6250
JB
2640 /* We dropped our lock, we need to loop. */
2641 ret = 0;
d7df2c79 2642 continue;
194ab0bc
JB
2643 } else if (ret) {
2644 return ret;
5d4f98a2 2645 }
22cd2e7d 2646 }
1ce7a5ec 2647
b00e6250 2648 /*
0110a4c4
NB
2649 * Either success case or btrfs_run_delayed_refs_for_head
2650 * returned -EAGAIN, meaning we need to select another head
b00e6250 2651 */
b00e6250 2652
0110a4c4 2653 locked_ref = NULL;
c3e69d58 2654 cond_resched();
0110a4c4 2655 } while ((nr != -1 && count < nr) || locked_ref);
0a2b2a84
JB
2656
2657 /*
2658 * We don't want to include ref heads since we can have empty ref heads
2659 * and those will drastically skew our runtime down since we just do
2660 * accounting, no actual extent tree updates.
2661 */
2662 if (actual_count > 0) {
2663 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2664 u64 avg;
2665
2666 /*
2667 * We weigh the current average higher than our current runtime
2668 * to avoid large swings in the average.
2669 */
2670 spin_lock(&delayed_refs->lock);
2671 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
f8c269d7 2672 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
0a2b2a84
JB
2673 spin_unlock(&delayed_refs->lock);
2674 }
d7df2c79 2675 return 0;
c3e69d58
CM
2676}
2677
709c0486
AJ
2678#ifdef SCRAMBLE_DELAYED_REFS
2679/*
2680 * Normally delayed refs get processed in ascending bytenr order. This
2681 * correlates in most cases to the order added. To expose dependencies on this
2682 * order, we start to process the tree in the middle instead of the beginning
2683 */
2684static u64 find_middle(struct rb_root *root)
2685{
2686 struct rb_node *n = root->rb_node;
2687 struct btrfs_delayed_ref_node *entry;
2688 int alt = 1;
2689 u64 middle;
2690 u64 first = 0, last = 0;
2691
2692 n = rb_first(root);
2693 if (n) {
2694 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2695 first = entry->bytenr;
2696 }
2697 n = rb_last(root);
2698 if (n) {
2699 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2700 last = entry->bytenr;
2701 }
2702 n = root->rb_node;
2703
2704 while (n) {
2705 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2706 WARN_ON(!entry->in_tree);
2707
2708 middle = entry->bytenr;
2709
2710 if (alt)
2711 n = n->rb_left;
2712 else
2713 n = n->rb_right;
2714
2715 alt = 1 - alt;
2716 }
2717 return middle;
2718}
2719#endif
2720
2ff7e61e 2721static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
1be41b78
JB
2722{
2723 u64 num_bytes;
2724
2725 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2726 sizeof(struct btrfs_extent_inline_ref));
0b246afa 2727 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1be41b78
JB
2728 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2729
2730 /*
2731 * We don't ever fill up leaves all the way so multiply by 2 just to be
01327610 2732 * closer to what we're really going to want to use.
1be41b78 2733 */
0b246afa 2734 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
1be41b78
JB
2735}
2736
1262133b
JB
2737/*
2738 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2739 * would require to store the csums for that many bytes.
2740 */
2ff7e61e 2741u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
1262133b
JB
2742{
2743 u64 csum_size;
2744 u64 num_csums_per_leaf;
2745 u64 num_csums;
2746
0b246afa 2747 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
1262133b 2748 num_csums_per_leaf = div64_u64(csum_size,
0b246afa
JM
2749 (u64)btrfs_super_csum_size(fs_info->super_copy));
2750 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
1262133b
JB
2751 num_csums += num_csums_per_leaf - 1;
2752 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2753 return num_csums;
2754}
2755
c3e69d58
CM
2756/*
2757 * this starts processing the delayed reference count updates and
2758 * extent insertions we have queued up so far. count can be
2759 * 0, which means to process everything in the tree at the start
2760 * of the run (but not newly added entries), or it can be some target
2761 * number you'd like to process.
79787eaa
JM
2762 *
2763 * Returns 0 on success or if called with an aborted transaction
2764 * Returns <0 on error and aborts the transaction
c3e69d58
CM
2765 */
2766int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
c79a70b1 2767 unsigned long count)
c3e69d58 2768{
c79a70b1 2769 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58
CM
2770 struct rb_node *node;
2771 struct btrfs_delayed_ref_root *delayed_refs;
c46effa6 2772 struct btrfs_delayed_ref_head *head;
c3e69d58
CM
2773 int ret;
2774 int run_all = count == (unsigned long)-1;
c3e69d58 2775
79787eaa
JM
2776 /* We'll clean this up in btrfs_cleanup_transaction */
2777 if (trans->aborted)
2778 return 0;
2779
0b246afa 2780 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2781 return 0;
2782
c3e69d58 2783 delayed_refs = &trans->transaction->delayed_refs;
26455d33 2784 if (count == 0)
d7df2c79 2785 count = atomic_read(&delayed_refs->num_entries) * 2;
bb721703 2786
c3e69d58 2787again:
709c0486
AJ
2788#ifdef SCRAMBLE_DELAYED_REFS
2789 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2790#endif
0a1e458a 2791 ret = __btrfs_run_delayed_refs(trans, count);
d7df2c79 2792 if (ret < 0) {
66642832 2793 btrfs_abort_transaction(trans, ret);
d7df2c79 2794 return ret;
eb099670 2795 }
c3e69d58 2796
56bec294 2797 if (run_all) {
119e80df 2798 btrfs_create_pending_block_groups(trans);
ea658bad 2799
d7df2c79 2800 spin_lock(&delayed_refs->lock);
5c9d028b 2801 node = rb_first_cached(&delayed_refs->href_root);
d7df2c79
JB
2802 if (!node) {
2803 spin_unlock(&delayed_refs->lock);
56bec294 2804 goto out;
d7df2c79 2805 }
d278850e
JB
2806 head = rb_entry(node, struct btrfs_delayed_ref_head,
2807 href_node);
2808 refcount_inc(&head->refs);
2809 spin_unlock(&delayed_refs->lock);
e9d0b13b 2810
d278850e
JB
2811 /* Mutex was contended, block until it's released and retry. */
2812 mutex_lock(&head->mutex);
2813 mutex_unlock(&head->mutex);
56bec294 2814
d278850e 2815 btrfs_put_delayed_ref_head(head);
d7df2c79 2816 cond_resched();
56bec294 2817 goto again;
5f39d397 2818 }
54aa1f4d 2819out:
a28ec197
CM
2820 return 0;
2821}
2822
5d4f98a2 2823int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
5d4f98a2 2824 u64 bytenr, u64 num_bytes, u64 flags,
b1c79e09 2825 int level, int is_data)
5d4f98a2
YZ
2826{
2827 struct btrfs_delayed_extent_op *extent_op;
2828 int ret;
2829
78a6184a 2830 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2831 if (!extent_op)
2832 return -ENOMEM;
2833
2834 extent_op->flags_to_set = flags;
35b3ad50
DS
2835 extent_op->update_flags = true;
2836 extent_op->update_key = false;
2837 extent_op->is_data = is_data ? true : false;
b1c79e09 2838 extent_op->level = level;
5d4f98a2 2839
c6e340bc 2840 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
5d4f98a2 2841 if (ret)
78a6184a 2842 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2843 return ret;
2844}
2845
e4c3b2dc 2846static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2847 struct btrfs_path *path,
2848 u64 objectid, u64 offset, u64 bytenr)
2849{
2850 struct btrfs_delayed_ref_head *head;
2851 struct btrfs_delayed_ref_node *ref;
2852 struct btrfs_delayed_data_ref *data_ref;
2853 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2854 struct btrfs_transaction *cur_trans;
0e0adbcf 2855 struct rb_node *node;
5d4f98a2
YZ
2856 int ret = 0;
2857
998ac6d2 2858 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2859 cur_trans = root->fs_info->running_transaction;
998ac6d2 2860 if (cur_trans)
2861 refcount_inc(&cur_trans->use_count);
2862 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2863 if (!cur_trans)
2864 return 0;
2865
2866 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2867 spin_lock(&delayed_refs->lock);
f72ad18e 2868 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2869 if (!head) {
2870 spin_unlock(&delayed_refs->lock);
998ac6d2 2871 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2872 return 0;
2873 }
5d4f98a2
YZ
2874
2875 if (!mutex_trylock(&head->mutex)) {
d278850e 2876 refcount_inc(&head->refs);
5d4f98a2
YZ
2877 spin_unlock(&delayed_refs->lock);
2878
b3b4aa74 2879 btrfs_release_path(path);
5d4f98a2 2880
8cc33e5c
DS
2881 /*
2882 * Mutex was contended, block until it's released and let
2883 * caller try again
2884 */
5d4f98a2
YZ
2885 mutex_lock(&head->mutex);
2886 mutex_unlock(&head->mutex);
d278850e 2887 btrfs_put_delayed_ref_head(head);
998ac6d2 2888 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2889 return -EAGAIN;
2890 }
d7df2c79 2891 spin_unlock(&delayed_refs->lock);
5d4f98a2 2892
d7df2c79 2893 spin_lock(&head->lock);
0e0adbcf
JB
2894 /*
2895 * XXX: We should replace this with a proper search function in the
2896 * future.
2897 */
e3d03965
LB
2898 for (node = rb_first_cached(&head->ref_tree); node;
2899 node = rb_next(node)) {
0e0adbcf 2900 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2901 /* If it's a shared ref we know a cross reference exists */
2902 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2903 ret = 1;
2904 break;
2905 }
5d4f98a2 2906
d7df2c79 2907 data_ref = btrfs_delayed_node_to_data_ref(ref);
5d4f98a2 2908
d7df2c79
JB
2909 /*
2910 * If our ref doesn't match the one we're currently looking at
2911 * then we have a cross reference.
2912 */
2913 if (data_ref->root != root->root_key.objectid ||
2914 data_ref->objectid != objectid ||
2915 data_ref->offset != offset) {
2916 ret = 1;
2917 break;
2918 }
5d4f98a2 2919 }
d7df2c79 2920 spin_unlock(&head->lock);
5d4f98a2 2921 mutex_unlock(&head->mutex);
998ac6d2 2922 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2923 return ret;
2924}
2925
e4c3b2dc 2926static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2927 struct btrfs_path *path,
2928 u64 objectid, u64 offset, u64 bytenr)
be20aa9d 2929{
0b246afa
JM
2930 struct btrfs_fs_info *fs_info = root->fs_info;
2931 struct btrfs_root *extent_root = fs_info->extent_root;
f321e491 2932 struct extent_buffer *leaf;
5d4f98a2
YZ
2933 struct btrfs_extent_data_ref *ref;
2934 struct btrfs_extent_inline_ref *iref;
2935 struct btrfs_extent_item *ei;
f321e491 2936 struct btrfs_key key;
5d4f98a2 2937 u32 item_size;
3de28d57 2938 int type;
be20aa9d 2939 int ret;
925baedd 2940
be20aa9d 2941 key.objectid = bytenr;
31840ae1 2942 key.offset = (u64)-1;
f321e491 2943 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2944
be20aa9d
CM
2945 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2946 if (ret < 0)
2947 goto out;
79787eaa 2948 BUG_ON(ret == 0); /* Corruption */
80ff3856
YZ
2949
2950 ret = -ENOENT;
2951 if (path->slots[0] == 0)
31840ae1 2952 goto out;
be20aa9d 2953
31840ae1 2954 path->slots[0]--;
f321e491 2955 leaf = path->nodes[0];
5d4f98a2 2956 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2957
5d4f98a2 2958 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2959 goto out;
f321e491 2960
5d4f98a2
YZ
2961 ret = 1;
2962 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
5d4f98a2 2963 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 2964
5d4f98a2
YZ
2965 if (item_size != sizeof(*ei) +
2966 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2967 goto out;
be20aa9d 2968
5d4f98a2
YZ
2969 if (btrfs_extent_generation(leaf, ei) <=
2970 btrfs_root_last_snapshot(&root->root_item))
2971 goto out;
2972
2973 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3de28d57
LB
2974
2975 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2976 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
2977 goto out;
2978
2979 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2980 if (btrfs_extent_refs(leaf, ei) !=
2981 btrfs_extent_data_ref_count(leaf, ref) ||
2982 btrfs_extent_data_ref_root(leaf, ref) !=
2983 root->root_key.objectid ||
2984 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2985 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2986 goto out;
2987
2988 ret = 0;
2989out:
2990 return ret;
2991}
2992
e4c3b2dc
LB
2993int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2994 u64 bytenr)
5d4f98a2
YZ
2995{
2996 struct btrfs_path *path;
2997 int ret;
5d4f98a2
YZ
2998
2999 path = btrfs_alloc_path();
3000 if (!path)
9132c4ff 3001 return -ENOMEM;
5d4f98a2
YZ
3002
3003 do {
e4c3b2dc 3004 ret = check_committed_ref(root, path, objectid,
5d4f98a2
YZ
3005 offset, bytenr);
3006 if (ret && ret != -ENOENT)
f321e491 3007 goto out;
80ff3856 3008
380fd066
MT
3009 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3010 } while (ret == -EAGAIN);
5d4f98a2 3011
be20aa9d 3012out:
80ff3856 3013 btrfs_free_path(path);
f0486c68
YZ
3014 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3015 WARN_ON(ret > 0);
f321e491 3016 return ret;
be20aa9d 3017}
c5739bba 3018
5d4f98a2 3019static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 3020 struct btrfs_root *root,
5d4f98a2 3021 struct extent_buffer *buf,
e339a6b0 3022 int full_backref, int inc)
31840ae1 3023{
0b246afa 3024 struct btrfs_fs_info *fs_info = root->fs_info;
31840ae1 3025 u64 bytenr;
5d4f98a2
YZ
3026 u64 num_bytes;
3027 u64 parent;
31840ae1 3028 u64 ref_root;
31840ae1 3029 u32 nritems;
31840ae1
ZY
3030 struct btrfs_key key;
3031 struct btrfs_file_extent_item *fi;
82fa113f
QW
3032 struct btrfs_ref generic_ref = { 0 };
3033 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 3034 int i;
82fa113f 3035 int action;
31840ae1
ZY
3036 int level;
3037 int ret = 0;
fccb84c9 3038
0b246afa 3039 if (btrfs_is_testing(fs_info))
faa2dbf0 3040 return 0;
fccb84c9 3041
31840ae1 3042 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
3043 nritems = btrfs_header_nritems(buf);
3044 level = btrfs_header_level(buf);
3045
27cdeb70 3046 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
5d4f98a2 3047 return 0;
31840ae1 3048
5d4f98a2
YZ
3049 if (full_backref)
3050 parent = buf->start;
3051 else
3052 parent = 0;
82fa113f
QW
3053 if (inc)
3054 action = BTRFS_ADD_DELAYED_REF;
3055 else
3056 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
3057
3058 for (i = 0; i < nritems; i++) {
31840ae1 3059 if (level == 0) {
5d4f98a2 3060 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 3061 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 3062 continue;
5d4f98a2 3063 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
3064 struct btrfs_file_extent_item);
3065 if (btrfs_file_extent_type(buf, fi) ==
3066 BTRFS_FILE_EXTENT_INLINE)
3067 continue;
3068 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3069 if (bytenr == 0)
3070 continue;
5d4f98a2
YZ
3071
3072 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3073 key.offset -= btrfs_file_extent_offset(buf, fi);
82fa113f
QW
3074 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3075 num_bytes, parent);
3076 generic_ref.real_root = root->root_key.objectid;
3077 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3078 key.offset);
3079 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 3080 if (inc)
82fa113f 3081 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 3082 else
ffd4bb2a 3083 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
3084 if (ret)
3085 goto fail;
3086 } else {
5d4f98a2 3087 bytenr = btrfs_node_blockptr(buf, i);
0b246afa 3088 num_bytes = fs_info->nodesize;
82fa113f
QW
3089 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3090 num_bytes, parent);
3091 generic_ref.real_root = root->root_key.objectid;
3092 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3093 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 3094 if (inc)
82fa113f 3095 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 3096 else
ffd4bb2a 3097 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
3098 if (ret)
3099 goto fail;
3100 }
3101 }
3102 return 0;
3103fail:
5d4f98a2
YZ
3104 return ret;
3105}
3106
3107int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3108 struct extent_buffer *buf, int full_backref)
5d4f98a2 3109{
e339a6b0 3110 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
3111}
3112
3113int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3114 struct extent_buffer *buf, int full_backref)
5d4f98a2 3115{
e339a6b0 3116 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
3117}
3118
9078a3e1 3119static int write_one_cache_group(struct btrfs_trans_handle *trans,
9078a3e1
CM
3120 struct btrfs_path *path,
3121 struct btrfs_block_group_cache *cache)
3122{
39db232d 3123 struct btrfs_fs_info *fs_info = trans->fs_info;
9078a3e1 3124 int ret;
0b246afa 3125 struct btrfs_root *extent_root = fs_info->extent_root;
5f39d397
CM
3126 unsigned long bi;
3127 struct extent_buffer *leaf;
9078a3e1 3128
9078a3e1 3129 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
df95e7f0
JB
3130 if (ret) {
3131 if (ret > 0)
3132 ret = -ENOENT;
54aa1f4d 3133 goto fail;
df95e7f0 3134 }
5f39d397
CM
3135
3136 leaf = path->nodes[0];
3137 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3138 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3139 btrfs_mark_buffer_dirty(leaf);
54aa1f4d 3140fail:
24b89d08 3141 btrfs_release_path(path);
df95e7f0 3142 return ret;
9078a3e1
CM
3143
3144}
3145
f87b7eb8
DS
3146static struct btrfs_block_group_cache *next_block_group(
3147 struct btrfs_block_group_cache *cache)
4a8c9a62 3148{
f87b7eb8 3149 struct btrfs_fs_info *fs_info = cache->fs_info;
4a8c9a62 3150 struct rb_node *node;
292cbd51 3151
0b246afa 3152 spin_lock(&fs_info->block_group_cache_lock);
292cbd51
FM
3153
3154 /* If our block group was removed, we need a full search. */
3155 if (RB_EMPTY_NODE(&cache->cache_node)) {
3156 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3157
0b246afa 3158 spin_unlock(&fs_info->block_group_cache_lock);
292cbd51 3159 btrfs_put_block_group(cache);
0b246afa 3160 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
292cbd51 3161 }
4a8c9a62
YZ
3162 node = rb_next(&cache->cache_node);
3163 btrfs_put_block_group(cache);
3164 if (node) {
3165 cache = rb_entry(node, struct btrfs_block_group_cache,
3166 cache_node);
11dfe35a 3167 btrfs_get_block_group(cache);
4a8c9a62
YZ
3168 } else
3169 cache = NULL;
0b246afa 3170 spin_unlock(&fs_info->block_group_cache_lock);
4a8c9a62
YZ
3171 return cache;
3172}
3173
0af3d00b
JB
3174static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3175 struct btrfs_trans_handle *trans,
3176 struct btrfs_path *path)
3177{
0b246afa
JM
3178 struct btrfs_fs_info *fs_info = block_group->fs_info;
3179 struct btrfs_root *root = fs_info->tree_root;
0af3d00b 3180 struct inode *inode = NULL;
364ecf36 3181 struct extent_changeset *data_reserved = NULL;
0af3d00b 3182 u64 alloc_hint = 0;
2b20982e 3183 int dcs = BTRFS_DC_ERROR;
f8c269d7 3184 u64 num_pages = 0;
0af3d00b
JB
3185 int retries = 0;
3186 int ret = 0;
3187
3188 /*
3189 * If this block group is smaller than 100 megs don't bother caching the
3190 * block group.
3191 */
ee22184b 3192 if (block_group->key.offset < (100 * SZ_1M)) {
0af3d00b
JB
3193 spin_lock(&block_group->lock);
3194 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3195 spin_unlock(&block_group->lock);
3196 return 0;
3197 }
3198
0c0ef4bc
JB
3199 if (trans->aborted)
3200 return 0;
0af3d00b 3201again:
7949f339 3202 inode = lookup_free_space_inode(block_group, path);
0af3d00b
JB
3203 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3204 ret = PTR_ERR(inode);
b3b4aa74 3205 btrfs_release_path(path);
0af3d00b
JB
3206 goto out;
3207 }
3208
3209 if (IS_ERR(inode)) {
3210 BUG_ON(retries);
3211 retries++;
3212
3213 if (block_group->ro)
3214 goto out_free;
3215
4ca75f1b 3216 ret = create_free_space_inode(trans, block_group, path);
0af3d00b
JB
3217 if (ret)
3218 goto out_free;
3219 goto again;
3220 }
3221
3222 /*
3223 * We want to set the generation to 0, that way if anything goes wrong
3224 * from here on out we know not to trust this cache when we load up next
3225 * time.
3226 */
3227 BTRFS_I(inode)->generation = 0;
3228 ret = btrfs_update_inode(trans, root, inode);
0c0ef4bc
JB
3229 if (ret) {
3230 /*
3231 * So theoretically we could recover from this, simply set the
3232 * super cache generation to 0 so we know to invalidate the
3233 * cache, but then we'd have to keep track of the block groups
3234 * that fail this way so we know we _have_ to reset this cache
3235 * before the next commit or risk reading stale cache. So to
3236 * limit our exposure to horrible edge cases lets just abort the
3237 * transaction, this only happens in really bad situations
3238 * anyway.
3239 */
66642832 3240 btrfs_abort_transaction(trans, ret);
0c0ef4bc
JB
3241 goto out_put;
3242 }
0af3d00b
JB
3243 WARN_ON(ret);
3244
8e138e0d
JB
3245 /* We've already setup this transaction, go ahead and exit */
3246 if (block_group->cache_generation == trans->transid &&
3247 i_size_read(inode)) {
3248 dcs = BTRFS_DC_SETUP;
3249 goto out_put;
3250 }
3251
0af3d00b 3252 if (i_size_read(inode) > 0) {
2ff7e61e 3253 ret = btrfs_check_trunc_cache_free_space(fs_info,
0b246afa 3254 &fs_info->global_block_rsv);
7b61cd92
MX
3255 if (ret)
3256 goto out_put;
3257
77ab86bf 3258 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
0af3d00b
JB
3259 if (ret)
3260 goto out_put;
3261 }
3262
3263 spin_lock(&block_group->lock);
cf7c1ef6 3264 if (block_group->cached != BTRFS_CACHE_FINISHED ||
0b246afa 3265 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
cf7c1ef6
LB
3266 /*
3267 * don't bother trying to write stuff out _if_
3268 * a) we're not cached,
1a79c1f2
LB
3269 * b) we're with nospace_cache mount option,
3270 * c) we're with v2 space_cache (FREE_SPACE_TREE).
cf7c1ef6 3271 */
2b20982e 3272 dcs = BTRFS_DC_WRITTEN;
0af3d00b
JB
3273 spin_unlock(&block_group->lock);
3274 goto out_put;
3275 }
3276 spin_unlock(&block_group->lock);
3277
2968b1f4
JB
3278 /*
3279 * We hit an ENOSPC when setting up the cache in this transaction, just
3280 * skip doing the setup, we've already cleared the cache so we're safe.
3281 */
3282 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3283 ret = -ENOSPC;
3284 goto out_put;
3285 }
3286
6fc823b1
JB
3287 /*
3288 * Try to preallocate enough space based on how big the block group is.
3289 * Keep in mind this has to include any pinned space which could end up
3290 * taking up quite a bit since it's not folded into the other space
3291 * cache.
3292 */
ee22184b 3293 num_pages = div_u64(block_group->key.offset, SZ_256M);
0af3d00b
JB
3294 if (!num_pages)
3295 num_pages = 1;
3296
0af3d00b 3297 num_pages *= 16;
09cbfeaf 3298 num_pages *= PAGE_SIZE;
0af3d00b 3299
364ecf36 3300 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
0af3d00b
JB
3301 if (ret)
3302 goto out_put;
3303
3304 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3305 num_pages, num_pages,
3306 &alloc_hint);
2968b1f4
JB
3307 /*
3308 * Our cache requires contiguous chunks so that we don't modify a bunch
3309 * of metadata or split extents when writing the cache out, which means
3310 * we can enospc if we are heavily fragmented in addition to just normal
3311 * out of space conditions. So if we hit this just skip setting up any
3312 * other block groups for this transaction, maybe we'll unpin enough
3313 * space the next time around.
3314 */
2b20982e
JB
3315 if (!ret)
3316 dcs = BTRFS_DC_SETUP;
2968b1f4
JB
3317 else if (ret == -ENOSPC)
3318 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
c09544e0 3319
0af3d00b
JB
3320out_put:
3321 iput(inode);
3322out_free:
b3b4aa74 3323 btrfs_release_path(path);
0af3d00b
JB
3324out:
3325 spin_lock(&block_group->lock);
e65cbb94 3326 if (!ret && dcs == BTRFS_DC_SETUP)
5b0e95bf 3327 block_group->cache_generation = trans->transid;
2b20982e 3328 block_group->disk_cache_state = dcs;
0af3d00b
JB
3329 spin_unlock(&block_group->lock);
3330
364ecf36 3331 extent_changeset_free(data_reserved);
0af3d00b
JB
3332 return ret;
3333}
3334
bbebb3e0 3335int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
dcdf7f6d 3336{
bbebb3e0 3337 struct btrfs_fs_info *fs_info = trans->fs_info;
dcdf7f6d
JB
3338 struct btrfs_block_group_cache *cache, *tmp;
3339 struct btrfs_transaction *cur_trans = trans->transaction;
3340 struct btrfs_path *path;
3341
3342 if (list_empty(&cur_trans->dirty_bgs) ||
0b246afa 3343 !btrfs_test_opt(fs_info, SPACE_CACHE))
dcdf7f6d
JB
3344 return 0;
3345
3346 path = btrfs_alloc_path();
3347 if (!path)
3348 return -ENOMEM;
3349
3350 /* Could add new block groups, use _safe just in case */
3351 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3352 dirty_list) {
3353 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3354 cache_save_setup(cache, trans, path);
3355 }
3356
3357 btrfs_free_path(path);
3358 return 0;
3359}
3360
1bbc621e
CM
3361/*
3362 * transaction commit does final block group cache writeback during a
3363 * critical section where nothing is allowed to change the FS. This is
3364 * required in order for the cache to actually match the block group,
3365 * but can introduce a lot of latency into the commit.
3366 *
3367 * So, btrfs_start_dirty_block_groups is here to kick off block group
3368 * cache IO. There's a chance we'll have to redo some of it if the
3369 * block group changes again during the commit, but it greatly reduces
3370 * the commit latency by getting rid of the easy block groups while
3371 * we're still allowing others to join the commit.
3372 */
21217054 3373int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
9078a3e1 3374{
21217054 3375 struct btrfs_fs_info *fs_info = trans->fs_info;
4a8c9a62 3376 struct btrfs_block_group_cache *cache;
ce93ec54
JB
3377 struct btrfs_transaction *cur_trans = trans->transaction;
3378 int ret = 0;
c9dc4c65 3379 int should_put;
1bbc621e
CM
3380 struct btrfs_path *path = NULL;
3381 LIST_HEAD(dirty);
3382 struct list_head *io = &cur_trans->io_bgs;
c9dc4c65 3383 int num_started = 0;
1bbc621e
CM
3384 int loops = 0;
3385
3386 spin_lock(&cur_trans->dirty_bgs_lock);
b58d1a9e
FM
3387 if (list_empty(&cur_trans->dirty_bgs)) {
3388 spin_unlock(&cur_trans->dirty_bgs_lock);
3389 return 0;
1bbc621e 3390 }
b58d1a9e 3391 list_splice_init(&cur_trans->dirty_bgs, &dirty);
1bbc621e 3392 spin_unlock(&cur_trans->dirty_bgs_lock);
ce93ec54 3393
1bbc621e 3394again:
1bbc621e
CM
3395 /*
3396 * make sure all the block groups on our dirty list actually
3397 * exist
3398 */
6c686b35 3399 btrfs_create_pending_block_groups(trans);
1bbc621e
CM
3400
3401 if (!path) {
3402 path = btrfs_alloc_path();
3403 if (!path)
3404 return -ENOMEM;
3405 }
3406
b58d1a9e
FM
3407 /*
3408 * cache_write_mutex is here only to save us from balance or automatic
3409 * removal of empty block groups deleting this block group while we are
3410 * writing out the cache
3411 */
3412 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e 3413 while (!list_empty(&dirty)) {
ba2c4d4e
JB
3414 bool drop_reserve = true;
3415
1bbc621e
CM
3416 cache = list_first_entry(&dirty,
3417 struct btrfs_block_group_cache,
3418 dirty_list);
1bbc621e
CM
3419 /*
3420 * this can happen if something re-dirties a block
3421 * group that is already under IO. Just wait for it to
3422 * finish and then do it all again
3423 */
3424 if (!list_empty(&cache->io_list)) {
3425 list_del_init(&cache->io_list);
afdb5718 3426 btrfs_wait_cache_io(trans, cache, path);
1bbc621e
CM
3427 btrfs_put_block_group(cache);
3428 }
3429
3430
3431 /*
3432 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3433 * if it should update the cache_state. Don't delete
3434 * until after we wait.
3435 *
3436 * Since we're not running in the commit critical section
3437 * we need the dirty_bgs_lock to protect from update_block_group
3438 */
3439 spin_lock(&cur_trans->dirty_bgs_lock);
3440 list_del_init(&cache->dirty_list);
3441 spin_unlock(&cur_trans->dirty_bgs_lock);
3442
3443 should_put = 1;
3444
3445 cache_save_setup(cache, trans, path);
3446
3447 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3448 cache->io_ctl.inode = NULL;
fe041534 3449 ret = btrfs_write_out_cache(trans, cache, path);
1bbc621e
CM
3450 if (ret == 0 && cache->io_ctl.inode) {
3451 num_started++;
3452 should_put = 0;
3453
3454 /*
45ae2c18
NB
3455 * The cache_write_mutex is protecting the
3456 * io_list, also refer to the definition of
3457 * btrfs_transaction::io_bgs for more details
1bbc621e
CM
3458 */
3459 list_add_tail(&cache->io_list, io);
3460 } else {
3461 /*
3462 * if we failed to write the cache, the
3463 * generation will be bad and life goes on
3464 */
3465 ret = 0;
3466 }
3467 }
ff1f8250 3468 if (!ret) {
39db232d 3469 ret = write_one_cache_group(trans, path, cache);
ff1f8250
FM
3470 /*
3471 * Our block group might still be attached to the list
3472 * of new block groups in the transaction handle of some
3473 * other task (struct btrfs_trans_handle->new_bgs). This
3474 * means its block group item isn't yet in the extent
3475 * tree. If this happens ignore the error, as we will
3476 * try again later in the critical section of the
3477 * transaction commit.
3478 */
3479 if (ret == -ENOENT) {
3480 ret = 0;
3481 spin_lock(&cur_trans->dirty_bgs_lock);
3482 if (list_empty(&cache->dirty_list)) {
3483 list_add_tail(&cache->dirty_list,
3484 &cur_trans->dirty_bgs);
3485 btrfs_get_block_group(cache);
ba2c4d4e 3486 drop_reserve = false;
ff1f8250
FM
3487 }
3488 spin_unlock(&cur_trans->dirty_bgs_lock);
3489 } else if (ret) {
66642832 3490 btrfs_abort_transaction(trans, ret);
ff1f8250
FM
3491 }
3492 }
1bbc621e 3493
52042d8e 3494 /* if it's not on the io list, we need to put the block group */
1bbc621e
CM
3495 if (should_put)
3496 btrfs_put_block_group(cache);
ba2c4d4e
JB
3497 if (drop_reserve)
3498 btrfs_delayed_refs_rsv_release(fs_info, 1);
1bbc621e
CM
3499
3500 if (ret)
3501 break;
b58d1a9e
FM
3502
3503 /*
3504 * Avoid blocking other tasks for too long. It might even save
3505 * us from writing caches for block groups that are going to be
3506 * removed.
3507 */
3508 mutex_unlock(&trans->transaction->cache_write_mutex);
3509 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e 3510 }
b58d1a9e 3511 mutex_unlock(&trans->transaction->cache_write_mutex);
1bbc621e
CM
3512
3513 /*
3514 * go through delayed refs for all the stuff we've just kicked off
3515 * and then loop back (just once)
3516 */
c79a70b1 3517 ret = btrfs_run_delayed_refs(trans, 0);
1bbc621e
CM
3518 if (!ret && loops == 0) {
3519 loops++;
3520 spin_lock(&cur_trans->dirty_bgs_lock);
3521 list_splice_init(&cur_trans->dirty_bgs, &dirty);
b58d1a9e
FM
3522 /*
3523 * dirty_bgs_lock protects us from concurrent block group
3524 * deletes too (not just cache_write_mutex).
3525 */
3526 if (!list_empty(&dirty)) {
3527 spin_unlock(&cur_trans->dirty_bgs_lock);
3528 goto again;
3529 }
1bbc621e 3530 spin_unlock(&cur_trans->dirty_bgs_lock);
c79a1751 3531 } else if (ret < 0) {
2ff7e61e 3532 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
1bbc621e
CM
3533 }
3534
3535 btrfs_free_path(path);
3536 return ret;
3537}
3538
5742d15f 3539int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
1bbc621e 3540{
5742d15f 3541 struct btrfs_fs_info *fs_info = trans->fs_info;
1bbc621e
CM
3542 struct btrfs_block_group_cache *cache;
3543 struct btrfs_transaction *cur_trans = trans->transaction;
3544 int ret = 0;
3545 int should_put;
3546 struct btrfs_path *path;
3547 struct list_head *io = &cur_trans->io_bgs;
3548 int num_started = 0;
9078a3e1
CM
3549
3550 path = btrfs_alloc_path();
3551 if (!path)
3552 return -ENOMEM;
3553
ce93ec54 3554 /*
e44081ef
FM
3555 * Even though we are in the critical section of the transaction commit,
3556 * we can still have concurrent tasks adding elements to this
3557 * transaction's list of dirty block groups. These tasks correspond to
3558 * endio free space workers started when writeback finishes for a
3559 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3560 * allocate new block groups as a result of COWing nodes of the root
3561 * tree when updating the free space inode. The writeback for the space
3562 * caches is triggered by an earlier call to
3563 * btrfs_start_dirty_block_groups() and iterations of the following
3564 * loop.
3565 * Also we want to do the cache_save_setup first and then run the
ce93ec54
JB
3566 * delayed refs to make sure we have the best chance at doing this all
3567 * in one shot.
3568 */
e44081ef 3569 spin_lock(&cur_trans->dirty_bgs_lock);
ce93ec54
JB
3570 while (!list_empty(&cur_trans->dirty_bgs)) {
3571 cache = list_first_entry(&cur_trans->dirty_bgs,
3572 struct btrfs_block_group_cache,
3573 dirty_list);
c9dc4c65
CM
3574
3575 /*
3576 * this can happen if cache_save_setup re-dirties a block
3577 * group that is already under IO. Just wait for it to
3578 * finish and then do it all again
3579 */
3580 if (!list_empty(&cache->io_list)) {
e44081ef 3581 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3582 list_del_init(&cache->io_list);
afdb5718 3583 btrfs_wait_cache_io(trans, cache, path);
c9dc4c65 3584 btrfs_put_block_group(cache);
e44081ef 3585 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3586 }
3587
1bbc621e
CM
3588 /*
3589 * don't remove from the dirty list until after we've waited
3590 * on any pending IO
3591 */
ce93ec54 3592 list_del_init(&cache->dirty_list);
e44081ef 3593 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3594 should_put = 1;
3595
1bbc621e 3596 cache_save_setup(cache, trans, path);
c9dc4c65 3597
ce93ec54 3598 if (!ret)
c79a70b1 3599 ret = btrfs_run_delayed_refs(trans,
2ff7e61e 3600 (unsigned long) -1);
c9dc4c65
CM
3601
3602 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3603 cache->io_ctl.inode = NULL;
fe041534 3604 ret = btrfs_write_out_cache(trans, cache, path);
c9dc4c65
CM
3605 if (ret == 0 && cache->io_ctl.inode) {
3606 num_started++;
3607 should_put = 0;
1bbc621e 3608 list_add_tail(&cache->io_list, io);
c9dc4c65
CM
3609 } else {
3610 /*
3611 * if we failed to write the cache, the
3612 * generation will be bad and life goes on
3613 */
3614 ret = 0;
3615 }
3616 }
ff1f8250 3617 if (!ret) {
39db232d 3618 ret = write_one_cache_group(trans, path, cache);
2bc0bb5f
FM
3619 /*
3620 * One of the free space endio workers might have
3621 * created a new block group while updating a free space
3622 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3623 * and hasn't released its transaction handle yet, in
3624 * which case the new block group is still attached to
3625 * its transaction handle and its creation has not
3626 * finished yet (no block group item in the extent tree
3627 * yet, etc). If this is the case, wait for all free
3628 * space endio workers to finish and retry. This is a
3629 * a very rare case so no need for a more efficient and
3630 * complex approach.
3631 */
3632 if (ret == -ENOENT) {
3633 wait_event(cur_trans->writer_wait,
3634 atomic_read(&cur_trans->num_writers) == 1);
39db232d 3635 ret = write_one_cache_group(trans, path, cache);
2bc0bb5f 3636 }
ff1f8250 3637 if (ret)
66642832 3638 btrfs_abort_transaction(trans, ret);
ff1f8250 3639 }
c9dc4c65
CM
3640
3641 /* if its not on the io list, we need to put the block group */
3642 if (should_put)
3643 btrfs_put_block_group(cache);
ba2c4d4e 3644 btrfs_delayed_refs_rsv_release(fs_info, 1);
e44081ef 3645 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3646 }
e44081ef 3647 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3648
45ae2c18
NB
3649 /*
3650 * Refer to the definition of io_bgs member for details why it's safe
3651 * to use it without any locking
3652 */
1bbc621e
CM
3653 while (!list_empty(io)) {
3654 cache = list_first_entry(io, struct btrfs_block_group_cache,
c9dc4c65
CM
3655 io_list);
3656 list_del_init(&cache->io_list);
afdb5718 3657 btrfs_wait_cache_io(trans, cache, path);
0cb59c99
JB
3658 btrfs_put_block_group(cache);
3659 }
3660
9078a3e1 3661 btrfs_free_path(path);
ce93ec54 3662 return ret;
9078a3e1
CM
3663}
3664
2ff7e61e 3665int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
d2fb3437
YZ
3666{
3667 struct btrfs_block_group_cache *block_group;
3668 int readonly = 0;
3669
0b246afa 3670 block_group = btrfs_lookup_block_group(fs_info, bytenr);
d2fb3437
YZ
3671 if (!block_group || block_group->ro)
3672 readonly = 1;
3673 if (block_group)
fa9c0d79 3674 btrfs_put_block_group(block_group);
d2fb3437
YZ
3675 return readonly;
3676}
3677
f78c436c
FM
3678bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3679{
3680 struct btrfs_block_group_cache *bg;
3681 bool ret = true;
3682
3683 bg = btrfs_lookup_block_group(fs_info, bytenr);
3684 if (!bg)
3685 return false;
3686
3687 spin_lock(&bg->lock);
3688 if (bg->ro)
3689 ret = false;
3690 else
3691 atomic_inc(&bg->nocow_writers);
3692 spin_unlock(&bg->lock);
3693
3694 /* no put on block group, done by btrfs_dec_nocow_writers */
3695 if (!ret)
3696 btrfs_put_block_group(bg);
3697
3698 return ret;
3699
3700}
3701
3702void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3703{
3704 struct btrfs_block_group_cache *bg;
3705
3706 bg = btrfs_lookup_block_group(fs_info, bytenr);
3707 ASSERT(bg);
3708 if (atomic_dec_and_test(&bg->nocow_writers))
4625956a 3709 wake_up_var(&bg->nocow_writers);
f78c436c
FM
3710 /*
3711 * Once for our lookup and once for the lookup done by a previous call
3712 * to btrfs_inc_nocow_writers()
3713 */
3714 btrfs_put_block_group(bg);
3715 btrfs_put_block_group(bg);
3716}
3717
f78c436c
FM
3718void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3719{
4625956a 3720 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
f78c436c
FM
3721}
3722
8790d502
CM
3723static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3724{
899c81ea
ID
3725 u64 extra_flags = chunk_to_extended(flags) &
3726 BTRFS_EXTENDED_PROFILE_MASK;
a46d11a8 3727
de98ced9 3728 write_seqlock(&fs_info->profiles_lock);
a46d11a8
ID
3729 if (flags & BTRFS_BLOCK_GROUP_DATA)
3730 fs_info->avail_data_alloc_bits |= extra_flags;
3731 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3732 fs_info->avail_metadata_alloc_bits |= extra_flags;
3733 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3734 fs_info->avail_system_alloc_bits |= extra_flags;
de98ced9 3735 write_sequnlock(&fs_info->profiles_lock);
8790d502 3736}
593060d7 3737
fc67c450
ID
3738/*
3739 * returns target flags in extended format or 0 if restripe for this
3740 * chunk_type is not in progress
c6664b42 3741 *
dccdb07b 3742 * should be called with balance_lock held
fc67c450
ID
3743 */
3744static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3745{
3746 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3747 u64 target = 0;
3748
fc67c450
ID
3749 if (!bctl)
3750 return 0;
3751
3752 if (flags & BTRFS_BLOCK_GROUP_DATA &&
3753 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3754 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3755 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3756 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3757 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3758 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3759 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3760 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3761 }
3762
3763 return target;
3764}
3765
a46d11a8
ID
3766/*
3767 * @flags: available profiles in extended format (see ctree.h)
3768 *
e4d8ec0f
ID
3769 * Returns reduced profile in chunk format. If profile changing is in
3770 * progress (either running or paused) picks the target profile (if it's
3771 * already available), otherwise falls back to plain reducing.
a46d11a8 3772 */
2ff7e61e 3773static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c 3774{
0b246afa 3775 u64 num_devices = fs_info->fs_devices->rw_devices;
fc67c450 3776 u64 target;
9c170b26
ZL
3777 u64 raid_type;
3778 u64 allowed = 0;
a061fc8d 3779
fc67c450
ID
3780 /*
3781 * see if restripe for this chunk_type is in progress, if so
3782 * try to reduce to the target profile
3783 */
0b246afa
JM
3784 spin_lock(&fs_info->balance_lock);
3785 target = get_restripe_target(fs_info, flags);
fc67c450
ID
3786 if (target) {
3787 /* pick target profile only if it's already available */
3788 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
0b246afa 3789 spin_unlock(&fs_info->balance_lock);
fc67c450 3790 return extended_to_chunk(target);
e4d8ec0f
ID
3791 }
3792 }
0b246afa 3793 spin_unlock(&fs_info->balance_lock);
e4d8ec0f 3794
53b381b3 3795 /* First, mask out the RAID levels which aren't possible */
9c170b26
ZL
3796 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3797 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
41a6e891 3798 allowed |= btrfs_raid_array[raid_type].bg_flag;
9c170b26
ZL
3799 }
3800 allowed &= flags;
3801
3802 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3803 allowed = BTRFS_BLOCK_GROUP_RAID6;
3804 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3805 allowed = BTRFS_BLOCK_GROUP_RAID5;
3806 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3807 allowed = BTRFS_BLOCK_GROUP_RAID10;
3808 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3809 allowed = BTRFS_BLOCK_GROUP_RAID1;
3810 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3811 allowed = BTRFS_BLOCK_GROUP_RAID0;
3812
3813 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3814
3815 return extended_to_chunk(flags | allowed);
ec44a35c
CM
3816}
3817
2ff7e61e 3818static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
6a63209f 3819{
de98ced9 3820 unsigned seq;
f8213bdc 3821 u64 flags;
de98ced9
MX
3822
3823 do {
f8213bdc 3824 flags = orig_flags;
0b246afa 3825 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9
MX
3826
3827 if (flags & BTRFS_BLOCK_GROUP_DATA)
0b246afa 3828 flags |= fs_info->avail_data_alloc_bits;
de98ced9 3829 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
0b246afa 3830 flags |= fs_info->avail_system_alloc_bits;
de98ced9 3831 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
0b246afa
JM
3832 flags |= fs_info->avail_metadata_alloc_bits;
3833 } while (read_seqretry(&fs_info->profiles_lock, seq));
6fef8df1 3834
2ff7e61e 3835 return btrfs_reduce_alloc_profile(fs_info, flags);
6a63209f
JB
3836}
3837
1b86826d 3838static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 3839{
0b246afa 3840 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 3841 u64 flags;
53b381b3 3842 u64 ret;
9ed74f2d 3843
b742bb82
YZ
3844 if (data)
3845 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 3846 else if (root == fs_info->chunk_root)
b742bb82 3847 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 3848 else
b742bb82 3849 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 3850
2ff7e61e 3851 ret = get_alloc_profile(fs_info, flags);
53b381b3 3852 return ret;
6a63209f 3853}
9ed74f2d 3854
1b86826d
JM
3855u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3856{
3857 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3858}
3859
3860u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3861{
3862 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3863}
3864
3865u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3866{
3867 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3868}
3869
04f4f916 3870int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
6a63209f 3871{
04f4f916 3872 struct btrfs_root *root = inode->root;
b4d7c3c9 3873 struct btrfs_fs_info *fs_info = root->fs_info;
1174cade 3874 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
ab6e2410 3875 u64 used;
94b947b2 3876 int ret = 0;
c99f1b0c
ZL
3877 int need_commit = 2;
3878 int have_pinned_space;
6a63209f 3879
6a63209f 3880 /* make sure bytes are sectorsize aligned */
0b246afa 3881 bytes = ALIGN(bytes, fs_info->sectorsize);
6a63209f 3882
9dced186 3883 if (btrfs_is_free_space_inode(inode)) {
c99f1b0c 3884 need_commit = 0;
9dced186 3885 ASSERT(current->journal_info);
0af3d00b
JB
3886 }
3887
6a63209f
JB
3888again:
3889 /* make sure we have enough space to handle the data first */
3890 spin_lock(&data_sinfo->lock);
4136135b 3891 used = btrfs_space_info_used(data_sinfo, true);
ab6e2410
JB
3892
3893 if (used + bytes > data_sinfo->total_bytes) {
4e06bdd6 3894 struct btrfs_trans_handle *trans;
9ed74f2d 3895
6a63209f
JB
3896 /*
3897 * if we don't have enough free bytes in this space then we need
3898 * to alloc a new chunk.
3899 */
b9fd47cd 3900 if (!data_sinfo->full) {
6a63209f 3901 u64 alloc_target;
9ed74f2d 3902
0e4f8f88 3903 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
6a63209f 3904 spin_unlock(&data_sinfo->lock);
1174cade 3905
1b86826d 3906 alloc_target = btrfs_data_alloc_profile(fs_info);
9dced186
MX
3907 /*
3908 * It is ugly that we don't call nolock join
3909 * transaction for the free space inode case here.
3910 * But it is safe because we only do the data space
3911 * reservation for the free space cache in the
3912 * transaction context, the common join transaction
3913 * just increase the counter of the current transaction
3914 * handler, doesn't try to acquire the trans_lock of
3915 * the fs.
3916 */
7a7eaa40 3917 trans = btrfs_join_transaction(root);
a22285a6
YZ
3918 if (IS_ERR(trans))
3919 return PTR_ERR(trans);
9ed74f2d 3920
fc471cb0
JB
3921 ret = btrfs_chunk_alloc(trans, alloc_target,
3922 CHUNK_ALLOC_NO_FORCE);
3a45bb20 3923 btrfs_end_transaction(trans);
d52a5b5f
MX
3924 if (ret < 0) {
3925 if (ret != -ENOSPC)
3926 return ret;
c99f1b0c
ZL
3927 else {
3928 have_pinned_space = 1;
d52a5b5f 3929 goto commit_trans;
c99f1b0c 3930 }
d52a5b5f 3931 }
9ed74f2d 3932
6a63209f
JB
3933 goto again;
3934 }
f2bb8f5c
JB
3935
3936 /*
b150a4f1 3937 * If we don't have enough pinned space to deal with this
94b947b2
ZL
3938 * allocation, and no removed chunk in current transaction,
3939 * don't bother committing the transaction.
f2bb8f5c 3940 */
dec59fa3 3941 have_pinned_space = __percpu_counter_compare(
c99f1b0c 3942 &data_sinfo->total_bytes_pinned,
dec59fa3
EL
3943 used + bytes - data_sinfo->total_bytes,
3944 BTRFS_TOTAL_BYTES_PINNED_BATCH);
6a63209f 3945 spin_unlock(&data_sinfo->lock);
6a63209f 3946
4e06bdd6 3947 /* commit the current transaction and try again */
d52a5b5f 3948commit_trans:
92e2f7e3 3949 if (need_commit) {
c99f1b0c 3950 need_commit--;
b150a4f1 3951
e1746e83 3952 if (need_commit > 0) {
82b3e53b 3953 btrfs_start_delalloc_roots(fs_info, -1);
6374e57a 3954 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
0b246afa 3955 (u64)-1);
e1746e83 3956 }
9a4e7276 3957
7a7eaa40 3958 trans = btrfs_join_transaction(root);
a22285a6
YZ
3959 if (IS_ERR(trans))
3960 return PTR_ERR(trans);
c99f1b0c 3961 if (have_pinned_space >= 0 ||
3204d33c
JB
3962 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
3963 &trans->transaction->flags) ||
c99f1b0c 3964 need_commit > 0) {
3a45bb20 3965 ret = btrfs_commit_transaction(trans);
94b947b2
ZL
3966 if (ret)
3967 return ret;
d7c15171 3968 /*
c2d6cb16
FM
3969 * The cleaner kthread might still be doing iput
3970 * operations. Wait for it to finish so that
034f784d
JB
3971 * more space is released. We don't need to
3972 * explicitly run the delayed iputs here because
3973 * the commit_transaction would have woken up
3974 * the cleaner.
d7c15171 3975 */
034f784d
JB
3976 ret = btrfs_wait_on_delayed_iputs(fs_info);
3977 if (ret)
3978 return ret;
94b947b2
ZL
3979 goto again;
3980 } else {
3a45bb20 3981 btrfs_end_transaction(trans);
94b947b2 3982 }
4e06bdd6 3983 }
9ed74f2d 3984
0b246afa 3985 trace_btrfs_space_reservation(fs_info,
cab45e22
JM
3986 "space_info:enospc",
3987 data_sinfo->flags, bytes, 1);
6a63209f
JB
3988 return -ENOSPC;
3989 }
bb96c4e5 3990 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
0b246afa 3991 trace_btrfs_space_reservation(fs_info, "space_info",
2bcc0328 3992 data_sinfo->flags, bytes, 1);
6a63209f 3993 spin_unlock(&data_sinfo->lock);
6a63209f 3994
4559b0a7 3995 return 0;
9ed74f2d 3996}
6a63209f 3997
364ecf36
QW
3998int btrfs_check_data_free_space(struct inode *inode,
3999 struct extent_changeset **reserved, u64 start, u64 len)
4ceff079 4000{
0b246afa 4001 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4002 int ret;
4003
4004 /* align the range */
0b246afa
JM
4005 len = round_up(start + len, fs_info->sectorsize) -
4006 round_down(start, fs_info->sectorsize);
4007 start = round_down(start, fs_info->sectorsize);
4ceff079 4008
04f4f916 4009 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4ceff079
QW
4010 if (ret < 0)
4011 return ret;
4012
1e5ec2e7 4013 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
364ecf36 4014 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
7bc329c1 4015 if (ret < 0)
1e5ec2e7 4016 btrfs_free_reserved_data_space_noquota(inode, start, len);
364ecf36
QW
4017 else
4018 ret = 0;
4ceff079
QW
4019 return ret;
4020}
4021
4ceff079
QW
4022/*
4023 * Called if we need to clear a data reservation for this inode
4024 * Normally in a error case.
4025 *
51773bec
QW
4026 * This one will *NOT* use accurate qgroup reserved space API, just for case
4027 * which we can't sleep and is sure it won't affect qgroup reserved space.
4028 * Like clear_bit_hook().
4ceff079 4029 */
51773bec
QW
4030void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4031 u64 len)
4ceff079 4032{
0b246afa 4033 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4034 struct btrfs_space_info *data_sinfo;
4035
4036 /* Make sure the range is aligned to sectorsize */
0b246afa
JM
4037 len = round_up(start + len, fs_info->sectorsize) -
4038 round_down(start, fs_info->sectorsize);
4039 start = round_down(start, fs_info->sectorsize);
4ceff079 4040
0b246afa 4041 data_sinfo = fs_info->data_sinfo;
4ceff079 4042 spin_lock(&data_sinfo->lock);
bb96c4e5 4043 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
0b246afa 4044 trace_btrfs_space_reservation(fs_info, "space_info",
4ceff079
QW
4045 data_sinfo->flags, len, 0);
4046 spin_unlock(&data_sinfo->lock);
4047}
4048
51773bec
QW
4049/*
4050 * Called if we need to clear a data reservation for this inode
4051 * Normally in a error case.
4052 *
01327610 4053 * This one will handle the per-inode data rsv map for accurate reserved
51773bec
QW
4054 * space framework.
4055 */
bc42bda2
QW
4056void btrfs_free_reserved_data_space(struct inode *inode,
4057 struct extent_changeset *reserved, u64 start, u64 len)
51773bec 4058{
0c476a5d
JM
4059 struct btrfs_root *root = BTRFS_I(inode)->root;
4060
4061 /* Make sure the range is aligned to sectorsize */
da17066c
JM
4062 len = round_up(start + len, root->fs_info->sectorsize) -
4063 round_down(start, root->fs_info->sectorsize);
4064 start = round_down(start, root->fs_info->sectorsize);
0c476a5d 4065
51773bec 4066 btrfs_free_reserved_data_space_noquota(inode, start, len);
bc42bda2 4067 btrfs_qgroup_free_data(inode, reserved, start, len);
51773bec
QW
4068}
4069
97e728d4 4070static void force_metadata_allocation(struct btrfs_fs_info *info)
e3ccfa98 4071{
97e728d4
JB
4072 struct list_head *head = &info->space_info;
4073 struct btrfs_space_info *found;
e3ccfa98 4074
97e728d4
JB
4075 rcu_read_lock();
4076 list_for_each_entry_rcu(found, head, list) {
4077 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
0e4f8f88 4078 found->force_alloc = CHUNK_ALLOC_FORCE;
e3ccfa98 4079 }
97e728d4 4080 rcu_read_unlock();
e3ccfa98
JB
4081}
4082
2ff7e61e 4083static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
698d0082 4084 struct btrfs_space_info *sinfo, int force)
32c00aff 4085{
8d8aafee 4086 u64 bytes_used = btrfs_space_info_used(sinfo, false);
e5bc2458 4087 u64 thresh;
e3ccfa98 4088
0e4f8f88
CM
4089 if (force == CHUNK_ALLOC_FORCE)
4090 return 1;
4091
4092 /*
4093 * in limited mode, we want to have some free space up to
4094 * about 1% of the FS size.
4095 */
4096 if (force == CHUNK_ALLOC_LIMITED) {
0b246afa 4097 thresh = btrfs_super_total_bytes(fs_info->super_copy);
ee22184b 4098 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
0e4f8f88 4099
8d8aafee 4100 if (sinfo->total_bytes - bytes_used < thresh)
0e4f8f88
CM
4101 return 1;
4102 }
0e4f8f88 4103
8d8aafee 4104 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
14ed0ca6 4105 return 0;
424499db 4106 return 1;
32c00aff
JB
4107}
4108
2ff7e61e 4109static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
15d1ff81
LB
4110{
4111 u64 num_dev;
4112
9fa02ac7
DS
4113 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4114 if (!num_dev)
0b246afa 4115 num_dev = fs_info->fs_devices->rw_devices;
15d1ff81 4116
39c2d7fa 4117 return num_dev;
15d1ff81
LB
4118}
4119
39c2d7fa
FM
4120/*
4121 * If @is_allocation is true, reserve space in the system space info necessary
4122 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4123 * removing a chunk.
4124 */
451a2c13 4125void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
15d1ff81 4126{
451a2c13 4127 struct btrfs_fs_info *fs_info = trans->fs_info;
15d1ff81
LB
4128 struct btrfs_space_info *info;
4129 u64 left;
4130 u64 thresh;
4fbcdf66 4131 int ret = 0;
39c2d7fa 4132 u64 num_devs;
4fbcdf66
FM
4133
4134 /*
4135 * Needed because we can end up allocating a system chunk and for an
4136 * atomic and race free space reservation in the chunk block reserve.
4137 */
a32bf9a3 4138 lockdep_assert_held(&fs_info->chunk_mutex);
15d1ff81 4139
280c2908 4140 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
15d1ff81 4141 spin_lock(&info->lock);
4136135b 4142 left = info->total_bytes - btrfs_space_info_used(info, true);
15d1ff81
LB
4143 spin_unlock(&info->lock);
4144
2ff7e61e 4145 num_devs = get_profile_num_devs(fs_info, type);
39c2d7fa
FM
4146
4147 /* num_devs device items to update and 1 chunk item to add or remove */
0b246afa
JM
4148 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4149 btrfs_calc_trans_metadata_size(fs_info, 1);
39c2d7fa 4150
0b246afa
JM
4151 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4152 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4153 left, thresh, type);
5da6afeb 4154 btrfs_dump_space_info(fs_info, info, 0, 0);
15d1ff81
LB
4155 }
4156
4157 if (left < thresh) {
1b86826d 4158 u64 flags = btrfs_system_alloc_profile(fs_info);
15d1ff81 4159
4fbcdf66
FM
4160 /*
4161 * Ignore failure to create system chunk. We might end up not
4162 * needing it, as we might not need to COW all nodes/leafs from
4163 * the paths we visit in the chunk tree (they were already COWed
4164 * or created in the current transaction for example).
4165 */
c216b203 4166 ret = btrfs_alloc_chunk(trans, flags);
4fbcdf66
FM
4167 }
4168
4169 if (!ret) {
0b246afa
JM
4170 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4171 &fs_info->chunk_block_rsv,
4fbcdf66
FM
4172 thresh, BTRFS_RESERVE_NO_FLUSH);
4173 if (!ret)
4174 trans->chunk_bytes_reserved += thresh;
15d1ff81
LB
4175 }
4176}
4177
28b737f6
LB
4178/*
4179 * If force is CHUNK_ALLOC_FORCE:
4180 * - return 1 if it successfully allocates a chunk,
4181 * - return errors including -ENOSPC otherwise.
4182 * If force is NOT CHUNK_ALLOC_FORCE:
4183 * - return 0 if it doesn't need to allocate a new chunk,
4184 * - return 1 if it successfully allocates a chunk,
4185 * - return errors including -ENOSPC otherwise.
4186 */
fc471cb0
JB
4187int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4188 enum btrfs_chunk_alloc_enum force)
9ed74f2d 4189{
01458828 4190 struct btrfs_fs_info *fs_info = trans->fs_info;
6324fbf3 4191 struct btrfs_space_info *space_info;
2556fbb0
NB
4192 bool wait_for_alloc = false;
4193 bool should_alloc = false;
9ed74f2d 4194 int ret = 0;
9ed74f2d 4195
c6b305a8
JB
4196 /* Don't re-enter if we're already allocating a chunk */
4197 if (trans->allocating_chunk)
4198 return -ENOSPC;
4199
280c2908 4200 space_info = btrfs_find_space_info(fs_info, flags);
dc2d3005 4201 ASSERT(space_info);
9ed74f2d 4202
2556fbb0
NB
4203 do {
4204 spin_lock(&space_info->lock);
4205 if (force < space_info->force_alloc)
4206 force = space_info->force_alloc;
4207 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4208 if (space_info->full) {
4209 /* No more free physical space */
4210 if (should_alloc)
4211 ret = -ENOSPC;
4212 else
4213 ret = 0;
4214 spin_unlock(&space_info->lock);
4215 return ret;
4216 } else if (!should_alloc) {
4217 spin_unlock(&space_info->lock);
4218 return 0;
4219 } else if (space_info->chunk_alloc) {
4220 /*
4221 * Someone is already allocating, so we need to block
4222 * until this someone is finished and then loop to
4223 * recheck if we should continue with our allocation
4224 * attempt.
4225 */
4226 wait_for_alloc = true;
4227 spin_unlock(&space_info->lock);
4228 mutex_lock(&fs_info->chunk_mutex);
4229 mutex_unlock(&fs_info->chunk_mutex);
4230 } else {
4231 /* Proceed with allocation */
4232 space_info->chunk_alloc = 1;
4233 wait_for_alloc = false;
4234 spin_unlock(&space_info->lock);
4235 }
6d74119f 4236
1e1c50a9 4237 cond_resched();
2556fbb0 4238 } while (wait_for_alloc);
6d74119f 4239
2556fbb0 4240 mutex_lock(&fs_info->chunk_mutex);
c6b305a8
JB
4241 trans->allocating_chunk = true;
4242
67377734
JB
4243 /*
4244 * If we have mixed data/metadata chunks we want to make sure we keep
4245 * allocating mixed chunks instead of individual chunks.
4246 */
4247 if (btrfs_mixed_space_info(space_info))
4248 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4249
97e728d4
JB
4250 /*
4251 * if we're doing a data chunk, go ahead and make sure that
4252 * we keep a reasonable number of metadata chunks allocated in the
4253 * FS as well.
4254 */
9ed74f2d 4255 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
97e728d4
JB
4256 fs_info->data_chunk_allocations++;
4257 if (!(fs_info->data_chunk_allocations %
4258 fs_info->metadata_ratio))
4259 force_metadata_allocation(fs_info);
9ed74f2d
JB
4260 }
4261
15d1ff81
LB
4262 /*
4263 * Check if we have enough space in SYSTEM chunk because we may need
4264 * to update devices.
4265 */
451a2c13 4266 check_system_chunk(trans, flags);
15d1ff81 4267
c216b203 4268 ret = btrfs_alloc_chunk(trans, flags);
c6b305a8 4269 trans->allocating_chunk = false;
92b8e897 4270
9ed74f2d 4271 spin_lock(&space_info->lock);
57f1642e
NB
4272 if (ret < 0) {
4273 if (ret == -ENOSPC)
4274 space_info->full = 1;
4275 else
4276 goto out;
4277 } else {
424499db 4278 ret = 1;
21a94f7a 4279 space_info->max_extent_size = 0;
57f1642e 4280 }
6d74119f 4281
0e4f8f88 4282 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
a81cb9a2 4283out:
6d74119f 4284 space_info->chunk_alloc = 0;
9ed74f2d 4285 spin_unlock(&space_info->lock);
a25c75d5 4286 mutex_unlock(&fs_info->chunk_mutex);
00d80e34
FM
4287 /*
4288 * When we allocate a new chunk we reserve space in the chunk block
4289 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4290 * add new nodes/leafs to it if we end up needing to do it when
4291 * inserting the chunk item and updating device items as part of the
4292 * second phase of chunk allocation, performed by
4293 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4294 * large number of new block groups to create in our transaction
4295 * handle's new_bgs list to avoid exhausting the chunk block reserve
4296 * in extreme cases - like having a single transaction create many new
4297 * block groups when starting to write out the free space caches of all
4298 * the block groups that were made dirty during the lifetime of the
4299 * transaction.
4300 */
5ce55557 4301 if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
6c686b35 4302 btrfs_create_pending_block_groups(trans);
5ce55557 4303
0f9dd46c 4304 return ret;
6324fbf3 4305}
9ed74f2d 4306
69fe2d75
JB
4307/**
4308 * btrfs_inode_rsv_release - release any excessive reservation.
4309 * @inode - the inode we need to release from.
43b18595
QW
4310 * @qgroup_free - free or convert qgroup meta.
4311 * Unlike normal operation, qgroup meta reservation needs to know if we are
4312 * freeing qgroup reservation or just converting it into per-trans. Normally
4313 * @qgroup_free is true for error handling, and false for normal release.
69fe2d75
JB
4314 *
4315 * This is the same as btrfs_block_rsv_release, except that it handles the
4316 * tracepoint for the reservation.
4317 */
43b18595 4318static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
69fe2d75
JB
4319{
4320 struct btrfs_fs_info *fs_info = inode->root->fs_info;
69fe2d75
JB
4321 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4322 u64 released = 0;
ff6bc37e 4323 u64 qgroup_to_release = 0;
69fe2d75
JB
4324
4325 /*
4326 * Since we statically set the block_rsv->size we just want to say we
4327 * are releasing 0 bytes, and then we'll just get the reservation over
4328 * the size free'd.
4329 */
ba2c4d4e
JB
4330 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
4331 &qgroup_to_release);
69fe2d75
JB
4332 if (released > 0)
4333 trace_btrfs_space_reservation(fs_info, "delalloc",
4334 btrfs_ino(inode), released, 0);
43b18595 4335 if (qgroup_free)
ff6bc37e 4336 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
43b18595 4337 else
ff6bc37e
QW
4338 btrfs_qgroup_convert_reserved_meta(inode->root,
4339 qgroup_to_release);
69fe2d75
JB
4340}
4341
d5c12070
MX
4342/*
4343 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4344 * root: the root of the parent directory
4345 * rsv: block reservation
4346 * items: the number of items that we need do reservation
a5b7f429 4347 * use_global_rsv: allow fallback to the global block reservation
d5c12070
MX
4348 *
4349 * This function is used to reserve the space for snapshot/subvolume
4350 * creation and deletion. Those operations are different with the
4351 * common file/directory operations, they change two fs/file trees
4352 * and root tree, the number of items that the qgroup reserves is
4353 * different with the free space reservation. So we can not use
01327610 4354 * the space reservation mechanism in start_transaction().
d5c12070
MX
4355 */
4356int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
a5b7f429 4357 struct btrfs_block_rsv *rsv, int items,
ee3441b4 4358 bool use_global_rsv)
a22285a6 4359{
a5b7f429 4360 u64 qgroup_num_bytes = 0;
d5c12070
MX
4361 u64 num_bytes;
4362 int ret;
0b246afa
JM
4363 struct btrfs_fs_info *fs_info = root->fs_info;
4364 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
d5c12070 4365
0b246afa 4366 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
d5c12070 4367 /* One for parent inode, two for dir entries */
a5b7f429
LF
4368 qgroup_num_bytes = 3 * fs_info->nodesize;
4369 ret = btrfs_qgroup_reserve_meta_prealloc(root,
4370 qgroup_num_bytes, true);
d5c12070
MX
4371 if (ret)
4372 return ret;
d5c12070
MX
4373 }
4374
0b246afa 4375 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
280c2908 4376 rsv->space_info = btrfs_find_space_info(fs_info,
d5c12070
MX
4377 BTRFS_BLOCK_GROUP_METADATA);
4378 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4379 BTRFS_RESERVE_FLUSH_ALL);
ee3441b4
JM
4380
4381 if (ret == -ENOSPC && use_global_rsv)
3a584174 4382 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
ee3441b4 4383
a5b7f429
LF
4384 if (ret && qgroup_num_bytes)
4385 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
d5c12070
MX
4386
4387 return ret;
4388}
4389
2ff7e61e 4390void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
7775c818 4391 struct btrfs_block_rsv *rsv)
d5c12070 4392{
2ff7e61e 4393 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
97e728d4
JB
4394}
4395
69fe2d75
JB
4396static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
4397 struct btrfs_inode *inode)
9e0baf60 4398{
69fe2d75
JB
4399 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4400 u64 reserve_size = 0;
ff6bc37e 4401 u64 qgroup_rsv_size = 0;
69fe2d75
JB
4402 u64 csum_leaves;
4403 unsigned outstanding_extents;
9e0baf60 4404
69fe2d75
JB
4405 lockdep_assert_held(&inode->lock);
4406 outstanding_extents = inode->outstanding_extents;
4407 if (outstanding_extents)
4408 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
4409 outstanding_extents + 1);
4410 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
4411 inode->csum_bytes);
4412 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
4413 csum_leaves);
ff6bc37e
QW
4414 /*
4415 * For qgroup rsv, the calculation is very simple:
4416 * account one nodesize for each outstanding extent
4417 *
4418 * This is overestimating in most cases.
4419 */
139a5617 4420 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
9e0baf60 4421
69fe2d75
JB
4422 spin_lock(&block_rsv->lock);
4423 block_rsv->size = reserve_size;
ff6bc37e 4424 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
69fe2d75 4425 spin_unlock(&block_rsv->lock);
0ca1f7ce 4426}
c146afad 4427
c8eaeac7
JB
4428static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
4429 u64 num_bytes, u64 *meta_reserve,
4430 u64 *qgroup_reserve)
4431{
4432 u64 nr_extents = count_max_extents(num_bytes);
4433 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
4434
4435 /* We add one for the inode update at finish ordered time */
4436 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
4437 nr_extents + csum_leaves + 1);
4438 *qgroup_reserve = nr_extents * fs_info->nodesize;
4439}
4440
9f3db423 4441int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
0ca1f7ce 4442{
c8eaeac7
JB
4443 struct btrfs_root *root = inode->root;
4444 struct btrfs_fs_info *fs_info = root->fs_info;
4445 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4446 u64 meta_reserve, qgroup_reserve;
69fe2d75 4447 unsigned nr_extents;
08e007d2 4448 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
eb6b88d9 4449 int ret = 0;
c64c2bd8 4450 bool delalloc_lock = true;
6324fbf3 4451
c64c2bd8
JB
4452 /* If we are a free space inode we need to not flush since we will be in
4453 * the middle of a transaction commit. We also don't need the delalloc
4454 * mutex since we won't race with anybody. We need this mostly to make
4455 * lockdep shut its filthy mouth.
bac357dc
JB
4456 *
4457 * If we have a transaction open (can happen if we call truncate_block
4458 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
c64c2bd8
JB
4459 */
4460 if (btrfs_is_free_space_inode(inode)) {
08e007d2 4461 flush = BTRFS_RESERVE_NO_FLUSH;
c64c2bd8 4462 delalloc_lock = false;
da07d4ab
NB
4463 } else {
4464 if (current->journal_info)
4465 flush = BTRFS_RESERVE_FLUSH_LIMIT;
c09544e0 4466
da07d4ab
NB
4467 if (btrfs_transaction_in_commit(fs_info))
4468 schedule_timeout(1);
4469 }
ec44a35c 4470
c64c2bd8 4471 if (delalloc_lock)
9f3db423 4472 mutex_lock(&inode->delalloc_mutex);
c64c2bd8 4473
0b246afa 4474 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
69fe2d75 4475
c8eaeac7
JB
4476 /*
4477 * We always want to do it this way, every other way is wrong and ends
4478 * in tears. Pre-reserving the amount we are going to add will always
4479 * be the right way, because otherwise if we have enough parallelism we
4480 * could end up with thousands of inodes all holding little bits of
4481 * reservations they were able to make previously and the only way to
4482 * reclaim that space is to ENOSPC out the operations and clear
4483 * everything out and try again, which is bad. This way we just
4484 * over-reserve slightly, and clean up the mess when we are done.
4485 */
4486 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
4487 &qgroup_reserve);
4488 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
4489 if (ret)
4490 goto out_fail;
0d9764f6 4491 ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
c8eaeac7
JB
4492 if (ret)
4493 goto out_qgroup;
4494
4495 /*
4496 * Now we need to update our outstanding extents and csum bytes _first_
4497 * and then add the reservation to the block_rsv. This keeps us from
4498 * racing with an ordered completion or some such that would think it
4499 * needs to free the reservation we just made.
4500 */
9f3db423 4501 spin_lock(&inode->lock);
69fe2d75 4502 nr_extents = count_max_extents(num_bytes);
8b62f87b 4503 btrfs_mod_outstanding_extents(inode, nr_extents);
69fe2d75
JB
4504 inode->csum_bytes += num_bytes;
4505 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
9f3db423 4506 spin_unlock(&inode->lock);
57a45ced 4507
c8eaeac7 4508 /* Now we can safely add our space to our block rsv */
0b50174a 4509 btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
c8eaeac7
JB
4510 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4511 btrfs_ino(inode), meta_reserve, 1);
4512
4513 spin_lock(&block_rsv->lock);
4514 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
4515 spin_unlock(&block_rsv->lock);
25179201 4516
c64c2bd8 4517 if (delalloc_lock)
9f3db423 4518 mutex_unlock(&inode->delalloc_mutex);
0ca1f7ce 4519 return 0;
c8eaeac7
JB
4520out_qgroup:
4521 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
88e081bf 4522out_fail:
43b18595 4523 btrfs_inode_rsv_release(inode, true);
88e081bf 4524 if (delalloc_lock)
9f3db423 4525 mutex_unlock(&inode->delalloc_mutex);
88e081bf 4526 return ret;
0ca1f7ce
YZ
4527}
4528
7709cde3
JB
4529/**
4530 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
8b62f87b
JB
4531 * @inode: the inode to release the reservation for.
4532 * @num_bytes: the number of bytes we are releasing.
43b18595 4533 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
7709cde3
JB
4534 *
4535 * This will release the metadata reservation for an inode. This can be called
4536 * once we complete IO for a given set of bytes to release their metadata
8b62f87b 4537 * reservations, or on error for the same reason.
7709cde3 4538 */
43b18595
QW
4539void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
4540 bool qgroup_free)
0ca1f7ce 4541{
3ffbd68c 4542 struct btrfs_fs_info *fs_info = inode->root->fs_info;
0ca1f7ce 4543
0b246afa 4544 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
691fa059 4545 spin_lock(&inode->lock);
69fe2d75
JB
4546 inode->csum_bytes -= num_bytes;
4547 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
691fa059 4548 spin_unlock(&inode->lock);
0ca1f7ce 4549
0b246afa 4550 if (btrfs_is_testing(fs_info))
6a3891c5
JB
4551 return;
4552
43b18595 4553 btrfs_inode_rsv_release(inode, qgroup_free);
0ca1f7ce
YZ
4554}
4555
8b62f87b
JB
4556/**
4557 * btrfs_delalloc_release_extents - release our outstanding_extents
4558 * @inode: the inode to balance the reservation for.
4559 * @num_bytes: the number of bytes we originally reserved with
43b18595 4560 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
8b62f87b
JB
4561 *
4562 * When we reserve space we increase outstanding_extents for the extents we may
4563 * add. Once we've set the range as delalloc or created our ordered extents we
4564 * have outstanding_extents to track the real usage, so we use this to free our
4565 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
4566 * with btrfs_delalloc_reserve_metadata.
4567 */
43b18595
QW
4568void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
4569 bool qgroup_free)
8b62f87b 4570{
3ffbd68c 4571 struct btrfs_fs_info *fs_info = inode->root->fs_info;
8b62f87b 4572 unsigned num_extents;
8b62f87b
JB
4573
4574 spin_lock(&inode->lock);
4575 num_extents = count_max_extents(num_bytes);
4576 btrfs_mod_outstanding_extents(inode, -num_extents);
69fe2d75 4577 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
8b62f87b
JB
4578 spin_unlock(&inode->lock);
4579
8b62f87b
JB
4580 if (btrfs_is_testing(fs_info))
4581 return;
4582
43b18595 4583 btrfs_inode_rsv_release(inode, qgroup_free);
8b62f87b
JB
4584}
4585
1ada3a62 4586/**
7cf5b976 4587 * btrfs_delalloc_reserve_space - reserve data and metadata space for
1ada3a62
QW
4588 * delalloc
4589 * @inode: inode we're writing to
4590 * @start: start range we are writing to
4591 * @len: how long the range we are writing to
364ecf36
QW
4592 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
4593 * current reservation.
1ada3a62 4594 *
1ada3a62
QW
4595 * This will do the following things
4596 *
4597 * o reserve space in data space info for num bytes
4598 * and reserve precious corresponding qgroup space
4599 * (Done in check_data_free_space)
4600 *
4601 * o reserve space for metadata space, based on the number of outstanding
4602 * extents and how much csums will be needed
4603 * also reserve metadata space in a per root over-reserve method.
4604 * o add to the inodes->delalloc_bytes
4605 * o add it to the fs_info's delalloc inodes list.
4606 * (Above 3 all done in delalloc_reserve_metadata)
4607 *
4608 * Return 0 for success
4609 * Return <0 for error(-ENOSPC or -EQUOT)
4610 */
364ecf36
QW
4611int btrfs_delalloc_reserve_space(struct inode *inode,
4612 struct extent_changeset **reserved, u64 start, u64 len)
1ada3a62
QW
4613{
4614 int ret;
4615
364ecf36 4616 ret = btrfs_check_data_free_space(inode, reserved, start, len);
1ada3a62
QW
4617 if (ret < 0)
4618 return ret;
9f3db423 4619 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
1ada3a62 4620 if (ret < 0)
bc42bda2 4621 btrfs_free_reserved_data_space(inode, *reserved, start, len);
1ada3a62
QW
4622 return ret;
4623}
4624
7709cde3 4625/**
7cf5b976 4626 * btrfs_delalloc_release_space - release data and metadata space for delalloc
1ada3a62
QW
4627 * @inode: inode we're releasing space for
4628 * @start: start position of the space already reserved
4629 * @len: the len of the space already reserved
8b62f87b 4630 * @release_bytes: the len of the space we consumed or didn't use
1ada3a62
QW
4631 *
4632 * This function will release the metadata space that was not used and will
4633 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4634 * list if there are no delalloc bytes left.
4635 * Also it will handle the qgroup reserved space.
4636 */
bc42bda2 4637void btrfs_delalloc_release_space(struct inode *inode,
8b62f87b 4638 struct extent_changeset *reserved,
43b18595 4639 u64 start, u64 len, bool qgroup_free)
1ada3a62 4640{
43b18595 4641 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
bc42bda2 4642 btrfs_free_reserved_data_space(inode, reserved, start, len);
6324fbf3
CM
4643}
4644
ce93ec54 4645static int update_block_group(struct btrfs_trans_handle *trans,
6b279408 4646 u64 bytenr, u64 num_bytes, int alloc)
9078a3e1 4647{
6b279408 4648 struct btrfs_fs_info *info = trans->fs_info;
0af3d00b 4649 struct btrfs_block_group_cache *cache = NULL;
db94535d 4650 u64 total = num_bytes;
9078a3e1 4651 u64 old_val;
db94535d 4652 u64 byte_in_group;
0af3d00b 4653 int factor;
ba2c4d4e 4654 int ret = 0;
3e1ad54f 4655
5d4f98a2 4656 /* block accounting for super block */
eb73c1b7 4657 spin_lock(&info->delalloc_root_lock);
6c41761f 4658 old_val = btrfs_super_bytes_used(info->super_copy);
5d4f98a2
YZ
4659 if (alloc)
4660 old_val += num_bytes;
4661 else
4662 old_val -= num_bytes;
6c41761f 4663 btrfs_set_super_bytes_used(info->super_copy, old_val);
eb73c1b7 4664 spin_unlock(&info->delalloc_root_lock);
5d4f98a2 4665
d397712b 4666 while (total) {
db94535d 4667 cache = btrfs_lookup_block_group(info, bytenr);
ba2c4d4e
JB
4668 if (!cache) {
4669 ret = -ENOENT;
4670 break;
4671 }
46df06b8
DS
4672 factor = btrfs_bg_type_to_factor(cache->flags);
4673
9d66e233
JB
4674 /*
4675 * If this block group has free space cache written out, we
4676 * need to make sure to load it if we are removing space. This
4677 * is because we need the unpinning stage to actually add the
4678 * space back to the block group, otherwise we will leak space.
4679 */
4680 if (!alloc && cache->cached == BTRFS_CACHE_NO)
f6373bf3 4681 cache_block_group(cache, 1);
0af3d00b 4682
db94535d
CM
4683 byte_in_group = bytenr - cache->key.objectid;
4684 WARN_ON(byte_in_group > cache->key.offset);
9078a3e1 4685
25179201 4686 spin_lock(&cache->space_info->lock);
c286ac48 4687 spin_lock(&cache->lock);
0af3d00b 4688
6202df69 4689 if (btrfs_test_opt(info, SPACE_CACHE) &&
0af3d00b
JB
4690 cache->disk_cache_state < BTRFS_DC_CLEAR)
4691 cache->disk_cache_state = BTRFS_DC_CLEAR;
4692
9078a3e1 4693 old_val = btrfs_block_group_used(&cache->item);
db94535d 4694 num_bytes = min(total, cache->key.offset - byte_in_group);
cd1bc465 4695 if (alloc) {
db94535d 4696 old_val += num_bytes;
11833d66
YZ
4697 btrfs_set_block_group_used(&cache->item, old_val);
4698 cache->reserved -= num_bytes;
11833d66 4699 cache->space_info->bytes_reserved -= num_bytes;
b742bb82
YZ
4700 cache->space_info->bytes_used += num_bytes;
4701 cache->space_info->disk_used += num_bytes * factor;
c286ac48 4702 spin_unlock(&cache->lock);
25179201 4703 spin_unlock(&cache->space_info->lock);
cd1bc465 4704 } else {
db94535d 4705 old_val -= num_bytes;
ae0ab003
FM
4706 btrfs_set_block_group_used(&cache->item, old_val);
4707 cache->pinned += num_bytes;
bb96c4e5
JB
4708 btrfs_space_info_update_bytes_pinned(info,
4709 cache->space_info, num_bytes);
ae0ab003
FM
4710 cache->space_info->bytes_used -= num_bytes;
4711 cache->space_info->disk_used -= num_bytes * factor;
4712 spin_unlock(&cache->lock);
4713 spin_unlock(&cache->space_info->lock);
47ab2a6c 4714
0b246afa 4715 trace_btrfs_space_reservation(info, "pinned",
c51e7bb1
JB
4716 cache->space_info->flags,
4717 num_bytes, 1);
dec59fa3
EL
4718 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
4719 num_bytes,
4720 BTRFS_TOTAL_BYTES_PINNED_BATCH);
ae0ab003
FM
4721 set_extent_dirty(info->pinned_extents,
4722 bytenr, bytenr + num_bytes - 1,
4723 GFP_NOFS | __GFP_NOFAIL);
cd1bc465 4724 }
1bbc621e
CM
4725
4726 spin_lock(&trans->transaction->dirty_bgs_lock);
4727 if (list_empty(&cache->dirty_list)) {
4728 list_add_tail(&cache->dirty_list,
4729 &trans->transaction->dirty_bgs);
ba2c4d4e 4730 trans->delayed_ref_updates++;
1bbc621e
CM
4731 btrfs_get_block_group(cache);
4732 }
4733 spin_unlock(&trans->transaction->dirty_bgs_lock);
4734
036a9348
FM
4735 /*
4736 * No longer have used bytes in this block group, queue it for
4737 * deletion. We do this after adding the block group to the
4738 * dirty list to avoid races between cleaner kthread and space
4739 * cache writeout.
4740 */
031f24da
QW
4741 if (!alloc && old_val == 0)
4742 btrfs_mark_bg_unused(cache);
036a9348 4743
fa9c0d79 4744 btrfs_put_block_group(cache);
db94535d
CM
4745 total -= num_bytes;
4746 bytenr += num_bytes;
9078a3e1 4747 }
ba2c4d4e
JB
4748
4749 /* Modified block groups are accounted for in the delayed_refs_rsv. */
4750 btrfs_update_delayed_refs_rsv(trans);
4751 return ret;
9078a3e1 4752}
6324fbf3 4753
2ff7e61e 4754static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
a061fc8d 4755{
0f9dd46c 4756 struct btrfs_block_group_cache *cache;
d2fb3437 4757 u64 bytenr;
0f9dd46c 4758
0b246afa
JM
4759 spin_lock(&fs_info->block_group_cache_lock);
4760 bytenr = fs_info->first_logical_byte;
4761 spin_unlock(&fs_info->block_group_cache_lock);
a1897fdd
LB
4762
4763 if (bytenr < (u64)-1)
4764 return bytenr;
4765
0b246afa 4766 cache = btrfs_lookup_first_block_group(fs_info, search_start);
0f9dd46c 4767 if (!cache)
a061fc8d 4768 return 0;
0f9dd46c 4769
d2fb3437 4770 bytenr = cache->key.objectid;
fa9c0d79 4771 btrfs_put_block_group(cache);
d2fb3437
YZ
4772
4773 return bytenr;
a061fc8d
CM
4774}
4775
fdf08605 4776static int pin_down_extent(struct btrfs_block_group_cache *cache,
f0486c68 4777 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 4778{
fdf08605
DS
4779 struct btrfs_fs_info *fs_info = cache->fs_info;
4780
11833d66
YZ
4781 spin_lock(&cache->space_info->lock);
4782 spin_lock(&cache->lock);
4783 cache->pinned += num_bytes;
bb96c4e5
JB
4784 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
4785 num_bytes);
11833d66
YZ
4786 if (reserved) {
4787 cache->reserved -= num_bytes;
4788 cache->space_info->bytes_reserved -= num_bytes;
4789 }
4790 spin_unlock(&cache->lock);
4791 spin_unlock(&cache->space_info->lock);
68b38550 4792
0b246afa 4793 trace_btrfs_space_reservation(fs_info, "pinned",
c51e7bb1 4794 cache->space_info->flags, num_bytes, 1);
dec59fa3
EL
4795 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
4796 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
0b246afa 4797 set_extent_dirty(fs_info->pinned_extents, bytenr,
f0486c68
YZ
4798 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4799 return 0;
4800}
68b38550 4801
f0486c68
YZ
4802/*
4803 * this function must be called within transaction
4804 */
2ff7e61e 4805int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
f0486c68
YZ
4806 u64 bytenr, u64 num_bytes, int reserved)
4807{
4808 struct btrfs_block_group_cache *cache;
68b38550 4809
0b246afa 4810 cache = btrfs_lookup_block_group(fs_info, bytenr);
79787eaa 4811 BUG_ON(!cache); /* Logic error */
f0486c68 4812
fdf08605 4813 pin_down_extent(cache, bytenr, num_bytes, reserved);
f0486c68
YZ
4814
4815 btrfs_put_block_group(cache);
11833d66
YZ
4816 return 0;
4817}
4818
f0486c68 4819/*
e688b725
CM
4820 * this function must be called within transaction
4821 */
2ff7e61e 4822int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
e688b725
CM
4823 u64 bytenr, u64 num_bytes)
4824{
4825 struct btrfs_block_group_cache *cache;
b50c6e25 4826 int ret;
e688b725 4827
0b246afa 4828 cache = btrfs_lookup_block_group(fs_info, bytenr);
b50c6e25
JB
4829 if (!cache)
4830 return -EINVAL;
e688b725
CM
4831
4832 /*
4833 * pull in the free space cache (if any) so that our pin
4834 * removes the free space from the cache. We have load_only set
4835 * to one because the slow code to read in the free extents does check
4836 * the pinned extents.
4837 */
f6373bf3 4838 cache_block_group(cache, 1);
e688b725 4839
fdf08605 4840 pin_down_extent(cache, bytenr, num_bytes, 0);
e688b725
CM
4841
4842 /* remove us from the free space cache (if we're there at all) */
b50c6e25 4843 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
e688b725 4844 btrfs_put_block_group(cache);
b50c6e25 4845 return ret;
e688b725
CM
4846}
4847
2ff7e61e
JM
4848static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
4849 u64 start, u64 num_bytes)
8c2a1a30
JB
4850{
4851 int ret;
4852 struct btrfs_block_group_cache *block_group;
4853 struct btrfs_caching_control *caching_ctl;
4854
0b246afa 4855 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
4856 if (!block_group)
4857 return -EINVAL;
4858
4859 cache_block_group(block_group, 0);
4860 caching_ctl = get_caching_control(block_group);
4861
4862 if (!caching_ctl) {
4863 /* Logic error */
4864 BUG_ON(!block_group_cache_done(block_group));
4865 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4866 } else {
4867 mutex_lock(&caching_ctl->mutex);
4868
4869 if (start >= caching_ctl->progress) {
2ff7e61e 4870 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
4871 } else if (start + num_bytes <= caching_ctl->progress) {
4872 ret = btrfs_remove_free_space(block_group,
4873 start, num_bytes);
4874 } else {
4875 num_bytes = caching_ctl->progress - start;
4876 ret = btrfs_remove_free_space(block_group,
4877 start, num_bytes);
4878 if (ret)
4879 goto out_lock;
4880
4881 num_bytes = (start + num_bytes) -
4882 caching_ctl->progress;
4883 start = caching_ctl->progress;
2ff7e61e 4884 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
4885 }
4886out_lock:
4887 mutex_unlock(&caching_ctl->mutex);
4888 put_caching_control(caching_ctl);
4889 }
4890 btrfs_put_block_group(block_group);
4891 return ret;
4892}
4893
bcdc428c 4894int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 4895{
bcdc428c 4896 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
4897 struct btrfs_file_extent_item *item;
4898 struct btrfs_key key;
4899 int found_type;
4900 int i;
b89311ef 4901 int ret = 0;
8c2a1a30 4902
2ff7e61e 4903 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
4904 return 0;
4905
4906 for (i = 0; i < btrfs_header_nritems(eb); i++) {
4907 btrfs_item_key_to_cpu(eb, &key, i);
4908 if (key.type != BTRFS_EXTENT_DATA_KEY)
4909 continue;
4910 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
4911 found_type = btrfs_file_extent_type(eb, item);
4912 if (found_type == BTRFS_FILE_EXTENT_INLINE)
4913 continue;
4914 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
4915 continue;
4916 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
4917 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
4918 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
4919 if (ret)
4920 break;
8c2a1a30
JB
4921 }
4922
b89311ef 4923 return ret;
8c2a1a30
JB
4924}
4925
9cfa3e34
FM
4926static void
4927btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
4928{
4929 atomic_inc(&bg->reservations);
4930}
4931
4932void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
4933 const u64 start)
4934{
4935 struct btrfs_block_group_cache *bg;
4936
4937 bg = btrfs_lookup_block_group(fs_info, start);
4938 ASSERT(bg);
4939 if (atomic_dec_and_test(&bg->reservations))
4625956a 4940 wake_up_var(&bg->reservations);
9cfa3e34
FM
4941 btrfs_put_block_group(bg);
4942}
4943
9cfa3e34
FM
4944void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
4945{
4946 struct btrfs_space_info *space_info = bg->space_info;
4947
4948 ASSERT(bg->ro);
4949
4950 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
4951 return;
4952
4953 /*
4954 * Our block group is read only but before we set it to read only,
4955 * some task might have had allocated an extent from it already, but it
4956 * has not yet created a respective ordered extent (and added it to a
4957 * root's list of ordered extents).
4958 * Therefore wait for any task currently allocating extents, since the
4959 * block group's reservations counter is incremented while a read lock
4960 * on the groups' semaphore is held and decremented after releasing
4961 * the read access on that semaphore and creating the ordered extent.
4962 */
4963 down_write(&space_info->groups_sem);
4964 up_write(&space_info->groups_sem);
4965
4625956a 4966 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
9cfa3e34
FM
4967}
4968
fb25e914 4969/**
4824f1f4 4970 * btrfs_add_reserved_bytes - update the block_group and space info counters
fb25e914 4971 * @cache: The cache we are manipulating
18513091
WX
4972 * @ram_bytes: The number of bytes of file content, and will be same to
4973 * @num_bytes except for the compress path.
fb25e914 4974 * @num_bytes: The number of bytes in question
e570fd27 4975 * @delalloc: The blocks are allocated for the delalloc write
fb25e914 4976 *
745699ef
XW
4977 * This is called by the allocator when it reserves space. If this is a
4978 * reservation and the block group has become read only we cannot make the
4979 * reservation and return -EAGAIN, otherwise this function always succeeds.
f0486c68 4980 */
4824f1f4 4981static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
18513091 4982 u64 ram_bytes, u64 num_bytes, int delalloc)
11833d66 4983{
fb25e914 4984 struct btrfs_space_info *space_info = cache->space_info;
f0486c68 4985 int ret = 0;
79787eaa 4986
fb25e914
JB
4987 spin_lock(&space_info->lock);
4988 spin_lock(&cache->lock);
4824f1f4
WX
4989 if (cache->ro) {
4990 ret = -EAGAIN;
fb25e914 4991 } else {
4824f1f4
WX
4992 cache->reserved += num_bytes;
4993 space_info->bytes_reserved += num_bytes;
bb96c4e5
JB
4994 btrfs_space_info_update_bytes_may_use(cache->fs_info,
4995 space_info, -ram_bytes);
e570fd27 4996 if (delalloc)
4824f1f4 4997 cache->delalloc_bytes += num_bytes;
324ae4df 4998 }
fb25e914
JB
4999 spin_unlock(&cache->lock);
5000 spin_unlock(&space_info->lock);
f0486c68 5001 return ret;
324ae4df 5002}
9078a3e1 5003
4824f1f4
WX
5004/**
5005 * btrfs_free_reserved_bytes - update the block_group and space info counters
5006 * @cache: The cache we are manipulating
5007 * @num_bytes: The number of bytes in question
5008 * @delalloc: The blocks are allocated for the delalloc write
5009 *
5010 * This is called by somebody who is freeing space that was never actually used
5011 * on disk. For example if you reserve some space for a new leaf in transaction
5012 * A and before transaction A commits you free that leaf, you call this with
5013 * reserve set to 0 in order to clear the reservation.
5014 */
5015
556f3ca8 5016static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
5017 u64 num_bytes, int delalloc)
4824f1f4
WX
5018{
5019 struct btrfs_space_info *space_info = cache->space_info;
4824f1f4
WX
5020
5021 spin_lock(&space_info->lock);
5022 spin_lock(&cache->lock);
5023 if (cache->ro)
5024 space_info->bytes_readonly += num_bytes;
5025 cache->reserved -= num_bytes;
5026 space_info->bytes_reserved -= num_bytes;
21a94f7a 5027 space_info->max_extent_size = 0;
4824f1f4
WX
5028
5029 if (delalloc)
5030 cache->delalloc_bytes -= num_bytes;
5031 spin_unlock(&cache->lock);
5032 spin_unlock(&space_info->lock);
4824f1f4 5033}
8b74c03e 5034void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
e8569813 5035{
11833d66
YZ
5036 struct btrfs_caching_control *next;
5037 struct btrfs_caching_control *caching_ctl;
5038 struct btrfs_block_group_cache *cache;
e8569813 5039
9e351cc8 5040 down_write(&fs_info->commit_root_sem);
25179201 5041
11833d66
YZ
5042 list_for_each_entry_safe(caching_ctl, next,
5043 &fs_info->caching_block_groups, list) {
5044 cache = caching_ctl->block_group;
5045 if (block_group_cache_done(cache)) {
5046 cache->last_byte_to_unpin = (u64)-1;
5047 list_del_init(&caching_ctl->list);
5048 put_caching_control(caching_ctl);
e8569813 5049 } else {
11833d66 5050 cache->last_byte_to_unpin = caching_ctl->progress;
e8569813 5051 }
e8569813 5052 }
11833d66
YZ
5053
5054 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5055 fs_info->pinned_extents = &fs_info->freed_extents[1];
5056 else
5057 fs_info->pinned_extents = &fs_info->freed_extents[0];
5058
9e351cc8 5059 up_write(&fs_info->commit_root_sem);
8929ecfa 5060
67f9c220 5061 btrfs_update_global_block_rsv(fs_info);
e8569813
ZY
5062}
5063
c759c4e1
JB
5064/*
5065 * Returns the free cluster for the given space info and sets empty_cluster to
5066 * what it should be based on the mount options.
5067 */
5068static struct btrfs_free_cluster *
2ff7e61e
JM
5069fetch_cluster_info(struct btrfs_fs_info *fs_info,
5070 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
5071{
5072 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
5073
5074 *empty_cluster = 0;
5075 if (btrfs_mixed_space_info(space_info))
5076 return ret;
5077
c759c4e1 5078 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 5079 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
5080 if (btrfs_test_opt(fs_info, SSD))
5081 *empty_cluster = SZ_2M;
5082 else
ee22184b 5083 *empty_cluster = SZ_64K;
583b7231
HK
5084 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
5085 btrfs_test_opt(fs_info, SSD_SPREAD)) {
5086 *empty_cluster = SZ_2M;
0b246afa 5087 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
5088 }
5089
5090 return ret;
5091}
5092
2ff7e61e
JM
5093static int unpin_extent_range(struct btrfs_fs_info *fs_info,
5094 u64 start, u64 end,
678886bd 5095 const bool return_free_space)
ccd467d6 5096{
11833d66 5097 struct btrfs_block_group_cache *cache = NULL;
7b398f8e
JB
5098 struct btrfs_space_info *space_info;
5099 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 5100 struct btrfs_free_cluster *cluster = NULL;
11833d66 5101 u64 len;
c759c4e1
JB
5102 u64 total_unpinned = 0;
5103 u64 empty_cluster = 0;
7b398f8e 5104 bool readonly;
ccd467d6 5105
11833d66 5106 while (start <= end) {
7b398f8e 5107 readonly = false;
11833d66
YZ
5108 if (!cache ||
5109 start >= cache->key.objectid + cache->key.offset) {
5110 if (cache)
5111 btrfs_put_block_group(cache);
c759c4e1 5112 total_unpinned = 0;
11833d66 5113 cache = btrfs_lookup_block_group(fs_info, start);
79787eaa 5114 BUG_ON(!cache); /* Logic error */
c759c4e1 5115
2ff7e61e 5116 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
5117 cache->space_info,
5118 &empty_cluster);
5119 empty_cluster <<= 1;
11833d66
YZ
5120 }
5121
5122 len = cache->key.objectid + cache->key.offset - start;
5123 len = min(len, end + 1 - start);
5124
5125 if (start < cache->last_byte_to_unpin) {
5126 len = min(len, cache->last_byte_to_unpin - start);
678886bd
FM
5127 if (return_free_space)
5128 btrfs_add_free_space(cache, start, len);
11833d66
YZ
5129 }
5130
f0486c68 5131 start += len;
c759c4e1 5132 total_unpinned += len;
7b398f8e 5133 space_info = cache->space_info;
f0486c68 5134
c759c4e1
JB
5135 /*
5136 * If this space cluster has been marked as fragmented and we've
5137 * unpinned enough in this block group to potentially allow a
5138 * cluster to be created inside of it go ahead and clear the
5139 * fragmented check.
5140 */
5141 if (cluster && cluster->fragmented &&
5142 total_unpinned > empty_cluster) {
5143 spin_lock(&cluster->lock);
5144 cluster->fragmented = 0;
5145 spin_unlock(&cluster->lock);
5146 }
5147
7b398f8e 5148 spin_lock(&space_info->lock);
11833d66
YZ
5149 spin_lock(&cache->lock);
5150 cache->pinned -= len;
bb96c4e5 5151 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
c51e7bb1
JB
5152
5153 trace_btrfs_space_reservation(fs_info, "pinned",
5154 space_info->flags, len, 0);
4f4db217 5155 space_info->max_extent_size = 0;
dec59fa3
EL
5156 percpu_counter_add_batch(&space_info->total_bytes_pinned,
5157 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
7b398f8e
JB
5158 if (cache->ro) {
5159 space_info->bytes_readonly += len;
5160 readonly = true;
5161 }
11833d66 5162 spin_unlock(&cache->lock);
957780eb
JB
5163 if (!readonly && return_free_space &&
5164 global_rsv->space_info == space_info) {
5165 u64 to_add = len;
92ac58ec 5166
7b398f8e
JB
5167 spin_lock(&global_rsv->lock);
5168 if (!global_rsv->full) {
957780eb
JB
5169 to_add = min(len, global_rsv->size -
5170 global_rsv->reserved);
5171 global_rsv->reserved += to_add;
bb96c4e5
JB
5172 btrfs_space_info_update_bytes_may_use(fs_info,
5173 space_info, to_add);
7b398f8e
JB
5174 if (global_rsv->reserved >= global_rsv->size)
5175 global_rsv->full = 1;
957780eb
JB
5176 trace_btrfs_space_reservation(fs_info,
5177 "space_info",
5178 space_info->flags,
5179 to_add, 1);
5180 len -= to_add;
7b398f8e
JB
5181 }
5182 spin_unlock(&global_rsv->lock);
957780eb
JB
5183 /* Add to any tickets we may have */
5184 if (len)
d44b72aa
JB
5185 btrfs_space_info_add_new_bytes(fs_info,
5186 space_info, len);
7b398f8e
JB
5187 }
5188 spin_unlock(&space_info->lock);
ccd467d6 5189 }
11833d66
YZ
5190
5191 if (cache)
5192 btrfs_put_block_group(cache);
ccd467d6
CM
5193 return 0;
5194}
5195
5ead2dd0 5196int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 5197{
5ead2dd0 5198 struct btrfs_fs_info *fs_info = trans->fs_info;
e33e17ee
JM
5199 struct btrfs_block_group_cache *block_group, *tmp;
5200 struct list_head *deleted_bgs;
11833d66 5201 struct extent_io_tree *unpin;
1a5bc167
CM
5202 u64 start;
5203 u64 end;
a28ec197 5204 int ret;
a28ec197 5205
11833d66
YZ
5206 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5207 unpin = &fs_info->freed_extents[1];
5208 else
5209 unpin = &fs_info->freed_extents[0];
5210
e33e17ee 5211 while (!trans->aborted) {
0e6ec385
FM
5212 struct extent_state *cached_state = NULL;
5213
d4b450cd 5214 mutex_lock(&fs_info->unused_bg_unpin_mutex);
1a5bc167 5215 ret = find_first_extent_bit(unpin, 0, &start, &end,
0e6ec385 5216 EXTENT_DIRTY, &cached_state);
d4b450cd
FM
5217 if (ret) {
5218 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 5219 break;
d4b450cd 5220 }
1f3c79a2 5221
0b246afa 5222 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 5223 ret = btrfs_discard_extent(fs_info, start,
5378e607 5224 end + 1 - start, NULL);
1f3c79a2 5225
0e6ec385 5226 clear_extent_dirty(unpin, start, end, &cached_state);
2ff7e61e 5227 unpin_extent_range(fs_info, start, end, true);
d4b450cd 5228 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 5229 free_extent_state(cached_state);
b9473439 5230 cond_resched();
a28ec197 5231 }
817d52f8 5232
e33e17ee
JM
5233 /*
5234 * Transaction is finished. We don't need the lock anymore. We
5235 * do need to clean up the block groups in case of a transaction
5236 * abort.
5237 */
5238 deleted_bgs = &trans->transaction->deleted_bgs;
5239 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
5240 u64 trimmed = 0;
5241
5242 ret = -EROFS;
5243 if (!trans->aborted)
2ff7e61e 5244 ret = btrfs_discard_extent(fs_info,
e33e17ee
JM
5245 block_group->key.objectid,
5246 block_group->key.offset,
5247 &trimmed);
5248
5249 list_del_init(&block_group->bg_list);
5250 btrfs_put_block_group_trimming(block_group);
5251 btrfs_put_block_group(block_group);
5252
5253 if (ret) {
5254 const char *errstr = btrfs_decode_error(ret);
5255 btrfs_warn(fs_info,
913e1535 5256 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
5257 ret, errstr);
5258 }
5259 }
5260
e20d96d6
CM
5261 return 0;
5262}
5263
5d4f98a2 5264static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
5265 struct btrfs_delayed_ref_node *node, u64 parent,
5266 u64 root_objectid, u64 owner_objectid,
5267 u64 owner_offset, int refs_to_drop,
5268 struct btrfs_delayed_extent_op *extent_op)
a28ec197 5269{
e72cb923 5270 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 5271 struct btrfs_key key;
5d4f98a2 5272 struct btrfs_path *path;
1261ec42 5273 struct btrfs_root *extent_root = info->extent_root;
5f39d397 5274 struct extent_buffer *leaf;
5d4f98a2
YZ
5275 struct btrfs_extent_item *ei;
5276 struct btrfs_extent_inline_ref *iref;
a28ec197 5277 int ret;
5d4f98a2 5278 int is_data;
952fccac
CM
5279 int extent_slot = 0;
5280 int found_extent = 0;
5281 int num_to_del = 1;
5d4f98a2
YZ
5282 u32 item_size;
5283 u64 refs;
c682f9b3
QW
5284 u64 bytenr = node->bytenr;
5285 u64 num_bytes = node->num_bytes;
fcebe456 5286 int last_ref = 0;
0b246afa 5287 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
037e6390 5288
5caf2a00 5289 path = btrfs_alloc_path();
54aa1f4d
CM
5290 if (!path)
5291 return -ENOMEM;
5f26f772 5292
e4058b54 5293 path->reada = READA_FORWARD;
b9473439 5294 path->leave_spinning = 1;
5d4f98a2
YZ
5295
5296 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5297 BUG_ON(!is_data && refs_to_drop != 1);
5298
3173a18f 5299 if (is_data)
897ca819 5300 skinny_metadata = false;
3173a18f 5301
fbe4801b
NB
5302 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
5303 parent, root_objectid, owner_objectid,
5d4f98a2 5304 owner_offset);
7bb86316 5305 if (ret == 0) {
952fccac 5306 extent_slot = path->slots[0];
5d4f98a2
YZ
5307 while (extent_slot >= 0) {
5308 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 5309 extent_slot);
5d4f98a2 5310 if (key.objectid != bytenr)
952fccac 5311 break;
5d4f98a2
YZ
5312 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5313 key.offset == num_bytes) {
952fccac
CM
5314 found_extent = 1;
5315 break;
5316 }
3173a18f
JB
5317 if (key.type == BTRFS_METADATA_ITEM_KEY &&
5318 key.offset == owner_objectid) {
5319 found_extent = 1;
5320 break;
5321 }
952fccac
CM
5322 if (path->slots[0] - extent_slot > 5)
5323 break;
5d4f98a2 5324 extent_slot--;
952fccac 5325 }
a79865c6 5326
31840ae1 5327 if (!found_extent) {
5d4f98a2 5328 BUG_ON(iref);
87cc7a8a 5329 ret = remove_extent_backref(trans, path, NULL,
87bde3cd 5330 refs_to_drop,
fcebe456 5331 is_data, &last_ref);
005d6427 5332 if (ret) {
66642832 5333 btrfs_abort_transaction(trans, ret);
005d6427
DS
5334 goto out;
5335 }
b3b4aa74 5336 btrfs_release_path(path);
b9473439 5337 path->leave_spinning = 1;
5d4f98a2
YZ
5338
5339 key.objectid = bytenr;
5340 key.type = BTRFS_EXTENT_ITEM_KEY;
5341 key.offset = num_bytes;
5342
3173a18f
JB
5343 if (!is_data && skinny_metadata) {
5344 key.type = BTRFS_METADATA_ITEM_KEY;
5345 key.offset = owner_objectid;
5346 }
5347
31840ae1
ZY
5348 ret = btrfs_search_slot(trans, extent_root,
5349 &key, path, -1, 1);
3173a18f
JB
5350 if (ret > 0 && skinny_metadata && path->slots[0]) {
5351 /*
5352 * Couldn't find our skinny metadata item,
5353 * see if we have ye olde extent item.
5354 */
5355 path->slots[0]--;
5356 btrfs_item_key_to_cpu(path->nodes[0], &key,
5357 path->slots[0]);
5358 if (key.objectid == bytenr &&
5359 key.type == BTRFS_EXTENT_ITEM_KEY &&
5360 key.offset == num_bytes)
5361 ret = 0;
5362 }
5363
5364 if (ret > 0 && skinny_metadata) {
5365 skinny_metadata = false;
9ce49a0b 5366 key.objectid = bytenr;
3173a18f
JB
5367 key.type = BTRFS_EXTENT_ITEM_KEY;
5368 key.offset = num_bytes;
5369 btrfs_release_path(path);
5370 ret = btrfs_search_slot(trans, extent_root,
5371 &key, path, -1, 1);
5372 }
5373
f3465ca4 5374 if (ret) {
5d163e0e
JM
5375 btrfs_err(info,
5376 "umm, got %d back from search, was looking for %llu",
5377 ret, bytenr);
b783e62d 5378 if (ret > 0)
a4f78750 5379 btrfs_print_leaf(path->nodes[0]);
f3465ca4 5380 }
005d6427 5381 if (ret < 0) {
66642832 5382 btrfs_abort_transaction(trans, ret);
005d6427
DS
5383 goto out;
5384 }
31840ae1
ZY
5385 extent_slot = path->slots[0];
5386 }
fae7f21c 5387 } else if (WARN_ON(ret == -ENOENT)) {
a4f78750 5388 btrfs_print_leaf(path->nodes[0]);
c2cf52eb
SK
5389 btrfs_err(info,
5390 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
c1c9ff7c
GU
5391 bytenr, parent, root_objectid, owner_objectid,
5392 owner_offset);
66642832 5393 btrfs_abort_transaction(trans, ret);
c4a050bb 5394 goto out;
79787eaa 5395 } else {
66642832 5396 btrfs_abort_transaction(trans, ret);
005d6427 5397 goto out;
7bb86316 5398 }
5f39d397
CM
5399
5400 leaf = path->nodes[0];
5d4f98a2 5401 item_size = btrfs_item_size_nr(leaf, extent_slot);
6d8ff4e4 5402 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
5403 ret = -EINVAL;
5404 btrfs_print_v0_err(info);
5405 btrfs_abort_transaction(trans, ret);
5406 goto out;
5407 }
952fccac 5408 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 5409 struct btrfs_extent_item);
3173a18f
JB
5410 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5411 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2
YZ
5412 struct btrfs_tree_block_info *bi;
5413 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5414 bi = (struct btrfs_tree_block_info *)(ei + 1);
5415 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5416 }
56bec294 5417
5d4f98a2 5418 refs = btrfs_extent_refs(leaf, ei);
32b02538 5419 if (refs < refs_to_drop) {
5d163e0e
JM
5420 btrfs_err(info,
5421 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
5422 refs_to_drop, refs, bytenr);
32b02538 5423 ret = -EINVAL;
66642832 5424 btrfs_abort_transaction(trans, ret);
32b02538
JB
5425 goto out;
5426 }
56bec294 5427 refs -= refs_to_drop;
5f39d397 5428
5d4f98a2
YZ
5429 if (refs > 0) {
5430 if (extent_op)
5431 __run_delayed_extent_op(extent_op, leaf, ei);
5432 /*
5433 * In the case of inline back ref, reference count will
5434 * be updated by remove_extent_backref
952fccac 5435 */
5d4f98a2
YZ
5436 if (iref) {
5437 BUG_ON(!found_extent);
5438 } else {
5439 btrfs_set_extent_refs(leaf, ei, refs);
5440 btrfs_mark_buffer_dirty(leaf);
5441 }
5442 if (found_extent) {
87cc7a8a
NB
5443 ret = remove_extent_backref(trans, path, iref,
5444 refs_to_drop, is_data,
5445 &last_ref);
005d6427 5446 if (ret) {
66642832 5447 btrfs_abort_transaction(trans, ret);
005d6427
DS
5448 goto out;
5449 }
952fccac 5450 }
5d4f98a2 5451 } else {
5d4f98a2
YZ
5452 if (found_extent) {
5453 BUG_ON(is_data && refs_to_drop !=
9ed0dea0 5454 extent_data_ref_count(path, iref));
5d4f98a2
YZ
5455 if (iref) {
5456 BUG_ON(path->slots[0] != extent_slot);
5457 } else {
5458 BUG_ON(path->slots[0] != extent_slot + 1);
5459 path->slots[0] = extent_slot;
5460 num_to_del = 2;
5461 }
78fae27e 5462 }
b9473439 5463
fcebe456 5464 last_ref = 1;
952fccac
CM
5465 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5466 num_to_del);
005d6427 5467 if (ret) {
66642832 5468 btrfs_abort_transaction(trans, ret);
005d6427
DS
5469 goto out;
5470 }
b3b4aa74 5471 btrfs_release_path(path);
21af804c 5472
5d4f98a2 5473 if (is_data) {
5b4aacef 5474 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
005d6427 5475 if (ret) {
66642832 5476 btrfs_abort_transaction(trans, ret);
005d6427
DS
5477 goto out;
5478 }
459931ec
CM
5479 }
5480
e7355e50 5481 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
1e144fb8 5482 if (ret) {
66642832 5483 btrfs_abort_transaction(trans, ret);
1e144fb8
OS
5484 goto out;
5485 }
5486
6b279408 5487 ret = update_block_group(trans, bytenr, num_bytes, 0);
005d6427 5488 if (ret) {
66642832 5489 btrfs_abort_transaction(trans, ret);
005d6427
DS
5490 goto out;
5491 }
a28ec197 5492 }
fcebe456
JB
5493 btrfs_release_path(path);
5494
79787eaa 5495out:
5caf2a00 5496 btrfs_free_path(path);
a28ec197
CM
5497 return ret;
5498}
5499
1887be66 5500/*
f0486c68 5501 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
5502 * delayed ref for that extent as well. This searches the delayed ref tree for
5503 * a given extent, and if there are no other delayed refs to be processed, it
5504 * removes it from the tree.
5505 */
5506static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 5507 u64 bytenr)
1887be66
CM
5508{
5509 struct btrfs_delayed_ref_head *head;
5510 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 5511 int ret = 0;
1887be66
CM
5512
5513 delayed_refs = &trans->transaction->delayed_refs;
5514 spin_lock(&delayed_refs->lock);
f72ad18e 5515 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 5516 if (!head)
cf93da7b 5517 goto out_delayed_unlock;
1887be66 5518
d7df2c79 5519 spin_lock(&head->lock);
e3d03965 5520 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
5521 goto out;
5522
bedc6617
JB
5523 if (cleanup_extent_op(head) != NULL)
5524 goto out;
5d4f98a2 5525
1887be66
CM
5526 /*
5527 * waiting for the lock here would deadlock. If someone else has it
5528 * locked they are already in the process of dropping it anyway
5529 */
5530 if (!mutex_trylock(&head->mutex))
5531 goto out;
5532
d7baffda 5533 btrfs_delete_ref_head(delayed_refs, head);
d7df2c79 5534 head->processing = 0;
d7baffda 5535
d7df2c79 5536 spin_unlock(&head->lock);
1887be66
CM
5537 spin_unlock(&delayed_refs->lock);
5538
f0486c68
YZ
5539 BUG_ON(head->extent_op);
5540 if (head->must_insert_reserved)
5541 ret = 1;
5542
31890da0 5543 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 5544 mutex_unlock(&head->mutex);
d278850e 5545 btrfs_put_delayed_ref_head(head);
f0486c68 5546 return ret;
1887be66 5547out:
d7df2c79 5548 spin_unlock(&head->lock);
cf93da7b
CM
5549
5550out_delayed_unlock:
1887be66
CM
5551 spin_unlock(&delayed_refs->lock);
5552 return 0;
5553}
5554
f0486c68
YZ
5555void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5556 struct btrfs_root *root,
5557 struct extent_buffer *buf,
5581a51a 5558 u64 parent, int last_ref)
f0486c68 5559{
0b246afa 5560 struct btrfs_fs_info *fs_info = root->fs_info;
ed4f255b 5561 struct btrfs_ref generic_ref = { 0 };
b150a4f1 5562 int pin = 1;
f0486c68
YZ
5563 int ret;
5564
ed4f255b
QW
5565 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
5566 buf->start, buf->len, parent);
5567 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
5568 root->root_key.objectid);
5569
f0486c68 5570 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
d7eae340
OS
5571 int old_ref_mod, new_ref_mod;
5572
8a5040f7 5573 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 5574 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
d7eae340 5575 &old_ref_mod, &new_ref_mod);
79787eaa 5576 BUG_ON(ret); /* -ENOMEM */
d7eae340 5577 pin = old_ref_mod >= 0 && new_ref_mod < 0;
f0486c68
YZ
5578 }
5579
0a16c7d7 5580 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
6219872d
FM
5581 struct btrfs_block_group_cache *cache;
5582
f0486c68 5583 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
2ff7e61e 5584 ret = check_ref_cleanup(trans, buf->start);
f0486c68 5585 if (!ret)
37be25bc 5586 goto out;
f0486c68
YZ
5587 }
5588
4da8b76d 5589 pin = 0;
0b246afa 5590 cache = btrfs_lookup_block_group(fs_info, buf->start);
6219872d 5591
f0486c68 5592 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
fdf08605 5593 pin_down_extent(cache, buf->start, buf->len, 1);
6219872d 5594 btrfs_put_block_group(cache);
37be25bc 5595 goto out;
f0486c68
YZ
5596 }
5597
5598 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5599
5600 btrfs_add_free_space(cache, buf->start, buf->len);
4824f1f4 5601 btrfs_free_reserved_bytes(cache, buf->len, 0);
6219872d 5602 btrfs_put_block_group(cache);
71ff6437 5603 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
f0486c68
YZ
5604 }
5605out:
b150a4f1 5606 if (pin)
78192442 5607 add_pinned_bytes(fs_info, &generic_ref);
b150a4f1 5608
0a16c7d7
OS
5609 if (last_ref) {
5610 /*
5611 * Deleting the buffer, clear the corrupt flag since it doesn't
5612 * matter anymore.
5613 */
5614 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5615 }
f0486c68
YZ
5616}
5617
79787eaa 5618/* Can return -ENOMEM */
ffd4bb2a 5619int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 5620{
ffd4bb2a 5621 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 5622 int old_ref_mod, new_ref_mod;
925baedd
CM
5623 int ret;
5624
f5ee5c9a 5625 if (btrfs_is_testing(fs_info))
faa2dbf0 5626 return 0;
fccb84c9 5627
56bec294
CM
5628 /*
5629 * tree log blocks never actually go into the extent allocation
5630 * tree, just update pinning info and exit early.
56bec294 5631 */
ffd4bb2a
QW
5632 if ((ref->type == BTRFS_REF_METADATA &&
5633 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5634 (ref->type == BTRFS_REF_DATA &&
5635 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
b9473439 5636 /* unlocks the pinned mutex */
ffd4bb2a 5637 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
d7eae340 5638 old_ref_mod = new_ref_mod = 0;
56bec294 5639 ret = 0;
ffd4bb2a
QW
5640 } else if (ref->type == BTRFS_REF_METADATA) {
5641 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
d7eae340 5642 &old_ref_mod, &new_ref_mod);
5d4f98a2 5643 } else {
ffd4bb2a 5644 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
d7eae340 5645 &old_ref_mod, &new_ref_mod);
56bec294 5646 }
d7eae340 5647
ffd4bb2a
QW
5648 if (!((ref->type == BTRFS_REF_METADATA &&
5649 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5650 (ref->type == BTRFS_REF_DATA &&
5651 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
5652 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 5653
ddf30cf0 5654 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
78192442 5655 add_pinned_bytes(fs_info, ref);
d7eae340 5656
925baedd
CM
5657 return ret;
5658}
5659
817d52f8
JB
5660/*
5661 * when we wait for progress in the block group caching, its because
5662 * our allocation attempt failed at least once. So, we must sleep
5663 * and let some progress happen before we try again.
5664 *
5665 * This function will sleep at least once waiting for new free space to
5666 * show up, and then it will check the block group free space numbers
5667 * for our min num_bytes. Another option is to have it go ahead
5668 * and look in the rbtree for a free extent of a given size, but this
5669 * is a good start.
36cce922
JB
5670 *
5671 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
5672 * any of the information in this block group.
817d52f8 5673 */
36cce922 5674static noinline void
817d52f8
JB
5675wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5676 u64 num_bytes)
5677{
11833d66 5678 struct btrfs_caching_control *caching_ctl;
817d52f8 5679
11833d66
YZ
5680 caching_ctl = get_caching_control(cache);
5681 if (!caching_ctl)
36cce922 5682 return;
817d52f8 5683
11833d66 5684 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
34d52cb6 5685 (cache->free_space_ctl->free_space >= num_bytes));
11833d66
YZ
5686
5687 put_caching_control(caching_ctl);
11833d66
YZ
5688}
5689
5690static noinline int
5691wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5692{
5693 struct btrfs_caching_control *caching_ctl;
36cce922 5694 int ret = 0;
11833d66
YZ
5695
5696 caching_ctl = get_caching_control(cache);
5697 if (!caching_ctl)
36cce922 5698 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
11833d66
YZ
5699
5700 wait_event(caching_ctl->wait, block_group_cache_done(cache));
36cce922
JB
5701 if (cache->cached == BTRFS_CACHE_ERROR)
5702 ret = -EIO;
11833d66 5703 put_caching_control(caching_ctl);
36cce922 5704 return ret;
817d52f8
JB
5705}
5706
5707enum btrfs_loop_type {
f262fa8d
DS
5708 LOOP_CACHING_NOWAIT,
5709 LOOP_CACHING_WAIT,
5710 LOOP_ALLOC_CHUNK,
5711 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
5712};
5713
e570fd27
MX
5714static inline void
5715btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
5716 int delalloc)
5717{
5718 if (delalloc)
5719 down_read(&cache->data_rwsem);
5720}
5721
5722static inline void
5723btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
5724 int delalloc)
5725{
5726 btrfs_get_block_group(cache);
5727 if (delalloc)
5728 down_read(&cache->data_rwsem);
5729}
5730
5731static struct btrfs_block_group_cache *
5732btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
5733 struct btrfs_free_cluster *cluster,
5734 int delalloc)
5735{
89771cc9 5736 struct btrfs_block_group_cache *used_bg = NULL;
6719afdc 5737
e570fd27 5738 spin_lock(&cluster->refill_lock);
6719afdc
GU
5739 while (1) {
5740 used_bg = cluster->block_group;
5741 if (!used_bg)
5742 return NULL;
5743
5744 if (used_bg == block_group)
e570fd27
MX
5745 return used_bg;
5746
6719afdc 5747 btrfs_get_block_group(used_bg);
e570fd27 5748
6719afdc
GU
5749 if (!delalloc)
5750 return used_bg;
e570fd27 5751
6719afdc
GU
5752 if (down_read_trylock(&used_bg->data_rwsem))
5753 return used_bg;
e570fd27 5754
6719afdc 5755 spin_unlock(&cluster->refill_lock);
e570fd27 5756
e321f8a8
LB
5757 /* We should only have one-level nested. */
5758 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 5759
6719afdc
GU
5760 spin_lock(&cluster->refill_lock);
5761 if (used_bg == cluster->block_group)
5762 return used_bg;
e570fd27 5763
6719afdc
GU
5764 up_read(&used_bg->data_rwsem);
5765 btrfs_put_block_group(used_bg);
5766 }
e570fd27
MX
5767}
5768
5769static inline void
5770btrfs_release_block_group(struct btrfs_block_group_cache *cache,
5771 int delalloc)
5772{
5773 if (delalloc)
5774 up_read(&cache->data_rwsem);
5775 btrfs_put_block_group(cache);
5776}
5777
b4bd745d
QW
5778/*
5779 * Structure used internally for find_free_extent() function. Wraps needed
5780 * parameters.
5781 */
5782struct find_free_extent_ctl {
5783 /* Basic allocation info */
5784 u64 ram_bytes;
5785 u64 num_bytes;
5786 u64 empty_size;
5787 u64 flags;
5788 int delalloc;
5789
5790 /* Where to start the search inside the bg */
5791 u64 search_start;
5792
5793 /* For clustered allocation */
5794 u64 empty_cluster;
5795
5796 bool have_caching_bg;
5797 bool orig_have_caching_bg;
5798
5799 /* RAID index, converted from flags */
5800 int index;
5801
e72d79d6
QW
5802 /*
5803 * Current loop number, check find_free_extent_update_loop() for details
5804 */
b4bd745d
QW
5805 int loop;
5806
5807 /*
5808 * Whether we're refilling a cluster, if true we need to re-search
5809 * current block group but don't try to refill the cluster again.
5810 */
5811 bool retry_clustered;
5812
5813 /*
5814 * Whether we're updating free space cache, if true we need to re-search
5815 * current block group but don't try updating free space cache again.
5816 */
5817 bool retry_unclustered;
5818
5819 /* If current block group is cached */
5820 int cached;
5821
5822 /* Max contiguous hole found */
5823 u64 max_extent_size;
5824
5825 /* Total free space from free space cache, not always contiguous */
5826 u64 total_free_space;
5827
5828 /* Found result */
5829 u64 found_offset;
5830};
5831
d06e3bb6
QW
5832
5833/*
5834 * Helper function for find_free_extent().
5835 *
5836 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
5837 * Return -EAGAIN to inform caller that we need to re-search this block group
5838 * Return >0 to inform caller that we find nothing
5839 * Return 0 means we have found a location and set ffe_ctl->found_offset.
5840 */
5841static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
5842 struct btrfs_free_cluster *last_ptr,
5843 struct find_free_extent_ctl *ffe_ctl,
5844 struct btrfs_block_group_cache **cluster_bg_ret)
5845{
d06e3bb6
QW
5846 struct btrfs_block_group_cache *cluster_bg;
5847 u64 aligned_cluster;
5848 u64 offset;
5849 int ret;
5850
5851 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
5852 if (!cluster_bg)
5853 goto refill_cluster;
5854 if (cluster_bg != bg && (cluster_bg->ro ||
5855 !block_group_bits(cluster_bg, ffe_ctl->flags)))
5856 goto release_cluster;
5857
5858 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
5859 ffe_ctl->num_bytes, cluster_bg->key.objectid,
5860 &ffe_ctl->max_extent_size);
5861 if (offset) {
5862 /* We have a block, we're done */
5863 spin_unlock(&last_ptr->refill_lock);
5864 trace_btrfs_reserve_extent_cluster(cluster_bg,
5865 ffe_ctl->search_start, ffe_ctl->num_bytes);
5866 *cluster_bg_ret = cluster_bg;
5867 ffe_ctl->found_offset = offset;
5868 return 0;
5869 }
5870 WARN_ON(last_ptr->block_group != cluster_bg);
5871
5872release_cluster:
5873 /*
5874 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
5875 * lets just skip it and let the allocator find whatever block it can
5876 * find. If we reach this point, we will have tried the cluster
5877 * allocator plenty of times and not have found anything, so we are
5878 * likely way too fragmented for the clustering stuff to find anything.
5879 *
5880 * However, if the cluster is taken from the current block group,
5881 * release the cluster first, so that we stand a better chance of
5882 * succeeding in the unclustered allocation.
5883 */
5884 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
5885 spin_unlock(&last_ptr->refill_lock);
5886 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
5887 return -ENOENT;
5888 }
5889
5890 /* This cluster didn't work out, free it and start over */
5891 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5892
5893 if (cluster_bg != bg)
5894 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
5895
5896refill_cluster:
5897 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
5898 spin_unlock(&last_ptr->refill_lock);
5899 return -ENOENT;
5900 }
5901
5902 aligned_cluster = max_t(u64,
5903 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
5904 bg->full_stripe_len);
2ceeae2e
DS
5905 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
5906 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
5907 if (ret == 0) {
5908 /* Now pull our allocation out of this cluster */
5909 offset = btrfs_alloc_from_cluster(bg, last_ptr,
5910 ffe_ctl->num_bytes, ffe_ctl->search_start,
5911 &ffe_ctl->max_extent_size);
5912 if (offset) {
5913 /* We found one, proceed */
5914 spin_unlock(&last_ptr->refill_lock);
5915 trace_btrfs_reserve_extent_cluster(bg,
5916 ffe_ctl->search_start,
5917 ffe_ctl->num_bytes);
5918 ffe_ctl->found_offset = offset;
5919 return 0;
5920 }
5921 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
5922 !ffe_ctl->retry_clustered) {
5923 spin_unlock(&last_ptr->refill_lock);
5924
5925 ffe_ctl->retry_clustered = true;
5926 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
5927 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
5928 return -EAGAIN;
5929 }
5930 /*
5931 * At this point we either didn't find a cluster or we weren't able to
5932 * allocate a block from our cluster. Free the cluster we've been
5933 * trying to use, and go to the next block group.
5934 */
5935 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5936 spin_unlock(&last_ptr->refill_lock);
5937 return 1;
5938}
5939
e1a41848
QW
5940/*
5941 * Return >0 to inform caller that we find nothing
5942 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
5943 * Return -EAGAIN to inform caller that we need to re-search this block group
5944 */
5945static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
5946 struct btrfs_free_cluster *last_ptr,
5947 struct find_free_extent_ctl *ffe_ctl)
5948{
5949 u64 offset;
5950
5951 /*
5952 * We are doing an unclustered allocation, set the fragmented flag so
5953 * we don't bother trying to setup a cluster again until we get more
5954 * space.
5955 */
5956 if (unlikely(last_ptr)) {
5957 spin_lock(&last_ptr->lock);
5958 last_ptr->fragmented = 1;
5959 spin_unlock(&last_ptr->lock);
5960 }
5961 if (ffe_ctl->cached) {
5962 struct btrfs_free_space_ctl *free_space_ctl;
5963
5964 free_space_ctl = bg->free_space_ctl;
5965 spin_lock(&free_space_ctl->tree_lock);
5966 if (free_space_ctl->free_space <
5967 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
5968 ffe_ctl->empty_size) {
5969 ffe_ctl->total_free_space = max_t(u64,
5970 ffe_ctl->total_free_space,
5971 free_space_ctl->free_space);
5972 spin_unlock(&free_space_ctl->tree_lock);
5973 return 1;
5974 }
5975 spin_unlock(&free_space_ctl->tree_lock);
5976 }
5977
5978 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
5979 ffe_ctl->num_bytes, ffe_ctl->empty_size,
5980 &ffe_ctl->max_extent_size);
5981
5982 /*
5983 * If we didn't find a chunk, and we haven't failed on this block group
5984 * before, and this block group is in the middle of caching and we are
5985 * ok with waiting, then go ahead and wait for progress to be made, and
5986 * set @retry_unclustered to true.
5987 *
5988 * If @retry_unclustered is true then we've already waited on this
5989 * block group once and should move on to the next block group.
5990 */
5991 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
5992 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
5993 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
5994 ffe_ctl->empty_size);
5995 ffe_ctl->retry_unclustered = true;
5996 return -EAGAIN;
5997 } else if (!offset) {
5998 return 1;
5999 }
6000 ffe_ctl->found_offset = offset;
6001 return 0;
6002}
6003
e72d79d6
QW
6004/*
6005 * Return >0 means caller needs to re-search for free extent
6006 * Return 0 means we have the needed free extent.
6007 * Return <0 means we failed to locate any free extent.
6008 */
6009static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
6010 struct btrfs_free_cluster *last_ptr,
6011 struct btrfs_key *ins,
6012 struct find_free_extent_ctl *ffe_ctl,
6013 int full_search, bool use_cluster)
6014{
6015 struct btrfs_root *root = fs_info->extent_root;
6016 int ret;
6017
6018 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
6019 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
6020 ffe_ctl->orig_have_caching_bg = true;
6021
6022 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
6023 ffe_ctl->have_caching_bg)
6024 return 1;
6025
6026 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
6027 return 1;
6028
6029 if (ins->objectid) {
6030 if (!use_cluster && last_ptr) {
6031 spin_lock(&last_ptr->lock);
6032 last_ptr->window_start = ins->objectid;
6033 spin_unlock(&last_ptr->lock);
6034 }
6035 return 0;
6036 }
6037
6038 /*
6039 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6040 * caching kthreads as we move along
6041 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6042 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6043 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6044 * again
6045 */
6046 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
6047 ffe_ctl->index = 0;
6048 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
6049 /*
6050 * We want to skip the LOOP_CACHING_WAIT step if we
6051 * don't have any uncached bgs and we've already done a
6052 * full search through.
6053 */
6054 if (ffe_ctl->orig_have_caching_bg || !full_search)
6055 ffe_ctl->loop = LOOP_CACHING_WAIT;
6056 else
6057 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
6058 } else {
6059 ffe_ctl->loop++;
6060 }
6061
6062 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
6063 struct btrfs_trans_handle *trans;
6064 int exist = 0;
6065
6066 trans = current->journal_info;
6067 if (trans)
6068 exist = 1;
6069 else
6070 trans = btrfs_join_transaction(root);
6071
6072 if (IS_ERR(trans)) {
6073 ret = PTR_ERR(trans);
6074 return ret;
6075 }
6076
fc471cb0
JB
6077 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
6078 CHUNK_ALLOC_FORCE);
e72d79d6
QW
6079
6080 /*
6081 * If we can't allocate a new chunk we've already looped
6082 * through at least once, move on to the NO_EMPTY_SIZE
6083 * case.
6084 */
6085 if (ret == -ENOSPC)
6086 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
6087
6088 /* Do not bail out on ENOSPC since we can do more. */
6089 if (ret < 0 && ret != -ENOSPC)
6090 btrfs_abort_transaction(trans, ret);
6091 else
6092 ret = 0;
6093 if (!exist)
6094 btrfs_end_transaction(trans);
6095 if (ret)
6096 return ret;
6097 }
6098
6099 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
6100 /*
6101 * Don't loop again if we already have no empty_size and
6102 * no empty_cluster.
6103 */
6104 if (ffe_ctl->empty_size == 0 &&
6105 ffe_ctl->empty_cluster == 0)
6106 return -ENOSPC;
6107 ffe_ctl->empty_size = 0;
6108 ffe_ctl->empty_cluster = 0;
6109 }
6110 return 1;
6111 }
6112 return -ENOSPC;
6113}
6114
fec577fb
CM
6115/*
6116 * walks the btree of allocated extents and find a hole of a given size.
6117 * The key ins is changed to record the hole:
a4820398 6118 * ins->objectid == start position
62e2749e 6119 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 6120 * ins->offset == the size of the hole.
fec577fb 6121 * Any available blocks before search_start are skipped.
a4820398
MX
6122 *
6123 * If there is no suitable free space, we will record the max size of
6124 * the free space extent currently.
e72d79d6
QW
6125 *
6126 * The overall logic and call chain:
6127 *
6128 * find_free_extent()
6129 * |- Iterate through all block groups
6130 * | |- Get a valid block group
6131 * | |- Try to do clustered allocation in that block group
6132 * | |- Try to do unclustered allocation in that block group
6133 * | |- Check if the result is valid
6134 * | | |- If valid, then exit
6135 * | |- Jump to next block group
6136 * |
6137 * |- Push harder to find free extents
6138 * |- If not found, re-iterate all block groups
fec577fb 6139 */
87bde3cd 6140static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
18513091
WX
6141 u64 ram_bytes, u64 num_bytes, u64 empty_size,
6142 u64 hint_byte, struct btrfs_key *ins,
6143 u64 flags, int delalloc)
fec577fb 6144{
80eb234a 6145 int ret = 0;
fa9c0d79 6146 struct btrfs_free_cluster *last_ptr = NULL;
80eb234a 6147 struct btrfs_block_group_cache *block_group = NULL;
b4bd745d 6148 struct find_free_extent_ctl ffe_ctl = {0};
80eb234a 6149 struct btrfs_space_info *space_info;
67377734 6150 bool use_cluster = true;
a5e681d9 6151 bool full_search = false;
fec577fb 6152
0b246afa 6153 WARN_ON(num_bytes < fs_info->sectorsize);
b4bd745d
QW
6154
6155 ffe_ctl.ram_bytes = ram_bytes;
6156 ffe_ctl.num_bytes = num_bytes;
6157 ffe_ctl.empty_size = empty_size;
6158 ffe_ctl.flags = flags;
6159 ffe_ctl.search_start = 0;
6160 ffe_ctl.retry_clustered = false;
6161 ffe_ctl.retry_unclustered = false;
6162 ffe_ctl.delalloc = delalloc;
6163 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
6164 ffe_ctl.have_caching_bg = false;
6165 ffe_ctl.orig_have_caching_bg = false;
6166 ffe_ctl.found_offset = 0;
6167
962a298f 6168 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
6169 ins->objectid = 0;
6170 ins->offset = 0;
b1a4d965 6171
71ff6437 6172 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
3f7de037 6173
280c2908 6174 space_info = btrfs_find_space_info(fs_info, flags);
1b1d1f66 6175 if (!space_info) {
0b246afa 6176 btrfs_err(fs_info, "No space info for %llu", flags);
1b1d1f66
JB
6177 return -ENOSPC;
6178 }
2552d17e 6179
67377734 6180 /*
4f4db217
JB
6181 * If our free space is heavily fragmented we may not be able to make
6182 * big contiguous allocations, so instead of doing the expensive search
6183 * for free space, simply return ENOSPC with our max_extent_size so we
6184 * can go ahead and search for a more manageable chunk.
6185 *
6186 * If our max_extent_size is large enough for our allocation simply
6187 * disable clustering since we will likely not be able to find enough
6188 * space to create a cluster and induce latency trying.
67377734 6189 */
4f4db217
JB
6190 if (unlikely(space_info->max_extent_size)) {
6191 spin_lock(&space_info->lock);
6192 if (space_info->max_extent_size &&
6193 num_bytes > space_info->max_extent_size) {
6194 ins->offset = space_info->max_extent_size;
6195 spin_unlock(&space_info->lock);
6196 return -ENOSPC;
6197 } else if (space_info->max_extent_size) {
6198 use_cluster = false;
6199 }
6200 spin_unlock(&space_info->lock);
fa9c0d79 6201 }
0f9dd46c 6202
b4bd745d
QW
6203 last_ptr = fetch_cluster_info(fs_info, space_info,
6204 &ffe_ctl.empty_cluster);
239b14b3 6205 if (last_ptr) {
fa9c0d79
CM
6206 spin_lock(&last_ptr->lock);
6207 if (last_ptr->block_group)
6208 hint_byte = last_ptr->window_start;
c759c4e1
JB
6209 if (last_ptr->fragmented) {
6210 /*
6211 * We still set window_start so we can keep track of the
6212 * last place we found an allocation to try and save
6213 * some time.
6214 */
6215 hint_byte = last_ptr->window_start;
6216 use_cluster = false;
6217 }
fa9c0d79 6218 spin_unlock(&last_ptr->lock);
239b14b3 6219 }
fa9c0d79 6220
b4bd745d
QW
6221 ffe_ctl.search_start = max(ffe_ctl.search_start,
6222 first_logical_byte(fs_info, 0));
6223 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
6224 if (ffe_ctl.search_start == hint_byte) {
6225 block_group = btrfs_lookup_block_group(fs_info,
6226 ffe_ctl.search_start);
817d52f8
JB
6227 /*
6228 * we don't want to use the block group if it doesn't match our
6229 * allocation bits, or if its not cached.
ccf0e725
JB
6230 *
6231 * However if we are re-searching with an ideal block group
6232 * picked out then we don't care that the block group is cached.
817d52f8 6233 */
b6919a58 6234 if (block_group && block_group_bits(block_group, flags) &&
285ff5af 6235 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 6236 down_read(&space_info->groups_sem);
44fb5511
CM
6237 if (list_empty(&block_group->list) ||
6238 block_group->ro) {
6239 /*
6240 * someone is removing this block group,
6241 * we can't jump into the have_block_group
6242 * target because our list pointers are not
6243 * valid
6244 */
6245 btrfs_put_block_group(block_group);
6246 up_read(&space_info->groups_sem);
ccf0e725 6247 } else {
b4bd745d 6248 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3e72ee88 6249 block_group->flags);
e570fd27 6250 btrfs_lock_block_group(block_group, delalloc);
44fb5511 6251 goto have_block_group;
ccf0e725 6252 }
2552d17e 6253 } else if (block_group) {
fa9c0d79 6254 btrfs_put_block_group(block_group);
2552d17e 6255 }
42e70e7a 6256 }
2552d17e 6257search:
b4bd745d
QW
6258 ffe_ctl.have_caching_bg = false;
6259 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
6260 ffe_ctl.index == 0)
a5e681d9 6261 full_search = true;
80eb234a 6262 down_read(&space_info->groups_sem);
b4bd745d
QW
6263 list_for_each_entry(block_group,
6264 &space_info->block_groups[ffe_ctl.index], list) {
14443937
JM
6265 /* If the block group is read-only, we can skip it entirely. */
6266 if (unlikely(block_group->ro))
6267 continue;
6268
e570fd27 6269 btrfs_grab_block_group(block_group, delalloc);
b4bd745d 6270 ffe_ctl.search_start = block_group->key.objectid;
42e70e7a 6271
83a50de9
CM
6272 /*
6273 * this can happen if we end up cycling through all the
6274 * raid types, but we want to make sure we only allocate
6275 * for the proper type.
6276 */
b6919a58 6277 if (!block_group_bits(block_group, flags)) {
bece2e82 6278 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 6279 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 6280 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
6281 BTRFS_BLOCK_GROUP_RAID10;
6282
6283 /*
6284 * if they asked for extra copies and this block group
6285 * doesn't provide them, bail. This does allow us to
6286 * fill raid0 from raid1.
6287 */
b6919a58 6288 if ((flags & extra) && !(block_group->flags & extra))
83a50de9
CM
6289 goto loop;
6290 }
6291
2552d17e 6292have_block_group:
b4bd745d
QW
6293 ffe_ctl.cached = block_group_cache_done(block_group);
6294 if (unlikely(!ffe_ctl.cached)) {
6295 ffe_ctl.have_caching_bg = true;
f6373bf3 6296 ret = cache_block_group(block_group, 0);
1d4284bd
CM
6297 BUG_ON(ret < 0);
6298 ret = 0;
817d52f8
JB
6299 }
6300
36cce922
JB
6301 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6302 goto loop;
0f9dd46c 6303
0a24325e 6304 /*
062c05c4
AO
6305 * Ok we want to try and use the cluster allocator, so
6306 * lets look there
0a24325e 6307 */
c759c4e1 6308 if (last_ptr && use_cluster) {
d06e3bb6 6309 struct btrfs_block_group_cache *cluster_bg = NULL;
fa9c0d79 6310
d06e3bb6
QW
6311 ret = find_free_extent_clustered(block_group, last_ptr,
6312 &ffe_ctl, &cluster_bg);
062c05c4 6313
fa9c0d79 6314 if (ret == 0) {
d06e3bb6
QW
6315 if (cluster_bg && cluster_bg != block_group) {
6316 btrfs_release_block_group(block_group,
6317 delalloc);
6318 block_group = cluster_bg;
fa9c0d79 6319 }
d06e3bb6
QW
6320 goto checks;
6321 } else if (ret == -EAGAIN) {
817d52f8 6322 goto have_block_group;
d06e3bb6
QW
6323 } else if (ret > 0) {
6324 goto loop;
fa9c0d79 6325 }
d06e3bb6 6326 /* ret == -ENOENT case falls through */
fa9c0d79
CM
6327 }
6328
e1a41848
QW
6329 ret = find_free_extent_unclustered(block_group, last_ptr,
6330 &ffe_ctl);
6331 if (ret == -EAGAIN)
817d52f8 6332 goto have_block_group;
e1a41848 6333 else if (ret > 0)
1cdda9b8 6334 goto loop;
e1a41848 6335 /* ret == 0 case falls through */
fa9c0d79 6336checks:
b4bd745d
QW
6337 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
6338 fs_info->stripesize);
25179201 6339
2552d17e 6340 /* move on to the next group */
b4bd745d 6341 if (ffe_ctl.search_start + num_bytes >
215a63d1 6342 block_group->key.objectid + block_group->key.offset) {
b4bd745d
QW
6343 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6344 num_bytes);
2552d17e 6345 goto loop;
6226cb0a 6346 }
f5a31e16 6347
b4bd745d
QW
6348 if (ffe_ctl.found_offset < ffe_ctl.search_start)
6349 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6350 ffe_ctl.search_start - ffe_ctl.found_offset);
2552d17e 6351
18513091
WX
6352 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
6353 num_bytes, delalloc);
f0486c68 6354 if (ret == -EAGAIN) {
b4bd745d
QW
6355 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6356 num_bytes);
2552d17e 6357 goto loop;
0f9dd46c 6358 }
9cfa3e34 6359 btrfs_inc_block_group_reservations(block_group);
0b86a832 6360
f0486c68 6361 /* we are all good, lets return */
b4bd745d 6362 ins->objectid = ffe_ctl.search_start;
2552d17e 6363 ins->offset = num_bytes;
d2fb3437 6364
b4bd745d
QW
6365 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
6366 num_bytes);
e570fd27 6367 btrfs_release_block_group(block_group, delalloc);
2552d17e
JB
6368 break;
6369loop:
b4bd745d
QW
6370 ffe_ctl.retry_clustered = false;
6371 ffe_ctl.retry_unclustered = false;
3e72ee88 6372 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
b4bd745d 6373 ffe_ctl.index);
e570fd27 6374 btrfs_release_block_group(block_group, delalloc);
14443937 6375 cond_resched();
2552d17e
JB
6376 }
6377 up_read(&space_info->groups_sem);
6378
e72d79d6
QW
6379 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
6380 full_search, use_cluster);
6381 if (ret > 0)
b742bb82
YZ
6382 goto search;
6383
4f4db217 6384 if (ret == -ENOSPC) {
b4bd745d
QW
6385 /*
6386 * Use ffe_ctl->total_free_space as fallback if we can't find
6387 * any contiguous hole.
6388 */
6389 if (!ffe_ctl.max_extent_size)
6390 ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4f4db217 6391 spin_lock(&space_info->lock);
b4bd745d 6392 space_info->max_extent_size = ffe_ctl.max_extent_size;
4f4db217 6393 spin_unlock(&space_info->lock);
b4bd745d 6394 ins->offset = ffe_ctl.max_extent_size;
4f4db217 6395 }
0f70abe2 6396 return ret;
fec577fb 6397}
ec44a35c 6398
6f47c706
NB
6399/*
6400 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
6401 * hole that is at least as big as @num_bytes.
6402 *
6403 * @root - The root that will contain this extent
6404 *
6405 * @ram_bytes - The amount of space in ram that @num_bytes take. This
6406 * is used for accounting purposes. This value differs
6407 * from @num_bytes only in the case of compressed extents.
6408 *
6409 * @num_bytes - Number of bytes to allocate on-disk.
6410 *
6411 * @min_alloc_size - Indicates the minimum amount of space that the
6412 * allocator should try to satisfy. In some cases
6413 * @num_bytes may be larger than what is required and if
6414 * the filesystem is fragmented then allocation fails.
6415 * However, the presence of @min_alloc_size gives a
6416 * chance to try and satisfy the smaller allocation.
6417 *
6418 * @empty_size - A hint that you plan on doing more COW. This is the
6419 * size in bytes the allocator should try to find free
6420 * next to the block it returns. This is just a hint and
6421 * may be ignored by the allocator.
6422 *
6423 * @hint_byte - Hint to the allocator to start searching above the byte
6424 * address passed. It might be ignored.
6425 *
6426 * @ins - This key is modified to record the found hole. It will
6427 * have the following values:
6428 * ins->objectid == start position
6429 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6430 * ins->offset == the size of the hole.
6431 *
6432 * @is_data - Boolean flag indicating whether an extent is
6433 * allocated for data (true) or metadata (false)
6434 *
6435 * @delalloc - Boolean flag indicating whether this allocation is for
6436 * delalloc or not. If 'true' data_rwsem of block groups
6437 * is going to be acquired.
6438 *
6439 *
6440 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
6441 * case -ENOSPC is returned then @ins->offset will contain the size of the
6442 * largest available hole the allocator managed to find.
6443 */
18513091 6444int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
6445 u64 num_bytes, u64 min_alloc_size,
6446 u64 empty_size, u64 hint_byte,
e570fd27 6447 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 6448{
ab8d0fc4 6449 struct btrfs_fs_info *fs_info = root->fs_info;
36af4e07 6450 bool final_tried = num_bytes == min_alloc_size;
b6919a58 6451 u64 flags;
fec577fb 6452 int ret;
925baedd 6453
1b86826d 6454 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 6455again:
0b246afa 6456 WARN_ON(num_bytes < fs_info->sectorsize);
87bde3cd 6457 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
18513091 6458 hint_byte, ins, flags, delalloc);
9cfa3e34 6459 if (!ret && !is_data) {
ab8d0fc4 6460 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 6461 } else if (ret == -ENOSPC) {
a4820398
MX
6462 if (!final_tried && ins->offset) {
6463 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 6464 num_bytes = round_down(num_bytes,
0b246afa 6465 fs_info->sectorsize);
9e622d6b 6466 num_bytes = max(num_bytes, min_alloc_size);
18513091 6467 ram_bytes = num_bytes;
9e622d6b
MX
6468 if (num_bytes == min_alloc_size)
6469 final_tried = true;
6470 goto again;
ab8d0fc4 6471 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
6472 struct btrfs_space_info *sinfo;
6473
280c2908 6474 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 6475 btrfs_err(fs_info,
5d163e0e
JM
6476 "allocation failed flags %llu, wanted %llu",
6477 flags, num_bytes);
53804280 6478 if (sinfo)
5da6afeb
JB
6479 btrfs_dump_space_info(fs_info, sinfo,
6480 num_bytes, 1);
9e622d6b 6481 }
925baedd 6482 }
0f9dd46c
JB
6483
6484 return ret;
e6dcd2dc
CM
6485}
6486
2ff7e61e 6487static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27
MX
6488 u64 start, u64 len,
6489 int pin, int delalloc)
65b51a00 6490{
0f9dd46c 6491 struct btrfs_block_group_cache *cache;
1f3c79a2 6492 int ret = 0;
0f9dd46c 6493
0b246afa 6494 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 6495 if (!cache) {
0b246afa
JM
6496 btrfs_err(fs_info, "Unable to find block group for %llu",
6497 start);
0f9dd46c
JB
6498 return -ENOSPC;
6499 }
1f3c79a2 6500
e688b725 6501 if (pin)
fdf08605 6502 pin_down_extent(cache, start, len, 1);
e688b725 6503 else {
0b246afa 6504 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 6505 ret = btrfs_discard_extent(fs_info, start, len, NULL);
e688b725 6506 btrfs_add_free_space(cache, start, len);
4824f1f4 6507 btrfs_free_reserved_bytes(cache, len, delalloc);
71ff6437 6508 trace_btrfs_reserved_extent_free(fs_info, start, len);
e688b725 6509 }
31193213 6510
fa9c0d79 6511 btrfs_put_block_group(cache);
e6dcd2dc
CM
6512 return ret;
6513}
6514
2ff7e61e 6515int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27 6516 u64 start, u64 len, int delalloc)
e688b725 6517{
2ff7e61e 6518 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
e688b725
CM
6519}
6520
2ff7e61e 6521int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
e688b725
CM
6522 u64 start, u64 len)
6523{
2ff7e61e 6524 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
e688b725
CM
6525}
6526
5d4f98a2 6527static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
6528 u64 parent, u64 root_objectid,
6529 u64 flags, u64 owner, u64 offset,
6530 struct btrfs_key *ins, int ref_mod)
e6dcd2dc 6531{
ef89b824 6532 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 6533 int ret;
e6dcd2dc 6534 struct btrfs_extent_item *extent_item;
5d4f98a2 6535 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 6536 struct btrfs_path *path;
5d4f98a2
YZ
6537 struct extent_buffer *leaf;
6538 int type;
6539 u32 size;
26b8003f 6540
5d4f98a2
YZ
6541 if (parent > 0)
6542 type = BTRFS_SHARED_DATA_REF_KEY;
6543 else
6544 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 6545
5d4f98a2 6546 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
6547
6548 path = btrfs_alloc_path();
db5b493a
TI
6549 if (!path)
6550 return -ENOMEM;
47e4bb98 6551
b9473439 6552 path->leave_spinning = 1;
5d4f98a2
YZ
6553 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6554 ins, size);
79787eaa
JM
6555 if (ret) {
6556 btrfs_free_path(path);
6557 return ret;
6558 }
0f9dd46c 6559
5d4f98a2
YZ
6560 leaf = path->nodes[0];
6561 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 6562 struct btrfs_extent_item);
5d4f98a2
YZ
6563 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6564 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6565 btrfs_set_extent_flags(leaf, extent_item,
6566 flags | BTRFS_EXTENT_FLAG_DATA);
6567
6568 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6569 btrfs_set_extent_inline_ref_type(leaf, iref, type);
6570 if (parent > 0) {
6571 struct btrfs_shared_data_ref *ref;
6572 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6573 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6574 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6575 } else {
6576 struct btrfs_extent_data_ref *ref;
6577 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6578 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6579 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6580 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6581 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6582 }
47e4bb98
CM
6583
6584 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 6585 btrfs_free_path(path);
f510cfec 6586
25a356d3 6587 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
1e144fb8
OS
6588 if (ret)
6589 return ret;
6590
6b279408 6591 ret = update_block_group(trans, ins->objectid, ins->offset, 1);
79787eaa 6592 if (ret) { /* -ENOENT, logic error */
c2cf52eb 6593 btrfs_err(fs_info, "update block group failed for %llu %llu",
c1c9ff7c 6594 ins->objectid, ins->offset);
f5947066
CM
6595 BUG();
6596 }
71ff6437 6597 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
e6dcd2dc
CM
6598 return ret;
6599}
6600
5d4f98a2 6601static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 6602 struct btrfs_delayed_ref_node *node,
21ebfbe7 6603 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 6604{
9dcdbe01 6605 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 6606 int ret;
5d4f98a2 6607 struct btrfs_extent_item *extent_item;
4e6bd4e0 6608 struct btrfs_key extent_key;
5d4f98a2
YZ
6609 struct btrfs_tree_block_info *block_info;
6610 struct btrfs_extent_inline_ref *iref;
6611 struct btrfs_path *path;
6612 struct extent_buffer *leaf;
4e6bd4e0 6613 struct btrfs_delayed_tree_ref *ref;
3173a18f 6614 u32 size = sizeof(*extent_item) + sizeof(*iref);
4e6bd4e0 6615 u64 num_bytes;
21ebfbe7 6616 u64 flags = extent_op->flags_to_set;
0b246afa 6617 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 6618
4e6bd4e0
NB
6619 ref = btrfs_delayed_node_to_tree_ref(node);
6620
4e6bd4e0
NB
6621 extent_key.objectid = node->bytenr;
6622 if (skinny_metadata) {
6623 extent_key.offset = ref->level;
6624 extent_key.type = BTRFS_METADATA_ITEM_KEY;
6625 num_bytes = fs_info->nodesize;
6626 } else {
6627 extent_key.offset = node->num_bytes;
6628 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 6629 size += sizeof(*block_info);
4e6bd4e0
NB
6630 num_bytes = node->num_bytes;
6631 }
1c2308f8 6632
5d4f98a2 6633 path = btrfs_alloc_path();
80ee54bf 6634 if (!path)
d8926bb3 6635 return -ENOMEM;
56bec294 6636
5d4f98a2
YZ
6637 path->leave_spinning = 1;
6638 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4e6bd4e0 6639 &extent_key, size);
79787eaa 6640 if (ret) {
dd825259 6641 btrfs_free_path(path);
79787eaa
JM
6642 return ret;
6643 }
5d4f98a2
YZ
6644
6645 leaf = path->nodes[0];
6646 extent_item = btrfs_item_ptr(leaf, path->slots[0],
6647 struct btrfs_extent_item);
6648 btrfs_set_extent_refs(leaf, extent_item, 1);
6649 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6650 btrfs_set_extent_flags(leaf, extent_item,
6651 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 6652
3173a18f
JB
6653 if (skinny_metadata) {
6654 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6655 } else {
6656 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 6657 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4e6bd4e0 6658 btrfs_set_tree_block_level(leaf, block_info, ref->level);
3173a18f
JB
6659 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6660 }
5d4f98a2 6661
d4b20733 6662 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
6663 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6664 btrfs_set_extent_inline_ref_type(leaf, iref,
6665 BTRFS_SHARED_BLOCK_REF_KEY);
d4b20733 6666 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
5d4f98a2
YZ
6667 } else {
6668 btrfs_set_extent_inline_ref_type(leaf, iref,
6669 BTRFS_TREE_BLOCK_REF_KEY);
4e6bd4e0 6670 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
5d4f98a2
YZ
6671 }
6672
6673 btrfs_mark_buffer_dirty(leaf);
6674 btrfs_free_path(path);
6675
4e6bd4e0
NB
6676 ret = remove_from_free_space_tree(trans, extent_key.objectid,
6677 num_bytes);
1e144fb8
OS
6678 if (ret)
6679 return ret;
6680
6b279408 6681 ret = update_block_group(trans, extent_key.objectid,
6202df69 6682 fs_info->nodesize, 1);
79787eaa 6683 if (ret) { /* -ENOENT, logic error */
c2cf52eb 6684 btrfs_err(fs_info, "update block group failed for %llu %llu",
4e6bd4e0 6685 extent_key.objectid, extent_key.offset);
5d4f98a2
YZ
6686 BUG();
6687 }
0be5dc67 6688
4e6bd4e0 6689 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
0b246afa 6690 fs_info->nodesize);
5d4f98a2
YZ
6691 return ret;
6692}
6693
6694int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 6695 struct btrfs_root *root, u64 owner,
5846a3c2
QW
6696 u64 offset, u64 ram_bytes,
6697 struct btrfs_key *ins)
5d4f98a2 6698{
76675593 6699 struct btrfs_ref generic_ref = { 0 };
5d4f98a2
YZ
6700 int ret;
6701
84f7d8e6 6702 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 6703
76675593
QW
6704 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
6705 ins->objectid, ins->offset, 0);
6706 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8a5040f7 6707 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
76675593
QW
6708 ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
6709 ram_bytes, NULL, NULL);
e6dcd2dc
CM
6710 return ret;
6711}
e02119d5
CM
6712
6713/*
6714 * this is used by the tree logging recovery code. It records that
6715 * an extent has been allocated and makes sure to clear the free
6716 * space cache bits as well
6717 */
5d4f98a2 6718int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
6719 u64 root_objectid, u64 owner, u64 offset,
6720 struct btrfs_key *ins)
e02119d5 6721{
61da2abf 6722 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5
CM
6723 int ret;
6724 struct btrfs_block_group_cache *block_group;
ed7a6948 6725 struct btrfs_space_info *space_info;
11833d66 6726
8c2a1a30
JB
6727 /*
6728 * Mixed block groups will exclude before processing the log so we only
01327610 6729 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 6730 */
0b246afa 6731 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
6732 ret = __exclude_logged_extent(fs_info, ins->objectid,
6733 ins->offset);
b50c6e25 6734 if (ret)
8c2a1a30 6735 return ret;
11833d66
YZ
6736 }
6737
0b246afa 6738 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
6739 if (!block_group)
6740 return -EINVAL;
6741
ed7a6948
WX
6742 space_info = block_group->space_info;
6743 spin_lock(&space_info->lock);
6744 spin_lock(&block_group->lock);
6745 space_info->bytes_reserved += ins->offset;
6746 block_group->reserved += ins->offset;
6747 spin_unlock(&block_group->lock);
6748 spin_unlock(&space_info->lock);
6749
ef89b824
NB
6750 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
6751 offset, ins, 1);
b50c6e25 6752 btrfs_put_block_group(block_group);
e02119d5
CM
6753 return ret;
6754}
6755
48a3b636
ES
6756static struct extent_buffer *
6757btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
bc877d28 6758 u64 bytenr, int level, u64 owner)
65b51a00 6759{
0b246afa 6760 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00
CM
6761 struct extent_buffer *buf;
6762
2ff7e61e 6763 buf = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
6764 if (IS_ERR(buf))
6765 return buf;
6766
b72c3aba
QW
6767 /*
6768 * Extra safety check in case the extent tree is corrupted and extent
6769 * allocator chooses to use a tree block which is already used and
6770 * locked.
6771 */
6772 if (buf->lock_owner == current->pid) {
6773 btrfs_err_rl(fs_info,
6774"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
6775 buf->start, btrfs_header_owner(buf), current->pid);
6776 free_extent_buffer(buf);
6777 return ERR_PTR(-EUCLEAN);
6778 }
6779
85d4e461 6780 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
65b51a00 6781 btrfs_tree_lock(buf);
6a884d7d 6782 btrfs_clean_tree_block(buf);
3083ee2e 6783 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
b4ce94de 6784
8bead258 6785 btrfs_set_lock_blocking_write(buf);
4db8c528 6786 set_extent_buffer_uptodate(buf);
b4ce94de 6787
bc877d28
NB
6788 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
6789 btrfs_set_header_level(buf, level);
6790 btrfs_set_header_bytenr(buf, buf->start);
6791 btrfs_set_header_generation(buf, trans->transid);
6792 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
6793 btrfs_set_header_owner(buf, owner);
de37aa51 6794 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 6795 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
d0c803c4 6796 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 6797 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
6798 /*
6799 * we allow two log transactions at a time, use different
52042d8e 6800 * EXTENT bit to differentiate dirty pages.
8cef4e16 6801 */
656f30db 6802 if (buf->log_index == 0)
8cef4e16
YZ
6803 set_extent_dirty(&root->dirty_log_pages, buf->start,
6804 buf->start + buf->len - 1, GFP_NOFS);
6805 else
6806 set_extent_new(&root->dirty_log_pages, buf->start,
3744dbeb 6807 buf->start + buf->len - 1);
d0c803c4 6808 } else {
656f30db 6809 buf->log_index = -1;
d0c803c4 6810 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 6811 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 6812 }
64c12921 6813 trans->dirty = true;
b4ce94de 6814 /* this returns a buffer locked for blocking */
65b51a00
CM
6815 return buf;
6816}
6817
fec577fb 6818/*
f0486c68 6819 * finds a free extent and does all the dirty work required for allocation
67b7859e 6820 * returns the tree buffer or an ERR_PTR on error.
fec577fb 6821 */
4d75f8a9 6822struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
6823 struct btrfs_root *root,
6824 u64 parent, u64 root_objectid,
6825 const struct btrfs_disk_key *key,
6826 int level, u64 hint,
6827 u64 empty_size)
fec577fb 6828{
0b246afa 6829 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 6830 struct btrfs_key ins;
f0486c68 6831 struct btrfs_block_rsv *block_rsv;
5f39d397 6832 struct extent_buffer *buf;
67b7859e 6833 struct btrfs_delayed_extent_op *extent_op;
ed4f255b 6834 struct btrfs_ref generic_ref = { 0 };
f0486c68
YZ
6835 u64 flags = 0;
6836 int ret;
0b246afa
JM
6837 u32 blocksize = fs_info->nodesize;
6838 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
fec577fb 6839
05653ef3 6840#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 6841 if (btrfs_is_testing(fs_info)) {
faa2dbf0 6842 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
bc877d28 6843 level, root_objectid);
faa2dbf0
JB
6844 if (!IS_ERR(buf))
6845 root->alloc_bytenr += blocksize;
6846 return buf;
6847 }
05653ef3 6848#endif
fccb84c9 6849
67f9c220 6850 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
f0486c68
YZ
6851 if (IS_ERR(block_rsv))
6852 return ERR_CAST(block_rsv);
6853
18513091 6854 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 6855 empty_size, hint, &ins, 0, 0);
67b7859e
OS
6856 if (ret)
6857 goto out_unuse;
55c69072 6858
bc877d28
NB
6859 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
6860 root_objectid);
67b7859e
OS
6861 if (IS_ERR(buf)) {
6862 ret = PTR_ERR(buf);
6863 goto out_free_reserved;
6864 }
f0486c68
YZ
6865
6866 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6867 if (parent == 0)
6868 parent = ins.objectid;
6869 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6870 } else
6871 BUG_ON(parent > 0);
6872
6873 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
78a6184a 6874 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
6875 if (!extent_op) {
6876 ret = -ENOMEM;
6877 goto out_free_buf;
6878 }
f0486c68
YZ
6879 if (key)
6880 memcpy(&extent_op->key, key, sizeof(extent_op->key));
6881 else
6882 memset(&extent_op->key, 0, sizeof(extent_op->key));
6883 extent_op->flags_to_set = flags;
35b3ad50
DS
6884 extent_op->update_key = skinny_metadata ? false : true;
6885 extent_op->update_flags = true;
6886 extent_op->is_data = false;
b1c79e09 6887 extent_op->level = level;
f0486c68 6888
ed4f255b
QW
6889 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
6890 ins.objectid, ins.offset, parent);
6891 generic_ref.real_root = root->root_key.objectid;
6892 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8a5040f7 6893 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 6894 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
7be07912 6895 extent_op, NULL, NULL);
67b7859e
OS
6896 if (ret)
6897 goto out_free_delayed;
f0486c68 6898 }
fec577fb 6899 return buf;
67b7859e
OS
6900
6901out_free_delayed:
6902 btrfs_free_delayed_extent_op(extent_op);
6903out_free_buf:
6904 free_extent_buffer(buf);
6905out_free_reserved:
2ff7e61e 6906 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 6907out_unuse:
67f9c220 6908 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 6909 return ERR_PTR(ret);
fec577fb 6910}
a28ec197 6911
2c47e605
YZ
6912struct walk_control {
6913 u64 refs[BTRFS_MAX_LEVEL];
6914 u64 flags[BTRFS_MAX_LEVEL];
6915 struct btrfs_key update_progress;
aea6f028
JB
6916 struct btrfs_key drop_progress;
6917 int drop_level;
2c47e605
YZ
6918 int stage;
6919 int level;
6920 int shared_level;
6921 int update_ref;
6922 int keep_locks;
1c4850e2
YZ
6923 int reada_slot;
6924 int reada_count;
78c52d9e 6925 int restarted;
2c47e605
YZ
6926};
6927
6928#define DROP_REFERENCE 1
6929#define UPDATE_BACKREF 2
6930
1c4850e2
YZ
6931static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6932 struct btrfs_root *root,
6933 struct walk_control *wc,
6934 struct btrfs_path *path)
6407bf6d 6935{
0b246afa 6936 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
6937 u64 bytenr;
6938 u64 generation;
6939 u64 refs;
94fcca9f 6940 u64 flags;
5d4f98a2 6941 u32 nritems;
1c4850e2
YZ
6942 struct btrfs_key key;
6943 struct extent_buffer *eb;
6407bf6d 6944 int ret;
1c4850e2
YZ
6945 int slot;
6946 int nread = 0;
6407bf6d 6947
1c4850e2
YZ
6948 if (path->slots[wc->level] < wc->reada_slot) {
6949 wc->reada_count = wc->reada_count * 2 / 3;
6950 wc->reada_count = max(wc->reada_count, 2);
6951 } else {
6952 wc->reada_count = wc->reada_count * 3 / 2;
6953 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 6954 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 6955 }
7bb86316 6956
1c4850e2
YZ
6957 eb = path->nodes[wc->level];
6958 nritems = btrfs_header_nritems(eb);
bd56b302 6959
1c4850e2
YZ
6960 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6961 if (nread >= wc->reada_count)
6962 break;
bd56b302 6963
2dd3e67b 6964 cond_resched();
1c4850e2
YZ
6965 bytenr = btrfs_node_blockptr(eb, slot);
6966 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 6967
1c4850e2
YZ
6968 if (slot == path->slots[wc->level])
6969 goto reada;
5d4f98a2 6970
1c4850e2
YZ
6971 if (wc->stage == UPDATE_BACKREF &&
6972 generation <= root->root_key.offset)
bd56b302
CM
6973 continue;
6974
94fcca9f 6975 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 6976 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f
JB
6977 wc->level - 1, 1, &refs,
6978 &flags);
79787eaa
JM
6979 /* We don't care about errors in readahead. */
6980 if (ret < 0)
6981 continue;
94fcca9f
YZ
6982 BUG_ON(refs == 0);
6983
1c4850e2 6984 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
6985 if (refs == 1)
6986 goto reada;
bd56b302 6987
94fcca9f
YZ
6988 if (wc->level == 1 &&
6989 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6990 continue;
1c4850e2
YZ
6991 if (!wc->update_ref ||
6992 generation <= root->root_key.offset)
6993 continue;
6994 btrfs_node_key_to_cpu(eb, &key, slot);
6995 ret = btrfs_comp_cpu_keys(&key,
6996 &wc->update_progress);
6997 if (ret < 0)
6998 continue;
94fcca9f
YZ
6999 } else {
7000 if (wc->level == 1 &&
7001 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7002 continue;
6407bf6d 7003 }
1c4850e2 7004reada:
2ff7e61e 7005 readahead_tree_block(fs_info, bytenr);
1c4850e2 7006 nread++;
20524f02 7007 }
1c4850e2 7008 wc->reada_slot = slot;
20524f02 7009}
2c47e605 7010
f82d02d9 7011/*
2c016dc2 7012 * helper to process tree block while walking down the tree.
2c47e605 7013 *
2c47e605
YZ
7014 * when wc->stage == UPDATE_BACKREF, this function updates
7015 * back refs for pointers in the block.
7016 *
7017 * NOTE: return value 1 means we should stop walking down.
f82d02d9 7018 */
2c47e605 7019static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 7020 struct btrfs_root *root,
2c47e605 7021 struct btrfs_path *path,
94fcca9f 7022 struct walk_control *wc, int lookup_info)
f82d02d9 7023{
2ff7e61e 7024 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
7025 int level = wc->level;
7026 struct extent_buffer *eb = path->nodes[level];
2c47e605 7027 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
7028 int ret;
7029
2c47e605
YZ
7030 if (wc->stage == UPDATE_BACKREF &&
7031 btrfs_header_owner(eb) != root->root_key.objectid)
7032 return 1;
f82d02d9 7033
2c47e605
YZ
7034 /*
7035 * when reference count of tree block is 1, it won't increase
7036 * again. once full backref flag is set, we never clear it.
7037 */
94fcca9f
YZ
7038 if (lookup_info &&
7039 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7040 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 7041 BUG_ON(!path->locks[level]);
2ff7e61e 7042 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 7043 eb->start, level, 1,
2c47e605
YZ
7044 &wc->refs[level],
7045 &wc->flags[level]);
79787eaa
JM
7046 BUG_ON(ret == -ENOMEM);
7047 if (ret)
7048 return ret;
2c47e605
YZ
7049 BUG_ON(wc->refs[level] == 0);
7050 }
5d4f98a2 7051
2c47e605
YZ
7052 if (wc->stage == DROP_REFERENCE) {
7053 if (wc->refs[level] > 1)
7054 return 1;
f82d02d9 7055
2c47e605 7056 if (path->locks[level] && !wc->keep_locks) {
bd681513 7057 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
7058 path->locks[level] = 0;
7059 }
7060 return 0;
7061 }
f82d02d9 7062
2c47e605
YZ
7063 /* wc->stage == UPDATE_BACKREF */
7064 if (!(wc->flags[level] & flag)) {
7065 BUG_ON(!path->locks[level]);
e339a6b0 7066 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 7067 BUG_ON(ret); /* -ENOMEM */
e339a6b0 7068 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 7069 BUG_ON(ret); /* -ENOMEM */
f5c8daa5 7070 ret = btrfs_set_disk_extent_flags(trans, eb->start,
b1c79e09
JB
7071 eb->len, flag,
7072 btrfs_header_level(eb), 0);
79787eaa 7073 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
7074 wc->flags[level] |= flag;
7075 }
7076
7077 /*
7078 * the block is shared by multiple trees, so it's not good to
7079 * keep the tree lock
7080 */
7081 if (path->locks[level] && level > 0) {
bd681513 7082 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
7083 path->locks[level] = 0;
7084 }
7085 return 0;
7086}
7087
78c52d9e
JB
7088/*
7089 * This is used to verify a ref exists for this root to deal with a bug where we
7090 * would have a drop_progress key that hadn't been updated properly.
7091 */
7092static int check_ref_exists(struct btrfs_trans_handle *trans,
7093 struct btrfs_root *root, u64 bytenr, u64 parent,
7094 int level)
7095{
7096 struct btrfs_path *path;
7097 struct btrfs_extent_inline_ref *iref;
7098 int ret;
7099
7100 path = btrfs_alloc_path();
7101 if (!path)
7102 return -ENOMEM;
7103
7104 ret = lookup_extent_backref(trans, path, &iref, bytenr,
7105 root->fs_info->nodesize, parent,
7106 root->root_key.objectid, level, 0);
7107 btrfs_free_path(path);
7108 if (ret == -ENOENT)
7109 return 0;
7110 if (ret < 0)
7111 return ret;
7112 return 1;
7113}
7114
1c4850e2 7115/*
2c016dc2 7116 * helper to process tree block pointer.
1c4850e2
YZ
7117 *
7118 * when wc->stage == DROP_REFERENCE, this function checks
7119 * reference count of the block pointed to. if the block
7120 * is shared and we need update back refs for the subtree
7121 * rooted at the block, this function changes wc->stage to
7122 * UPDATE_BACKREF. if the block is shared and there is no
7123 * need to update back, this function drops the reference
7124 * to the block.
7125 *
7126 * NOTE: return value 1 means we should stop walking down.
7127 */
7128static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7129 struct btrfs_root *root,
7130 struct btrfs_path *path,
94fcca9f 7131 struct walk_control *wc, int *lookup_info)
1c4850e2 7132{
0b246afa 7133 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
7134 u64 bytenr;
7135 u64 generation;
7136 u64 parent;
1c4850e2 7137 struct btrfs_key key;
581c1760 7138 struct btrfs_key first_key;
ffd4bb2a 7139 struct btrfs_ref ref = { 0 };
1c4850e2
YZ
7140 struct extent_buffer *next;
7141 int level = wc->level;
7142 int reada = 0;
7143 int ret = 0;
1152651a 7144 bool need_account = false;
1c4850e2
YZ
7145
7146 generation = btrfs_node_ptr_generation(path->nodes[level],
7147 path->slots[level]);
7148 /*
7149 * if the lower level block was created before the snapshot
7150 * was created, we know there is no need to update back refs
7151 * for the subtree
7152 */
7153 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
7154 generation <= root->root_key.offset) {
7155 *lookup_info = 1;
1c4850e2 7156 return 1;
94fcca9f 7157 }
1c4850e2
YZ
7158
7159 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
581c1760
QW
7160 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
7161 path->slots[level]);
1c4850e2 7162
0b246afa 7163 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 7164 if (!next) {
2ff7e61e 7165 next = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
7166 if (IS_ERR(next))
7167 return PTR_ERR(next);
7168
b2aaaa3b
JB
7169 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7170 level - 1);
1c4850e2
YZ
7171 reada = 1;
7172 }
7173 btrfs_tree_lock(next);
8bead258 7174 btrfs_set_lock_blocking_write(next);
1c4850e2 7175
2ff7e61e 7176 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f
YZ
7177 &wc->refs[level - 1],
7178 &wc->flags[level - 1]);
4867268c
JB
7179 if (ret < 0)
7180 goto out_unlock;
79787eaa 7181
c2cf52eb 7182 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 7183 btrfs_err(fs_info, "Missing references.");
4867268c
JB
7184 ret = -EIO;
7185 goto out_unlock;
c2cf52eb 7186 }
94fcca9f 7187 *lookup_info = 0;
1c4850e2 7188
94fcca9f 7189 if (wc->stage == DROP_REFERENCE) {
1c4850e2 7190 if (wc->refs[level - 1] > 1) {
1152651a 7191 need_account = true;
94fcca9f
YZ
7192 if (level == 1 &&
7193 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7194 goto skip;
7195
1c4850e2
YZ
7196 if (!wc->update_ref ||
7197 generation <= root->root_key.offset)
7198 goto skip;
7199
7200 btrfs_node_key_to_cpu(path->nodes[level], &key,
7201 path->slots[level]);
7202 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7203 if (ret < 0)
7204 goto skip;
7205
7206 wc->stage = UPDATE_BACKREF;
7207 wc->shared_level = level - 1;
7208 }
94fcca9f
YZ
7209 } else {
7210 if (level == 1 &&
7211 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7212 goto skip;
1c4850e2
YZ
7213 }
7214
b9fab919 7215 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
7216 btrfs_tree_unlock(next);
7217 free_extent_buffer(next);
7218 next = NULL;
94fcca9f 7219 *lookup_info = 1;
1c4850e2
YZ
7220 }
7221
7222 if (!next) {
7223 if (reada && level == 1)
7224 reada_walk_down(trans, root, wc, path);
581c1760
QW
7225 next = read_tree_block(fs_info, bytenr, generation, level - 1,
7226 &first_key);
64c043de
LB
7227 if (IS_ERR(next)) {
7228 return PTR_ERR(next);
7229 } else if (!extent_buffer_uptodate(next)) {
416bc658 7230 free_extent_buffer(next);
97d9a8a4 7231 return -EIO;
416bc658 7232 }
1c4850e2 7233 btrfs_tree_lock(next);
8bead258 7234 btrfs_set_lock_blocking_write(next);
1c4850e2
YZ
7235 }
7236
7237 level--;
4867268c
JB
7238 ASSERT(level == btrfs_header_level(next));
7239 if (level != btrfs_header_level(next)) {
7240 btrfs_err(root->fs_info, "mismatched level");
7241 ret = -EIO;
7242 goto out_unlock;
7243 }
1c4850e2
YZ
7244 path->nodes[level] = next;
7245 path->slots[level] = 0;
bd681513 7246 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
1c4850e2
YZ
7247 wc->level = level;
7248 if (wc->level == 1)
7249 wc->reada_slot = 0;
7250 return 0;
7251skip:
7252 wc->refs[level - 1] = 0;
7253 wc->flags[level - 1] = 0;
94fcca9f
YZ
7254 if (wc->stage == DROP_REFERENCE) {
7255 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7256 parent = path->nodes[level]->start;
7257 } else {
4867268c 7258 ASSERT(root->root_key.objectid ==
94fcca9f 7259 btrfs_header_owner(path->nodes[level]));
4867268c
JB
7260 if (root->root_key.objectid !=
7261 btrfs_header_owner(path->nodes[level])) {
7262 btrfs_err(root->fs_info,
7263 "mismatched block owner");
7264 ret = -EIO;
7265 goto out_unlock;
7266 }
94fcca9f
YZ
7267 parent = 0;
7268 }
1c4850e2 7269
78c52d9e
JB
7270 /*
7271 * If we had a drop_progress we need to verify the refs are set
7272 * as expected. If we find our ref then we know that from here
7273 * on out everything should be correct, and we can clear the
7274 * ->restarted flag.
7275 */
7276 if (wc->restarted) {
7277 ret = check_ref_exists(trans, root, bytenr, parent,
7278 level - 1);
7279 if (ret < 0)
7280 goto out_unlock;
7281 if (ret == 0)
7282 goto no_delete;
7283 ret = 0;
7284 wc->restarted = 0;
7285 }
7286
2cd86d30
QW
7287 /*
7288 * Reloc tree doesn't contribute to qgroup numbers, and we have
7289 * already accounted them at merge time (replace_path),
7290 * thus we could skip expensive subtree trace here.
7291 */
7292 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
7293 need_account) {
deb40627 7294 ret = btrfs_qgroup_trace_subtree(trans, next,
33d1f05c 7295 generation, level - 1);
1152651a 7296 if (ret) {
0b246afa 7297 btrfs_err_rl(fs_info,
5d163e0e
JM
7298 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
7299 ret);
1152651a
MF
7300 }
7301 }
aea6f028
JB
7302
7303 /*
7304 * We need to update the next key in our walk control so we can
7305 * update the drop_progress key accordingly. We don't care if
7306 * find_next_key doesn't find a key because that means we're at
7307 * the end and are going to clean up now.
7308 */
7309 wc->drop_level = level;
7310 find_next_key(path, level, &wc->drop_progress);
7311
ffd4bb2a
QW
7312 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
7313 fs_info->nodesize, parent);
7314 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
7315 ret = btrfs_free_extent(trans, &ref);
4867268c
JB
7316 if (ret)
7317 goto out_unlock;
1c4850e2 7318 }
78c52d9e 7319no_delete:
4867268c
JB
7320 *lookup_info = 1;
7321 ret = 1;
7322
7323out_unlock:
1c4850e2
YZ
7324 btrfs_tree_unlock(next);
7325 free_extent_buffer(next);
4867268c
JB
7326
7327 return ret;
1c4850e2
YZ
7328}
7329
2c47e605 7330/*
2c016dc2 7331 * helper to process tree block while walking up the tree.
2c47e605
YZ
7332 *
7333 * when wc->stage == DROP_REFERENCE, this function drops
7334 * reference count on the block.
7335 *
7336 * when wc->stage == UPDATE_BACKREF, this function changes
7337 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7338 * to UPDATE_BACKREF previously while processing the block.
7339 *
7340 * NOTE: return value 1 means we should stop walking up.
7341 */
7342static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7343 struct btrfs_root *root,
7344 struct btrfs_path *path,
7345 struct walk_control *wc)
7346{
0b246afa 7347 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 7348 int ret;
2c47e605
YZ
7349 int level = wc->level;
7350 struct extent_buffer *eb = path->nodes[level];
7351 u64 parent = 0;
7352
7353 if (wc->stage == UPDATE_BACKREF) {
7354 BUG_ON(wc->shared_level < level);
7355 if (level < wc->shared_level)
7356 goto out;
7357
2c47e605
YZ
7358 ret = find_next_key(path, level + 1, &wc->update_progress);
7359 if (ret > 0)
7360 wc->update_ref = 0;
7361
7362 wc->stage = DROP_REFERENCE;
7363 wc->shared_level = -1;
7364 path->slots[level] = 0;
7365
7366 /*
7367 * check reference count again if the block isn't locked.
7368 * we should start walking down the tree again if reference
7369 * count is one.
7370 */
7371 if (!path->locks[level]) {
7372 BUG_ON(level == 0);
7373 btrfs_tree_lock(eb);
8bead258 7374 btrfs_set_lock_blocking_write(eb);
bd681513 7375 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7376
2ff7e61e 7377 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 7378 eb->start, level, 1,
2c47e605
YZ
7379 &wc->refs[level],
7380 &wc->flags[level]);
79787eaa
JM
7381 if (ret < 0) {
7382 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 7383 path->locks[level] = 0;
79787eaa
JM
7384 return ret;
7385 }
2c47e605
YZ
7386 BUG_ON(wc->refs[level] == 0);
7387 if (wc->refs[level] == 1) {
bd681513 7388 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 7389 path->locks[level] = 0;
2c47e605
YZ
7390 return 1;
7391 }
f82d02d9 7392 }
2c47e605 7393 }
f82d02d9 7394
2c47e605
YZ
7395 /* wc->stage == DROP_REFERENCE */
7396 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 7397
2c47e605
YZ
7398 if (wc->refs[level] == 1) {
7399 if (level == 0) {
7400 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 7401 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 7402 else
e339a6b0 7403 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 7404 BUG_ON(ret); /* -ENOMEM */
c4140cbf
QW
7405 if (is_fstree(root->root_key.objectid)) {
7406 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
7407 if (ret) {
7408 btrfs_err_rl(fs_info,
7409 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 7410 ret);
c4140cbf 7411 }
1152651a 7412 }
2c47e605 7413 }
6a884d7d 7414 /* make block locked assertion in btrfs_clean_tree_block happy */
2c47e605
YZ
7415 if (!path->locks[level] &&
7416 btrfs_header_generation(eb) == trans->transid) {
7417 btrfs_tree_lock(eb);
8bead258 7418 btrfs_set_lock_blocking_write(eb);
bd681513 7419 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7420 }
6a884d7d 7421 btrfs_clean_tree_block(eb);
2c47e605
YZ
7422 }
7423
7424 if (eb == root->node) {
7425 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7426 parent = eb->start;
65c6e82b
QW
7427 else if (root->root_key.objectid != btrfs_header_owner(eb))
7428 goto owner_mismatch;
2c47e605
YZ
7429 } else {
7430 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7431 parent = path->nodes[level + 1]->start;
65c6e82b
QW
7432 else if (root->root_key.objectid !=
7433 btrfs_header_owner(path->nodes[level + 1]))
7434 goto owner_mismatch;
f82d02d9 7435 }
f82d02d9 7436
5581a51a 7437 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
2c47e605
YZ
7438out:
7439 wc->refs[level] = 0;
7440 wc->flags[level] = 0;
f0486c68 7441 return 0;
65c6e82b
QW
7442
7443owner_mismatch:
7444 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
7445 btrfs_header_owner(eb), root->root_key.objectid);
7446 return -EUCLEAN;
2c47e605
YZ
7447}
7448
7449static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7450 struct btrfs_root *root,
7451 struct btrfs_path *path,
7452 struct walk_control *wc)
7453{
2c47e605 7454 int level = wc->level;
94fcca9f 7455 int lookup_info = 1;
2c47e605
YZ
7456 int ret;
7457
7458 while (level >= 0) {
94fcca9f 7459 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
7460 if (ret > 0)
7461 break;
7462
7463 if (level == 0)
7464 break;
7465
7a7965f8
YZ
7466 if (path->slots[level] >=
7467 btrfs_header_nritems(path->nodes[level]))
7468 break;
7469
94fcca9f 7470 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
7471 if (ret > 0) {
7472 path->slots[level]++;
7473 continue;
90d2c51d
MX
7474 } else if (ret < 0)
7475 return ret;
1c4850e2 7476 level = wc->level;
f82d02d9 7477 }
f82d02d9
YZ
7478 return 0;
7479}
7480
d397712b 7481static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 7482 struct btrfs_root *root,
f82d02d9 7483 struct btrfs_path *path,
2c47e605 7484 struct walk_control *wc, int max_level)
20524f02 7485{
2c47e605 7486 int level = wc->level;
20524f02 7487 int ret;
9f3a7427 7488
2c47e605
YZ
7489 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7490 while (level < max_level && path->nodes[level]) {
7491 wc->level = level;
7492 if (path->slots[level] + 1 <
7493 btrfs_header_nritems(path->nodes[level])) {
7494 path->slots[level]++;
20524f02
CM
7495 return 0;
7496 } else {
2c47e605
YZ
7497 ret = walk_up_proc(trans, root, path, wc);
7498 if (ret > 0)
7499 return 0;
65c6e82b
QW
7500 if (ret < 0)
7501 return ret;
bd56b302 7502
2c47e605 7503 if (path->locks[level]) {
bd681513
CM
7504 btrfs_tree_unlock_rw(path->nodes[level],
7505 path->locks[level]);
2c47e605 7506 path->locks[level] = 0;
f82d02d9 7507 }
2c47e605
YZ
7508 free_extent_buffer(path->nodes[level]);
7509 path->nodes[level] = NULL;
7510 level++;
20524f02
CM
7511 }
7512 }
7513 return 1;
7514}
7515
9aca1d51 7516/*
2c47e605
YZ
7517 * drop a subvolume tree.
7518 *
7519 * this function traverses the tree freeing any blocks that only
7520 * referenced by the tree.
7521 *
7522 * when a shared tree block is found. this function decreases its
7523 * reference count by one. if update_ref is true, this function
7524 * also make sure backrefs for the shared block and all lower level
7525 * blocks are properly updated.
9d1a2a3a
DS
7526 *
7527 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 7528 */
2c536799 7529int btrfs_drop_snapshot(struct btrfs_root *root,
66d7e7f0
AJ
7530 struct btrfs_block_rsv *block_rsv, int update_ref,
7531 int for_reloc)
20524f02 7532{
ab8d0fc4 7533 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 7534 struct btrfs_path *path;
2c47e605 7535 struct btrfs_trans_handle *trans;
ab8d0fc4 7536 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 7537 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
7538 struct walk_control *wc;
7539 struct btrfs_key key;
7540 int err = 0;
7541 int ret;
7542 int level;
d29a9f62 7543 bool root_dropped = false;
20524f02 7544
4fd786e6 7545 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
1152651a 7546
5caf2a00 7547 path = btrfs_alloc_path();
cb1b69f4
TI
7548 if (!path) {
7549 err = -ENOMEM;
7550 goto out;
7551 }
20524f02 7552
2c47e605 7553 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
7554 if (!wc) {
7555 btrfs_free_path(path);
cb1b69f4
TI
7556 err = -ENOMEM;
7557 goto out;
38a1a919 7558 }
2c47e605 7559
a22285a6 7560 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
7561 if (IS_ERR(trans)) {
7562 err = PTR_ERR(trans);
7563 goto out_free;
7564 }
98d5dc13 7565
0568e82d
JB
7566 err = btrfs_run_delayed_items(trans);
7567 if (err)
7568 goto out_end_trans;
7569
3fd0a558
YZ
7570 if (block_rsv)
7571 trans->block_rsv = block_rsv;
2c47e605 7572
83354f07
JB
7573 /*
7574 * This will help us catch people modifying the fs tree while we're
7575 * dropping it. It is unsafe to mess with the fs tree while it's being
7576 * dropped as we unlock the root node and parent nodes as we walk down
7577 * the tree, assuming nothing will change. If something does change
7578 * then we'll have stale information and drop references to blocks we've
7579 * already dropped.
7580 */
7581 set_bit(BTRFS_ROOT_DELETING, &root->state);
9f3a7427 7582 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 7583 level = btrfs_header_level(root->node);
5d4f98a2 7584 path->nodes[level] = btrfs_lock_root_node(root);
8bead258 7585 btrfs_set_lock_blocking_write(path->nodes[level]);
9f3a7427 7586 path->slots[level] = 0;
bd681513 7587 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
7588 memset(&wc->update_progress, 0,
7589 sizeof(wc->update_progress));
9f3a7427 7590 } else {
9f3a7427 7591 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
7592 memcpy(&wc->update_progress, &key,
7593 sizeof(wc->update_progress));
7594
6702ed49 7595 level = root_item->drop_level;
2c47e605 7596 BUG_ON(level == 0);
6702ed49 7597 path->lowest_level = level;
2c47e605
YZ
7598 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7599 path->lowest_level = 0;
7600 if (ret < 0) {
7601 err = ret;
79787eaa 7602 goto out_end_trans;
9f3a7427 7603 }
1c4850e2 7604 WARN_ON(ret > 0);
2c47e605 7605
7d9eb12c
CM
7606 /*
7607 * unlock our path, this is safe because only this
7608 * function is allowed to delete this snapshot
7609 */
5d4f98a2 7610 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
7611
7612 level = btrfs_header_level(root->node);
7613 while (1) {
7614 btrfs_tree_lock(path->nodes[level]);
8bead258 7615 btrfs_set_lock_blocking_write(path->nodes[level]);
fec386ac 7616 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7617
2ff7e61e 7618 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 7619 path->nodes[level]->start,
3173a18f 7620 level, 1, &wc->refs[level],
2c47e605 7621 &wc->flags[level]);
79787eaa
JM
7622 if (ret < 0) {
7623 err = ret;
7624 goto out_end_trans;
7625 }
2c47e605
YZ
7626 BUG_ON(wc->refs[level] == 0);
7627
7628 if (level == root_item->drop_level)
7629 break;
7630
7631 btrfs_tree_unlock(path->nodes[level]);
fec386ac 7632 path->locks[level] = 0;
2c47e605
YZ
7633 WARN_ON(wc->refs[level] != 1);
7634 level--;
7635 }
9f3a7427 7636 }
2c47e605 7637
78c52d9e 7638 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
7639 wc->level = level;
7640 wc->shared_level = -1;
7641 wc->stage = DROP_REFERENCE;
7642 wc->update_ref = update_ref;
7643 wc->keep_locks = 0;
0b246afa 7644 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 7645
d397712b 7646 while (1) {
9d1a2a3a 7647
2c47e605
YZ
7648 ret = walk_down_tree(trans, root, path, wc);
7649 if (ret < 0) {
7650 err = ret;
20524f02 7651 break;
2c47e605 7652 }
9aca1d51 7653
2c47e605
YZ
7654 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7655 if (ret < 0) {
7656 err = ret;
20524f02 7657 break;
2c47e605
YZ
7658 }
7659
7660 if (ret > 0) {
7661 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
7662 break;
7663 }
2c47e605
YZ
7664
7665 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
7666 wc->drop_level = wc->level;
7667 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
7668 &wc->drop_progress,
7669 path->slots[wc->drop_level]);
7670 }
7671 btrfs_cpu_key_to_disk(&root_item->drop_progress,
7672 &wc->drop_progress);
7673 root_item->drop_level = wc->drop_level;
2c47e605
YZ
7674
7675 BUG_ON(wc->level == 0);
3a45bb20 7676 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 7677 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
7678 ret = btrfs_update_root(trans, tree_root,
7679 &root->root_key,
7680 root_item);
79787eaa 7681 if (ret) {
66642832 7682 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7683 err = ret;
7684 goto out_end_trans;
7685 }
2c47e605 7686
3a45bb20 7687 btrfs_end_transaction_throttle(trans);
2ff7e61e 7688 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
7689 btrfs_debug(fs_info,
7690 "drop snapshot early exit");
3c8f2422
JB
7691 err = -EAGAIN;
7692 goto out_free;
7693 }
7694
a22285a6 7695 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
7696 if (IS_ERR(trans)) {
7697 err = PTR_ERR(trans);
7698 goto out_free;
7699 }
3fd0a558
YZ
7700 if (block_rsv)
7701 trans->block_rsv = block_rsv;
c3e69d58 7702 }
20524f02 7703 }
b3b4aa74 7704 btrfs_release_path(path);
79787eaa
JM
7705 if (err)
7706 goto out_end_trans;
2c47e605 7707
ab9ce7d4 7708 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 7709 if (ret) {
66642832 7710 btrfs_abort_transaction(trans, ret);
e19182c0 7711 err = ret;
79787eaa
JM
7712 goto out_end_trans;
7713 }
2c47e605 7714
76dda93c 7715 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
cb517eab
MX
7716 ret = btrfs_find_root(tree_root, &root->root_key, path,
7717 NULL, NULL);
79787eaa 7718 if (ret < 0) {
66642832 7719 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7720 err = ret;
7721 goto out_end_trans;
7722 } else if (ret > 0) {
84cd948c
JB
7723 /* if we fail to delete the orphan item this time
7724 * around, it'll get picked up the next time.
7725 *
7726 * The most common failure here is just -ENOENT.
7727 */
7728 btrfs_del_orphan_item(trans, tree_root,
7729 root->root_key.objectid);
76dda93c
YZ
7730 }
7731 }
7732
27cdeb70 7733 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
2b9dbef2 7734 btrfs_add_dropped_root(trans, root);
76dda93c
YZ
7735 } else {
7736 free_extent_buffer(root->node);
7737 free_extent_buffer(root->commit_root);
b0feb9d9 7738 btrfs_put_fs_root(root);
76dda93c 7739 }
d29a9f62 7740 root_dropped = true;
79787eaa 7741out_end_trans:
3a45bb20 7742 btrfs_end_transaction_throttle(trans);
79787eaa 7743out_free:
2c47e605 7744 kfree(wc);
5caf2a00 7745 btrfs_free_path(path);
cb1b69f4 7746out:
d29a9f62
JB
7747 /*
7748 * So if we need to stop dropping the snapshot for whatever reason we
7749 * need to make sure to add it back to the dead root list so that we
7750 * keep trying to do the work later. This also cleans up roots if we
7751 * don't have it in the radix (like when we recover after a power fail
7752 * or unmount) so we don't leak memory.
7753 */
897ca819 7754 if (!for_reloc && !root_dropped)
d29a9f62 7755 btrfs_add_dead_root(root);
90515e7f 7756 if (err && err != -EAGAIN)
ab8d0fc4 7757 btrfs_handle_fs_error(fs_info, err, NULL);
2c536799 7758 return err;
20524f02 7759}
9078a3e1 7760
2c47e605
YZ
7761/*
7762 * drop subtree rooted at tree block 'node'.
7763 *
7764 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 7765 * only used by relocation code
2c47e605 7766 */
f82d02d9
YZ
7767int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7768 struct btrfs_root *root,
7769 struct extent_buffer *node,
7770 struct extent_buffer *parent)
7771{
0b246afa 7772 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 7773 struct btrfs_path *path;
2c47e605 7774 struct walk_control *wc;
f82d02d9
YZ
7775 int level;
7776 int parent_level;
7777 int ret = 0;
7778 int wret;
7779
2c47e605
YZ
7780 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7781
f82d02d9 7782 path = btrfs_alloc_path();
db5b493a
TI
7783 if (!path)
7784 return -ENOMEM;
f82d02d9 7785
2c47e605 7786 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
7787 if (!wc) {
7788 btrfs_free_path(path);
7789 return -ENOMEM;
7790 }
2c47e605 7791
b9447ef8 7792 btrfs_assert_tree_locked(parent);
f82d02d9
YZ
7793 parent_level = btrfs_header_level(parent);
7794 extent_buffer_get(parent);
7795 path->nodes[parent_level] = parent;
7796 path->slots[parent_level] = btrfs_header_nritems(parent);
7797
b9447ef8 7798 btrfs_assert_tree_locked(node);
f82d02d9 7799 level = btrfs_header_level(node);
f82d02d9
YZ
7800 path->nodes[level] = node;
7801 path->slots[level] = 0;
bd681513 7802 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
7803
7804 wc->refs[parent_level] = 1;
7805 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7806 wc->level = level;
7807 wc->shared_level = -1;
7808 wc->stage = DROP_REFERENCE;
7809 wc->update_ref = 0;
7810 wc->keep_locks = 1;
0b246afa 7811 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
7812
7813 while (1) {
2c47e605
YZ
7814 wret = walk_down_tree(trans, root, path, wc);
7815 if (wret < 0) {
f82d02d9 7816 ret = wret;
f82d02d9 7817 break;
2c47e605 7818 }
f82d02d9 7819
2c47e605 7820 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
7821 if (wret < 0)
7822 ret = wret;
7823 if (wret != 0)
7824 break;
7825 }
7826
2c47e605 7827 kfree(wc);
f82d02d9
YZ
7828 btrfs_free_path(path);
7829 return ret;
7830}
7831
6202df69 7832static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c
CM
7833{
7834 u64 num_devices;
fc67c450 7835 u64 stripped;
e4d8ec0f 7836
fc67c450
ID
7837 /*
7838 * if restripe for this chunk_type is on pick target profile and
7839 * return, otherwise do the usual balance
7840 */
6202df69 7841 stripped = get_restripe_target(fs_info, flags);
fc67c450
ID
7842 if (stripped)
7843 return extended_to_chunk(stripped);
e4d8ec0f 7844
6202df69 7845 num_devices = fs_info->fs_devices->rw_devices;
cd02dca5 7846
a07e8a46 7847 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
c7369b3f 7848 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
fc67c450 7849
ec44a35c
CM
7850 if (num_devices == 1) {
7851 stripped |= BTRFS_BLOCK_GROUP_DUP;
7852 stripped = flags & ~stripped;
7853
7854 /* turn raid0 into single device chunks */
7855 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7856 return stripped;
7857
7858 /* turn mirroring into duplication */
c7369b3f 7859 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
ec44a35c
CM
7860 BTRFS_BLOCK_GROUP_RAID10))
7861 return stripped | BTRFS_BLOCK_GROUP_DUP;
ec44a35c
CM
7862 } else {
7863 /* they already had raid on here, just return */
ec44a35c
CM
7864 if (flags & stripped)
7865 return flags;
7866
7867 stripped |= BTRFS_BLOCK_GROUP_DUP;
7868 stripped = flags & ~stripped;
7869
7870 /* switch duplicated blocks with raid1 */
7871 if (flags & BTRFS_BLOCK_GROUP_DUP)
7872 return stripped | BTRFS_BLOCK_GROUP_RAID1;
7873
e3176ca2 7874 /* this is drive concat, leave it alone */
ec44a35c 7875 }
e3176ca2 7876
ec44a35c
CM
7877 return flags;
7878}
7879
868f401a 7880static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
0ef3e66b 7881{
f0486c68
YZ
7882 struct btrfs_space_info *sinfo = cache->space_info;
7883 u64 num_bytes;
3ece54e5 7884 u64 sinfo_used;
199c36ea 7885 u64 min_allocable_bytes;
f0486c68 7886 int ret = -ENOSPC;
0ef3e66b 7887
199c36ea
MX
7888 /*
7889 * We need some metadata space and system metadata space for
7890 * allocating chunks in some corner cases until we force to set
7891 * it to be readonly.
7892 */
7893 if ((sinfo->flags &
7894 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7895 !force)
ee22184b 7896 min_allocable_bytes = SZ_1M;
199c36ea
MX
7897 else
7898 min_allocable_bytes = 0;
7899
f0486c68
YZ
7900 spin_lock(&sinfo->lock);
7901 spin_lock(&cache->lock);
61cfea9b
W
7902
7903 if (cache->ro) {
868f401a 7904 cache->ro++;
61cfea9b
W
7905 ret = 0;
7906 goto out;
7907 }
7908
f0486c68
YZ
7909 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7910 cache->bytes_super - btrfs_block_group_used(&cache->item);
3ece54e5 7911 sinfo_used = btrfs_space_info_used(sinfo, true);
f0486c68 7912
3ece54e5
QW
7913 if (sinfo_used + num_bytes + min_allocable_bytes <=
7914 sinfo->total_bytes) {
f0486c68 7915 sinfo->bytes_readonly += num_bytes;
868f401a 7916 cache->ro++;
633c0aad 7917 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
f0486c68
YZ
7918 ret = 0;
7919 }
61cfea9b 7920out:
f0486c68
YZ
7921 spin_unlock(&cache->lock);
7922 spin_unlock(&sinfo->lock);
3ece54e5
QW
7923 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
7924 btrfs_info(cache->fs_info,
7925 "unable to make block group %llu ro",
7926 cache->key.objectid);
7927 btrfs_info(cache->fs_info,
7928 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
7929 sinfo_used, num_bytes, min_allocable_bytes);
5da6afeb 7930 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
3ece54e5 7931 }
f0486c68
YZ
7932 return ret;
7933}
7d9eb12c 7934
c83488af 7935int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
c286ac48 7936
f0486c68 7937{
c83488af 7938 struct btrfs_fs_info *fs_info = cache->fs_info;
f0486c68
YZ
7939 struct btrfs_trans_handle *trans;
7940 u64 alloc_flags;
7941 int ret;
7d9eb12c 7942
1bbc621e 7943again:
5e00f193 7944 trans = btrfs_join_transaction(fs_info->extent_root);
79787eaa
JM
7945 if (IS_ERR(trans))
7946 return PTR_ERR(trans);
5d4f98a2 7947
1bbc621e
CM
7948 /*
7949 * we're not allowed to set block groups readonly after the dirty
7950 * block groups cache has started writing. If it already started,
7951 * back off and let this transaction commit
7952 */
0b246afa 7953 mutex_lock(&fs_info->ro_block_group_mutex);
3204d33c 7954 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
1bbc621e
CM
7955 u64 transid = trans->transid;
7956
0b246afa 7957 mutex_unlock(&fs_info->ro_block_group_mutex);
3a45bb20 7958 btrfs_end_transaction(trans);
1bbc621e 7959
2ff7e61e 7960 ret = btrfs_wait_for_commit(fs_info, transid);
1bbc621e
CM
7961 if (ret)
7962 return ret;
7963 goto again;
7964 }
7965
153c35b6
CM
7966 /*
7967 * if we are changing raid levels, try to allocate a corresponding
7968 * block group with the new raid level.
7969 */
0b246afa 7970 alloc_flags = update_block_group_flags(fs_info, cache->flags);
153c35b6 7971 if (alloc_flags != cache->flags) {
fc471cb0 7972 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
153c35b6
CM
7973 /*
7974 * ENOSPC is allowed here, we may have enough space
7975 * already allocated at the new raid level to
7976 * carry on
7977 */
7978 if (ret == -ENOSPC)
7979 ret = 0;
7980 if (ret < 0)
7981 goto out;
7982 }
1bbc621e 7983
868f401a 7984 ret = inc_block_group_ro(cache, 0);
f0486c68
YZ
7985 if (!ret)
7986 goto out;
2ff7e61e 7987 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
fc471cb0 7988 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
f0486c68
YZ
7989 if (ret < 0)
7990 goto out;
868f401a 7991 ret = inc_block_group_ro(cache, 0);
f0486c68 7992out:
2f081088 7993 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 7994 alloc_flags = update_block_group_flags(fs_info, cache->flags);
34441361 7995 mutex_lock(&fs_info->chunk_mutex);
451a2c13 7996 check_system_chunk(trans, alloc_flags);
34441361 7997 mutex_unlock(&fs_info->chunk_mutex);
2f081088 7998 }
0b246afa 7999 mutex_unlock(&fs_info->ro_block_group_mutex);
2f081088 8000
3a45bb20 8001 btrfs_end_transaction(trans);
f0486c68
YZ
8002 return ret;
8003}
5d4f98a2 8004
43a7e99d 8005int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
c87f08ca 8006{
43a7e99d 8007 u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
2ff7e61e 8008
fc471cb0 8009 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
c87f08ca
CM
8010}
8011
6d07bcec
MX
8012/*
8013 * helper to account the unused space of all the readonly block group in the
633c0aad 8014 * space_info. takes mirrors into account.
6d07bcec 8015 */
633c0aad 8016u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6d07bcec
MX
8017{
8018 struct btrfs_block_group_cache *block_group;
8019 u64 free_bytes = 0;
8020 int factor;
8021
01327610 8022 /* It's df, we don't care if it's racy */
633c0aad
JB
8023 if (list_empty(&sinfo->ro_bgs))
8024 return 0;
8025
8026 spin_lock(&sinfo->lock);
8027 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
6d07bcec
MX
8028 spin_lock(&block_group->lock);
8029
8030 if (!block_group->ro) {
8031 spin_unlock(&block_group->lock);
8032 continue;
8033 }
8034
46df06b8 8035 factor = btrfs_bg_type_to_factor(block_group->flags);
6d07bcec
MX
8036 free_bytes += (block_group->key.offset -
8037 btrfs_block_group_used(&block_group->item)) *
8038 factor;
8039
8040 spin_unlock(&block_group->lock);
8041 }
6d07bcec
MX
8042 spin_unlock(&sinfo->lock);
8043
8044 return free_bytes;
8045}
8046
2ff7e61e 8047void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
5d4f98a2 8048{
f0486c68
YZ
8049 struct btrfs_space_info *sinfo = cache->space_info;
8050 u64 num_bytes;
8051
8052 BUG_ON(!cache->ro);
8053
8054 spin_lock(&sinfo->lock);
8055 spin_lock(&cache->lock);
868f401a
Z
8056 if (!--cache->ro) {
8057 num_bytes = cache->key.offset - cache->reserved -
8058 cache->pinned - cache->bytes_super -
8059 btrfs_block_group_used(&cache->item);
8060 sinfo->bytes_readonly -= num_bytes;
8061 list_del_init(&cache->ro_list);
8062 }
f0486c68
YZ
8063 spin_unlock(&cache->lock);
8064 spin_unlock(&sinfo->lock);
5d4f98a2
YZ
8065}
8066
ba1bf481 8067/*
52042d8e 8068 * Checks to see if it's even possible to relocate this block group.
ba1bf481
JB
8069 *
8070 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8071 * ok to go ahead and try.
8072 */
6bccf3ab 8073int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
1a40e23b 8074{
ba1bf481
JB
8075 struct btrfs_block_group_cache *block_group;
8076 struct btrfs_space_info *space_info;
0b246afa 8077 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
ba1bf481 8078 struct btrfs_device *device;
cdcb725c 8079 u64 min_free;
6719db6a
JB
8080 u64 dev_min = 1;
8081 u64 dev_nr = 0;
4a5e98f5 8082 u64 target;
0305bc27 8083 int debug;
cdcb725c 8084 int index;
ba1bf481
JB
8085 int full = 0;
8086 int ret = 0;
1a40e23b 8087
0b246afa 8088 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
0305bc27 8089
0b246afa 8090 block_group = btrfs_lookup_block_group(fs_info, bytenr);
1a40e23b 8091
ba1bf481 8092 /* odd, couldn't find the block group, leave it alone */
0305bc27
QW
8093 if (!block_group) {
8094 if (debug)
0b246afa 8095 btrfs_warn(fs_info,
0305bc27
QW
8096 "can't find block group for bytenr %llu",
8097 bytenr);
ba1bf481 8098 return -1;
0305bc27 8099 }
1a40e23b 8100
cdcb725c 8101 min_free = btrfs_block_group_used(&block_group->item);
8102
ba1bf481 8103 /* no bytes used, we're good */
cdcb725c 8104 if (!min_free)
1a40e23b
ZY
8105 goto out;
8106
ba1bf481
JB
8107 space_info = block_group->space_info;
8108 spin_lock(&space_info->lock);
17d217fe 8109
ba1bf481 8110 full = space_info->full;
17d217fe 8111
ba1bf481
JB
8112 /*
8113 * if this is the last block group we have in this space, we can't
7ce618db
CM
8114 * relocate it unless we're able to allocate a new chunk below.
8115 *
8116 * Otherwise, we need to make sure we have room in the space to handle
8117 * all of the extents from this block group. If we can, we're good
ba1bf481 8118 */
7ce618db 8119 if ((space_info->total_bytes != block_group->key.offset) &&
4136135b
LB
8120 (btrfs_space_info_used(space_info, false) + min_free <
8121 space_info->total_bytes)) {
ba1bf481
JB
8122 spin_unlock(&space_info->lock);
8123 goto out;
17d217fe 8124 }
ba1bf481 8125 spin_unlock(&space_info->lock);
ea8c2819 8126
ba1bf481
JB
8127 /*
8128 * ok we don't have enough space, but maybe we have free space on our
8129 * devices to allocate new chunks for relocation, so loop through our
4a5e98f5
ID
8130 * alloc devices and guess if we have enough space. if this block
8131 * group is going to be restriped, run checks against the target
8132 * profile instead of the current one.
ba1bf481
JB
8133 */
8134 ret = -1;
ea8c2819 8135
cdcb725c 8136 /*
8137 * index:
8138 * 0: raid10
8139 * 1: raid1
8140 * 2: dup
8141 * 3: raid0
8142 * 4: single
8143 */
0b246afa 8144 target = get_restripe_target(fs_info, block_group->flags);
4a5e98f5 8145 if (target) {
3e72ee88 8146 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
4a5e98f5
ID
8147 } else {
8148 /*
8149 * this is just a balance, so if we were marked as full
8150 * we know there is no space for a new chunk
8151 */
0305bc27
QW
8152 if (full) {
8153 if (debug)
0b246afa
JM
8154 btrfs_warn(fs_info,
8155 "no space to alloc new chunk for block group %llu",
8156 block_group->key.objectid);
4a5e98f5 8157 goto out;
0305bc27 8158 }
4a5e98f5 8159
3e72ee88 8160 index = btrfs_bg_flags_to_raid_index(block_group->flags);
4a5e98f5
ID
8161 }
8162
e6ec716f 8163 if (index == BTRFS_RAID_RAID10) {
cdcb725c 8164 dev_min = 4;
6719db6a
JB
8165 /* Divide by 2 */
8166 min_free >>= 1;
e6ec716f 8167 } else if (index == BTRFS_RAID_RAID1) {
cdcb725c 8168 dev_min = 2;
e6ec716f 8169 } else if (index == BTRFS_RAID_DUP) {
6719db6a
JB
8170 /* Multiply by 2 */
8171 min_free <<= 1;
e6ec716f 8172 } else if (index == BTRFS_RAID_RAID0) {
cdcb725c 8173 dev_min = fs_devices->rw_devices;
47c5713f 8174 min_free = div64_u64(min_free, dev_min);
cdcb725c 8175 }
8176
0b246afa 8177 mutex_lock(&fs_info->chunk_mutex);
ba1bf481 8178 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7bfc837d 8179 u64 dev_offset;
56bec294 8180
ba1bf481
JB
8181 /*
8182 * check to make sure we can actually find a chunk with enough
8183 * space to fit our block group in.
8184 */
63a212ab 8185 if (device->total_bytes > device->bytes_used + min_free &&
401e29c1 8186 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
60dfdf25 8187 ret = find_free_dev_extent(device, min_free,
7bfc837d 8188 &dev_offset, NULL);
ba1bf481 8189 if (!ret)
cdcb725c 8190 dev_nr++;
8191
8192 if (dev_nr >= dev_min)
73e48b27 8193 break;
cdcb725c 8194
ba1bf481 8195 ret = -1;
725c8463 8196 }
edbd8d4e 8197 }
0305bc27 8198 if (debug && ret == -1)
0b246afa
JM
8199 btrfs_warn(fs_info,
8200 "no space to allocate a new chunk for block group %llu",
8201 block_group->key.objectid);
8202 mutex_unlock(&fs_info->chunk_mutex);
edbd8d4e 8203out:
ba1bf481 8204 btrfs_put_block_group(block_group);
edbd8d4e
CM
8205 return ret;
8206}
8207
6bccf3ab
JM
8208static int find_first_block_group(struct btrfs_fs_info *fs_info,
8209 struct btrfs_path *path,
8210 struct btrfs_key *key)
0b86a832 8211{
6bccf3ab 8212 struct btrfs_root *root = fs_info->extent_root;
925baedd 8213 int ret = 0;
0b86a832
CM
8214 struct btrfs_key found_key;
8215 struct extent_buffer *leaf;
514c7dca
QW
8216 struct btrfs_block_group_item bg;
8217 u64 flags;
0b86a832 8218 int slot;
edbd8d4e 8219
0b86a832
CM
8220 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8221 if (ret < 0)
925baedd
CM
8222 goto out;
8223
d397712b 8224 while (1) {
0b86a832 8225 slot = path->slots[0];
edbd8d4e 8226 leaf = path->nodes[0];
0b86a832
CM
8227 if (slot >= btrfs_header_nritems(leaf)) {
8228 ret = btrfs_next_leaf(root, path);
8229 if (ret == 0)
8230 continue;
8231 if (ret < 0)
925baedd 8232 goto out;
0b86a832 8233 break;
edbd8d4e 8234 }
0b86a832 8235 btrfs_item_key_to_cpu(leaf, &found_key, slot);
edbd8d4e 8236
0b86a832 8237 if (found_key.objectid >= key->objectid &&
925baedd 8238 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6fb37b75
LB
8239 struct extent_map_tree *em_tree;
8240 struct extent_map *em;
8241
c8bf1b67 8242 em_tree = &root->fs_info->mapping_tree;
6fb37b75
LB
8243 read_lock(&em_tree->lock);
8244 em = lookup_extent_mapping(em_tree, found_key.objectid,
8245 found_key.offset);
8246 read_unlock(&em_tree->lock);
8247 if (!em) {
0b246afa 8248 btrfs_err(fs_info,
6fb37b75
LB
8249 "logical %llu len %llu found bg but no related chunk",
8250 found_key.objectid, found_key.offset);
8251 ret = -ENOENT;
514c7dca
QW
8252 } else if (em->start != found_key.objectid ||
8253 em->len != found_key.offset) {
8254 btrfs_err(fs_info,
8255 "block group %llu len %llu mismatch with chunk %llu len %llu",
8256 found_key.objectid, found_key.offset,
8257 em->start, em->len);
8258 ret = -EUCLEAN;
6fb37b75 8259 } else {
514c7dca
QW
8260 read_extent_buffer(leaf, &bg,
8261 btrfs_item_ptr_offset(leaf, slot),
8262 sizeof(bg));
8263 flags = btrfs_block_group_flags(&bg) &
8264 BTRFS_BLOCK_GROUP_TYPE_MASK;
8265
8266 if (flags != (em->map_lookup->type &
8267 BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8268 btrfs_err(fs_info,
8269"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
8270 found_key.objectid,
8271 found_key.offset, flags,
8272 (BTRFS_BLOCK_GROUP_TYPE_MASK &
8273 em->map_lookup->type));
8274 ret = -EUCLEAN;
8275 } else {
8276 ret = 0;
8277 }
6fb37b75 8278 }
187ee58c 8279 free_extent_map(em);
925baedd
CM
8280 goto out;
8281 }
0b86a832 8282 path->slots[0]++;
edbd8d4e 8283 }
925baedd 8284out:
0b86a832 8285 return ret;
edbd8d4e
CM
8286}
8287
0af3d00b
JB
8288void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8289{
8290 struct btrfs_block_group_cache *block_group;
8291 u64 last = 0;
8292
8293 while (1) {
8294 struct inode *inode;
8295
8296 block_group = btrfs_lookup_first_block_group(info, last);
8297 while (block_group) {
3aa7c7a3 8298 wait_block_group_cache_done(block_group);
0af3d00b
JB
8299 spin_lock(&block_group->lock);
8300 if (block_group->iref)
8301 break;
8302 spin_unlock(&block_group->lock);
f87b7eb8 8303 block_group = next_block_group(block_group);
0af3d00b
JB
8304 }
8305 if (!block_group) {
8306 if (last == 0)
8307 break;
8308 last = 0;
8309 continue;
8310 }
8311
8312 inode = block_group->inode;
8313 block_group->iref = 0;
8314 block_group->inode = NULL;
8315 spin_unlock(&block_group->lock);
f3bca802 8316 ASSERT(block_group->io_ctl.inode == NULL);
0af3d00b
JB
8317 iput(inode);
8318 last = block_group->key.objectid + block_group->key.offset;
8319 btrfs_put_block_group(block_group);
8320 }
8321}
8322
5cdd7db6
FM
8323/*
8324 * Must be called only after stopping all workers, since we could have block
8325 * group caching kthreads running, and therefore they could race with us if we
8326 * freed the block groups before stopping them.
8327 */
1a40e23b
ZY
8328int btrfs_free_block_groups(struct btrfs_fs_info *info)
8329{
8330 struct btrfs_block_group_cache *block_group;
4184ea7f 8331 struct btrfs_space_info *space_info;
11833d66 8332 struct btrfs_caching_control *caching_ctl;
1a40e23b
ZY
8333 struct rb_node *n;
8334
9e351cc8 8335 down_write(&info->commit_root_sem);
11833d66
YZ
8336 while (!list_empty(&info->caching_block_groups)) {
8337 caching_ctl = list_entry(info->caching_block_groups.next,
8338 struct btrfs_caching_control, list);
8339 list_del(&caching_ctl->list);
8340 put_caching_control(caching_ctl);
8341 }
9e351cc8 8342 up_write(&info->commit_root_sem);
11833d66 8343
47ab2a6c
JB
8344 spin_lock(&info->unused_bgs_lock);
8345 while (!list_empty(&info->unused_bgs)) {
8346 block_group = list_first_entry(&info->unused_bgs,
8347 struct btrfs_block_group_cache,
8348 bg_list);
8349 list_del_init(&block_group->bg_list);
8350 btrfs_put_block_group(block_group);
8351 }
8352 spin_unlock(&info->unused_bgs_lock);
8353
1a40e23b
ZY
8354 spin_lock(&info->block_group_cache_lock);
8355 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8356 block_group = rb_entry(n, struct btrfs_block_group_cache,
8357 cache_node);
1a40e23b
ZY
8358 rb_erase(&block_group->cache_node,
8359 &info->block_group_cache_tree);
01eacb27 8360 RB_CLEAR_NODE(&block_group->cache_node);
d899e052
YZ
8361 spin_unlock(&info->block_group_cache_lock);
8362
80eb234a 8363 down_write(&block_group->space_info->groups_sem);
1a40e23b 8364 list_del(&block_group->list);
80eb234a 8365 up_write(&block_group->space_info->groups_sem);
d2fb3437 8366
3c14874a
JB
8367 /*
8368 * We haven't cached this block group, which means we could
8369 * possibly have excluded extents on this block group.
8370 */
36cce922
JB
8371 if (block_group->cached == BTRFS_CACHE_NO ||
8372 block_group->cached == BTRFS_CACHE_ERROR)
9e715da8 8373 free_excluded_extents(block_group);
3c14874a 8374
817d52f8 8375 btrfs_remove_free_space_cache(block_group);
5cdd7db6 8376 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
f3bca802
LB
8377 ASSERT(list_empty(&block_group->dirty_list));
8378 ASSERT(list_empty(&block_group->io_list));
8379 ASSERT(list_empty(&block_group->bg_list));
8380 ASSERT(atomic_read(&block_group->count) == 1);
11dfe35a 8381 btrfs_put_block_group(block_group);
d899e052
YZ
8382
8383 spin_lock(&info->block_group_cache_lock);
1a40e23b
ZY
8384 }
8385 spin_unlock(&info->block_group_cache_lock);
4184ea7f
CM
8386
8387 /* now that all the block groups are freed, go through and
8388 * free all the space_info structs. This is only called during
8389 * the final stages of unmount, and so we know nobody is
8390 * using them. We call synchronize_rcu() once before we start,
8391 * just to be on the safe side.
8392 */
8393 synchronize_rcu();
8394
67f9c220 8395 btrfs_release_global_block_rsv(info);
8929ecfa 8396
67871254 8397 while (!list_empty(&info->space_info)) {
6ab0a202
JM
8398 int i;
8399
4184ea7f
CM
8400 space_info = list_entry(info->space_info.next,
8401 struct btrfs_space_info,
8402 list);
d555b6c3
JB
8403
8404 /*
8405 * Do not hide this behind enospc_debug, this is actually
8406 * important and indicates a real bug if this happens.
8407 */
8408 if (WARN_ON(space_info->bytes_pinned > 0 ||
b069e0c3 8409 space_info->bytes_reserved > 0 ||
d555b6c3 8410 space_info->bytes_may_use > 0))
5da6afeb 8411 btrfs_dump_space_info(info, space_info, 0, 0);
4184ea7f 8412 list_del(&space_info->list);
6ab0a202
JM
8413 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8414 struct kobject *kobj;
c1895442
JM
8415 kobj = space_info->block_group_kobjs[i];
8416 space_info->block_group_kobjs[i] = NULL;
8417 if (kobj) {
6ab0a202
JM
8418 kobject_del(kobj);
8419 kobject_put(kobj);
8420 }
8421 }
8422 kobject_del(&space_info->kobj);
8423 kobject_put(&space_info->kobj);
4184ea7f 8424 }
1a40e23b
ZY
8425 return 0;
8426}
8427
75cb379d
JM
8428/* link_block_group will queue up kobjects to add when we're reclaim-safe */
8429void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
8430{
8431 struct btrfs_space_info *space_info;
8432 struct raid_kobject *rkobj;
8433 LIST_HEAD(list);
75cb379d
JM
8434 int ret = 0;
8435
8436 spin_lock(&fs_info->pending_raid_kobjs_lock);
8437 list_splice_init(&fs_info->pending_raid_kobjs, &list);
8438 spin_unlock(&fs_info->pending_raid_kobjs_lock);
8439
8440 list_for_each_entry(rkobj, &list, list) {
280c2908 8441 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
75cb379d
JM
8442
8443 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
158da513 8444 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
75cb379d
JM
8445 if (ret) {
8446 kobject_put(&rkobj->kobj);
8447 break;
8448 }
8449 }
8450 if (ret)
8451 btrfs_warn(fs_info,
8452 "failed to add kobject for block cache, ignoring");
8453}
8454
c434d21c 8455static void link_block_group(struct btrfs_block_group_cache *cache)
b742bb82 8456{
c434d21c 8457 struct btrfs_space_info *space_info = cache->space_info;
75cb379d 8458 struct btrfs_fs_info *fs_info = cache->fs_info;
3e72ee88 8459 int index = btrfs_bg_flags_to_raid_index(cache->flags);
ed55b6ac 8460 bool first = false;
b742bb82
YZ
8461
8462 down_write(&space_info->groups_sem);
ed55b6ac
JM
8463 if (list_empty(&space_info->block_groups[index]))
8464 first = true;
8465 list_add_tail(&cache->list, &space_info->block_groups[index]);
8466 up_write(&space_info->groups_sem);
8467
8468 if (first) {
75cb379d
JM
8469 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8470 if (!rkobj) {
8471 btrfs_warn(cache->fs_info,
8472 "couldn't alloc memory for raid level kobject");
8473 return;
6ab0a202 8474 }
75cb379d
JM
8475 rkobj->flags = cache->flags;
8476 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8477
8478 spin_lock(&fs_info->pending_raid_kobjs_lock);
8479 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
8480 spin_unlock(&fs_info->pending_raid_kobjs_lock);
c1895442 8481 space_info->block_group_kobjs[index] = &rkobj->kobj;
6ab0a202 8482 }
b742bb82
YZ
8483}
8484
920e4a58 8485static struct btrfs_block_group_cache *
2ff7e61e
JM
8486btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
8487 u64 start, u64 size)
920e4a58
MX
8488{
8489 struct btrfs_block_group_cache *cache;
8490
8491 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8492 if (!cache)
8493 return NULL;
8494
8495 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8496 GFP_NOFS);
8497 if (!cache->free_space_ctl) {
8498 kfree(cache);
8499 return NULL;
8500 }
8501
8502 cache->key.objectid = start;
8503 cache->key.offset = size;
8504 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8505
0b246afa 8506 cache->fs_info = fs_info;
e4ff5fb5 8507 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1e144fb8
OS
8508 set_free_space_tree_thresholds(cache);
8509
920e4a58
MX
8510 atomic_set(&cache->count, 1);
8511 spin_lock_init(&cache->lock);
e570fd27 8512 init_rwsem(&cache->data_rwsem);
920e4a58
MX
8513 INIT_LIST_HEAD(&cache->list);
8514 INIT_LIST_HEAD(&cache->cluster_list);
47ab2a6c 8515 INIT_LIST_HEAD(&cache->bg_list);
633c0aad 8516 INIT_LIST_HEAD(&cache->ro_list);
ce93ec54 8517 INIT_LIST_HEAD(&cache->dirty_list);
c9dc4c65 8518 INIT_LIST_HEAD(&cache->io_list);
920e4a58 8519 btrfs_init_free_space_ctl(cache);
04216820 8520 atomic_set(&cache->trimming, 0);
a5ed9182 8521 mutex_init(&cache->free_space_lock);
0966a7b1 8522 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
920e4a58
MX
8523
8524 return cache;
8525}
8526
7ef49515
QW
8527
8528/*
8529 * Iterate all chunks and verify that each of them has the corresponding block
8530 * group
8531 */
8532static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
8533{
c8bf1b67 8534 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7ef49515
QW
8535 struct extent_map *em;
8536 struct btrfs_block_group_cache *bg;
8537 u64 start = 0;
8538 int ret = 0;
8539
8540 while (1) {
c8bf1b67 8541 read_lock(&map_tree->lock);
7ef49515
QW
8542 /*
8543 * lookup_extent_mapping will return the first extent map
8544 * intersecting the range, so setting @len to 1 is enough to
8545 * get the first chunk.
8546 */
c8bf1b67
DS
8547 em = lookup_extent_mapping(map_tree, start, 1);
8548 read_unlock(&map_tree->lock);
7ef49515
QW
8549 if (!em)
8550 break;
8551
8552 bg = btrfs_lookup_block_group(fs_info, em->start);
8553 if (!bg) {
8554 btrfs_err(fs_info,
8555 "chunk start=%llu len=%llu doesn't have corresponding block group",
8556 em->start, em->len);
8557 ret = -EUCLEAN;
8558 free_extent_map(em);
8559 break;
8560 }
8561 if (bg->key.objectid != em->start ||
8562 bg->key.offset != em->len ||
8563 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
8564 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8565 btrfs_err(fs_info,
8566"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
8567 em->start, em->len,
8568 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
8569 bg->key.objectid, bg->key.offset,
8570 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
8571 ret = -EUCLEAN;
8572 free_extent_map(em);
8573 btrfs_put_block_group(bg);
8574 break;
8575 }
8576 start = em->start + em->len;
8577 free_extent_map(em);
8578 btrfs_put_block_group(bg);
8579 }
8580 return ret;
8581}
8582
5b4aacef 8583int btrfs_read_block_groups(struct btrfs_fs_info *info)
9078a3e1
CM
8584{
8585 struct btrfs_path *path;
8586 int ret;
9078a3e1 8587 struct btrfs_block_group_cache *cache;
6324fbf3 8588 struct btrfs_space_info *space_info;
9078a3e1
CM
8589 struct btrfs_key key;
8590 struct btrfs_key found_key;
5f39d397 8591 struct extent_buffer *leaf;
0af3d00b
JB
8592 int need_clear = 0;
8593 u64 cache_gen;
49303381
LB
8594 u64 feature;
8595 int mixed;
8596
8597 feature = btrfs_super_incompat_flags(info->super_copy);
8598 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
96b5179d 8599
9078a3e1 8600 key.objectid = 0;
0b86a832 8601 key.offset = 0;
962a298f 8602 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9078a3e1
CM
8603 path = btrfs_alloc_path();
8604 if (!path)
8605 return -ENOMEM;
e4058b54 8606 path->reada = READA_FORWARD;
9078a3e1 8607
0b246afa
JM
8608 cache_gen = btrfs_super_cache_generation(info->super_copy);
8609 if (btrfs_test_opt(info, SPACE_CACHE) &&
8610 btrfs_super_generation(info->super_copy) != cache_gen)
0af3d00b 8611 need_clear = 1;
0b246afa 8612 if (btrfs_test_opt(info, CLEAR_CACHE))
88c2ba3b 8613 need_clear = 1;
0af3d00b 8614
d397712b 8615 while (1) {
6bccf3ab 8616 ret = find_first_block_group(info, path, &key);
b742bb82
YZ
8617 if (ret > 0)
8618 break;
0b86a832
CM
8619 if (ret != 0)
8620 goto error;
920e4a58 8621
5f39d397
CM
8622 leaf = path->nodes[0];
8623 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
920e4a58 8624
2ff7e61e 8625 cache = btrfs_create_block_group_cache(info, found_key.objectid,
920e4a58 8626 found_key.offset);
9078a3e1 8627 if (!cache) {
0b86a832 8628 ret = -ENOMEM;
f0486c68 8629 goto error;
9078a3e1 8630 }
96303081 8631
cf7c1ef6
LB
8632 if (need_clear) {
8633 /*
8634 * When we mount with old space cache, we need to
8635 * set BTRFS_DC_CLEAR and set dirty flag.
8636 *
8637 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8638 * truncate the old free space cache inode and
8639 * setup a new one.
8640 * b) Setting 'dirty flag' makes sure that we flush
8641 * the new space cache info onto disk.
8642 */
0b246afa 8643 if (btrfs_test_opt(info, SPACE_CACHE))
ce93ec54 8644 cache->disk_cache_state = BTRFS_DC_CLEAR;
cf7c1ef6 8645 }
0af3d00b 8646
5f39d397
CM
8647 read_extent_buffer(leaf, &cache->item,
8648 btrfs_item_ptr_offset(leaf, path->slots[0]),
8649 sizeof(cache->item));
920e4a58 8650 cache->flags = btrfs_block_group_flags(&cache->item);
49303381
LB
8651 if (!mixed &&
8652 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
8653 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
8654 btrfs_err(info,
8655"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
8656 cache->key.objectid);
8657 ret = -EINVAL;
8658 goto error;
8659 }
0b86a832 8660
9078a3e1 8661 key.objectid = found_key.objectid + found_key.offset;
b3b4aa74 8662 btrfs_release_path(path);
34d52cb6 8663
3c14874a
JB
8664 /*
8665 * We need to exclude the super stripes now so that the space
8666 * info has super bytes accounted for, otherwise we'll think
8667 * we have more space than we actually do.
8668 */
3c4da657 8669 ret = exclude_super_stripes(cache);
835d974f
JB
8670 if (ret) {
8671 /*
8672 * We may have excluded something, so call this just in
8673 * case.
8674 */
9e715da8 8675 free_excluded_extents(cache);
920e4a58 8676 btrfs_put_block_group(cache);
835d974f
JB
8677 goto error;
8678 }
3c14874a 8679
817d52f8
JB
8680 /*
8681 * check for two cases, either we are full, and therefore
8682 * don't need to bother with the caching work since we won't
8683 * find any space, or we are empty, and we can just add all
52042d8e 8684 * the space in and be done with it. This saves us _a_lot_ of
817d52f8
JB
8685 * time, particularly in the full case.
8686 */
8687 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
11833d66 8688 cache->last_byte_to_unpin = (u64)-1;
817d52f8 8689 cache->cached = BTRFS_CACHE_FINISHED;
9e715da8 8690 free_excluded_extents(cache);
817d52f8 8691 } else if (btrfs_block_group_used(&cache->item) == 0) {
11833d66 8692 cache->last_byte_to_unpin = (u64)-1;
817d52f8 8693 cache->cached = BTRFS_CACHE_FINISHED;
4457c1c7 8694 add_new_free_space(cache, found_key.objectid,
817d52f8
JB
8695 found_key.objectid +
8696 found_key.offset);
9e715da8 8697 free_excluded_extents(cache);
817d52f8 8698 }
96b5179d 8699
0b246afa 8700 ret = btrfs_add_block_group_cache(info, cache);
8c579fe7
JB
8701 if (ret) {
8702 btrfs_remove_free_space_cache(cache);
8703 btrfs_put_block_group(cache);
8704 goto error;
8705 }
8706
0b246afa 8707 trace_btrfs_add_block_group(info, cache, 0);
280c2908
JB
8708 btrfs_update_space_info(info, cache->flags, found_key.offset,
8709 btrfs_block_group_used(&cache->item),
8710 cache->bytes_super, &space_info);
8c579fe7 8711
6324fbf3 8712 cache->space_info = space_info;
1b2da372 8713
c434d21c 8714 link_block_group(cache);
0f9dd46c 8715
0b246afa 8716 set_avail_alloc_bits(info, cache->flags);
2ff7e61e 8717 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
868f401a 8718 inc_block_group_ro(cache, 1);
47ab2a6c 8719 } else if (btrfs_block_group_used(&cache->item) == 0) {
031f24da
QW
8720 ASSERT(list_empty(&cache->bg_list));
8721 btrfs_mark_bg_unused(cache);
47ab2a6c 8722 }
9078a3e1 8723 }
b742bb82 8724
0b246afa 8725 list_for_each_entry_rcu(space_info, &info->space_info, list) {
2ff7e61e 8726 if (!(get_alloc_profile(info, space_info->flags) &
b742bb82 8727 (BTRFS_BLOCK_GROUP_RAID10 |
c7369b3f 8728 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 8729 BTRFS_BLOCK_GROUP_RAID56_MASK |
b742bb82
YZ
8730 BTRFS_BLOCK_GROUP_DUP)))
8731 continue;
8732 /*
8733 * avoid allocating from un-mirrored block group if there are
8734 * mirrored block groups.
8735 */
1095cc0d 8736 list_for_each_entry(cache,
8737 &space_info->block_groups[BTRFS_RAID_RAID0],
8738 list)
868f401a 8739 inc_block_group_ro(cache, 1);
1095cc0d 8740 list_for_each_entry(cache,
8741 &space_info->block_groups[BTRFS_RAID_SINGLE],
8742 list)
868f401a 8743 inc_block_group_ro(cache, 1);
9078a3e1 8744 }
f0486c68 8745
75cb379d 8746 btrfs_add_raid_kobjects(info);
67f9c220 8747 btrfs_init_global_block_rsv(info);
7ef49515 8748 ret = check_chunk_block_group_mappings(info);
0b86a832 8749error:
9078a3e1 8750 btrfs_free_path(path);
0b86a832 8751 return ret;
9078a3e1 8752}
6324fbf3 8753
6c686b35 8754void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ea658bad 8755{
6c686b35 8756 struct btrfs_fs_info *fs_info = trans->fs_info;
545e3366 8757 struct btrfs_block_group_cache *block_group;
0b246afa 8758 struct btrfs_root *extent_root = fs_info->extent_root;
ea658bad
JB
8759 struct btrfs_block_group_item item;
8760 struct btrfs_key key;
8761 int ret = 0;
8762
5ce55557
FM
8763 if (!trans->can_flush_pending_bgs)
8764 return;
8765
545e3366
JB
8766 while (!list_empty(&trans->new_bgs)) {
8767 block_group = list_first_entry(&trans->new_bgs,
8768 struct btrfs_block_group_cache,
8769 bg_list);
ea658bad 8770 if (ret)
c92f6be3 8771 goto next;
ea658bad
JB
8772
8773 spin_lock(&block_group->lock);
8774 memcpy(&item, &block_group->item, sizeof(item));
8775 memcpy(&key, &block_group->key, sizeof(key));
8776 spin_unlock(&block_group->lock);
8777
8778 ret = btrfs_insert_item(trans, extent_root, &key, &item,
8779 sizeof(item));
8780 if (ret)
66642832 8781 btrfs_abort_transaction(trans, ret);
97aff912 8782 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
6df9a95e 8783 if (ret)
66642832 8784 btrfs_abort_transaction(trans, ret);
e4e0711c 8785 add_block_group_free_space(trans, block_group);
1e144fb8 8786 /* already aborted the transaction if it failed. */
c92f6be3 8787next:
ba2c4d4e 8788 btrfs_delayed_refs_rsv_release(fs_info, 1);
c92f6be3 8789 list_del_init(&block_group->bg_list);
ea658bad 8790 }
5ce55557 8791 btrfs_trans_release_chunk_metadata(trans);
ea658bad
JB
8792}
8793
e7e02096 8794int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
0174484d 8795 u64 type, u64 chunk_offset, u64 size)
6324fbf3 8796{
e7e02096 8797 struct btrfs_fs_info *fs_info = trans->fs_info;
6324fbf3 8798 struct btrfs_block_group_cache *cache;
0b246afa 8799 int ret;
6324fbf3 8800
90787766 8801 btrfs_set_log_full_commit(trans);
e02119d5 8802
2ff7e61e 8803 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
0f9dd46c
JB
8804 if (!cache)
8805 return -ENOMEM;
34d52cb6 8806
6324fbf3 8807 btrfs_set_block_group_used(&cache->item, bytes_used);
0174484d
NB
8808 btrfs_set_block_group_chunk_objectid(&cache->item,
8809 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
6324fbf3
CM
8810 btrfs_set_block_group_flags(&cache->item, type);
8811
920e4a58 8812 cache->flags = type;
11833d66 8813 cache->last_byte_to_unpin = (u64)-1;
817d52f8 8814 cache->cached = BTRFS_CACHE_FINISHED;
1e144fb8 8815 cache->needs_free_space = 1;
3c4da657 8816 ret = exclude_super_stripes(cache);
835d974f
JB
8817 if (ret) {
8818 /*
8819 * We may have excluded something, so call this just in
8820 * case.
8821 */
9e715da8 8822 free_excluded_extents(cache);
920e4a58 8823 btrfs_put_block_group(cache);
835d974f
JB
8824 return ret;
8825 }
96303081 8826
4457c1c7 8827 add_new_free_space(cache, chunk_offset, chunk_offset + size);
817d52f8 8828
9e715da8 8829 free_excluded_extents(cache);
11833d66 8830
d0bd4560 8831#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 8832 if (btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
8833 u64 new_bytes_used = size - bytes_used;
8834
8835 bytes_used += new_bytes_used >> 1;
2ff7e61e 8836 fragment_free_space(cache);
d0bd4560
JB
8837 }
8838#endif
2e6e5183 8839 /*
2be12ef7
NB
8840 * Ensure the corresponding space_info object is created and
8841 * assigned to our block group. We want our bg to be added to the rbtree
8842 * with its ->space_info set.
2e6e5183 8843 */
280c2908 8844 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
dc2d3005 8845 ASSERT(cache->space_info);
2e6e5183 8846
0b246afa 8847 ret = btrfs_add_block_group_cache(fs_info, cache);
8c579fe7
JB
8848 if (ret) {
8849 btrfs_remove_free_space_cache(cache);
8850 btrfs_put_block_group(cache);
8851 return ret;
8852 }
8853
2e6e5183
FM
8854 /*
8855 * Now that our block group has its ->space_info set and is inserted in
8856 * the rbtree, update the space info's counters.
8857 */
0b246afa 8858 trace_btrfs_add_block_group(fs_info, cache, 1);
280c2908 8859 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
e40edf2d 8860 cache->bytes_super, &cache->space_info);
67f9c220 8861 btrfs_update_global_block_rsv(fs_info);
1b2da372 8862
c434d21c 8863 link_block_group(cache);
6324fbf3 8864
47ab2a6c 8865 list_add_tail(&cache->bg_list, &trans->new_bgs);
ba2c4d4e
JB
8866 trans->delayed_ref_updates++;
8867 btrfs_update_delayed_refs_rsv(trans);
6324fbf3 8868
0b246afa 8869 set_avail_alloc_bits(fs_info, type);
6324fbf3
CM
8870 return 0;
8871}
1a40e23b 8872
10ea00f5
ID
8873static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8874{
899c81ea
ID
8875 u64 extra_flags = chunk_to_extended(flags) &
8876 BTRFS_EXTENDED_PROFILE_MASK;
10ea00f5 8877
de98ced9 8878 write_seqlock(&fs_info->profiles_lock);
10ea00f5
ID
8879 if (flags & BTRFS_BLOCK_GROUP_DATA)
8880 fs_info->avail_data_alloc_bits &= ~extra_flags;
8881 if (flags & BTRFS_BLOCK_GROUP_METADATA)
8882 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8883 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8884 fs_info->avail_system_alloc_bits &= ~extra_flags;
de98ced9 8885 write_sequnlock(&fs_info->profiles_lock);
10ea00f5
ID
8886}
8887
6d58a55a
DS
8888/*
8889 * Clear incompat bits for the following feature(s):
8890 *
8891 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
8892 * in the whole filesystem
8893 */
8894static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
8895{
8896 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
8897 struct list_head *head = &fs_info->space_info;
8898 struct btrfs_space_info *sinfo;
8899
8900 list_for_each_entry_rcu(sinfo, head, list) {
8901 bool found = false;
8902
8903 down_read(&sinfo->groups_sem);
8904 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
8905 found = true;
8906 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
8907 found = true;
8908 up_read(&sinfo->groups_sem);
8909
8910 if (found)
8911 return;
8912 }
8913 btrfs_clear_fs_incompat(fs_info, RAID56);
8914 }
8915}
8916
1a40e23b 8917int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5a98ec01 8918 u64 group_start, struct extent_map *em)
1a40e23b 8919{
5a98ec01 8920 struct btrfs_fs_info *fs_info = trans->fs_info;
6bccf3ab 8921 struct btrfs_root *root = fs_info->extent_root;
1a40e23b
ZY
8922 struct btrfs_path *path;
8923 struct btrfs_block_group_cache *block_group;
44fb5511 8924 struct btrfs_free_cluster *cluster;
0b246afa 8925 struct btrfs_root *tree_root = fs_info->tree_root;
1a40e23b 8926 struct btrfs_key key;
0af3d00b 8927 struct inode *inode;
c1895442 8928 struct kobject *kobj = NULL;
1a40e23b 8929 int ret;
10ea00f5 8930 int index;
89a55897 8931 int factor;
4f69cb98 8932 struct btrfs_caching_control *caching_ctl = NULL;
04216820 8933 bool remove_em;
ba2c4d4e 8934 bool remove_rsv = false;
1a40e23b 8935
6bccf3ab 8936 block_group = btrfs_lookup_block_group(fs_info, group_start);
1a40e23b 8937 BUG_ON(!block_group);
c146afad 8938 BUG_ON(!block_group->ro);
1a40e23b 8939
4ed0a7a3 8940 trace_btrfs_remove_block_group(block_group);
9f7c43c9 8941 /*
8942 * Free the reserved super bytes from this block group before
8943 * remove it.
8944 */
9e715da8 8945 free_excluded_extents(block_group);
fd708b81
JB
8946 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
8947 block_group->key.offset);
9f7c43c9 8948
1a40e23b 8949 memcpy(&key, &block_group->key, sizeof(key));
3e72ee88 8950 index = btrfs_bg_flags_to_raid_index(block_group->flags);
46df06b8 8951 factor = btrfs_bg_type_to_factor(block_group->flags);
1a40e23b 8952
44fb5511 8953 /* make sure this block group isn't part of an allocation cluster */
0b246afa 8954 cluster = &fs_info->data_alloc_cluster;
44fb5511
CM
8955 spin_lock(&cluster->refill_lock);
8956 btrfs_return_cluster_to_free_space(block_group, cluster);
8957 spin_unlock(&cluster->refill_lock);
8958
8959 /*
8960 * make sure this block group isn't part of a metadata
8961 * allocation cluster
8962 */
0b246afa 8963 cluster = &fs_info->meta_alloc_cluster;
44fb5511
CM
8964 spin_lock(&cluster->refill_lock);
8965 btrfs_return_cluster_to_free_space(block_group, cluster);
8966 spin_unlock(&cluster->refill_lock);
8967
1a40e23b 8968 path = btrfs_alloc_path();
d8926bb3
MF
8969 if (!path) {
8970 ret = -ENOMEM;
8971 goto out;
8972 }
1a40e23b 8973
1bbc621e
CM
8974 /*
8975 * get the inode first so any iput calls done for the io_list
8976 * aren't the final iput (no unlinks allowed now)
8977 */
7949f339 8978 inode = lookup_free_space_inode(block_group, path);
1bbc621e
CM
8979
8980 mutex_lock(&trans->transaction->cache_write_mutex);
8981 /*
52042d8e 8982 * Make sure our free space cache IO is done before removing the
1bbc621e
CM
8983 * free space inode
8984 */
8985 spin_lock(&trans->transaction->dirty_bgs_lock);
8986 if (!list_empty(&block_group->io_list)) {
8987 list_del_init(&block_group->io_list);
8988
8989 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
8990
8991 spin_unlock(&trans->transaction->dirty_bgs_lock);
afdb5718 8992 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
8993 btrfs_put_block_group(block_group);
8994 spin_lock(&trans->transaction->dirty_bgs_lock);
8995 }
8996
8997 if (!list_empty(&block_group->dirty_list)) {
8998 list_del_init(&block_group->dirty_list);
ba2c4d4e 8999 remove_rsv = true;
1bbc621e
CM
9000 btrfs_put_block_group(block_group);
9001 }
9002 spin_unlock(&trans->transaction->dirty_bgs_lock);
9003 mutex_unlock(&trans->transaction->cache_write_mutex);
9004
0af3d00b 9005 if (!IS_ERR(inode)) {
73f2e545 9006 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
79787eaa
JM
9007 if (ret) {
9008 btrfs_add_delayed_iput(inode);
9009 goto out;
9010 }
0af3d00b
JB
9011 clear_nlink(inode);
9012 /* One for the block groups ref */
9013 spin_lock(&block_group->lock);
9014 if (block_group->iref) {
9015 block_group->iref = 0;
9016 block_group->inode = NULL;
9017 spin_unlock(&block_group->lock);
9018 iput(inode);
9019 } else {
9020 spin_unlock(&block_group->lock);
9021 }
9022 /* One for our lookup ref */
455757c3 9023 btrfs_add_delayed_iput(inode);
0af3d00b
JB
9024 }
9025
9026 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9027 key.offset = block_group->key.objectid;
9028 key.type = 0;
9029
9030 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9031 if (ret < 0)
9032 goto out;
9033 if (ret > 0)
b3b4aa74 9034 btrfs_release_path(path);
0af3d00b
JB
9035 if (ret == 0) {
9036 ret = btrfs_del_item(trans, tree_root, path);
9037 if (ret)
9038 goto out;
b3b4aa74 9039 btrfs_release_path(path);
0af3d00b
JB
9040 }
9041
0b246afa 9042 spin_lock(&fs_info->block_group_cache_lock);
1a40e23b 9043 rb_erase(&block_group->cache_node,
0b246afa 9044 &fs_info->block_group_cache_tree);
292cbd51 9045 RB_CLEAR_NODE(&block_group->cache_node);
a1897fdd 9046
0b246afa
JM
9047 if (fs_info->first_logical_byte == block_group->key.objectid)
9048 fs_info->first_logical_byte = (u64)-1;
9049 spin_unlock(&fs_info->block_group_cache_lock);
817d52f8 9050
80eb234a 9051 down_write(&block_group->space_info->groups_sem);
44fb5511
CM
9052 /*
9053 * we must use list_del_init so people can check to see if they
9054 * are still on the list after taking the semaphore
9055 */
9056 list_del_init(&block_group->list);
6ab0a202 9057 if (list_empty(&block_group->space_info->block_groups[index])) {
c1895442
JM
9058 kobj = block_group->space_info->block_group_kobjs[index];
9059 block_group->space_info->block_group_kobjs[index] = NULL;
0b246afa 9060 clear_avail_alloc_bits(fs_info, block_group->flags);
6ab0a202 9061 }
80eb234a 9062 up_write(&block_group->space_info->groups_sem);
6d58a55a 9063 clear_incompat_bg_bits(fs_info, block_group->flags);
c1895442
JM
9064 if (kobj) {
9065 kobject_del(kobj);
9066 kobject_put(kobj);
9067 }
1a40e23b 9068
4f69cb98
FM
9069 if (block_group->has_caching_ctl)
9070 caching_ctl = get_caching_control(block_group);
817d52f8 9071 if (block_group->cached == BTRFS_CACHE_STARTED)
11833d66 9072 wait_block_group_cache_done(block_group);
4f69cb98 9073 if (block_group->has_caching_ctl) {
0b246afa 9074 down_write(&fs_info->commit_root_sem);
4f69cb98
FM
9075 if (!caching_ctl) {
9076 struct btrfs_caching_control *ctl;
9077
9078 list_for_each_entry(ctl,
0b246afa 9079 &fs_info->caching_block_groups, list)
4f69cb98
FM
9080 if (ctl->block_group == block_group) {
9081 caching_ctl = ctl;
1e4f4714 9082 refcount_inc(&caching_ctl->count);
4f69cb98
FM
9083 break;
9084 }
9085 }
9086 if (caching_ctl)
9087 list_del_init(&caching_ctl->list);
0b246afa 9088 up_write(&fs_info->commit_root_sem);
4f69cb98
FM
9089 if (caching_ctl) {
9090 /* Once for the caching bgs list and once for us. */
9091 put_caching_control(caching_ctl);
9092 put_caching_control(caching_ctl);
9093 }
9094 }
817d52f8 9095
ce93ec54 9096 spin_lock(&trans->transaction->dirty_bgs_lock);
9a0ec83d
NB
9097 WARN_ON(!list_empty(&block_group->dirty_list));
9098 WARN_ON(!list_empty(&block_group->io_list));
ce93ec54 9099 spin_unlock(&trans->transaction->dirty_bgs_lock);
9a0ec83d 9100
817d52f8
JB
9101 btrfs_remove_free_space_cache(block_group);
9102
c146afad 9103 spin_lock(&block_group->space_info->lock);
75c68e9f 9104 list_del_init(&block_group->ro_list);
18d018ad 9105
0b246afa 9106 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
18d018ad
ZL
9107 WARN_ON(block_group->space_info->total_bytes
9108 < block_group->key.offset);
9109 WARN_ON(block_group->space_info->bytes_readonly
9110 < block_group->key.offset);
9111 WARN_ON(block_group->space_info->disk_total
9112 < block_group->key.offset * factor);
9113 }
c146afad
YZ
9114 block_group->space_info->total_bytes -= block_group->key.offset;
9115 block_group->space_info->bytes_readonly -= block_group->key.offset;
89a55897 9116 block_group->space_info->disk_total -= block_group->key.offset * factor;
18d018ad 9117
c146afad 9118 spin_unlock(&block_group->space_info->lock);
283bb197 9119
0af3d00b
JB
9120 memcpy(&key, &block_group->key, sizeof(key));
9121
34441361 9122 mutex_lock(&fs_info->chunk_mutex);
04216820
FM
9123 spin_lock(&block_group->lock);
9124 block_group->removed = 1;
9125 /*
9126 * At this point trimming can't start on this block group, because we
9127 * removed the block group from the tree fs_info->block_group_cache_tree
9128 * so no one can't find it anymore and even if someone already got this
9129 * block group before we removed it from the rbtree, they have already
9130 * incremented block_group->trimming - if they didn't, they won't find
9131 * any free space entries because we already removed them all when we
9132 * called btrfs_remove_free_space_cache().
9133 *
9134 * And we must not remove the extent map from the fs_info->mapping_tree
9135 * to prevent the same logical address range and physical device space
9136 * ranges from being reused for a new block group. This is because our
9137 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9138 * completely transactionless, so while it is trimming a range the
9139 * currently running transaction might finish and a new one start,
9140 * allowing for new block groups to be created that can reuse the same
9141 * physical device locations unless we take this special care.
e33e17ee
JM
9142 *
9143 * There may also be an implicit trim operation if the file system
9144 * is mounted with -odiscard. The same protections must remain
9145 * in place until the extents have been discarded completely when
9146 * the transaction commit has completed.
04216820
FM
9147 */
9148 remove_em = (atomic_read(&block_group->trimming) == 0);
04216820 9149 spin_unlock(&block_group->lock);
04216820 9150
34441361 9151 mutex_unlock(&fs_info->chunk_mutex);
8dbcd10f 9152
f3f72779 9153 ret = remove_block_group_free_space(trans, block_group);
1e144fb8
OS
9154 if (ret)
9155 goto out;
9156
fa9c0d79
CM
9157 btrfs_put_block_group(block_group);
9158 btrfs_put_block_group(block_group);
1a40e23b
ZY
9159
9160 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9161 if (ret > 0)
9162 ret = -EIO;
9163 if (ret < 0)
9164 goto out;
9165
9166 ret = btrfs_del_item(trans, root, path);
8eaf40c0
FM
9167 if (ret)
9168 goto out;
9169
9170 if (remove_em) {
9171 struct extent_map_tree *em_tree;
9172
c8bf1b67 9173 em_tree = &fs_info->mapping_tree;
8eaf40c0
FM
9174 write_lock(&em_tree->lock);
9175 remove_extent_mapping(em_tree, em);
9176 write_unlock(&em_tree->lock);
9177 /* once for the tree */
9178 free_extent_map(em);
9179 }
1a40e23b 9180out:
ba2c4d4e
JB
9181 if (remove_rsv)
9182 btrfs_delayed_refs_rsv_release(fs_info, 1);
1a40e23b
ZY
9183 btrfs_free_path(path);
9184 return ret;
9185}
acce952b 9186
8eab77ff 9187struct btrfs_trans_handle *
7fd01182
FM
9188btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
9189 const u64 chunk_offset)
8eab77ff 9190{
c8bf1b67 9191 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7fd01182
FM
9192 struct extent_map *em;
9193 struct map_lookup *map;
9194 unsigned int num_items;
9195
9196 read_lock(&em_tree->lock);
9197 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
9198 read_unlock(&em_tree->lock);
9199 ASSERT(em && em->start == chunk_offset);
9200
8eab77ff 9201 /*
7fd01182
FM
9202 * We need to reserve 3 + N units from the metadata space info in order
9203 * to remove a block group (done at btrfs_remove_chunk() and at
9204 * btrfs_remove_block_group()), which are used for:
9205 *
8eab77ff
FM
9206 * 1 unit for adding the free space inode's orphan (located in the tree
9207 * of tree roots).
7fd01182
FM
9208 * 1 unit for deleting the block group item (located in the extent
9209 * tree).
9210 * 1 unit for deleting the free space item (located in tree of tree
9211 * roots).
9212 * N units for deleting N device extent items corresponding to each
9213 * stripe (located in the device tree).
9214 *
9215 * In order to remove a block group we also need to reserve units in the
9216 * system space info in order to update the chunk tree (update one or
9217 * more device items and remove one chunk item), but this is done at
9218 * btrfs_remove_chunk() through a call to check_system_chunk().
8eab77ff 9219 */
95617d69 9220 map = em->map_lookup;
7fd01182
FM
9221 num_items = 3 + map->num_stripes;
9222 free_extent_map(em);
9223
8eab77ff 9224 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
7fd01182 9225 num_items, 1);
8eab77ff
FM
9226}
9227
47ab2a6c
JB
9228/*
9229 * Process the unused_bgs list and remove any that don't have any allocated
9230 * space inside of them.
9231 */
9232void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9233{
9234 struct btrfs_block_group_cache *block_group;
9235 struct btrfs_space_info *space_info;
47ab2a6c
JB
9236 struct btrfs_trans_handle *trans;
9237 int ret = 0;
9238
afcdd129 9239 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
47ab2a6c
JB
9240 return;
9241
9242 spin_lock(&fs_info->unused_bgs_lock);
9243 while (!list_empty(&fs_info->unused_bgs)) {
9244 u64 start, end;
e33e17ee 9245 int trimming;
47ab2a6c
JB
9246
9247 block_group = list_first_entry(&fs_info->unused_bgs,
9248 struct btrfs_block_group_cache,
9249 bg_list);
47ab2a6c 9250 list_del_init(&block_group->bg_list);
aefbe9a6
ZL
9251
9252 space_info = block_group->space_info;
9253
47ab2a6c
JB
9254 if (ret || btrfs_mixed_space_info(space_info)) {
9255 btrfs_put_block_group(block_group);
9256 continue;
9257 }
9258 spin_unlock(&fs_info->unused_bgs_lock);
9259
d5f2e33b 9260 mutex_lock(&fs_info->delete_unused_bgs_mutex);
67c5e7d4 9261
47ab2a6c
JB
9262 /* Don't want to race with allocators so take the groups_sem */
9263 down_write(&space_info->groups_sem);
9264 spin_lock(&block_group->lock);
43794446 9265 if (block_group->reserved || block_group->pinned ||
47ab2a6c 9266 btrfs_block_group_used(&block_group->item) ||
19c4d2f9 9267 block_group->ro ||
aefbe9a6 9268 list_is_singular(&block_group->list)) {
47ab2a6c
JB
9269 /*
9270 * We want to bail if we made new allocations or have
9271 * outstanding allocations in this block group. We do
9272 * the ro check in case balance is currently acting on
9273 * this block group.
9274 */
4ed0a7a3 9275 trace_btrfs_skip_unused_block_group(block_group);
47ab2a6c
JB
9276 spin_unlock(&block_group->lock);
9277 up_write(&space_info->groups_sem);
9278 goto next;
9279 }
9280 spin_unlock(&block_group->lock);
9281
9282 /* We don't want to force the issue, only flip if it's ok. */
868f401a 9283 ret = inc_block_group_ro(block_group, 0);
47ab2a6c
JB
9284 up_write(&space_info->groups_sem);
9285 if (ret < 0) {
9286 ret = 0;
9287 goto next;
9288 }
9289
9290 /*
9291 * Want to do this before we do anything else so we can recover
9292 * properly if we fail to join the transaction.
9293 */
7fd01182
FM
9294 trans = btrfs_start_trans_remove_block_group(fs_info,
9295 block_group->key.objectid);
47ab2a6c 9296 if (IS_ERR(trans)) {
2ff7e61e 9297 btrfs_dec_block_group_ro(block_group);
47ab2a6c
JB
9298 ret = PTR_ERR(trans);
9299 goto next;
9300 }
9301
9302 /*
9303 * We could have pending pinned extents for this block group,
9304 * just delete them, we don't care about them anymore.
9305 */
9306 start = block_group->key.objectid;
9307 end = start + block_group->key.offset - 1;
d4b450cd
FM
9308 /*
9309 * Hold the unused_bg_unpin_mutex lock to avoid racing with
9310 * btrfs_finish_extent_commit(). If we are at transaction N,
9311 * another task might be running finish_extent_commit() for the
9312 * previous transaction N - 1, and have seen a range belonging
9313 * to the block group in freed_extents[] before we were able to
9314 * clear the whole block group range from freed_extents[]. This
9315 * means that task can lookup for the block group after we
9316 * unpinned it from freed_extents[] and removed it, leading to
9317 * a BUG_ON() at btrfs_unpin_extent_range().
9318 */
9319 mutex_lock(&fs_info->unused_bg_unpin_mutex);
758eb51e 9320 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
91166212 9321 EXTENT_DIRTY);
758eb51e 9322 if (ret) {
d4b450cd 9323 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 9324 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
9325 goto end_trans;
9326 }
9327 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
91166212 9328 EXTENT_DIRTY);
758eb51e 9329 if (ret) {
d4b450cd 9330 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 9331 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
9332 goto end_trans;
9333 }
d4b450cd 9334 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
47ab2a6c
JB
9335
9336 /* Reset pinned so btrfs_put_block_group doesn't complain */
c30666d4
ZL
9337 spin_lock(&space_info->lock);
9338 spin_lock(&block_group->lock);
9339
bb96c4e5
JB
9340 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
9341 -block_group->pinned);
c30666d4 9342 space_info->bytes_readonly += block_group->pinned;
dec59fa3
EL
9343 percpu_counter_add_batch(&space_info->total_bytes_pinned,
9344 -block_group->pinned,
9345 BTRFS_TOTAL_BYTES_PINNED_BATCH);
47ab2a6c
JB
9346 block_group->pinned = 0;
9347
c30666d4
ZL
9348 spin_unlock(&block_group->lock);
9349 spin_unlock(&space_info->lock);
9350
e33e17ee 9351 /* DISCARD can flip during remount */
0b246afa 9352 trimming = btrfs_test_opt(fs_info, DISCARD);
e33e17ee
JM
9353
9354 /* Implicit trim during transaction commit. */
9355 if (trimming)
9356 btrfs_get_block_group_trimming(block_group);
9357
47ab2a6c
JB
9358 /*
9359 * Btrfs_remove_chunk will abort the transaction if things go
9360 * horribly wrong.
9361 */
97aff912 9362 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
e33e17ee
JM
9363
9364 if (ret) {
9365 if (trimming)
9366 btrfs_put_block_group_trimming(block_group);
9367 goto end_trans;
9368 }
9369
9370 /*
9371 * If we're not mounted with -odiscard, we can just forget
9372 * about this block group. Otherwise we'll need to wait
9373 * until transaction commit to do the actual discard.
9374 */
9375 if (trimming) {
348a0013
FM
9376 spin_lock(&fs_info->unused_bgs_lock);
9377 /*
9378 * A concurrent scrub might have added us to the list
9379 * fs_info->unused_bgs, so use a list_move operation
9380 * to add the block group to the deleted_bgs list.
9381 */
e33e17ee
JM
9382 list_move(&block_group->bg_list,
9383 &trans->transaction->deleted_bgs);
348a0013 9384 spin_unlock(&fs_info->unused_bgs_lock);
e33e17ee
JM
9385 btrfs_get_block_group(block_group);
9386 }
758eb51e 9387end_trans:
3a45bb20 9388 btrfs_end_transaction(trans);
47ab2a6c 9389next:
d5f2e33b 9390 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
47ab2a6c
JB
9391 btrfs_put_block_group(block_group);
9392 spin_lock(&fs_info->unused_bgs_lock);
9393 }
9394 spin_unlock(&fs_info->unused_bgs_lock);
9395}
9396
2ff7e61e
JM
9397int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
9398 u64 start, u64 end)
acce952b 9399{
2ff7e61e 9400 return unpin_extent_range(fs_info, start, end, false);
acce952b 9401}
9402
499f377f
JM
9403/*
9404 * It used to be that old block groups would be left around forever.
9405 * Iterating over them would be enough to trim unused space. Since we
9406 * now automatically remove them, we also need to iterate over unallocated
9407 * space.
9408 *
9409 * We don't want a transaction for this since the discard may take a
9410 * substantial amount of time. We don't require that a transaction be
9411 * running, but we do need to take a running transaction into account
fee7acc3
JM
9412 * to ensure that we're not discarding chunks that were released or
9413 * allocated in the current transaction.
499f377f
JM
9414 *
9415 * Holding the chunks lock will prevent other threads from allocating
9416 * or releasing chunks, but it won't prevent a running transaction
9417 * from committing and releasing the memory that the pending chunks
9418 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
9419 * transaction and hold the commit root sem. We only need to hold
9420 * it while performing the free space search since we have already
9421 * held back allocations.
499f377f 9422 */
8103d10b 9423static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 9424{
8103d10b 9425 u64 start = SZ_1M, len = 0, end = 0;
499f377f
JM
9426 int ret;
9427
9428 *trimmed = 0;
9429
0be88e36
JM
9430 /* Discard not supported = nothing to do. */
9431 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
9432 return 0;
9433
52042d8e 9434 /* Not writable = nothing to do. */
ebbede42 9435 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
9436 return 0;
9437
9438 /* No free space = nothing to do. */
9439 if (device->total_bytes <= device->bytes_used)
9440 return 0;
9441
9442 ret = 0;
9443
9444 while (1) {
fb456252 9445 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
9446 u64 bytes;
9447
9448 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
9449 if (ret)
fee7acc3 9450 break;
499f377f 9451
929be17a
NB
9452 find_first_clear_extent_bit(&device->alloc_state, start,
9453 &start, &end,
9454 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45
NB
9455
9456 /* Ensure we skip the reserved area in the first 1M */
9457 start = max_t(u64, start, SZ_1M);
9458
929be17a
NB
9459 /*
9460 * If find_first_clear_extent_bit find a range that spans the
9461 * end of the device it will set end to -1, in this case it's up
9462 * to the caller to trim the value to the size of the device.
9463 */
9464 end = min(end, device->total_bytes - 1);
53460a45 9465
929be17a 9466 len = end - start + 1;
499f377f 9467
929be17a
NB
9468 /* We didn't find any extents */
9469 if (!len) {
499f377f 9470 mutex_unlock(&fs_info->chunk_mutex);
929be17a 9471 ret = 0;
499f377f
JM
9472 break;
9473 }
9474
929be17a
NB
9475 ret = btrfs_issue_discard(device->bdev, start, len,
9476 &bytes);
9477 if (!ret)
9478 set_extent_bits(&device->alloc_state, start,
9479 start + bytes - 1,
9480 CHUNK_TRIMMED);
499f377f
JM
9481 mutex_unlock(&fs_info->chunk_mutex);
9482
9483 if (ret)
9484 break;
9485
9486 start += len;
9487 *trimmed += bytes;
9488
9489 if (fatal_signal_pending(current)) {
9490 ret = -ERESTARTSYS;
9491 break;
9492 }
9493
9494 cond_resched();
9495 }
9496
9497 return ret;
9498}
9499
93bba24d
QW
9500/*
9501 * Trim the whole filesystem by:
9502 * 1) trimming the free space in each block group
9503 * 2) trimming the unallocated space on each device
9504 *
9505 * This will also continue trimming even if a block group or device encounters
9506 * an error. The return value will be the last error, or 0 if nothing bad
9507 * happens.
9508 */
2ff7e61e 9509int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 9510{
f7039b1d 9511 struct btrfs_block_group_cache *cache = NULL;
499f377f
JM
9512 struct btrfs_device *device;
9513 struct list_head *devices;
f7039b1d
LD
9514 u64 group_trimmed;
9515 u64 start;
9516 u64 end;
9517 u64 trimmed = 0;
93bba24d
QW
9518 u64 bg_failed = 0;
9519 u64 dev_failed = 0;
9520 int bg_ret = 0;
9521 int dev_ret = 0;
f7039b1d
LD
9522 int ret = 0;
9523
6ba9fc8e 9524 cache = btrfs_lookup_first_block_group(fs_info, range->start);
f87b7eb8 9525 for (; cache; cache = next_block_group(cache)) {
f7039b1d
LD
9526 if (cache->key.objectid >= (range->start + range->len)) {
9527 btrfs_put_block_group(cache);
9528 break;
9529 }
9530
9531 start = max(range->start, cache->key.objectid);
9532 end = min(range->start + range->len,
9533 cache->key.objectid + cache->key.offset);
9534
9535 if (end - start >= range->minlen) {
9536 if (!block_group_cache_done(cache)) {
f6373bf3 9537 ret = cache_block_group(cache, 0);
1be41b78 9538 if (ret) {
93bba24d
QW
9539 bg_failed++;
9540 bg_ret = ret;
9541 continue;
1be41b78
JB
9542 }
9543 ret = wait_block_group_cache_done(cache);
9544 if (ret) {
93bba24d
QW
9545 bg_failed++;
9546 bg_ret = ret;
9547 continue;
1be41b78 9548 }
f7039b1d
LD
9549 }
9550 ret = btrfs_trim_block_group(cache,
9551 &group_trimmed,
9552 start,
9553 end,
9554 range->minlen);
9555
9556 trimmed += group_trimmed;
9557 if (ret) {
93bba24d
QW
9558 bg_failed++;
9559 bg_ret = ret;
9560 continue;
f7039b1d
LD
9561 }
9562 }
f7039b1d
LD
9563 }
9564
93bba24d
QW
9565 if (bg_failed)
9566 btrfs_warn(fs_info,
9567 "failed to trim %llu block group(s), last error %d",
9568 bg_failed, bg_ret);
0b246afa 9569 mutex_lock(&fs_info->fs_devices->device_list_mutex);
d4e329de
JM
9570 devices = &fs_info->fs_devices->devices;
9571 list_for_each_entry(device, devices, dev_list) {
8103d10b 9572 ret = btrfs_trim_free_extents(device, &group_trimmed);
93bba24d
QW
9573 if (ret) {
9574 dev_failed++;
9575 dev_ret = ret;
499f377f 9576 break;
93bba24d 9577 }
499f377f
JM
9578
9579 trimmed += group_trimmed;
9580 }
0b246afa 9581 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
499f377f 9582
93bba24d
QW
9583 if (dev_failed)
9584 btrfs_warn(fs_info,
9585 "failed to trim %llu device(s), last error %d",
9586 dev_failed, dev_ret);
f7039b1d 9587 range->len = trimmed;
93bba24d
QW
9588 if (bg_ret)
9589 return bg_ret;
9590 return dev_ret;
f7039b1d 9591}
8257b2dc
MX
9592
9593/*
ea14b57f 9594 * btrfs_{start,end}_write_no_snapshotting() are similar to
9ea24bbe
FM
9595 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9596 * data into the page cache through nocow before the subvolume is snapshoted,
9597 * but flush the data into disk after the snapshot creation, or to prevent
ea14b57f 9598 * operations while snapshotting is ongoing and that cause the snapshot to be
9ea24bbe 9599 * inconsistent (writes followed by expanding truncates for example).
8257b2dc 9600 */
ea14b57f 9601void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
8257b2dc
MX
9602{
9603 percpu_counter_dec(&root->subv_writers->counter);
093258e6 9604 cond_wake_up(&root->subv_writers->wait);
8257b2dc
MX
9605}
9606
ea14b57f 9607int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
8257b2dc 9608{
ea14b57f 9609 if (atomic_read(&root->will_be_snapshotted))
8257b2dc
MX
9610 return 0;
9611
9612 percpu_counter_inc(&root->subv_writers->counter);
9613 /*
9614 * Make sure counter is updated before we check for snapshot creation.
9615 */
9616 smp_mb();
ea14b57f
DS
9617 if (atomic_read(&root->will_be_snapshotted)) {
9618 btrfs_end_write_no_snapshotting(root);
8257b2dc
MX
9619 return 0;
9620 }
9621 return 1;
9622}
0bc19f90 9623
0bc19f90
ZL
9624void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
9625{
9626 while (true) {
9627 int ret;
9628
ea14b57f 9629 ret = btrfs_start_write_no_snapshotting(root);
0bc19f90
ZL
9630 if (ret)
9631 break;
4625956a
PZ
9632 wait_var_event(&root->will_be_snapshotted,
9633 !atomic_read(&root->will_be_snapshotted));
0bc19f90
ZL
9634 }
9635}
031f24da
QW
9636
9637void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
9638{
9639 struct btrfs_fs_info *fs_info = bg->fs_info;
9640
9641 spin_lock(&fs_info->unused_bgs_lock);
9642 if (list_empty(&bg->bg_list)) {
9643 btrfs_get_block_group(bg);
9644 trace_btrfs_add_unused_block_group(bg);
9645 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
9646 }
9647 spin_unlock(&fs_info->unused_bgs_lock);
9648}