btrfs: update the comment for submit_extent_page()
[linux-block.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
602cbe91 19#include "misc.h"
995946dd 20#include "tree-log.h"
fec577fb
CM
21#include "disk-io.h"
22#include "print-tree.h"
0b86a832 23#include "volumes.h"
53b381b3 24#include "raid56.h"
925baedd 25#include "locking.h"
fa9c0d79 26#include "free-space-cache.h"
1e144fb8 27#include "free-space-tree.h"
6ab0a202 28#include "sysfs.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
86736342 33#include "delalloc-space.h"
aac0023c 34#include "block-group.h"
b0643e59 35#include "discard.h"
c57dd1f2 36#include "rcu-string.h"
169e0da9 37#include "zoned.h"
6143c23c 38#include "dev-replace.h"
fec577fb 39
709c0486
AJ
40#undef SCRAMBLE_DELAYED_REFS
41
9f9b8e8d 42
5d4f98a2 43static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
44 struct btrfs_delayed_ref_node *node, u64 parent,
45 u64 root_objectid, u64 owner_objectid,
46 u64 owner_offset, int refs_to_drop,
47 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
49 struct extent_buffer *leaf,
50 struct btrfs_extent_item *ei);
51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
52 u64 parent, u64 root_objectid,
53 u64 flags, u64 owner, u64 offset,
54 struct btrfs_key *ins, int ref_mod);
55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 56 struct btrfs_delayed_ref_node *node,
21ebfbe7 57 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
58static int find_next_key(struct btrfs_path *path, int level,
59 struct btrfs_key *key);
6a63209f 60
32da5386 61static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
0f9dd46c
JB
62{
63 return (cache->flags & bits) == bits;
64}
65
6f410d1b
JB
66int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
67 u64 start, u64 num_bytes)
817d52f8 68{
11833d66 69 u64 end = start + num_bytes - 1;
fe119a6e
NB
70 set_extent_bits(&fs_info->excluded_extents, start, end,
71 EXTENT_UPTODATE);
11833d66
YZ
72 return 0;
73}
817d52f8 74
32da5386 75void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
11833d66 76{
9e715da8 77 struct btrfs_fs_info *fs_info = cache->fs_info;
11833d66 78 u64 start, end;
817d52f8 79
b3470b5d
DS
80 start = cache->start;
81 end = start + cache->length - 1;
11833d66 82
fe119a6e
NB
83 clear_extent_bits(&fs_info->excluded_extents, start, end,
84 EXTENT_UPTODATE);
817d52f8
JB
85}
86
1a4ed8fd 87/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 88int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5 89{
29cbcf40 90 struct btrfs_root *root = btrfs_extent_root(fs_info, start);
e02119d5
CM
91 int ret;
92 struct btrfs_key key;
31840ae1 93 struct btrfs_path *path;
e02119d5 94
31840ae1 95 path = btrfs_alloc_path();
d8926bb3
MF
96 if (!path)
97 return -ENOMEM;
98
e02119d5
CM
99 key.objectid = start;
100 key.offset = len;
3173a18f 101 key.type = BTRFS_EXTENT_ITEM_KEY;
29cbcf40 102 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
31840ae1 103 btrfs_free_path(path);
7bb86316
CM
104 return ret;
105}
106
a22285a6 107/*
3173a18f 108 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
109 *
110 * the head node for delayed ref is used to store the sum of all the
111 * reference count modifications queued up in the rbtree. the head
112 * node may also store the extent flags to set. This way you can check
113 * to see what the reference count and extent flags would be if all of
114 * the delayed refs are not processed.
115 */
116int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 117 struct btrfs_fs_info *fs_info, u64 bytenr,
3173a18f 118 u64 offset, int metadata, u64 *refs, u64 *flags)
a22285a6 119{
29cbcf40 120 struct btrfs_root *extent_root;
a22285a6
YZ
121 struct btrfs_delayed_ref_head *head;
122 struct btrfs_delayed_ref_root *delayed_refs;
123 struct btrfs_path *path;
124 struct btrfs_extent_item *ei;
125 struct extent_buffer *leaf;
126 struct btrfs_key key;
127 u32 item_size;
128 u64 num_refs;
129 u64 extent_flags;
130 int ret;
131
3173a18f
JB
132 /*
133 * If we don't have skinny metadata, don't bother doing anything
134 * different
135 */
0b246afa
JM
136 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
137 offset = fs_info->nodesize;
3173a18f
JB
138 metadata = 0;
139 }
140
a22285a6
YZ
141 path = btrfs_alloc_path();
142 if (!path)
143 return -ENOMEM;
144
a22285a6
YZ
145 if (!trans) {
146 path->skip_locking = 1;
147 path->search_commit_root = 1;
148 }
639eefc8
FDBM
149
150search_again:
151 key.objectid = bytenr;
152 key.offset = offset;
153 if (metadata)
154 key.type = BTRFS_METADATA_ITEM_KEY;
155 else
156 key.type = BTRFS_EXTENT_ITEM_KEY;
157
29cbcf40
JB
158 extent_root = btrfs_extent_root(fs_info, bytenr);
159 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
a22285a6
YZ
160 if (ret < 0)
161 goto out_free;
162
3173a18f 163 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
164 if (path->slots[0]) {
165 path->slots[0]--;
166 btrfs_item_key_to_cpu(path->nodes[0], &key,
167 path->slots[0]);
168 if (key.objectid == bytenr &&
169 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 170 key.offset == fs_info->nodesize)
74be9510
FDBM
171 ret = 0;
172 }
3173a18f
JB
173 }
174
a22285a6
YZ
175 if (ret == 0) {
176 leaf = path->nodes[0];
3212fa14 177 item_size = btrfs_item_size(leaf, path->slots[0]);
a22285a6
YZ
178 if (item_size >= sizeof(*ei)) {
179 ei = btrfs_item_ptr(leaf, path->slots[0],
180 struct btrfs_extent_item);
181 num_refs = btrfs_extent_refs(leaf, ei);
182 extent_flags = btrfs_extent_flags(leaf, ei);
183 } else {
ba3c2b19
NB
184 ret = -EINVAL;
185 btrfs_print_v0_err(fs_info);
186 if (trans)
187 btrfs_abort_transaction(trans, ret);
188 else
189 btrfs_handle_fs_error(fs_info, ret, NULL);
190
191 goto out_free;
a22285a6 192 }
ba3c2b19 193
a22285a6
YZ
194 BUG_ON(num_refs == 0);
195 } else {
196 num_refs = 0;
197 extent_flags = 0;
198 ret = 0;
199 }
200
201 if (!trans)
202 goto out;
203
204 delayed_refs = &trans->transaction->delayed_refs;
205 spin_lock(&delayed_refs->lock);
f72ad18e 206 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
207 if (head) {
208 if (!mutex_trylock(&head->mutex)) {
d278850e 209 refcount_inc(&head->refs);
a22285a6
YZ
210 spin_unlock(&delayed_refs->lock);
211
b3b4aa74 212 btrfs_release_path(path);
a22285a6 213
8cc33e5c
DS
214 /*
215 * Mutex was contended, block until it's released and try
216 * again
217 */
a22285a6
YZ
218 mutex_lock(&head->mutex);
219 mutex_unlock(&head->mutex);
d278850e 220 btrfs_put_delayed_ref_head(head);
639eefc8 221 goto search_again;
a22285a6 222 }
d7df2c79 223 spin_lock(&head->lock);
a22285a6
YZ
224 if (head->extent_op && head->extent_op->update_flags)
225 extent_flags |= head->extent_op->flags_to_set;
226 else
227 BUG_ON(num_refs == 0);
228
d278850e 229 num_refs += head->ref_mod;
d7df2c79 230 spin_unlock(&head->lock);
a22285a6
YZ
231 mutex_unlock(&head->mutex);
232 }
233 spin_unlock(&delayed_refs->lock);
234out:
235 WARN_ON(num_refs == 0);
236 if (refs)
237 *refs = num_refs;
238 if (flags)
239 *flags = extent_flags;
240out_free:
241 btrfs_free_path(path);
242 return ret;
243}
244
d8d5f3e1
CM
245/*
246 * Back reference rules. Back refs have three main goals:
247 *
248 * 1) differentiate between all holders of references to an extent so that
249 * when a reference is dropped we can make sure it was a valid reference
250 * before freeing the extent.
251 *
252 * 2) Provide enough information to quickly find the holders of an extent
253 * if we notice a given block is corrupted or bad.
254 *
255 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
256 * maintenance. This is actually the same as #2, but with a slightly
257 * different use case.
258 *
5d4f98a2
YZ
259 * There are two kinds of back refs. The implicit back refs is optimized
260 * for pointers in non-shared tree blocks. For a given pointer in a block,
261 * back refs of this kind provide information about the block's owner tree
262 * and the pointer's key. These information allow us to find the block by
263 * b-tree searching. The full back refs is for pointers in tree blocks not
264 * referenced by their owner trees. The location of tree block is recorded
265 * in the back refs. Actually the full back refs is generic, and can be
266 * used in all cases the implicit back refs is used. The major shortcoming
267 * of the full back refs is its overhead. Every time a tree block gets
268 * COWed, we have to update back refs entry for all pointers in it.
269 *
270 * For a newly allocated tree block, we use implicit back refs for
271 * pointers in it. This means most tree related operations only involve
272 * implicit back refs. For a tree block created in old transaction, the
273 * only way to drop a reference to it is COW it. So we can detect the
274 * event that tree block loses its owner tree's reference and do the
275 * back refs conversion.
276 *
01327610 277 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
278 *
279 * The reference count of the block is one and the tree is the block's
280 * owner tree. Nothing to do in this case.
281 *
282 * The reference count of the block is one and the tree is not the
283 * block's owner tree. In this case, full back refs is used for pointers
284 * in the block. Remove these full back refs, add implicit back refs for
285 * every pointers in the new block.
286 *
287 * The reference count of the block is greater than one and the tree is
288 * the block's owner tree. In this case, implicit back refs is used for
289 * pointers in the block. Add full back refs for every pointers in the
290 * block, increase lower level extents' reference counts. The original
291 * implicit back refs are entailed to the new block.
292 *
293 * The reference count of the block is greater than one and the tree is
294 * not the block's owner tree. Add implicit back refs for every pointer in
295 * the new block, increase lower level extents' reference count.
296 *
297 * Back Reference Key composing:
298 *
299 * The key objectid corresponds to the first byte in the extent,
300 * The key type is used to differentiate between types of back refs.
301 * There are different meanings of the key offset for different types
302 * of back refs.
303 *
d8d5f3e1
CM
304 * File extents can be referenced by:
305 *
306 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 307 * - different files inside a single subvolume
d8d5f3e1
CM
308 * - different offsets inside a file (bookend extents in file.c)
309 *
5d4f98a2 310 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
311 *
312 * - Objectid of the subvolume root
d8d5f3e1 313 * - objectid of the file holding the reference
5d4f98a2
YZ
314 * - original offset in the file
315 * - how many bookend extents
d8d5f3e1 316 *
5d4f98a2
YZ
317 * The key offset for the implicit back refs is hash of the first
318 * three fields.
d8d5f3e1 319 *
5d4f98a2 320 * The extent ref structure for the full back refs has field for:
d8d5f3e1 321 *
5d4f98a2 322 * - number of pointers in the tree leaf
d8d5f3e1 323 *
5d4f98a2
YZ
324 * The key offset for the implicit back refs is the first byte of
325 * the tree leaf
d8d5f3e1 326 *
5d4f98a2
YZ
327 * When a file extent is allocated, The implicit back refs is used.
328 * the fields are filled in:
d8d5f3e1 329 *
5d4f98a2 330 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 331 *
5d4f98a2
YZ
332 * When a file extent is removed file truncation, we find the
333 * corresponding implicit back refs and check the following fields:
d8d5f3e1 334 *
5d4f98a2 335 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 336 *
5d4f98a2 337 * Btree extents can be referenced by:
d8d5f3e1 338 *
5d4f98a2 339 * - Different subvolumes
d8d5f3e1 340 *
5d4f98a2
YZ
341 * Both the implicit back refs and the full back refs for tree blocks
342 * only consist of key. The key offset for the implicit back refs is
343 * objectid of block's owner tree. The key offset for the full back refs
344 * is the first byte of parent block.
d8d5f3e1 345 *
5d4f98a2
YZ
346 * When implicit back refs is used, information about the lowest key and
347 * level of the tree block are required. These information are stored in
348 * tree block info structure.
d8d5f3e1 349 */
31840ae1 350
167ce953
LB
351/*
352 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 353 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
354 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
355 */
356int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
357 struct btrfs_extent_inline_ref *iref,
358 enum btrfs_inline_ref_type is_data)
359{
360 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 361 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953
LB
362
363 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
364 type == BTRFS_SHARED_BLOCK_REF_KEY ||
365 type == BTRFS_SHARED_DATA_REF_KEY ||
366 type == BTRFS_EXTENT_DATA_REF_KEY) {
367 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 368 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 369 return type;
64ecdb64
LB
370 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
371 ASSERT(eb->fs_info);
372 /*
ea57788e
QW
373 * Every shared one has parent tree block,
374 * which must be aligned to sector size.
64ecdb64
LB
375 */
376 if (offset &&
ea57788e 377 IS_ALIGNED(offset, eb->fs_info->sectorsize))
64ecdb64
LB
378 return type;
379 }
167ce953 380 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 381 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 382 return type;
64ecdb64
LB
383 if (type == BTRFS_SHARED_DATA_REF_KEY) {
384 ASSERT(eb->fs_info);
385 /*
ea57788e
QW
386 * Every shared one has parent tree block,
387 * which must be aligned to sector size.
64ecdb64
LB
388 */
389 if (offset &&
ea57788e 390 IS_ALIGNED(offset, eb->fs_info->sectorsize))
64ecdb64
LB
391 return type;
392 }
167ce953
LB
393 } else {
394 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
395 return type;
396 }
397 }
398
399 btrfs_print_leaf((struct extent_buffer *)eb);
ea57788e
QW
400 btrfs_err(eb->fs_info,
401 "eb %llu iref 0x%lx invalid extent inline ref type %d",
402 eb->start, (unsigned long)iref, type);
167ce953
LB
403 WARN_ON(1);
404
405 return BTRFS_REF_TYPE_INVALID;
406}
407
0785a9aa 408u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
5d4f98a2
YZ
409{
410 u32 high_crc = ~(u32)0;
411 u32 low_crc = ~(u32)0;
412 __le64 lenum;
413
414 lenum = cpu_to_le64(root_objectid);
65019df8 415 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 416 lenum = cpu_to_le64(owner);
65019df8 417 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 418 lenum = cpu_to_le64(offset);
65019df8 419 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
420
421 return ((u64)high_crc << 31) ^ (u64)low_crc;
422}
423
424static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
425 struct btrfs_extent_data_ref *ref)
426{
427 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
428 btrfs_extent_data_ref_objectid(leaf, ref),
429 btrfs_extent_data_ref_offset(leaf, ref));
430}
431
432static int match_extent_data_ref(struct extent_buffer *leaf,
433 struct btrfs_extent_data_ref *ref,
434 u64 root_objectid, u64 owner, u64 offset)
435{
436 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
437 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
438 btrfs_extent_data_ref_offset(leaf, ref) != offset)
439 return 0;
440 return 1;
441}
442
443static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
444 struct btrfs_path *path,
445 u64 bytenr, u64 parent,
446 u64 root_objectid,
447 u64 owner, u64 offset)
448{
29cbcf40 449 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2
YZ
450 struct btrfs_key key;
451 struct btrfs_extent_data_ref *ref;
31840ae1 452 struct extent_buffer *leaf;
5d4f98a2 453 u32 nritems;
74493f7a 454 int ret;
5d4f98a2
YZ
455 int recow;
456 int err = -ENOENT;
74493f7a 457
31840ae1 458 key.objectid = bytenr;
5d4f98a2
YZ
459 if (parent) {
460 key.type = BTRFS_SHARED_DATA_REF_KEY;
461 key.offset = parent;
462 } else {
463 key.type = BTRFS_EXTENT_DATA_REF_KEY;
464 key.offset = hash_extent_data_ref(root_objectid,
465 owner, offset);
466 }
467again:
468 recow = 0;
469 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
470 if (ret < 0) {
471 err = ret;
472 goto fail;
473 }
31840ae1 474
5d4f98a2
YZ
475 if (parent) {
476 if (!ret)
477 return 0;
5d4f98a2 478 goto fail;
31840ae1
ZY
479 }
480
481 leaf = path->nodes[0];
5d4f98a2
YZ
482 nritems = btrfs_header_nritems(leaf);
483 while (1) {
484 if (path->slots[0] >= nritems) {
485 ret = btrfs_next_leaf(root, path);
486 if (ret < 0)
487 err = ret;
488 if (ret)
489 goto fail;
490
491 leaf = path->nodes[0];
492 nritems = btrfs_header_nritems(leaf);
493 recow = 1;
494 }
495
496 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
497 if (key.objectid != bytenr ||
498 key.type != BTRFS_EXTENT_DATA_REF_KEY)
499 goto fail;
500
501 ref = btrfs_item_ptr(leaf, path->slots[0],
502 struct btrfs_extent_data_ref);
503
504 if (match_extent_data_ref(leaf, ref, root_objectid,
505 owner, offset)) {
506 if (recow) {
b3b4aa74 507 btrfs_release_path(path);
5d4f98a2
YZ
508 goto again;
509 }
510 err = 0;
511 break;
512 }
513 path->slots[0]++;
31840ae1 514 }
5d4f98a2
YZ
515fail:
516 return err;
31840ae1
ZY
517}
518
5d4f98a2 519static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
520 struct btrfs_path *path,
521 u64 bytenr, u64 parent,
522 u64 root_objectid, u64 owner,
523 u64 offset, int refs_to_add)
31840ae1 524{
29cbcf40 525 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
31840ae1
ZY
526 struct btrfs_key key;
527 struct extent_buffer *leaf;
5d4f98a2 528 u32 size;
31840ae1
ZY
529 u32 num_refs;
530 int ret;
74493f7a 531
74493f7a 532 key.objectid = bytenr;
5d4f98a2
YZ
533 if (parent) {
534 key.type = BTRFS_SHARED_DATA_REF_KEY;
535 key.offset = parent;
536 size = sizeof(struct btrfs_shared_data_ref);
537 } else {
538 key.type = BTRFS_EXTENT_DATA_REF_KEY;
539 key.offset = hash_extent_data_ref(root_objectid,
540 owner, offset);
541 size = sizeof(struct btrfs_extent_data_ref);
542 }
74493f7a 543
5d4f98a2
YZ
544 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
545 if (ret && ret != -EEXIST)
546 goto fail;
547
548 leaf = path->nodes[0];
549 if (parent) {
550 struct btrfs_shared_data_ref *ref;
31840ae1 551 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
552 struct btrfs_shared_data_ref);
553 if (ret == 0) {
554 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
555 } else {
556 num_refs = btrfs_shared_data_ref_count(leaf, ref);
557 num_refs += refs_to_add;
558 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 559 }
5d4f98a2
YZ
560 } else {
561 struct btrfs_extent_data_ref *ref;
562 while (ret == -EEXIST) {
563 ref = btrfs_item_ptr(leaf, path->slots[0],
564 struct btrfs_extent_data_ref);
565 if (match_extent_data_ref(leaf, ref, root_objectid,
566 owner, offset))
567 break;
b3b4aa74 568 btrfs_release_path(path);
5d4f98a2
YZ
569 key.offset++;
570 ret = btrfs_insert_empty_item(trans, root, path, &key,
571 size);
572 if (ret && ret != -EEXIST)
573 goto fail;
31840ae1 574
5d4f98a2
YZ
575 leaf = path->nodes[0];
576 }
577 ref = btrfs_item_ptr(leaf, path->slots[0],
578 struct btrfs_extent_data_ref);
579 if (ret == 0) {
580 btrfs_set_extent_data_ref_root(leaf, ref,
581 root_objectid);
582 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
583 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
584 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
585 } else {
586 num_refs = btrfs_extent_data_ref_count(leaf, ref);
587 num_refs += refs_to_add;
588 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 589 }
31840ae1 590 }
5d4f98a2
YZ
591 btrfs_mark_buffer_dirty(leaf);
592 ret = 0;
593fail:
b3b4aa74 594 btrfs_release_path(path);
7bb86316 595 return ret;
74493f7a
CM
596}
597
5d4f98a2 598static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
76d76e78 599 struct btrfs_root *root,
5d4f98a2 600 struct btrfs_path *path,
5b2a54bb 601 int refs_to_drop)
31840ae1 602{
5d4f98a2
YZ
603 struct btrfs_key key;
604 struct btrfs_extent_data_ref *ref1 = NULL;
605 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 606 struct extent_buffer *leaf;
5d4f98a2 607 u32 num_refs = 0;
31840ae1
ZY
608 int ret = 0;
609
610 leaf = path->nodes[0];
5d4f98a2
YZ
611 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
612
613 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
614 ref1 = btrfs_item_ptr(leaf, path->slots[0],
615 struct btrfs_extent_data_ref);
616 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
617 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
618 ref2 = btrfs_item_ptr(leaf, path->slots[0],
619 struct btrfs_shared_data_ref);
620 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
6d8ff4e4 621 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
ba3c2b19
NB
622 btrfs_print_v0_err(trans->fs_info);
623 btrfs_abort_transaction(trans, -EINVAL);
624 return -EINVAL;
5d4f98a2
YZ
625 } else {
626 BUG();
627 }
628
56bec294
CM
629 BUG_ON(num_refs < refs_to_drop);
630 num_refs -= refs_to_drop;
5d4f98a2 631
31840ae1 632 if (num_refs == 0) {
76d76e78 633 ret = btrfs_del_item(trans, root, path);
31840ae1 634 } else {
5d4f98a2
YZ
635 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
636 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
637 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
638 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
31840ae1
ZY
639 btrfs_mark_buffer_dirty(leaf);
640 }
31840ae1
ZY
641 return ret;
642}
643
9ed0dea0 644static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 645 struct btrfs_extent_inline_ref *iref)
15916de8 646{
5d4f98a2
YZ
647 struct btrfs_key key;
648 struct extent_buffer *leaf;
649 struct btrfs_extent_data_ref *ref1;
650 struct btrfs_shared_data_ref *ref2;
651 u32 num_refs = 0;
3de28d57 652 int type;
5d4f98a2
YZ
653
654 leaf = path->nodes[0];
655 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19
NB
656
657 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
5d4f98a2 658 if (iref) {
3de28d57
LB
659 /*
660 * If type is invalid, we should have bailed out earlier than
661 * this call.
662 */
663 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
664 ASSERT(type != BTRFS_REF_TYPE_INVALID);
665 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
666 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
667 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
668 } else {
669 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
670 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
671 }
672 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
673 ref1 = btrfs_item_ptr(leaf, path->slots[0],
674 struct btrfs_extent_data_ref);
675 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
676 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
677 ref2 = btrfs_item_ptr(leaf, path->slots[0],
678 struct btrfs_shared_data_ref);
679 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
680 } else {
681 WARN_ON(1);
682 }
683 return num_refs;
684}
15916de8 685
5d4f98a2 686static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
687 struct btrfs_path *path,
688 u64 bytenr, u64 parent,
689 u64 root_objectid)
1f3c79a2 690{
29cbcf40 691 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 692 struct btrfs_key key;
1f3c79a2 693 int ret;
1f3c79a2 694
5d4f98a2
YZ
695 key.objectid = bytenr;
696 if (parent) {
697 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
698 key.offset = parent;
699 } else {
700 key.type = BTRFS_TREE_BLOCK_REF_KEY;
701 key.offset = root_objectid;
1f3c79a2
LH
702 }
703
5d4f98a2
YZ
704 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
705 if (ret > 0)
706 ret = -ENOENT;
5d4f98a2 707 return ret;
1f3c79a2
LH
708}
709
5d4f98a2 710static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
711 struct btrfs_path *path,
712 u64 bytenr, u64 parent,
713 u64 root_objectid)
31840ae1 714{
29cbcf40 715 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 716 struct btrfs_key key;
31840ae1 717 int ret;
31840ae1 718
5d4f98a2
YZ
719 key.objectid = bytenr;
720 if (parent) {
721 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
722 key.offset = parent;
723 } else {
724 key.type = BTRFS_TREE_BLOCK_REF_KEY;
725 key.offset = root_objectid;
726 }
727
29cbcf40 728 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
b3b4aa74 729 btrfs_release_path(path);
31840ae1
ZY
730 return ret;
731}
732
5d4f98a2 733static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 734{
5d4f98a2
YZ
735 int type;
736 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
737 if (parent > 0)
738 type = BTRFS_SHARED_BLOCK_REF_KEY;
739 else
740 type = BTRFS_TREE_BLOCK_REF_KEY;
741 } else {
742 if (parent > 0)
743 type = BTRFS_SHARED_DATA_REF_KEY;
744 else
745 type = BTRFS_EXTENT_DATA_REF_KEY;
746 }
747 return type;
31840ae1 748}
56bec294 749
2c47e605
YZ
750static int find_next_key(struct btrfs_path *path, int level,
751 struct btrfs_key *key)
56bec294 752
02217ed2 753{
2c47e605 754 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
755 if (!path->nodes[level])
756 break;
5d4f98a2
YZ
757 if (path->slots[level] + 1 >=
758 btrfs_header_nritems(path->nodes[level]))
759 continue;
760 if (level == 0)
761 btrfs_item_key_to_cpu(path->nodes[level], key,
762 path->slots[level] + 1);
763 else
764 btrfs_node_key_to_cpu(path->nodes[level], key,
765 path->slots[level] + 1);
766 return 0;
767 }
768 return 1;
769}
037e6390 770
5d4f98a2
YZ
771/*
772 * look for inline back ref. if back ref is found, *ref_ret is set
773 * to the address of inline back ref, and 0 is returned.
774 *
775 * if back ref isn't found, *ref_ret is set to the address where it
776 * should be inserted, and -ENOENT is returned.
777 *
778 * if insert is true and there are too many inline back refs, the path
779 * points to the extent item, and -EAGAIN is returned.
780 *
781 * NOTE: inline back refs are ordered in the same way that back ref
782 * items in the tree are ordered.
783 */
784static noinline_for_stack
785int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
786 struct btrfs_path *path,
787 struct btrfs_extent_inline_ref **ref_ret,
788 u64 bytenr, u64 num_bytes,
789 u64 parent, u64 root_objectid,
790 u64 owner, u64 offset, int insert)
791{
867cc1fb 792 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 793 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
5d4f98a2
YZ
794 struct btrfs_key key;
795 struct extent_buffer *leaf;
796 struct btrfs_extent_item *ei;
797 struct btrfs_extent_inline_ref *iref;
798 u64 flags;
799 u64 item_size;
800 unsigned long ptr;
801 unsigned long end;
802 int extra_size;
803 int type;
804 int want;
805 int ret;
806 int err = 0;
0b246afa 807 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 808 int needed;
26b8003f 809
db94535d 810 key.objectid = bytenr;
31840ae1 811 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 812 key.offset = num_bytes;
31840ae1 813
5d4f98a2
YZ
814 want = extent_ref_type(parent, owner);
815 if (insert) {
816 extra_size = btrfs_extent_inline_ref_size(want);
9a664971 817 path->search_for_extension = 1;
85d4198e 818 path->keep_locks = 1;
5d4f98a2
YZ
819 } else
820 extra_size = -1;
3173a18f
JB
821
822 /*
16d1c062
NB
823 * Owner is our level, so we can just add one to get the level for the
824 * block we are interested in.
3173a18f
JB
825 */
826 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
827 key.type = BTRFS_METADATA_ITEM_KEY;
828 key.offset = owner;
829 }
830
831again:
5d4f98a2 832 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 833 if (ret < 0) {
5d4f98a2
YZ
834 err = ret;
835 goto out;
836 }
3173a18f
JB
837
838 /*
839 * We may be a newly converted file system which still has the old fat
840 * extent entries for metadata, so try and see if we have one of those.
841 */
842 if (ret > 0 && skinny_metadata) {
843 skinny_metadata = false;
844 if (path->slots[0]) {
845 path->slots[0]--;
846 btrfs_item_key_to_cpu(path->nodes[0], &key,
847 path->slots[0]);
848 if (key.objectid == bytenr &&
849 key.type == BTRFS_EXTENT_ITEM_KEY &&
850 key.offset == num_bytes)
851 ret = 0;
852 }
853 if (ret) {
9ce49a0b 854 key.objectid = bytenr;
3173a18f
JB
855 key.type = BTRFS_EXTENT_ITEM_KEY;
856 key.offset = num_bytes;
857 btrfs_release_path(path);
858 goto again;
859 }
860 }
861
79787eaa
JM
862 if (ret && !insert) {
863 err = -ENOENT;
864 goto out;
fae7f21c 865 } else if (WARN_ON(ret)) {
492104c8 866 err = -EIO;
492104c8 867 goto out;
79787eaa 868 }
5d4f98a2
YZ
869
870 leaf = path->nodes[0];
3212fa14 871 item_size = btrfs_item_size(leaf, path->slots[0]);
6d8ff4e4 872 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
873 err = -EINVAL;
874 btrfs_print_v0_err(fs_info);
875 btrfs_abort_transaction(trans, err);
876 goto out;
877 }
5d4f98a2 878
5d4f98a2
YZ
879 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
880 flags = btrfs_extent_flags(leaf, ei);
881
882 ptr = (unsigned long)(ei + 1);
883 end = (unsigned long)ei + item_size;
884
3173a18f 885 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
886 ptr += sizeof(struct btrfs_tree_block_info);
887 BUG_ON(ptr > end);
5d4f98a2
YZ
888 }
889
3de28d57
LB
890 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
891 needed = BTRFS_REF_TYPE_DATA;
892 else
893 needed = BTRFS_REF_TYPE_BLOCK;
894
5d4f98a2
YZ
895 err = -ENOENT;
896 while (1) {
897 if (ptr >= end) {
cf4f03c3
NB
898 if (ptr > end) {
899 err = -EUCLEAN;
900 btrfs_print_leaf(path->nodes[0]);
901 btrfs_crit(fs_info,
902"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
903 path->slots[0], root_objectid, owner, offset, parent);
904 }
5d4f98a2
YZ
905 break;
906 }
907 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57
LB
908 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
909 if (type == BTRFS_REF_TYPE_INVALID) {
af431dcb 910 err = -EUCLEAN;
3de28d57
LB
911 goto out;
912 }
913
5d4f98a2
YZ
914 if (want < type)
915 break;
916 if (want > type) {
917 ptr += btrfs_extent_inline_ref_size(type);
918 continue;
919 }
920
921 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
922 struct btrfs_extent_data_ref *dref;
923 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
924 if (match_extent_data_ref(leaf, dref, root_objectid,
925 owner, offset)) {
926 err = 0;
927 break;
928 }
929 if (hash_extent_data_ref_item(leaf, dref) <
930 hash_extent_data_ref(root_objectid, owner, offset))
931 break;
932 } else {
933 u64 ref_offset;
934 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
935 if (parent > 0) {
936 if (parent == ref_offset) {
937 err = 0;
938 break;
939 }
940 if (ref_offset < parent)
941 break;
942 } else {
943 if (root_objectid == ref_offset) {
944 err = 0;
945 break;
946 }
947 if (ref_offset < root_objectid)
948 break;
949 }
950 }
951 ptr += btrfs_extent_inline_ref_size(type);
952 }
953 if (err == -ENOENT && insert) {
954 if (item_size + extra_size >=
955 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
956 err = -EAGAIN;
957 goto out;
958 }
959 /*
960 * To add new inline back ref, we have to make sure
961 * there is no corresponding back ref item.
962 * For simplicity, we just do not add new inline back
963 * ref if there is any kind of item for this block
964 */
2c47e605
YZ
965 if (find_next_key(path, 0, &key) == 0 &&
966 key.objectid == bytenr &&
85d4198e 967 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
968 err = -EAGAIN;
969 goto out;
970 }
971 }
972 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
973out:
85d4198e 974 if (insert) {
5d4f98a2 975 path->keep_locks = 0;
9a664971 976 path->search_for_extension = 0;
5d4f98a2
YZ
977 btrfs_unlock_up_safe(path, 1);
978 }
979 return err;
980}
981
982/*
983 * helper to add new inline back ref
984 */
985static noinline_for_stack
87bde3cd 986void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
987 struct btrfs_path *path,
988 struct btrfs_extent_inline_ref *iref,
989 u64 parent, u64 root_objectid,
990 u64 owner, u64 offset, int refs_to_add,
991 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
992{
993 struct extent_buffer *leaf;
994 struct btrfs_extent_item *ei;
995 unsigned long ptr;
996 unsigned long end;
997 unsigned long item_offset;
998 u64 refs;
999 int size;
1000 int type;
5d4f98a2
YZ
1001
1002 leaf = path->nodes[0];
1003 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1004 item_offset = (unsigned long)iref - (unsigned long)ei;
1005
1006 type = extent_ref_type(parent, owner);
1007 size = btrfs_extent_inline_ref_size(type);
1008
c71dd880 1009 btrfs_extend_item(path, size);
5d4f98a2
YZ
1010
1011 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1012 refs = btrfs_extent_refs(leaf, ei);
1013 refs += refs_to_add;
1014 btrfs_set_extent_refs(leaf, ei, refs);
1015 if (extent_op)
1016 __run_delayed_extent_op(extent_op, leaf, ei);
1017
1018 ptr = (unsigned long)ei + item_offset;
3212fa14 1019 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1020 if (ptr < end - size)
1021 memmove_extent_buffer(leaf, ptr + size, ptr,
1022 end - size - ptr);
1023
1024 iref = (struct btrfs_extent_inline_ref *)ptr;
1025 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1026 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1027 struct btrfs_extent_data_ref *dref;
1028 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1029 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1030 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1031 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1032 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1033 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1034 struct btrfs_shared_data_ref *sref;
1035 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1036 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1037 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1038 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1039 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1040 } else {
1041 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1042 }
1043 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1044}
1045
1046static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1047 struct btrfs_path *path,
1048 struct btrfs_extent_inline_ref **ref_ret,
1049 u64 bytenr, u64 num_bytes, u64 parent,
1050 u64 root_objectid, u64 owner, u64 offset)
1051{
1052 int ret;
1053
867cc1fb
NB
1054 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1055 num_bytes, parent, root_objectid,
1056 owner, offset, 0);
5d4f98a2 1057 if (ret != -ENOENT)
54aa1f4d 1058 return ret;
5d4f98a2 1059
b3b4aa74 1060 btrfs_release_path(path);
5d4f98a2
YZ
1061 *ref_ret = NULL;
1062
1063 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1064 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1065 root_objectid);
5d4f98a2 1066 } else {
bd1d53ef
NB
1067 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1068 root_objectid, owner, offset);
b9473439 1069 }
5d4f98a2
YZ
1070 return ret;
1071}
31840ae1 1072
5d4f98a2
YZ
1073/*
1074 * helper to update/remove inline back ref
1075 */
1076static noinline_for_stack
61a18f1c 1077void update_inline_extent_backref(struct btrfs_path *path,
143bede5
JM
1078 struct btrfs_extent_inline_ref *iref,
1079 int refs_to_mod,
5b2a54bb 1080 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2 1081{
61a18f1c 1082 struct extent_buffer *leaf = path->nodes[0];
5d4f98a2
YZ
1083 struct btrfs_extent_item *ei;
1084 struct btrfs_extent_data_ref *dref = NULL;
1085 struct btrfs_shared_data_ref *sref = NULL;
1086 unsigned long ptr;
1087 unsigned long end;
1088 u32 item_size;
1089 int size;
1090 int type;
5d4f98a2
YZ
1091 u64 refs;
1092
5d4f98a2
YZ
1093 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1094 refs = btrfs_extent_refs(leaf, ei);
1095 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1096 refs += refs_to_mod;
1097 btrfs_set_extent_refs(leaf, ei, refs);
1098 if (extent_op)
1099 __run_delayed_extent_op(extent_op, leaf, ei);
1100
3de28d57
LB
1101 /*
1102 * If type is invalid, we should have bailed out after
1103 * lookup_inline_extent_backref().
1104 */
1105 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1106 ASSERT(type != BTRFS_REF_TYPE_INVALID);
5d4f98a2
YZ
1107
1108 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1109 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1110 refs = btrfs_extent_data_ref_count(leaf, dref);
1111 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1112 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1113 refs = btrfs_shared_data_ref_count(leaf, sref);
1114 } else {
1115 refs = 1;
1116 BUG_ON(refs_to_mod != -1);
56bec294 1117 }
31840ae1 1118
5d4f98a2
YZ
1119 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1120 refs += refs_to_mod;
1121
1122 if (refs > 0) {
1123 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1124 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1125 else
1126 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1127 } else {
1128 size = btrfs_extent_inline_ref_size(type);
3212fa14 1129 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1130 ptr = (unsigned long)iref;
1131 end = (unsigned long)ei + item_size;
1132 if (ptr + size < end)
1133 memmove_extent_buffer(leaf, ptr, ptr + size,
1134 end - ptr - size);
1135 item_size -= size;
78ac4f9e 1136 btrfs_truncate_item(path, item_size, 1);
5d4f98a2
YZ
1137 }
1138 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1139}
1140
1141static noinline_for_stack
1142int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1143 struct btrfs_path *path,
1144 u64 bytenr, u64 num_bytes, u64 parent,
1145 u64 root_objectid, u64 owner,
1146 u64 offset, int refs_to_add,
1147 struct btrfs_delayed_extent_op *extent_op)
1148{
1149 struct btrfs_extent_inline_ref *iref;
1150 int ret;
1151
867cc1fb
NB
1152 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1153 num_bytes, parent, root_objectid,
1154 owner, offset, 1);
5d4f98a2 1155 if (ret == 0) {
07cce5cf
QW
1156 /*
1157 * We're adding refs to a tree block we already own, this
1158 * should not happen at all.
1159 */
1160 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1161 btrfs_crit(trans->fs_info,
1162"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1163 bytenr, num_bytes, root_objectid);
1164 if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1165 WARN_ON(1);
1166 btrfs_crit(trans->fs_info,
1167 "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1168 btrfs_print_leaf(path->nodes[0]);
1169 }
1170 return -EUCLEAN;
1171 }
5b2a54bb 1172 update_inline_extent_backref(path, iref, refs_to_add, extent_op);
5d4f98a2 1173 } else if (ret == -ENOENT) {
a639cdeb 1174 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
143bede5
JM
1175 root_objectid, owner, offset,
1176 refs_to_add, extent_op);
1177 ret = 0;
771ed689 1178 }
5d4f98a2
YZ
1179 return ret;
1180}
31840ae1 1181
5d4f98a2 1182static int remove_extent_backref(struct btrfs_trans_handle *trans,
76d76e78 1183 struct btrfs_root *root,
5d4f98a2
YZ
1184 struct btrfs_path *path,
1185 struct btrfs_extent_inline_ref *iref,
5b2a54bb 1186 int refs_to_drop, int is_data)
5d4f98a2 1187{
143bede5 1188 int ret = 0;
b9473439 1189
5d4f98a2 1190 BUG_ON(!is_data && refs_to_drop != 1);
5b2a54bb
JB
1191 if (iref)
1192 update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
1193 else if (is_data)
1194 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1195 else
76d76e78 1196 ret = btrfs_del_item(trans, root, path);
5d4f98a2
YZ
1197 return ret;
1198}
1199
d04c6b88
JM
1200static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1201 u64 *discarded_bytes)
5d4f98a2 1202{
86557861
JM
1203 int j, ret = 0;
1204 u64 bytes_left, end;
4d89d377 1205 u64 aligned_start = ALIGN(start, 1 << 9);
d04c6b88 1206
4d89d377
JM
1207 if (WARN_ON(start != aligned_start)) {
1208 len -= aligned_start - start;
1209 len = round_down(len, 1 << 9);
1210 start = aligned_start;
1211 }
d04c6b88 1212
4d89d377 1213 *discarded_bytes = 0;
86557861
JM
1214
1215 if (!len)
1216 return 0;
1217
1218 end = start + len;
1219 bytes_left = len;
1220
1221 /* Skip any superblocks on this device. */
1222 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1223 u64 sb_start = btrfs_sb_offset(j);
1224 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1225 u64 size = sb_start - start;
1226
1227 if (!in_range(sb_start, start, bytes_left) &&
1228 !in_range(sb_end, start, bytes_left) &&
1229 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1230 continue;
1231
1232 /*
1233 * Superblock spans beginning of range. Adjust start and
1234 * try again.
1235 */
1236 if (sb_start <= start) {
1237 start += sb_end - start;
1238 if (start > end) {
1239 bytes_left = 0;
1240 break;
1241 }
1242 bytes_left = end - start;
1243 continue;
1244 }
1245
1246 if (size) {
1247 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
44abff2c 1248 GFP_NOFS);
86557861
JM
1249 if (!ret)
1250 *discarded_bytes += size;
1251 else if (ret != -EOPNOTSUPP)
1252 return ret;
1253 }
1254
1255 start = sb_end;
1256 if (start > end) {
1257 bytes_left = 0;
1258 break;
1259 }
1260 bytes_left = end - start;
1261 }
1262
1263 if (bytes_left) {
1264 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
44abff2c 1265 GFP_NOFS);
4d89d377 1266 if (!ret)
86557861 1267 *discarded_bytes += bytes_left;
4d89d377 1268 }
d04c6b88 1269 return ret;
5d4f98a2 1270}
5d4f98a2 1271
a4012f06 1272static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
6143c23c
NA
1273{
1274 struct btrfs_device *dev = stripe->dev;
1275 struct btrfs_fs_info *fs_info = dev->fs_info;
1276 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1277 u64 phys = stripe->physical;
1278 u64 len = stripe->length;
1279 u64 discarded = 0;
1280 int ret = 0;
1281
1282 /* Zone reset on a zoned filesystem */
1283 if (btrfs_can_zone_reset(dev, phys, len)) {
1284 u64 src_disc;
1285
1286 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1287 if (ret)
1288 goto out;
1289
1290 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1291 dev != dev_replace->srcdev)
1292 goto out;
1293
1294 src_disc = discarded;
1295
1296 /* Send to replace target as well */
1297 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1298 &discarded);
1299 discarded += src_disc;
70200574 1300 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
6143c23c
NA
1301 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1302 } else {
1303 ret = 0;
1304 *bytes = 0;
1305 }
1306
1307out:
1308 *bytes = discarded;
1309 return ret;
1310}
1311
2ff7e61e 1312int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1313 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1314{
6b7faadd 1315 int ret = 0;
5378e607 1316 u64 discarded_bytes = 0;
6b7faadd
QW
1317 u64 end = bytenr + num_bytes;
1318 u64 cur = bytenr;
e244a0ae 1319
2999241d 1320 /*
a4012f06
CH
1321 * Avoid races with device replace and make sure the devices in the
1322 * stripes don't go away while we are discarding.
2999241d 1323 */
0b246afa 1324 btrfs_bio_counter_inc_blocked(fs_info);
6b7faadd 1325 while (cur < end) {
a4012f06
CH
1326 struct btrfs_discard_stripe *stripes;
1327 unsigned int num_stripes;
5d4f98a2
YZ
1328 int i;
1329
6b7faadd 1330 num_bytes = end - cur;
a4012f06
CH
1331 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1332 if (IS_ERR(stripes)) {
1333 ret = PTR_ERR(stripes);
1334 if (ret == -EOPNOTSUPP)
1335 ret = 0;
1336 break;
1337 }
5d4f98a2 1338
a4012f06
CH
1339 for (i = 0; i < num_stripes; i++) {
1340 struct btrfs_discard_stripe *stripe = stripes + i;
d04c6b88 1341 u64 bytes;
38b5f68e 1342
a4012f06 1343 if (!stripe->dev->bdev) {
627e0873
FM
1344 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1345 continue;
1346 }
dcba6e48 1347
a4012f06
CH
1348 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1349 &stripe->dev->dev_state))
5e753a81
AJ
1350 continue;
1351
6143c23c 1352 ret = do_discard_extent(stripe, &bytes);
a4012f06 1353 if (ret) {
6b7faadd 1354 /*
a4012f06
CH
1355 * Keep going if discard is not supported by the
1356 * device.
6b7faadd 1357 */
a4012f06
CH
1358 if (ret != -EOPNOTSUPP)
1359 break;
1360 ret = 0;
1361 } else {
1362 discarded_bytes += bytes;
6b7faadd 1363 }
5d4f98a2 1364 }
a4012f06
CH
1365 kfree(stripes);
1366 if (ret)
1367 break;
6b7faadd 1368 cur += num_bytes;
5d4f98a2 1369 }
0b246afa 1370 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1371 if (actual_bytes)
1372 *actual_bytes = discarded_bytes;
5d4f98a2 1373 return ret;
5d4f98a2
YZ
1374}
1375
79787eaa 1376/* Can return -ENOMEM */
5d4f98a2 1377int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1378 struct btrfs_ref *generic_ref)
5d4f98a2 1379{
82fa113f 1380 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2 1381 int ret;
66d7e7f0 1382
82fa113f
QW
1383 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1384 generic_ref->action);
1385 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
113479d5 1386 generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1387
82fa113f 1388 if (generic_ref->type == BTRFS_REF_METADATA)
2187374f 1389 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
82fa113f 1390 else
2187374f 1391 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
d7eae340 1392
82fa113f 1393 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1394
5d4f98a2
YZ
1395 return ret;
1396}
1397
bd3c685e
NB
1398/*
1399 * __btrfs_inc_extent_ref - insert backreference for a given extent
1400 *
07cce5cf
QW
1401 * The counterpart is in __btrfs_free_extent(), with examples and more details
1402 * how it works.
1403 *
bd3c685e
NB
1404 * @trans: Handle of transaction
1405 *
1406 * @node: The delayed ref node used to get the bytenr/length for
1407 * extent whose references are incremented.
1408 *
1409 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1410 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1411 * bytenr of the parent block. Since new extents are always
1412 * created with indirect references, this will only be the case
1413 * when relocating a shared extent. In that case, root_objectid
1a9fd417 1414 * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
bd3c685e
NB
1415 * be 0
1416 *
1417 * @root_objectid: The id of the root where this modification has originated,
1418 * this can be either one of the well-known metadata trees or
1419 * the subvolume id which references this extent.
1420 *
1421 * @owner: For data extents it is the inode number of the owning file.
1422 * For metadata extents this parameter holds the level in the
1423 * tree of the extent.
1424 *
1425 * @offset: For metadata extents the offset is ignored and is currently
1426 * always passed as 0. For data extents it is the fileoffset
1427 * this extent belongs to.
1428 *
1429 * @refs_to_add Number of references to add
1430 *
1431 * @extent_op Pointer to a structure, holding information necessary when
1432 * updating a tree block's flags
1433 *
1434 */
5d4f98a2 1435static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 1436 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
1437 u64 parent, u64 root_objectid,
1438 u64 owner, u64 offset, int refs_to_add,
1439 struct btrfs_delayed_extent_op *extent_op)
1440{
1441 struct btrfs_path *path;
1442 struct extent_buffer *leaf;
1443 struct btrfs_extent_item *item;
fcebe456 1444 struct btrfs_key key;
c682f9b3
QW
1445 u64 bytenr = node->bytenr;
1446 u64 num_bytes = node->num_bytes;
5d4f98a2
YZ
1447 u64 refs;
1448 int ret;
5d4f98a2
YZ
1449
1450 path = btrfs_alloc_path();
1451 if (!path)
1452 return -ENOMEM;
1453
5d4f98a2 1454 /* this will setup the path even if it fails to insert the back ref */
a639cdeb
NB
1455 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1456 parent, root_objectid, owner,
1457 offset, refs_to_add, extent_op);
0ed4792a 1458 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 1459 goto out;
fcebe456
JB
1460
1461 /*
1462 * Ok we had -EAGAIN which means we didn't have space to insert and
1463 * inline extent ref, so just update the reference count and add a
1464 * normal backref.
1465 */
5d4f98a2 1466 leaf = path->nodes[0];
fcebe456 1467 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
1468 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1469 refs = btrfs_extent_refs(leaf, item);
1470 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1471 if (extent_op)
1472 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 1473
5d4f98a2 1474 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 1475 btrfs_release_path(path);
56bec294 1476
56bec294 1477 /* now insert the actual backref */
65cd6d9e
NB
1478 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1479 BUG_ON(refs_to_add != 1);
1480 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1481 root_objectid);
1482 } else {
1483 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1484 root_objectid, owner, offset,
1485 refs_to_add);
1486 }
79787eaa 1487 if (ret)
66642832 1488 btrfs_abort_transaction(trans, ret);
5d4f98a2 1489out:
56bec294 1490 btrfs_free_path(path);
30d133fc 1491 return ret;
56bec294
CM
1492}
1493
5d4f98a2 1494static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1495 struct btrfs_delayed_ref_node *node,
1496 struct btrfs_delayed_extent_op *extent_op,
1497 int insert_reserved)
56bec294 1498{
5d4f98a2
YZ
1499 int ret = 0;
1500 struct btrfs_delayed_data_ref *ref;
1501 struct btrfs_key ins;
1502 u64 parent = 0;
1503 u64 ref_root = 0;
1504 u64 flags = 0;
1505
1506 ins.objectid = node->bytenr;
1507 ins.offset = node->num_bytes;
1508 ins.type = BTRFS_EXTENT_ITEM_KEY;
1509
1510 ref = btrfs_delayed_node_to_data_ref(node);
2bf98ef3 1511 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
599c75ec 1512
5d4f98a2
YZ
1513 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1514 parent = ref->parent;
fcebe456 1515 ref_root = ref->root;
5d4f98a2
YZ
1516
1517 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 1518 if (extent_op)
5d4f98a2 1519 flags |= extent_op->flags_to_set;
ef89b824
NB
1520 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1521 flags, ref->objectid,
1522 ref->offset, &ins,
1523 node->ref_mod);
5d4f98a2 1524 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
1525 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1526 ref->objectid, ref->offset,
1527 node->ref_mod, extent_op);
5d4f98a2 1528 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 1529 ret = __btrfs_free_extent(trans, node, parent,
5d4f98a2
YZ
1530 ref_root, ref->objectid,
1531 ref->offset, node->ref_mod,
c682f9b3 1532 extent_op);
5d4f98a2
YZ
1533 } else {
1534 BUG();
1535 }
1536 return ret;
1537}
1538
1539static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1540 struct extent_buffer *leaf,
1541 struct btrfs_extent_item *ei)
1542{
1543 u64 flags = btrfs_extent_flags(leaf, ei);
1544 if (extent_op->update_flags) {
1545 flags |= extent_op->flags_to_set;
1546 btrfs_set_extent_flags(leaf, ei, flags);
1547 }
1548
1549 if (extent_op->update_key) {
1550 struct btrfs_tree_block_info *bi;
1551 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1552 bi = (struct btrfs_tree_block_info *)(ei + 1);
1553 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1554 }
1555}
1556
1557static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 1558 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
1559 struct btrfs_delayed_extent_op *extent_op)
1560{
20b9a2d6 1561 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 1562 struct btrfs_root *root;
5d4f98a2
YZ
1563 struct btrfs_key key;
1564 struct btrfs_path *path;
1565 struct btrfs_extent_item *ei;
1566 struct extent_buffer *leaf;
1567 u32 item_size;
56bec294 1568 int ret;
5d4f98a2 1569 int err = 0;
0e3696f8 1570 int metadata = 1;
5d4f98a2 1571
bf31f87f 1572 if (TRANS_ABORTED(trans))
79787eaa
JM
1573 return 0;
1574
0e3696f8 1575 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
1576 metadata = 0;
1577
5d4f98a2
YZ
1578 path = btrfs_alloc_path();
1579 if (!path)
1580 return -ENOMEM;
1581
d278850e 1582 key.objectid = head->bytenr;
5d4f98a2 1583
3173a18f 1584 if (metadata) {
3173a18f 1585 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 1586 key.offset = extent_op->level;
3173a18f
JB
1587 } else {
1588 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 1589 key.offset = head->num_bytes;
3173a18f
JB
1590 }
1591
29cbcf40 1592 root = btrfs_extent_root(fs_info, key.objectid);
3173a18f 1593again:
29cbcf40 1594 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5d4f98a2
YZ
1595 if (ret < 0) {
1596 err = ret;
1597 goto out;
1598 }
1599 if (ret > 0) {
3173a18f 1600 if (metadata) {
55994887
FDBM
1601 if (path->slots[0] > 0) {
1602 path->slots[0]--;
1603 btrfs_item_key_to_cpu(path->nodes[0], &key,
1604 path->slots[0]);
d278850e 1605 if (key.objectid == head->bytenr &&
55994887 1606 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 1607 key.offset == head->num_bytes)
55994887
FDBM
1608 ret = 0;
1609 }
1610 if (ret > 0) {
1611 btrfs_release_path(path);
1612 metadata = 0;
3173a18f 1613
d278850e
JB
1614 key.objectid = head->bytenr;
1615 key.offset = head->num_bytes;
55994887
FDBM
1616 key.type = BTRFS_EXTENT_ITEM_KEY;
1617 goto again;
1618 }
1619 } else {
1620 err = -EIO;
1621 goto out;
3173a18f 1622 }
5d4f98a2
YZ
1623 }
1624
1625 leaf = path->nodes[0];
3212fa14 1626 item_size = btrfs_item_size(leaf, path->slots[0]);
ba3c2b19 1627
6d8ff4e4 1628 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
1629 err = -EINVAL;
1630 btrfs_print_v0_err(fs_info);
1631 btrfs_abort_transaction(trans, err);
1632 goto out;
1633 }
1634
5d4f98a2
YZ
1635 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1636 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 1637
5d4f98a2
YZ
1638 btrfs_mark_buffer_dirty(leaf);
1639out:
1640 btrfs_free_path(path);
1641 return err;
56bec294
CM
1642}
1643
5d4f98a2 1644static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1645 struct btrfs_delayed_ref_node *node,
1646 struct btrfs_delayed_extent_op *extent_op,
1647 int insert_reserved)
56bec294
CM
1648{
1649 int ret = 0;
5d4f98a2 1650 struct btrfs_delayed_tree_ref *ref;
5d4f98a2
YZ
1651 u64 parent = 0;
1652 u64 ref_root = 0;
56bec294 1653
5d4f98a2 1654 ref = btrfs_delayed_node_to_tree_ref(node);
f97806f2 1655 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
599c75ec 1656
5d4f98a2
YZ
1657 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1658 parent = ref->parent;
fcebe456 1659 ref_root = ref->root;
5d4f98a2 1660
02794222 1661 if (node->ref_mod != 1) {
f97806f2 1662 btrfs_err(trans->fs_info,
02794222
LB
1663 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1664 node->bytenr, node->ref_mod, node->action, ref_root,
1665 parent);
1666 return -EIO;
1667 }
5d4f98a2 1668 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 1669 BUG_ON(!extent_op || !extent_op->update_flags);
21ebfbe7 1670 ret = alloc_reserved_tree_block(trans, node, extent_op);
5d4f98a2 1671 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
1672 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1673 ref->level, 0, 1, extent_op);
5d4f98a2 1674 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 1675 ret = __btrfs_free_extent(trans, node, parent, ref_root,
c682f9b3 1676 ref->level, 0, 1, extent_op);
5d4f98a2
YZ
1677 } else {
1678 BUG();
1679 }
56bec294
CM
1680 return ret;
1681}
1682
1683/* helper function to actually process a single delayed ref entry */
5d4f98a2 1684static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1685 struct btrfs_delayed_ref_node *node,
1686 struct btrfs_delayed_extent_op *extent_op,
1687 int insert_reserved)
56bec294 1688{
79787eaa
JM
1689 int ret = 0;
1690
bf31f87f 1691 if (TRANS_ABORTED(trans)) {
857cc2fc 1692 if (insert_reserved)
b25c36f8 1693 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
79787eaa 1694 return 0;
857cc2fc 1695 }
79787eaa 1696
5d4f98a2
YZ
1697 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1698 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
f97806f2 1699 ret = run_delayed_tree_ref(trans, node, extent_op,
5d4f98a2
YZ
1700 insert_reserved);
1701 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1702 node->type == BTRFS_SHARED_DATA_REF_KEY)
2bf98ef3 1703 ret = run_delayed_data_ref(trans, node, extent_op,
5d4f98a2
YZ
1704 insert_reserved);
1705 else
1706 BUG();
80ee54bf 1707 if (ret && insert_reserved)
b25c36f8 1708 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
5d4f98a2 1709 return ret;
56bec294
CM
1710}
1711
c6fc2454 1712static inline struct btrfs_delayed_ref_node *
56bec294
CM
1713select_delayed_ref(struct btrfs_delayed_ref_head *head)
1714{
cffc3374
FM
1715 struct btrfs_delayed_ref_node *ref;
1716
e3d03965 1717 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 1718 return NULL;
d7df2c79 1719
cffc3374
FM
1720 /*
1721 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1722 * This is to prevent a ref count from going down to zero, which deletes
1723 * the extent item from the extent tree, when there still are references
1724 * to add, which would fail because they would not find the extent item.
1725 */
1d57ee94
WX
1726 if (!list_empty(&head->ref_add_list))
1727 return list_first_entry(&head->ref_add_list,
1728 struct btrfs_delayed_ref_node, add_list);
1729
e3d03965 1730 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 1731 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
1732 ASSERT(list_empty(&ref->add_list));
1733 return ref;
56bec294
CM
1734}
1735
2eadaa22
JB
1736static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1737 struct btrfs_delayed_ref_head *head)
1738{
1739 spin_lock(&delayed_refs->lock);
1740 head->processing = 0;
1741 delayed_refs->num_heads_ready++;
1742 spin_unlock(&delayed_refs->lock);
1743 btrfs_delayed_ref_unlock(head);
1744}
1745
bedc6617
JB
1746static struct btrfs_delayed_extent_op *cleanup_extent_op(
1747 struct btrfs_delayed_ref_head *head)
b00e6250
JB
1748{
1749 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
1750
1751 if (!extent_op)
bedc6617
JB
1752 return NULL;
1753
b00e6250 1754 if (head->must_insert_reserved) {
bedc6617 1755 head->extent_op = NULL;
b00e6250 1756 btrfs_free_delayed_extent_op(extent_op);
bedc6617 1757 return NULL;
b00e6250 1758 }
bedc6617
JB
1759 return extent_op;
1760}
1761
1762static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1763 struct btrfs_delayed_ref_head *head)
1764{
1765 struct btrfs_delayed_extent_op *extent_op;
1766 int ret;
1767
1768 extent_op = cleanup_extent_op(head);
1769 if (!extent_op)
1770 return 0;
1771 head->extent_op = NULL;
b00e6250 1772 spin_unlock(&head->lock);
20b9a2d6 1773 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
1774 btrfs_free_delayed_extent_op(extent_op);
1775 return ret ? ret : 1;
1776}
1777
31890da0
JB
1778void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1779 struct btrfs_delayed_ref_root *delayed_refs,
1780 struct btrfs_delayed_ref_head *head)
07c47775 1781{
ba2c4d4e 1782 int nr_items = 1; /* Dropping this ref head update. */
07c47775 1783
81e75ac7
JB
1784 /*
1785 * We had csum deletions accounted for in our delayed refs rsv, we need
1786 * to drop the csum leaves for this update from our delayed_refs_rsv.
1787 */
1788 if (head->total_ref_mod < 0 && head->is_data) {
1789 spin_lock(&delayed_refs->lock);
1790 delayed_refs->pending_csums -= head->num_bytes;
1791 spin_unlock(&delayed_refs->lock);
1792 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1793 }
1794
ba2c4d4e 1795 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
07c47775
JB
1796}
1797
194ab0bc 1798static int cleanup_ref_head(struct btrfs_trans_handle *trans,
194ab0bc
JB
1799 struct btrfs_delayed_ref_head *head)
1800{
f9871edd
NB
1801
1802 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
1803 struct btrfs_delayed_ref_root *delayed_refs;
1804 int ret;
1805
1806 delayed_refs = &trans->transaction->delayed_refs;
1807
bedc6617 1808 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
1809 if (ret < 0) {
1810 unselect_delayed_ref_head(delayed_refs, head);
1811 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1812 return ret;
1813 } else if (ret) {
1814 return ret;
1815 }
1816
1817 /*
1818 * Need to drop our head ref lock and re-acquire the delayed ref lock
1819 * and then re-check to make sure nobody got added.
1820 */
1821 spin_unlock(&head->lock);
1822 spin_lock(&delayed_refs->lock);
1823 spin_lock(&head->lock);
e3d03965 1824 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
1825 spin_unlock(&head->lock);
1826 spin_unlock(&delayed_refs->lock);
1827 return 1;
1828 }
d7baffda 1829 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 1830 spin_unlock(&head->lock);
1e7a1421 1831 spin_unlock(&delayed_refs->lock);
c1103f7a 1832
c1103f7a 1833 if (head->must_insert_reserved) {
b25c36f8 1834 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
c1103f7a 1835 if (head->is_data) {
fc28b25e
JB
1836 struct btrfs_root *csum_root;
1837
1838 csum_root = btrfs_csum_root(fs_info, head->bytenr);
1839 ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1840 head->num_bytes);
c1103f7a
JB
1841 }
1842 }
1843
31890da0 1844 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
1845
1846 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 1847 btrfs_delayed_ref_unlock(head);
d278850e 1848 btrfs_put_delayed_ref_head(head);
856bd270 1849 return ret;
194ab0bc
JB
1850}
1851
b1cdbcb5
NB
1852static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1853 struct btrfs_trans_handle *trans)
1854{
1855 struct btrfs_delayed_ref_root *delayed_refs =
1856 &trans->transaction->delayed_refs;
1857 struct btrfs_delayed_ref_head *head = NULL;
1858 int ret;
1859
1860 spin_lock(&delayed_refs->lock);
5637c74b 1861 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
1862 if (!head) {
1863 spin_unlock(&delayed_refs->lock);
1864 return head;
1865 }
1866
1867 /*
1868 * Grab the lock that says we are going to process all the refs for
1869 * this head
1870 */
9e920a6f 1871 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
1872 spin_unlock(&delayed_refs->lock);
1873
1874 /*
1875 * We may have dropped the spin lock to get the head mutex lock, and
1876 * that might have given someone else time to free the head. If that's
1877 * true, it has been removed from our list and we can move on.
1878 */
1879 if (ret == -EAGAIN)
1880 head = ERR_PTR(-EAGAIN);
1881
1882 return head;
1883}
1884
e7261386
NB
1885static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1886 struct btrfs_delayed_ref_head *locked_ref,
1887 unsigned long *run_refs)
1888{
1889 struct btrfs_fs_info *fs_info = trans->fs_info;
1890 struct btrfs_delayed_ref_root *delayed_refs;
1891 struct btrfs_delayed_extent_op *extent_op;
1892 struct btrfs_delayed_ref_node *ref;
1893 int must_insert_reserved = 0;
1894 int ret;
1895
1896 delayed_refs = &trans->transaction->delayed_refs;
1897
0110a4c4
NB
1898 lockdep_assert_held(&locked_ref->mutex);
1899 lockdep_assert_held(&locked_ref->lock);
1900
e7261386
NB
1901 while ((ref = select_delayed_ref(locked_ref))) {
1902 if (ref->seq &&
1903 btrfs_check_delayed_seq(fs_info, ref->seq)) {
1904 spin_unlock(&locked_ref->lock);
1905 unselect_delayed_ref_head(delayed_refs, locked_ref);
1906 return -EAGAIN;
1907 }
1908
1909 (*run_refs)++;
1910 ref->in_tree = 0;
1911 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1912 RB_CLEAR_NODE(&ref->ref_node);
1913 if (!list_empty(&ref->add_list))
1914 list_del(&ref->add_list);
1915 /*
1916 * When we play the delayed ref, also correct the ref_mod on
1917 * head
1918 */
1919 switch (ref->action) {
1920 case BTRFS_ADD_DELAYED_REF:
1921 case BTRFS_ADD_DELAYED_EXTENT:
1922 locked_ref->ref_mod -= ref->ref_mod;
1923 break;
1924 case BTRFS_DROP_DELAYED_REF:
1925 locked_ref->ref_mod += ref->ref_mod;
1926 break;
1927 default:
1928 WARN_ON(1);
1929 }
1930 atomic_dec(&delayed_refs->num_entries);
1931
1932 /*
1933 * Record the must_insert_reserved flag before we drop the
1934 * spin lock.
1935 */
1936 must_insert_reserved = locked_ref->must_insert_reserved;
1937 locked_ref->must_insert_reserved = 0;
1938
1939 extent_op = locked_ref->extent_op;
1940 locked_ref->extent_op = NULL;
1941 spin_unlock(&locked_ref->lock);
1942
1943 ret = run_one_delayed_ref(trans, ref, extent_op,
1944 must_insert_reserved);
1945
1946 btrfs_free_delayed_extent_op(extent_op);
1947 if (ret) {
1948 unselect_delayed_ref_head(delayed_refs, locked_ref);
1949 btrfs_put_delayed_ref(ref);
1950 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1951 ret);
1952 return ret;
1953 }
1954
1955 btrfs_put_delayed_ref(ref);
1956 cond_resched();
1957
1958 spin_lock(&locked_ref->lock);
1959 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1960 }
1961
1962 return 0;
1963}
1964
79787eaa
JM
1965/*
1966 * Returns 0 on success or if called with an already aborted transaction.
1967 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1968 */
d7df2c79 1969static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
d7df2c79 1970 unsigned long nr)
56bec294 1971{
0a1e458a 1972 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 1973 struct btrfs_delayed_ref_root *delayed_refs;
56bec294 1974 struct btrfs_delayed_ref_head *locked_ref = NULL;
0a2b2a84 1975 ktime_t start = ktime_get();
56bec294 1976 int ret;
d7df2c79 1977 unsigned long count = 0;
0a2b2a84 1978 unsigned long actual_count = 0;
56bec294
CM
1979
1980 delayed_refs = &trans->transaction->delayed_refs;
0110a4c4 1981 do {
56bec294 1982 if (!locked_ref) {
b1cdbcb5 1983 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
1984 if (IS_ERR_OR_NULL(locked_ref)) {
1985 if (PTR_ERR(locked_ref) == -EAGAIN) {
1986 continue;
1987 } else {
1988 break;
1989 }
56bec294 1990 }
0110a4c4 1991 count++;
56bec294 1992 }
2c3cf7d5
FM
1993 /*
1994 * We need to try and merge add/drops of the same ref since we
1995 * can run into issues with relocate dropping the implicit ref
1996 * and then it being added back again before the drop can
1997 * finish. If we merged anything we need to re-loop so we can
1998 * get a good ref.
1999 * Or we can get node references of the same type that weren't
2000 * merged when created due to bumps in the tree mod seq, and
2001 * we need to merge them to prevent adding an inline extent
2002 * backref before dropping it (triggering a BUG_ON at
2003 * insert_inline_extent_backref()).
2004 */
d7df2c79 2005 spin_lock(&locked_ref->lock);
be97f133 2006 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
ae1e206b 2007
0110a4c4
NB
2008 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2009 &actual_count);
2010 if (ret < 0 && ret != -EAGAIN) {
2011 /*
2012 * Error, btrfs_run_delayed_refs_for_head already
2013 * unlocked everything so just bail out
2014 */
2015 return ret;
2016 } else if (!ret) {
2017 /*
2018 * Success, perform the usual cleanup of a processed
2019 * head
2020 */
f9871edd 2021 ret = cleanup_ref_head(trans, locked_ref);
194ab0bc 2022 if (ret > 0 ) {
b00e6250
JB
2023 /* We dropped our lock, we need to loop. */
2024 ret = 0;
d7df2c79 2025 continue;
194ab0bc
JB
2026 } else if (ret) {
2027 return ret;
5d4f98a2 2028 }
22cd2e7d 2029 }
1ce7a5ec 2030
b00e6250 2031 /*
0110a4c4
NB
2032 * Either success case or btrfs_run_delayed_refs_for_head
2033 * returned -EAGAIN, meaning we need to select another head
b00e6250 2034 */
b00e6250 2035
0110a4c4 2036 locked_ref = NULL;
c3e69d58 2037 cond_resched();
0110a4c4 2038 } while ((nr != -1 && count < nr) || locked_ref);
0a2b2a84
JB
2039
2040 /*
2041 * We don't want to include ref heads since we can have empty ref heads
2042 * and those will drastically skew our runtime down since we just do
2043 * accounting, no actual extent tree updates.
2044 */
2045 if (actual_count > 0) {
2046 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2047 u64 avg;
2048
2049 /*
2050 * We weigh the current average higher than our current runtime
2051 * to avoid large swings in the average.
2052 */
2053 spin_lock(&delayed_refs->lock);
2054 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
f8c269d7 2055 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
0a2b2a84
JB
2056 spin_unlock(&delayed_refs->lock);
2057 }
d7df2c79 2058 return 0;
c3e69d58
CM
2059}
2060
709c0486
AJ
2061#ifdef SCRAMBLE_DELAYED_REFS
2062/*
2063 * Normally delayed refs get processed in ascending bytenr order. This
2064 * correlates in most cases to the order added. To expose dependencies on this
2065 * order, we start to process the tree in the middle instead of the beginning
2066 */
2067static u64 find_middle(struct rb_root *root)
2068{
2069 struct rb_node *n = root->rb_node;
2070 struct btrfs_delayed_ref_node *entry;
2071 int alt = 1;
2072 u64 middle;
2073 u64 first = 0, last = 0;
2074
2075 n = rb_first(root);
2076 if (n) {
2077 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2078 first = entry->bytenr;
2079 }
2080 n = rb_last(root);
2081 if (n) {
2082 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2083 last = entry->bytenr;
2084 }
2085 n = root->rb_node;
2086
2087 while (n) {
2088 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2089 WARN_ON(!entry->in_tree);
2090
2091 middle = entry->bytenr;
2092
2093 if (alt)
2094 n = n->rb_left;
2095 else
2096 n = n->rb_right;
2097
2098 alt = 1 - alt;
2099 }
2100 return middle;
2101}
2102#endif
2103
c3e69d58
CM
2104/*
2105 * this starts processing the delayed reference count updates and
2106 * extent insertions we have queued up so far. count can be
2107 * 0, which means to process everything in the tree at the start
2108 * of the run (but not newly added entries), or it can be some target
2109 * number you'd like to process.
79787eaa
JM
2110 *
2111 * Returns 0 on success or if called with an aborted transaction
2112 * Returns <0 on error and aborts the transaction
c3e69d58
CM
2113 */
2114int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
c79a70b1 2115 unsigned long count)
c3e69d58 2116{
c79a70b1 2117 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58
CM
2118 struct rb_node *node;
2119 struct btrfs_delayed_ref_root *delayed_refs;
c46effa6 2120 struct btrfs_delayed_ref_head *head;
c3e69d58
CM
2121 int ret;
2122 int run_all = count == (unsigned long)-1;
c3e69d58 2123
79787eaa 2124 /* We'll clean this up in btrfs_cleanup_transaction */
bf31f87f 2125 if (TRANS_ABORTED(trans))
79787eaa
JM
2126 return 0;
2127
0b246afa 2128 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2129 return 0;
2130
c3e69d58 2131 delayed_refs = &trans->transaction->delayed_refs;
26455d33 2132 if (count == 0)
61a56a99 2133 count = delayed_refs->num_heads_ready;
bb721703 2134
c3e69d58 2135again:
709c0486
AJ
2136#ifdef SCRAMBLE_DELAYED_REFS
2137 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2138#endif
0a1e458a 2139 ret = __btrfs_run_delayed_refs(trans, count);
d7df2c79 2140 if (ret < 0) {
66642832 2141 btrfs_abort_transaction(trans, ret);
d7df2c79 2142 return ret;
eb099670 2143 }
c3e69d58 2144
56bec294 2145 if (run_all) {
119e80df 2146 btrfs_create_pending_block_groups(trans);
ea658bad 2147
d7df2c79 2148 spin_lock(&delayed_refs->lock);
5c9d028b 2149 node = rb_first_cached(&delayed_refs->href_root);
d7df2c79
JB
2150 if (!node) {
2151 spin_unlock(&delayed_refs->lock);
56bec294 2152 goto out;
d7df2c79 2153 }
d278850e
JB
2154 head = rb_entry(node, struct btrfs_delayed_ref_head,
2155 href_node);
2156 refcount_inc(&head->refs);
2157 spin_unlock(&delayed_refs->lock);
e9d0b13b 2158
d278850e
JB
2159 /* Mutex was contended, block until it's released and retry. */
2160 mutex_lock(&head->mutex);
2161 mutex_unlock(&head->mutex);
56bec294 2162
d278850e 2163 btrfs_put_delayed_ref_head(head);
d7df2c79 2164 cond_resched();
56bec294 2165 goto again;
5f39d397 2166 }
54aa1f4d 2167out:
a28ec197
CM
2168 return 0;
2169}
2170
5d4f98a2 2171int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
42c9d0b5 2172 struct extent_buffer *eb, u64 flags,
2fe6a5a1 2173 int level)
5d4f98a2
YZ
2174{
2175 struct btrfs_delayed_extent_op *extent_op;
2176 int ret;
2177
78a6184a 2178 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2179 if (!extent_op)
2180 return -ENOMEM;
2181
2182 extent_op->flags_to_set = flags;
35b3ad50
DS
2183 extent_op->update_flags = true;
2184 extent_op->update_key = false;
b1c79e09 2185 extent_op->level = level;
5d4f98a2 2186
42c9d0b5 2187 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
5d4f98a2 2188 if (ret)
78a6184a 2189 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2190 return ret;
2191}
2192
e4c3b2dc 2193static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2194 struct btrfs_path *path,
2195 u64 objectid, u64 offset, u64 bytenr)
2196{
2197 struct btrfs_delayed_ref_head *head;
2198 struct btrfs_delayed_ref_node *ref;
2199 struct btrfs_delayed_data_ref *data_ref;
2200 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2201 struct btrfs_transaction *cur_trans;
0e0adbcf 2202 struct rb_node *node;
5d4f98a2
YZ
2203 int ret = 0;
2204
998ac6d2 2205 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2206 cur_trans = root->fs_info->running_transaction;
998ac6d2 2207 if (cur_trans)
2208 refcount_inc(&cur_trans->use_count);
2209 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2210 if (!cur_trans)
2211 return 0;
2212
2213 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2214 spin_lock(&delayed_refs->lock);
f72ad18e 2215 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2216 if (!head) {
2217 spin_unlock(&delayed_refs->lock);
998ac6d2 2218 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2219 return 0;
2220 }
5d4f98a2
YZ
2221
2222 if (!mutex_trylock(&head->mutex)) {
d278850e 2223 refcount_inc(&head->refs);
5d4f98a2
YZ
2224 spin_unlock(&delayed_refs->lock);
2225
b3b4aa74 2226 btrfs_release_path(path);
5d4f98a2 2227
8cc33e5c
DS
2228 /*
2229 * Mutex was contended, block until it's released and let
2230 * caller try again
2231 */
5d4f98a2
YZ
2232 mutex_lock(&head->mutex);
2233 mutex_unlock(&head->mutex);
d278850e 2234 btrfs_put_delayed_ref_head(head);
998ac6d2 2235 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2236 return -EAGAIN;
2237 }
d7df2c79 2238 spin_unlock(&delayed_refs->lock);
5d4f98a2 2239
d7df2c79 2240 spin_lock(&head->lock);
0e0adbcf
JB
2241 /*
2242 * XXX: We should replace this with a proper search function in the
2243 * future.
2244 */
e3d03965
LB
2245 for (node = rb_first_cached(&head->ref_tree); node;
2246 node = rb_next(node)) {
0e0adbcf 2247 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2248 /* If it's a shared ref we know a cross reference exists */
2249 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2250 ret = 1;
2251 break;
2252 }
5d4f98a2 2253
d7df2c79 2254 data_ref = btrfs_delayed_node_to_data_ref(ref);
5d4f98a2 2255
d7df2c79
JB
2256 /*
2257 * If our ref doesn't match the one we're currently looking at
2258 * then we have a cross reference.
2259 */
2260 if (data_ref->root != root->root_key.objectid ||
2261 data_ref->objectid != objectid ||
2262 data_ref->offset != offset) {
2263 ret = 1;
2264 break;
2265 }
5d4f98a2 2266 }
d7df2c79 2267 spin_unlock(&head->lock);
5d4f98a2 2268 mutex_unlock(&head->mutex);
998ac6d2 2269 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2270 return ret;
2271}
2272
e4c3b2dc 2273static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2 2274 struct btrfs_path *path,
a84d5d42
BB
2275 u64 objectid, u64 offset, u64 bytenr,
2276 bool strict)
be20aa9d 2277{
0b246afa 2278 struct btrfs_fs_info *fs_info = root->fs_info;
29cbcf40 2279 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
f321e491 2280 struct extent_buffer *leaf;
5d4f98a2
YZ
2281 struct btrfs_extent_data_ref *ref;
2282 struct btrfs_extent_inline_ref *iref;
2283 struct btrfs_extent_item *ei;
f321e491 2284 struct btrfs_key key;
5d4f98a2 2285 u32 item_size;
3de28d57 2286 int type;
be20aa9d 2287 int ret;
925baedd 2288
be20aa9d 2289 key.objectid = bytenr;
31840ae1 2290 key.offset = (u64)-1;
f321e491 2291 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2292
be20aa9d
CM
2293 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2294 if (ret < 0)
2295 goto out;
79787eaa 2296 BUG_ON(ret == 0); /* Corruption */
80ff3856
YZ
2297
2298 ret = -ENOENT;
2299 if (path->slots[0] == 0)
31840ae1 2300 goto out;
be20aa9d 2301
31840ae1 2302 path->slots[0]--;
f321e491 2303 leaf = path->nodes[0];
5d4f98a2 2304 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2305
5d4f98a2 2306 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2307 goto out;
f321e491 2308
5d4f98a2 2309 ret = 1;
3212fa14 2310 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2 2311 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 2312
a6bd9cd1 2313 /* If extent item has more than 1 inline ref then it's shared */
5d4f98a2
YZ
2314 if (item_size != sizeof(*ei) +
2315 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2316 goto out;
be20aa9d 2317
a84d5d42
BB
2318 /*
2319 * If extent created before last snapshot => it's shared unless the
2320 * snapshot has been deleted. Use the heuristic if strict is false.
2321 */
2322 if (!strict &&
2323 (btrfs_extent_generation(leaf, ei) <=
2324 btrfs_root_last_snapshot(&root->root_item)))
5d4f98a2
YZ
2325 goto out;
2326
2327 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3de28d57 2328
a6bd9cd1 2329 /* If this extent has SHARED_DATA_REF then it's shared */
3de28d57
LB
2330 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2331 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
2332 goto out;
2333
2334 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2335 if (btrfs_extent_refs(leaf, ei) !=
2336 btrfs_extent_data_ref_count(leaf, ref) ||
2337 btrfs_extent_data_ref_root(leaf, ref) !=
2338 root->root_key.objectid ||
2339 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2340 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2341 goto out;
2342
2343 ret = 0;
2344out:
2345 return ret;
2346}
2347
e4c3b2dc 2348int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
1a89f173 2349 u64 bytenr, bool strict, struct btrfs_path *path)
5d4f98a2 2350{
5d4f98a2 2351 int ret;
5d4f98a2 2352
5d4f98a2 2353 do {
e4c3b2dc 2354 ret = check_committed_ref(root, path, objectid,
a84d5d42 2355 offset, bytenr, strict);
5d4f98a2 2356 if (ret && ret != -ENOENT)
f321e491 2357 goto out;
80ff3856 2358
380fd066
MT
2359 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2360 } while (ret == -EAGAIN);
5d4f98a2 2361
be20aa9d 2362out:
1a89f173 2363 btrfs_release_path(path);
37f00a6d 2364 if (btrfs_is_data_reloc_root(root))
f0486c68 2365 WARN_ON(ret > 0);
f321e491 2366 return ret;
be20aa9d 2367}
c5739bba 2368
5d4f98a2 2369static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 2370 struct btrfs_root *root,
5d4f98a2 2371 struct extent_buffer *buf,
e339a6b0 2372 int full_backref, int inc)
31840ae1 2373{
0b246afa 2374 struct btrfs_fs_info *fs_info = root->fs_info;
31840ae1 2375 u64 bytenr;
5d4f98a2
YZ
2376 u64 num_bytes;
2377 u64 parent;
31840ae1 2378 u64 ref_root;
31840ae1 2379 u32 nritems;
31840ae1
ZY
2380 struct btrfs_key key;
2381 struct btrfs_file_extent_item *fi;
82fa113f
QW
2382 struct btrfs_ref generic_ref = { 0 };
2383 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 2384 int i;
82fa113f 2385 int action;
31840ae1
ZY
2386 int level;
2387 int ret = 0;
fccb84c9 2388
0b246afa 2389 if (btrfs_is_testing(fs_info))
faa2dbf0 2390 return 0;
fccb84c9 2391
31840ae1 2392 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
2393 nritems = btrfs_header_nritems(buf);
2394 level = btrfs_header_level(buf);
2395
92a7cc42 2396 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
5d4f98a2 2397 return 0;
31840ae1 2398
5d4f98a2
YZ
2399 if (full_backref)
2400 parent = buf->start;
2401 else
2402 parent = 0;
82fa113f
QW
2403 if (inc)
2404 action = BTRFS_ADD_DELAYED_REF;
2405 else
2406 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
2407
2408 for (i = 0; i < nritems; i++) {
31840ae1 2409 if (level == 0) {
5d4f98a2 2410 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 2411 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 2412 continue;
5d4f98a2 2413 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
2414 struct btrfs_file_extent_item);
2415 if (btrfs_file_extent_type(buf, fi) ==
2416 BTRFS_FILE_EXTENT_INLINE)
2417 continue;
2418 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2419 if (bytenr == 0)
2420 continue;
5d4f98a2
YZ
2421
2422 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2423 key.offset -= btrfs_file_extent_offset(buf, fi);
82fa113f
QW
2424 btrfs_init_generic_ref(&generic_ref, action, bytenr,
2425 num_bytes, parent);
82fa113f 2426 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
f42c5da6
NB
2427 key.offset, root->root_key.objectid,
2428 for_reloc);
dd28b6a5 2429 if (inc)
82fa113f 2430 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 2431 else
ffd4bb2a 2432 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
2433 if (ret)
2434 goto fail;
2435 } else {
5d4f98a2 2436 bytenr = btrfs_node_blockptr(buf, i);
0b246afa 2437 num_bytes = fs_info->nodesize;
82fa113f
QW
2438 btrfs_init_generic_ref(&generic_ref, action, bytenr,
2439 num_bytes, parent);
f42c5da6
NB
2440 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2441 root->root_key.objectid, for_reloc);
dd28b6a5 2442 if (inc)
82fa113f 2443 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 2444 else
ffd4bb2a 2445 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
2446 if (ret)
2447 goto fail;
2448 }
2449 }
2450 return 0;
2451fail:
5d4f98a2
YZ
2452 return ret;
2453}
2454
2455int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2456 struct extent_buffer *buf, int full_backref)
5d4f98a2 2457{
e339a6b0 2458 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
2459}
2460
2461int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2462 struct extent_buffer *buf, int full_backref)
5d4f98a2 2463{
e339a6b0 2464 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
2465}
2466
1b86826d 2467static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 2468{
0b246afa 2469 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 2470 u64 flags;
53b381b3 2471 u64 ret;
9ed74f2d 2472
b742bb82
YZ
2473 if (data)
2474 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 2475 else if (root == fs_info->chunk_root)
b742bb82 2476 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 2477 else
b742bb82 2478 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 2479
878d7b67 2480 ret = btrfs_get_alloc_profile(fs_info, flags);
53b381b3 2481 return ret;
6a63209f 2482}
9ed74f2d 2483
0eb997bf 2484static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
a061fc8d 2485{
08dddb29
FM
2486 struct rb_node *leftmost;
2487 u64 bytenr = 0;
a1897fdd 2488
16b0c258 2489 read_lock(&fs_info->block_group_cache_lock);
0eb997bf 2490 /* Get the block group with the lowest logical start address. */
08dddb29
FM
2491 leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2492 if (leftmost) {
2493 struct btrfs_block_group *bg;
0f9dd46c 2494
08dddb29
FM
2495 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2496 bytenr = bg->start;
2497 }
16b0c258 2498 read_unlock(&fs_info->block_group_cache_lock);
d2fb3437
YZ
2499
2500 return bytenr;
a061fc8d
CM
2501}
2502
6690d071
NB
2503static int pin_down_extent(struct btrfs_trans_handle *trans,
2504 struct btrfs_block_group *cache,
f0486c68 2505 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 2506{
fdf08605
DS
2507 struct btrfs_fs_info *fs_info = cache->fs_info;
2508
11833d66
YZ
2509 spin_lock(&cache->space_info->lock);
2510 spin_lock(&cache->lock);
2511 cache->pinned += num_bytes;
bb96c4e5
JB
2512 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2513 num_bytes);
11833d66
YZ
2514 if (reserved) {
2515 cache->reserved -= num_bytes;
2516 cache->space_info->bytes_reserved -= num_bytes;
2517 }
2518 spin_unlock(&cache->lock);
2519 spin_unlock(&cache->space_info->lock);
68b38550 2520
fe119a6e 2521 set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
f0486c68
YZ
2522 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2523 return 0;
2524}
68b38550 2525
b25c36f8 2526int btrfs_pin_extent(struct btrfs_trans_handle *trans,
f0486c68
YZ
2527 u64 bytenr, u64 num_bytes, int reserved)
2528{
32da5386 2529 struct btrfs_block_group *cache;
68b38550 2530
b25c36f8 2531 cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
79787eaa 2532 BUG_ON(!cache); /* Logic error */
f0486c68 2533
6690d071 2534 pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
f0486c68
YZ
2535
2536 btrfs_put_block_group(cache);
11833d66
YZ
2537 return 0;
2538}
2539
f0486c68 2540/*
e688b725
CM
2541 * this function must be called within transaction
2542 */
9fce5704 2543int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
e688b725
CM
2544 u64 bytenr, u64 num_bytes)
2545{
32da5386 2546 struct btrfs_block_group *cache;
b50c6e25 2547 int ret;
e688b725 2548
9fce5704 2549 cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
b50c6e25
JB
2550 if (!cache)
2551 return -EINVAL;
e688b725
CM
2552
2553 /*
ced8ecf0
OS
2554 * Fully cache the free space first so that our pin removes the free space
2555 * from the cache.
e688b725 2556 */
ced8ecf0 2557 ret = btrfs_cache_block_group(cache, true);
9ad6d91f
FM
2558 if (ret)
2559 goto out;
e688b725 2560
6690d071 2561 pin_down_extent(trans, cache, bytenr, num_bytes, 0);
e688b725
CM
2562
2563 /* remove us from the free space cache (if we're there at all) */
b50c6e25 2564 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
9ad6d91f 2565out:
e688b725 2566 btrfs_put_block_group(cache);
b50c6e25 2567 return ret;
e688b725
CM
2568}
2569
2ff7e61e
JM
2570static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2571 u64 start, u64 num_bytes)
8c2a1a30
JB
2572{
2573 int ret;
32da5386 2574 struct btrfs_block_group *block_group;
8c2a1a30 2575
0b246afa 2576 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
2577 if (!block_group)
2578 return -EINVAL;
2579
ced8ecf0 2580 ret = btrfs_cache_block_group(block_group, true);
9ad6d91f
FM
2581 if (ret)
2582 goto out;
8c2a1a30 2583
9ad6d91f
FM
2584 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2585out:
8c2a1a30
JB
2586 btrfs_put_block_group(block_group);
2587 return ret;
2588}
2589
bcdc428c 2590int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 2591{
bcdc428c 2592 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
2593 struct btrfs_file_extent_item *item;
2594 struct btrfs_key key;
2595 int found_type;
2596 int i;
b89311ef 2597 int ret = 0;
8c2a1a30 2598
2ff7e61e 2599 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
2600 return 0;
2601
2602 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2603 btrfs_item_key_to_cpu(eb, &key, i);
2604 if (key.type != BTRFS_EXTENT_DATA_KEY)
2605 continue;
2606 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2607 found_type = btrfs_file_extent_type(eb, item);
2608 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2609 continue;
2610 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2611 continue;
2612 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2613 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
2614 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2615 if (ret)
2616 break;
8c2a1a30
JB
2617 }
2618
b89311ef 2619 return ret;
8c2a1a30
JB
2620}
2621
9cfa3e34 2622static void
32da5386 2623btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
9cfa3e34
FM
2624{
2625 atomic_inc(&bg->reservations);
2626}
2627
c759c4e1
JB
2628/*
2629 * Returns the free cluster for the given space info and sets empty_cluster to
2630 * what it should be based on the mount options.
2631 */
2632static struct btrfs_free_cluster *
2ff7e61e
JM
2633fetch_cluster_info(struct btrfs_fs_info *fs_info,
2634 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
2635{
2636 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
2637
2638 *empty_cluster = 0;
2639 if (btrfs_mixed_space_info(space_info))
2640 return ret;
2641
c759c4e1 2642 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 2643 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
2644 if (btrfs_test_opt(fs_info, SSD))
2645 *empty_cluster = SZ_2M;
2646 else
ee22184b 2647 *empty_cluster = SZ_64K;
583b7231
HK
2648 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2649 btrfs_test_opt(fs_info, SSD_SPREAD)) {
2650 *empty_cluster = SZ_2M;
0b246afa 2651 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
2652 }
2653
2654 return ret;
2655}
2656
2ff7e61e
JM
2657static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2658 u64 start, u64 end,
678886bd 2659 const bool return_free_space)
ccd467d6 2660{
32da5386 2661 struct btrfs_block_group *cache = NULL;
7b398f8e
JB
2662 struct btrfs_space_info *space_info;
2663 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 2664 struct btrfs_free_cluster *cluster = NULL;
11833d66 2665 u64 len;
c759c4e1
JB
2666 u64 total_unpinned = 0;
2667 u64 empty_cluster = 0;
7b398f8e 2668 bool readonly;
ccd467d6 2669
11833d66 2670 while (start <= end) {
7b398f8e 2671 readonly = false;
11833d66 2672 if (!cache ||
b3470b5d 2673 start >= cache->start + cache->length) {
11833d66
YZ
2674 if (cache)
2675 btrfs_put_block_group(cache);
c759c4e1 2676 total_unpinned = 0;
11833d66 2677 cache = btrfs_lookup_block_group(fs_info, start);
79787eaa 2678 BUG_ON(!cache); /* Logic error */
c759c4e1 2679
2ff7e61e 2680 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
2681 cache->space_info,
2682 &empty_cluster);
2683 empty_cluster <<= 1;
11833d66
YZ
2684 }
2685
b3470b5d 2686 len = cache->start + cache->length - start;
11833d66
YZ
2687 len = min(len, end + 1 - start);
2688
48ff7083
OS
2689 if (return_free_space)
2690 btrfs_add_free_space(cache, start, len);
11833d66 2691
f0486c68 2692 start += len;
c759c4e1 2693 total_unpinned += len;
7b398f8e 2694 space_info = cache->space_info;
f0486c68 2695
c759c4e1
JB
2696 /*
2697 * If this space cluster has been marked as fragmented and we've
2698 * unpinned enough in this block group to potentially allow a
2699 * cluster to be created inside of it go ahead and clear the
2700 * fragmented check.
2701 */
2702 if (cluster && cluster->fragmented &&
2703 total_unpinned > empty_cluster) {
2704 spin_lock(&cluster->lock);
2705 cluster->fragmented = 0;
2706 spin_unlock(&cluster->lock);
2707 }
2708
7b398f8e 2709 spin_lock(&space_info->lock);
11833d66
YZ
2710 spin_lock(&cache->lock);
2711 cache->pinned -= len;
bb96c4e5 2712 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
4f4db217 2713 space_info->max_extent_size = 0;
7b398f8e
JB
2714 if (cache->ro) {
2715 space_info->bytes_readonly += len;
2716 readonly = true;
169e0da9
NA
2717 } else if (btrfs_is_zoned(fs_info)) {
2718 /* Need reset before reusing in a zoned block group */
2719 space_info->bytes_zone_unusable += len;
2720 readonly = true;
7b398f8e 2721 }
11833d66 2722 spin_unlock(&cache->lock);
957780eb
JB
2723 if (!readonly && return_free_space &&
2724 global_rsv->space_info == space_info) {
7b398f8e
JB
2725 spin_lock(&global_rsv->lock);
2726 if (!global_rsv->full) {
c4bf1909
JC
2727 u64 to_add = min(len, global_rsv->size -
2728 global_rsv->reserved);
2729
957780eb 2730 global_rsv->reserved += to_add;
bb96c4e5
JB
2731 btrfs_space_info_update_bytes_may_use(fs_info,
2732 space_info, to_add);
7b398f8e
JB
2733 if (global_rsv->reserved >= global_rsv->size)
2734 global_rsv->full = 1;
957780eb 2735 len -= to_add;
7b398f8e
JB
2736 }
2737 spin_unlock(&global_rsv->lock);
2738 }
2732798c
JB
2739 /* Add to any tickets we may have */
2740 if (!readonly && return_free_space && len)
2741 btrfs_try_granting_tickets(fs_info, space_info);
7b398f8e 2742 spin_unlock(&space_info->lock);
ccd467d6 2743 }
11833d66
YZ
2744
2745 if (cache)
2746 btrfs_put_block_group(cache);
ccd467d6
CM
2747 return 0;
2748}
2749
5ead2dd0 2750int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 2751{
5ead2dd0 2752 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2753 struct btrfs_block_group *block_group, *tmp;
e33e17ee 2754 struct list_head *deleted_bgs;
11833d66 2755 struct extent_io_tree *unpin;
1a5bc167
CM
2756 u64 start;
2757 u64 end;
a28ec197 2758 int ret;
a28ec197 2759
fe119a6e 2760 unpin = &trans->transaction->pinned_extents;
11833d66 2761
bf31f87f 2762 while (!TRANS_ABORTED(trans)) {
0e6ec385
FM
2763 struct extent_state *cached_state = NULL;
2764
d4b450cd 2765 mutex_lock(&fs_info->unused_bg_unpin_mutex);
1a5bc167 2766 ret = find_first_extent_bit(unpin, 0, &start, &end,
0e6ec385 2767 EXTENT_DIRTY, &cached_state);
d4b450cd
FM
2768 if (ret) {
2769 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 2770 break;
d4b450cd 2771 }
1f3c79a2 2772
46b27f50 2773 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2ff7e61e 2774 ret = btrfs_discard_extent(fs_info, start,
5378e607 2775 end + 1 - start, NULL);
1f3c79a2 2776
0e6ec385 2777 clear_extent_dirty(unpin, start, end, &cached_state);
2ff7e61e 2778 unpin_extent_range(fs_info, start, end, true);
d4b450cd 2779 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 2780 free_extent_state(cached_state);
b9473439 2781 cond_resched();
a28ec197 2782 }
817d52f8 2783
a2309300
DZ
2784 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2785 btrfs_discard_calc_delay(&fs_info->discard_ctl);
b0643e59 2786 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
a2309300 2787 }
b0643e59 2788
e33e17ee
JM
2789 /*
2790 * Transaction is finished. We don't need the lock anymore. We
2791 * do need to clean up the block groups in case of a transaction
2792 * abort.
2793 */
2794 deleted_bgs = &trans->transaction->deleted_bgs;
2795 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2796 u64 trimmed = 0;
2797
2798 ret = -EROFS;
bf31f87f 2799 if (!TRANS_ABORTED(trans))
2ff7e61e 2800 ret = btrfs_discard_extent(fs_info,
b3470b5d
DS
2801 block_group->start,
2802 block_group->length,
e33e17ee
JM
2803 &trimmed);
2804
2805 list_del_init(&block_group->bg_list);
6b7304af 2806 btrfs_unfreeze_block_group(block_group);
e33e17ee
JM
2807 btrfs_put_block_group(block_group);
2808
2809 if (ret) {
2810 const char *errstr = btrfs_decode_error(ret);
2811 btrfs_warn(fs_info,
913e1535 2812 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
2813 ret, errstr);
2814 }
2815 }
2816
e20d96d6
CM
2817 return 0;
2818}
2819
8f8aa4c7
JB
2820static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2821 u64 bytenr, u64 num_bytes, bool is_data)
2822{
2823 int ret;
2824
2825 if (is_data) {
2826 struct btrfs_root *csum_root;
2827
2828 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2829 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2830 if (ret) {
2831 btrfs_abort_transaction(trans, ret);
2832 return ret;
2833 }
2834 }
2835
2836 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2837 if (ret) {
2838 btrfs_abort_transaction(trans, ret);
2839 return ret;
2840 }
2841
2842 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2843 if (ret)
2844 btrfs_abort_transaction(trans, ret);
2845
2846 return ret;
2847}
2848
1c2a07f5
QW
2849/*
2850 * Drop one or more refs of @node.
2851 *
2852 * 1. Locate the extent refs.
2853 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2854 * Locate it, then reduce the refs number or remove the ref line completely.
2855 *
2856 * 2. Update the refs count in EXTENT/METADATA_ITEM
2857 *
2858 * Inline backref case:
2859 *
2860 * in extent tree we have:
2861 *
2862 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2863 * refs 2 gen 6 flags DATA
2864 * extent data backref root FS_TREE objectid 258 offset 0 count 1
2865 * extent data backref root FS_TREE objectid 257 offset 0 count 1
2866 *
2867 * This function gets called with:
2868 *
2869 * node->bytenr = 13631488
2870 * node->num_bytes = 1048576
2871 * root_objectid = FS_TREE
2872 * owner_objectid = 257
2873 * owner_offset = 0
2874 * refs_to_drop = 1
2875 *
2876 * Then we should get some like:
2877 *
2878 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2879 * refs 1 gen 6 flags DATA
2880 * extent data backref root FS_TREE objectid 258 offset 0 count 1
2881 *
2882 * Keyed backref case:
2883 *
2884 * in extent tree we have:
2885 *
2886 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2887 * refs 754 gen 6 flags DATA
2888 * [...]
2889 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2890 * extent data backref root FS_TREE objectid 866 offset 0 count 1
2891 *
2892 * This function get called with:
2893 *
2894 * node->bytenr = 13631488
2895 * node->num_bytes = 1048576
2896 * root_objectid = FS_TREE
2897 * owner_objectid = 866
2898 * owner_offset = 0
2899 * refs_to_drop = 1
2900 *
2901 * Then we should get some like:
2902 *
2903 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2904 * refs 753 gen 6 flags DATA
2905 *
2906 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2907 */
5d4f98a2 2908static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
2909 struct btrfs_delayed_ref_node *node, u64 parent,
2910 u64 root_objectid, u64 owner_objectid,
2911 u64 owner_offset, int refs_to_drop,
2912 struct btrfs_delayed_extent_op *extent_op)
a28ec197 2913{
e72cb923 2914 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 2915 struct btrfs_key key;
5d4f98a2 2916 struct btrfs_path *path;
29cbcf40 2917 struct btrfs_root *extent_root;
5f39d397 2918 struct extent_buffer *leaf;
5d4f98a2
YZ
2919 struct btrfs_extent_item *ei;
2920 struct btrfs_extent_inline_ref *iref;
a28ec197 2921 int ret;
5d4f98a2 2922 int is_data;
952fccac
CM
2923 int extent_slot = 0;
2924 int found_extent = 0;
2925 int num_to_del = 1;
5d4f98a2
YZ
2926 u32 item_size;
2927 u64 refs;
c682f9b3
QW
2928 u64 bytenr = node->bytenr;
2929 u64 num_bytes = node->num_bytes;
0b246afa 2930 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
037e6390 2931
29cbcf40 2932 extent_root = btrfs_extent_root(info, bytenr);
abed4aaa 2933 ASSERT(extent_root);
29cbcf40 2934
5caf2a00 2935 path = btrfs_alloc_path();
54aa1f4d
CM
2936 if (!path)
2937 return -ENOMEM;
5f26f772 2938
5d4f98a2 2939 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
1c2a07f5
QW
2940
2941 if (!is_data && refs_to_drop != 1) {
2942 btrfs_crit(info,
2943"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2944 node->bytenr, refs_to_drop);
2945 ret = -EINVAL;
2946 btrfs_abort_transaction(trans, ret);
2947 goto out;
2948 }
5d4f98a2 2949
3173a18f 2950 if (is_data)
897ca819 2951 skinny_metadata = false;
3173a18f 2952
fbe4801b
NB
2953 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2954 parent, root_objectid, owner_objectid,
5d4f98a2 2955 owner_offset);
7bb86316 2956 if (ret == 0) {
1c2a07f5
QW
2957 /*
2958 * Either the inline backref or the SHARED_DATA_REF/
2959 * SHARED_BLOCK_REF is found
2960 *
2961 * Here is a quick path to locate EXTENT/METADATA_ITEM.
2962 * It's possible the EXTENT/METADATA_ITEM is near current slot.
2963 */
952fccac 2964 extent_slot = path->slots[0];
5d4f98a2
YZ
2965 while (extent_slot >= 0) {
2966 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 2967 extent_slot);
5d4f98a2 2968 if (key.objectid != bytenr)
952fccac 2969 break;
5d4f98a2
YZ
2970 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2971 key.offset == num_bytes) {
952fccac
CM
2972 found_extent = 1;
2973 break;
2974 }
3173a18f
JB
2975 if (key.type == BTRFS_METADATA_ITEM_KEY &&
2976 key.offset == owner_objectid) {
2977 found_extent = 1;
2978 break;
2979 }
1c2a07f5
QW
2980
2981 /* Quick path didn't find the EXTEMT/METADATA_ITEM */
952fccac
CM
2982 if (path->slots[0] - extent_slot > 5)
2983 break;
5d4f98a2 2984 extent_slot--;
952fccac 2985 }
a79865c6 2986
31840ae1 2987 if (!found_extent) {
1c2a07f5
QW
2988 if (iref) {
2989 btrfs_crit(info,
2990"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
2991 btrfs_abort_transaction(trans, -EUCLEAN);
2992 goto err_dump;
2993 }
2994 /* Must be SHARED_* item, remove the backref first */
76d76e78 2995 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 2996 NULL, refs_to_drop, is_data);
005d6427 2997 if (ret) {
66642832 2998 btrfs_abort_transaction(trans, ret);
005d6427
DS
2999 goto out;
3000 }
b3b4aa74 3001 btrfs_release_path(path);
5d4f98a2 3002
1c2a07f5 3003 /* Slow path to locate EXTENT/METADATA_ITEM */
5d4f98a2
YZ
3004 key.objectid = bytenr;
3005 key.type = BTRFS_EXTENT_ITEM_KEY;
3006 key.offset = num_bytes;
3007
3173a18f
JB
3008 if (!is_data && skinny_metadata) {
3009 key.type = BTRFS_METADATA_ITEM_KEY;
3010 key.offset = owner_objectid;
3011 }
3012
31840ae1
ZY
3013 ret = btrfs_search_slot(trans, extent_root,
3014 &key, path, -1, 1);
3173a18f
JB
3015 if (ret > 0 && skinny_metadata && path->slots[0]) {
3016 /*
3017 * Couldn't find our skinny metadata item,
3018 * see if we have ye olde extent item.
3019 */
3020 path->slots[0]--;
3021 btrfs_item_key_to_cpu(path->nodes[0], &key,
3022 path->slots[0]);
3023 if (key.objectid == bytenr &&
3024 key.type == BTRFS_EXTENT_ITEM_KEY &&
3025 key.offset == num_bytes)
3026 ret = 0;
3027 }
3028
3029 if (ret > 0 && skinny_metadata) {
3030 skinny_metadata = false;
9ce49a0b 3031 key.objectid = bytenr;
3173a18f
JB
3032 key.type = BTRFS_EXTENT_ITEM_KEY;
3033 key.offset = num_bytes;
3034 btrfs_release_path(path);
3035 ret = btrfs_search_slot(trans, extent_root,
3036 &key, path, -1, 1);
3037 }
3038
f3465ca4 3039 if (ret) {
5d163e0e
JM
3040 btrfs_err(info,
3041 "umm, got %d back from search, was looking for %llu",
3042 ret, bytenr);
b783e62d 3043 if (ret > 0)
a4f78750 3044 btrfs_print_leaf(path->nodes[0]);
f3465ca4 3045 }
005d6427 3046 if (ret < 0) {
66642832 3047 btrfs_abort_transaction(trans, ret);
005d6427
DS
3048 goto out;
3049 }
31840ae1
ZY
3050 extent_slot = path->slots[0];
3051 }
fae7f21c 3052 } else if (WARN_ON(ret == -ENOENT)) {
a4f78750 3053 btrfs_print_leaf(path->nodes[0]);
c2cf52eb
SK
3054 btrfs_err(info,
3055 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
c1c9ff7c
GU
3056 bytenr, parent, root_objectid, owner_objectid,
3057 owner_offset);
66642832 3058 btrfs_abort_transaction(trans, ret);
c4a050bb 3059 goto out;
79787eaa 3060 } else {
66642832 3061 btrfs_abort_transaction(trans, ret);
005d6427 3062 goto out;
7bb86316 3063 }
5f39d397
CM
3064
3065 leaf = path->nodes[0];
3212fa14 3066 item_size = btrfs_item_size(leaf, extent_slot);
6d8ff4e4 3067 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
3068 ret = -EINVAL;
3069 btrfs_print_v0_err(info);
3070 btrfs_abort_transaction(trans, ret);
3071 goto out;
3072 }
952fccac 3073 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 3074 struct btrfs_extent_item);
3173a18f
JB
3075 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3076 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2 3077 struct btrfs_tree_block_info *bi;
1c2a07f5
QW
3078 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3079 btrfs_crit(info,
cad69d13 3080"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
1c2a07f5
QW
3081 key.objectid, key.type, key.offset,
3082 owner_objectid, item_size,
3083 sizeof(*ei) + sizeof(*bi));
3084 btrfs_abort_transaction(trans, -EUCLEAN);
3085 goto err_dump;
3086 }
5d4f98a2
YZ
3087 bi = (struct btrfs_tree_block_info *)(ei + 1);
3088 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3089 }
56bec294 3090
5d4f98a2 3091 refs = btrfs_extent_refs(leaf, ei);
32b02538 3092 if (refs < refs_to_drop) {
1c2a07f5
QW
3093 btrfs_crit(info,
3094 "trying to drop %d refs but we only have %llu for bytenr %llu",
5d163e0e 3095 refs_to_drop, refs, bytenr);
1c2a07f5
QW
3096 btrfs_abort_transaction(trans, -EUCLEAN);
3097 goto err_dump;
32b02538 3098 }
56bec294 3099 refs -= refs_to_drop;
5f39d397 3100
5d4f98a2
YZ
3101 if (refs > 0) {
3102 if (extent_op)
3103 __run_delayed_extent_op(extent_op, leaf, ei);
3104 /*
3105 * In the case of inline back ref, reference count will
3106 * be updated by remove_extent_backref
952fccac 3107 */
5d4f98a2 3108 if (iref) {
1c2a07f5
QW
3109 if (!found_extent) {
3110 btrfs_crit(info,
3111"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3112 btrfs_abort_transaction(trans, -EUCLEAN);
3113 goto err_dump;
3114 }
5d4f98a2
YZ
3115 } else {
3116 btrfs_set_extent_refs(leaf, ei, refs);
3117 btrfs_mark_buffer_dirty(leaf);
3118 }
3119 if (found_extent) {
76d76e78 3120 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 3121 iref, refs_to_drop, is_data);
005d6427 3122 if (ret) {
66642832 3123 btrfs_abort_transaction(trans, ret);
005d6427
DS
3124 goto out;
3125 }
952fccac 3126 }
5d4f98a2 3127 } else {
1c2a07f5 3128 /* In this branch refs == 1 */
5d4f98a2 3129 if (found_extent) {
1c2a07f5
QW
3130 if (is_data && refs_to_drop !=
3131 extent_data_ref_count(path, iref)) {
3132 btrfs_crit(info,
3133 "invalid refs_to_drop, current refs %u refs_to_drop %u",
3134 extent_data_ref_count(path, iref),
3135 refs_to_drop);
3136 btrfs_abort_transaction(trans, -EUCLEAN);
3137 goto err_dump;
3138 }
5d4f98a2 3139 if (iref) {
1c2a07f5
QW
3140 if (path->slots[0] != extent_slot) {
3141 btrfs_crit(info,
3142"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3143 key.objectid, key.type,
3144 key.offset);
3145 btrfs_abort_transaction(trans, -EUCLEAN);
3146 goto err_dump;
3147 }
5d4f98a2 3148 } else {
1c2a07f5
QW
3149 /*
3150 * No inline ref, we must be at SHARED_* item,
3151 * And it's single ref, it must be:
3152 * | extent_slot ||extent_slot + 1|
3153 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3154 */
3155 if (path->slots[0] != extent_slot + 1) {
3156 btrfs_crit(info,
3157 "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3158 btrfs_abort_transaction(trans, -EUCLEAN);
3159 goto err_dump;
3160 }
5d4f98a2
YZ
3161 path->slots[0] = extent_slot;
3162 num_to_del = 2;
3163 }
78fae27e 3164 }
b9473439 3165
952fccac
CM
3166 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3167 num_to_del);
005d6427 3168 if (ret) {
66642832 3169 btrfs_abort_transaction(trans, ret);
005d6427
DS
3170 goto out;
3171 }
b3b4aa74 3172 btrfs_release_path(path);
21af804c 3173
8f8aa4c7 3174 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
a28ec197 3175 }
fcebe456
JB
3176 btrfs_release_path(path);
3177
79787eaa 3178out:
5caf2a00 3179 btrfs_free_path(path);
a28ec197 3180 return ret;
1c2a07f5
QW
3181err_dump:
3182 /*
3183 * Leaf dump can take up a lot of log buffer, so we only do full leaf
3184 * dump for debug build.
3185 */
3186 if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3187 btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3188 path->slots[0], extent_slot);
3189 btrfs_print_leaf(path->nodes[0]);
3190 }
3191
3192 btrfs_free_path(path);
3193 return -EUCLEAN;
a28ec197
CM
3194}
3195
1887be66 3196/*
f0486c68 3197 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
3198 * delayed ref for that extent as well. This searches the delayed ref tree for
3199 * a given extent, and if there are no other delayed refs to be processed, it
3200 * removes it from the tree.
3201 */
3202static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 3203 u64 bytenr)
1887be66
CM
3204{
3205 struct btrfs_delayed_ref_head *head;
3206 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 3207 int ret = 0;
1887be66
CM
3208
3209 delayed_refs = &trans->transaction->delayed_refs;
3210 spin_lock(&delayed_refs->lock);
f72ad18e 3211 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 3212 if (!head)
cf93da7b 3213 goto out_delayed_unlock;
1887be66 3214
d7df2c79 3215 spin_lock(&head->lock);
e3d03965 3216 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
3217 goto out;
3218
bedc6617
JB
3219 if (cleanup_extent_op(head) != NULL)
3220 goto out;
5d4f98a2 3221
1887be66
CM
3222 /*
3223 * waiting for the lock here would deadlock. If someone else has it
3224 * locked they are already in the process of dropping it anyway
3225 */
3226 if (!mutex_trylock(&head->mutex))
3227 goto out;
3228
d7baffda 3229 btrfs_delete_ref_head(delayed_refs, head);
d7df2c79 3230 head->processing = 0;
d7baffda 3231
d7df2c79 3232 spin_unlock(&head->lock);
1887be66
CM
3233 spin_unlock(&delayed_refs->lock);
3234
f0486c68
YZ
3235 BUG_ON(head->extent_op);
3236 if (head->must_insert_reserved)
3237 ret = 1;
3238
31890da0 3239 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 3240 mutex_unlock(&head->mutex);
d278850e 3241 btrfs_put_delayed_ref_head(head);
f0486c68 3242 return ret;
1887be66 3243out:
d7df2c79 3244 spin_unlock(&head->lock);
cf93da7b
CM
3245
3246out_delayed_unlock:
1887be66
CM
3247 spin_unlock(&delayed_refs->lock);
3248 return 0;
3249}
3250
f0486c68 3251void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7a163608 3252 u64 root_id,
f0486c68 3253 struct extent_buffer *buf,
5581a51a 3254 u64 parent, int last_ref)
f0486c68 3255{
7a163608 3256 struct btrfs_fs_info *fs_info = trans->fs_info;
ed4f255b 3257 struct btrfs_ref generic_ref = { 0 };
f0486c68
YZ
3258 int ret;
3259
ed4f255b
QW
3260 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3261 buf->start, buf->len, parent);
3262 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
7a163608 3263 root_id, 0, false);
ed4f255b 3264
7a163608 3265 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
8a5040f7 3266 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 3267 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
79787eaa 3268 BUG_ON(ret); /* -ENOMEM */
f0486c68
YZ
3269 }
3270
0a16c7d7 3271 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
32da5386 3272 struct btrfs_block_group *cache;
485df755 3273 bool must_pin = false;
6219872d 3274
7a163608 3275 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
2ff7e61e 3276 ret = check_ref_cleanup(trans, buf->start);
d3575156
NA
3277 if (!ret) {
3278 btrfs_redirty_list_add(trans->transaction, buf);
37be25bc 3279 goto out;
d3575156 3280 }
f0486c68
YZ
3281 }
3282
0b246afa 3283 cache = btrfs_lookup_block_group(fs_info, buf->start);
6219872d 3284
f0486c68 3285 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6690d071 3286 pin_down_extent(trans, cache, buf->start, buf->len, 1);
6219872d 3287 btrfs_put_block_group(cache);
37be25bc 3288 goto out;
f0486c68
YZ
3289 }
3290
485df755
FM
3291 /*
3292 * If this is a leaf and there are tree mod log users, we may
3293 * have recorded mod log operations that point to this leaf.
3294 * So we must make sure no one reuses this leaf's extent before
3295 * mod log operations are applied to a node, otherwise after
3296 * rewinding a node using the mod log operations we get an
3297 * inconsistent btree, as the leaf's extent may now be used as
3298 * a node or leaf for another different btree.
3299 * We are safe from races here because at this point no other
3300 * node or root points to this extent buffer, so if after this
3301 * check a new tree mod log user joins, it will not be able to
3302 * find a node pointing to this leaf and record operations that
3303 * point to this leaf.
3304 */
888dd183
FM
3305 if (btrfs_header_level(buf) == 0 &&
3306 test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3307 must_pin = true;
485df755
FM
3308
3309 if (must_pin || btrfs_is_zoned(fs_info)) {
d3575156
NA
3310 btrfs_redirty_list_add(trans->transaction, buf);
3311 pin_down_extent(trans, cache, buf->start, buf->len, 1);
3312 btrfs_put_block_group(cache);
3313 goto out;
3314 }
3315
f0486c68
YZ
3316 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3317
3318 btrfs_add_free_space(cache, buf->start, buf->len);
4824f1f4 3319 btrfs_free_reserved_bytes(cache, buf->len, 0);
6219872d 3320 btrfs_put_block_group(cache);
71ff6437 3321 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
f0486c68
YZ
3322 }
3323out:
0a16c7d7
OS
3324 if (last_ref) {
3325 /*
3326 * Deleting the buffer, clear the corrupt flag since it doesn't
3327 * matter anymore.
3328 */
3329 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3330 }
f0486c68
YZ
3331}
3332
79787eaa 3333/* Can return -ENOMEM */
ffd4bb2a 3334int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 3335{
ffd4bb2a 3336 struct btrfs_fs_info *fs_info = trans->fs_info;
925baedd
CM
3337 int ret;
3338
f5ee5c9a 3339 if (btrfs_is_testing(fs_info))
faa2dbf0 3340 return 0;
fccb84c9 3341
56bec294
CM
3342 /*
3343 * tree log blocks never actually go into the extent allocation
3344 * tree, just update pinning info and exit early.
56bec294 3345 */
ffd4bb2a 3346 if ((ref->type == BTRFS_REF_METADATA &&
113479d5 3347 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
ffd4bb2a 3348 (ref->type == BTRFS_REF_DATA &&
113479d5 3349 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
b9473439 3350 /* unlocks the pinned mutex */
b25c36f8 3351 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
56bec294 3352 ret = 0;
ffd4bb2a 3353 } else if (ref->type == BTRFS_REF_METADATA) {
2187374f 3354 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
5d4f98a2 3355 } else {
2187374f 3356 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
56bec294 3357 }
d7eae340 3358
ffd4bb2a 3359 if (!((ref->type == BTRFS_REF_METADATA &&
113479d5 3360 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
ffd4bb2a 3361 (ref->type == BTRFS_REF_DATA &&
113479d5 3362 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
ffd4bb2a 3363 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 3364
925baedd
CM
3365 return ret;
3366}
3367
817d52f8 3368enum btrfs_loop_type {
f262fa8d
DS
3369 LOOP_CACHING_NOWAIT,
3370 LOOP_CACHING_WAIT,
3371 LOOP_ALLOC_CHUNK,
3372 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
3373};
3374
e570fd27 3375static inline void
32da5386 3376btrfs_lock_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3377 int delalloc)
3378{
3379 if (delalloc)
3380 down_read(&cache->data_rwsem);
3381}
3382
32da5386 3383static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3384 int delalloc)
3385{
3386 btrfs_get_block_group(cache);
3387 if (delalloc)
3388 down_read(&cache->data_rwsem);
3389}
3390
32da5386
DS
3391static struct btrfs_block_group *btrfs_lock_cluster(
3392 struct btrfs_block_group *block_group,
e570fd27
MX
3393 struct btrfs_free_cluster *cluster,
3394 int delalloc)
c142c6a4 3395 __acquires(&cluster->refill_lock)
e570fd27 3396{
32da5386 3397 struct btrfs_block_group *used_bg = NULL;
6719afdc 3398
e570fd27 3399 spin_lock(&cluster->refill_lock);
6719afdc
GU
3400 while (1) {
3401 used_bg = cluster->block_group;
3402 if (!used_bg)
3403 return NULL;
3404
3405 if (used_bg == block_group)
e570fd27
MX
3406 return used_bg;
3407
6719afdc 3408 btrfs_get_block_group(used_bg);
e570fd27 3409
6719afdc
GU
3410 if (!delalloc)
3411 return used_bg;
e570fd27 3412
6719afdc
GU
3413 if (down_read_trylock(&used_bg->data_rwsem))
3414 return used_bg;
e570fd27 3415
6719afdc 3416 spin_unlock(&cluster->refill_lock);
e570fd27 3417
e321f8a8
LB
3418 /* We should only have one-level nested. */
3419 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 3420
6719afdc
GU
3421 spin_lock(&cluster->refill_lock);
3422 if (used_bg == cluster->block_group)
3423 return used_bg;
e570fd27 3424
6719afdc
GU
3425 up_read(&used_bg->data_rwsem);
3426 btrfs_put_block_group(used_bg);
3427 }
e570fd27
MX
3428}
3429
3430static inline void
32da5386 3431btrfs_release_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3432 int delalloc)
3433{
3434 if (delalloc)
3435 up_read(&cache->data_rwsem);
3436 btrfs_put_block_group(cache);
3437}
3438
cb2f96f8
NA
3439enum btrfs_extent_allocation_policy {
3440 BTRFS_EXTENT_ALLOC_CLUSTERED,
2eda5708 3441 BTRFS_EXTENT_ALLOC_ZONED,
cb2f96f8
NA
3442};
3443
b4bd745d
QW
3444/*
3445 * Structure used internally for find_free_extent() function. Wraps needed
3446 * parameters.
3447 */
3448struct find_free_extent_ctl {
3449 /* Basic allocation info */
a12b0dc0 3450 u64 ram_bytes;
b4bd745d 3451 u64 num_bytes;
a85f05e5 3452 u64 min_alloc_size;
b4bd745d
QW
3453 u64 empty_size;
3454 u64 flags;
3455 int delalloc;
3456
3457 /* Where to start the search inside the bg */
3458 u64 search_start;
3459
3460 /* For clustered allocation */
3461 u64 empty_cluster;
c10859be
NA
3462 struct btrfs_free_cluster *last_ptr;
3463 bool use_cluster;
b4bd745d
QW
3464
3465 bool have_caching_bg;
3466 bool orig_have_caching_bg;
3467
40ab3be1
NA
3468 /* Allocation is called for tree-log */
3469 bool for_treelog;
3470
c2707a25
JT
3471 /* Allocation is called for data relocation */
3472 bool for_data_reloc;
3473
b4bd745d
QW
3474 /* RAID index, converted from flags */
3475 int index;
3476
e72d79d6
QW
3477 /*
3478 * Current loop number, check find_free_extent_update_loop() for details
3479 */
b4bd745d
QW
3480 int loop;
3481
3482 /*
3483 * Whether we're refilling a cluster, if true we need to re-search
3484 * current block group but don't try to refill the cluster again.
3485 */
3486 bool retry_clustered;
3487
3488 /*
3489 * Whether we're updating free space cache, if true we need to re-search
3490 * current block group but don't try updating free space cache again.
3491 */
3492 bool retry_unclustered;
3493
3494 /* If current block group is cached */
3495 int cached;
3496
3497 /* Max contiguous hole found */
3498 u64 max_extent_size;
3499
3500 /* Total free space from free space cache, not always contiguous */
3501 u64 total_free_space;
3502
3503 /* Found result */
3504 u64 found_offset;
cb2f96f8 3505
ea544149
NA
3506 /* Hint where to start looking for an empty space */
3507 u64 hint_byte;
3508
cb2f96f8
NA
3509 /* Allocation policy */
3510 enum btrfs_extent_allocation_policy policy;
b4bd745d
QW
3511};
3512
d06e3bb6
QW
3513
3514/*
3515 * Helper function for find_free_extent().
3516 *
3517 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3518 * Return -EAGAIN to inform caller that we need to re-search this block group
3519 * Return >0 to inform caller that we find nothing
3520 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3521 */
32da5386 3522static int find_free_extent_clustered(struct btrfs_block_group *bg,
897cae79
NA
3523 struct find_free_extent_ctl *ffe_ctl,
3524 struct btrfs_block_group **cluster_bg_ret)
d06e3bb6 3525{
32da5386 3526 struct btrfs_block_group *cluster_bg;
897cae79 3527 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
d06e3bb6
QW
3528 u64 aligned_cluster;
3529 u64 offset;
3530 int ret;
3531
3532 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3533 if (!cluster_bg)
3534 goto refill_cluster;
3535 if (cluster_bg != bg && (cluster_bg->ro ||
3536 !block_group_bits(cluster_bg, ffe_ctl->flags)))
3537 goto release_cluster;
3538
3539 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
b3470b5d 3540 ffe_ctl->num_bytes, cluster_bg->start,
d06e3bb6
QW
3541 &ffe_ctl->max_extent_size);
3542 if (offset) {
3543 /* We have a block, we're done */
3544 spin_unlock(&last_ptr->refill_lock);
3545 trace_btrfs_reserve_extent_cluster(cluster_bg,
3546 ffe_ctl->search_start, ffe_ctl->num_bytes);
3547 *cluster_bg_ret = cluster_bg;
3548 ffe_ctl->found_offset = offset;
3549 return 0;
3550 }
3551 WARN_ON(last_ptr->block_group != cluster_bg);
3552
3553release_cluster:
3554 /*
3555 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3556 * lets just skip it and let the allocator find whatever block it can
3557 * find. If we reach this point, we will have tried the cluster
3558 * allocator plenty of times and not have found anything, so we are
3559 * likely way too fragmented for the clustering stuff to find anything.
3560 *
3561 * However, if the cluster is taken from the current block group,
3562 * release the cluster first, so that we stand a better chance of
3563 * succeeding in the unclustered allocation.
3564 */
3565 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3566 spin_unlock(&last_ptr->refill_lock);
3567 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3568 return -ENOENT;
3569 }
3570
3571 /* This cluster didn't work out, free it and start over */
3572 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3573
3574 if (cluster_bg != bg)
3575 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3576
3577refill_cluster:
3578 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3579 spin_unlock(&last_ptr->refill_lock);
3580 return -ENOENT;
3581 }
3582
3583 aligned_cluster = max_t(u64,
3584 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3585 bg->full_stripe_len);
2ceeae2e
DS
3586 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3587 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
3588 if (ret == 0) {
3589 /* Now pull our allocation out of this cluster */
3590 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3591 ffe_ctl->num_bytes, ffe_ctl->search_start,
3592 &ffe_ctl->max_extent_size);
3593 if (offset) {
3594 /* We found one, proceed */
3595 spin_unlock(&last_ptr->refill_lock);
3596 trace_btrfs_reserve_extent_cluster(bg,
3597 ffe_ctl->search_start,
3598 ffe_ctl->num_bytes);
3599 ffe_ctl->found_offset = offset;
3600 return 0;
3601 }
3602 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3603 !ffe_ctl->retry_clustered) {
3604 spin_unlock(&last_ptr->refill_lock);
3605
3606 ffe_ctl->retry_clustered = true;
676f1f75 3607 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
d06e3bb6
QW
3608 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3609 return -EAGAIN;
3610 }
3611 /*
3612 * At this point we either didn't find a cluster or we weren't able to
3613 * allocate a block from our cluster. Free the cluster we've been
3614 * trying to use, and go to the next block group.
3615 */
3616 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3617 spin_unlock(&last_ptr->refill_lock);
3618 return 1;
3619}
3620
e1a41848
QW
3621/*
3622 * Return >0 to inform caller that we find nothing
3623 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3624 * Return -EAGAIN to inform caller that we need to re-search this block group
3625 */
32da5386 3626static int find_free_extent_unclustered(struct btrfs_block_group *bg,
897cae79 3627 struct find_free_extent_ctl *ffe_ctl)
e1a41848 3628{
897cae79 3629 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
e1a41848
QW
3630 u64 offset;
3631
3632 /*
3633 * We are doing an unclustered allocation, set the fragmented flag so
3634 * we don't bother trying to setup a cluster again until we get more
3635 * space.
3636 */
3637 if (unlikely(last_ptr)) {
3638 spin_lock(&last_ptr->lock);
3639 last_ptr->fragmented = 1;
3640 spin_unlock(&last_ptr->lock);
3641 }
3642 if (ffe_ctl->cached) {
3643 struct btrfs_free_space_ctl *free_space_ctl;
3644
3645 free_space_ctl = bg->free_space_ctl;
3646 spin_lock(&free_space_ctl->tree_lock);
3647 if (free_space_ctl->free_space <
3648 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3649 ffe_ctl->empty_size) {
3650 ffe_ctl->total_free_space = max_t(u64,
3651 ffe_ctl->total_free_space,
3652 free_space_ctl->free_space);
3653 spin_unlock(&free_space_ctl->tree_lock);
3654 return 1;
3655 }
3656 spin_unlock(&free_space_ctl->tree_lock);
3657 }
3658
3659 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3660 ffe_ctl->num_bytes, ffe_ctl->empty_size,
3661 &ffe_ctl->max_extent_size);
3662
3663 /*
3664 * If we didn't find a chunk, and we haven't failed on this block group
3665 * before, and this block group is in the middle of caching and we are
3666 * ok with waiting, then go ahead and wait for progress to be made, and
3667 * set @retry_unclustered to true.
3668 *
3669 * If @retry_unclustered is true then we've already waited on this
3670 * block group once and should move on to the next block group.
3671 */
3672 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3673 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
676f1f75
JB
3674 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3675 ffe_ctl->empty_size);
e1a41848
QW
3676 ffe_ctl->retry_unclustered = true;
3677 return -EAGAIN;
3678 } else if (!offset) {
3679 return 1;
3680 }
3681 ffe_ctl->found_offset = offset;
3682 return 0;
3683}
3684
c668690d
NA
3685static int do_allocation_clustered(struct btrfs_block_group *block_group,
3686 struct find_free_extent_ctl *ffe_ctl,
3687 struct btrfs_block_group **bg_ret)
3688{
3689 int ret;
3690
3691 /* We want to try and use the cluster allocator, so lets look there */
3692 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
897cae79 3693 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
c668690d
NA
3694 if (ret >= 0 || ret == -EAGAIN)
3695 return ret;
3696 /* ret == -ENOENT case falls through */
3697 }
3698
897cae79 3699 return find_free_extent_unclustered(block_group, ffe_ctl);
c668690d
NA
3700}
3701
40ab3be1
NA
3702/*
3703 * Tree-log block group locking
3704 * ============================
3705 *
3706 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3707 * indicates the starting address of a block group, which is reserved only
3708 * for tree-log metadata.
3709 *
3710 * Lock nesting
3711 * ============
3712 *
3713 * space_info::lock
3714 * block_group::lock
3715 * fs_info::treelog_bg_lock
3716 */
3717
2eda5708
NA
3718/*
3719 * Simple allocator for sequential-only block group. It only allows sequential
3720 * allocation. No need to play with trees. This function also reserves the
3721 * bytes as in btrfs_add_reserved_bytes.
3722 */
3723static int do_allocation_zoned(struct btrfs_block_group *block_group,
3724 struct find_free_extent_ctl *ffe_ctl,
3725 struct btrfs_block_group **bg_ret)
3726{
40ab3be1 3727 struct btrfs_fs_info *fs_info = block_group->fs_info;
2eda5708
NA
3728 struct btrfs_space_info *space_info = block_group->space_info;
3729 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3730 u64 start = block_group->start;
3731 u64 num_bytes = ffe_ctl->num_bytes;
3732 u64 avail;
40ab3be1
NA
3733 u64 bytenr = block_group->start;
3734 u64 log_bytenr;
c2707a25 3735 u64 data_reloc_bytenr;
2eda5708 3736 int ret = 0;
2d81eb1c 3737 bool skip = false;
2eda5708
NA
3738
3739 ASSERT(btrfs_is_zoned(block_group->fs_info));
3740
40ab3be1
NA
3741 /*
3742 * Do not allow non-tree-log blocks in the dedicated tree-log block
3743 * group, and vice versa.
3744 */
3745 spin_lock(&fs_info->treelog_bg_lock);
3746 log_bytenr = fs_info->treelog_bg;
2d81eb1c
JT
3747 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3748 (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3749 skip = true;
40ab3be1
NA
3750 spin_unlock(&fs_info->treelog_bg_lock);
3751 if (skip)
3752 return 1;
3753
c2707a25
JT
3754 /*
3755 * Do not allow non-relocation blocks in the dedicated relocation block
3756 * group, and vice versa.
3757 */
3758 spin_lock(&fs_info->relocation_bg_lock);
3759 data_reloc_bytenr = fs_info->data_reloc_bg;
3760 if (data_reloc_bytenr &&
3761 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3762 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3763 skip = true;
3764 spin_unlock(&fs_info->relocation_bg_lock);
3765 if (skip)
3766 return 1;
1ada69f6 3767
2e654e4b
NA
3768 /* Check RO and no space case before trying to activate it */
3769 spin_lock(&block_group->lock);
1bfd4767 3770 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
1ada69f6
NA
3771 ret = 1;
3772 /*
3773 * May need to clear fs_info->{treelog,data_reloc}_bg.
3774 * Return the error after taking the locks.
3775 */
2e654e4b
NA
3776 }
3777 spin_unlock(&block_group->lock);
3778
1ada69f6
NA
3779 if (!ret && !btrfs_zone_activate(block_group)) {
3780 ret = 1;
3781 /*
3782 * May need to clear fs_info->{treelog,data_reloc}_bg.
3783 * Return the error after taking the locks.
3784 */
3785 }
2e654e4b 3786
2eda5708
NA
3787 spin_lock(&space_info->lock);
3788 spin_lock(&block_group->lock);
40ab3be1 3789 spin_lock(&fs_info->treelog_bg_lock);
c2707a25 3790 spin_lock(&fs_info->relocation_bg_lock);
40ab3be1 3791
1ada69f6
NA
3792 if (ret)
3793 goto out;
3794
40ab3be1
NA
3795 ASSERT(!ffe_ctl->for_treelog ||
3796 block_group->start == fs_info->treelog_bg ||
3797 fs_info->treelog_bg == 0);
c2707a25
JT
3798 ASSERT(!ffe_ctl->for_data_reloc ||
3799 block_group->start == fs_info->data_reloc_bg ||
3800 fs_info->data_reloc_bg == 0);
2eda5708 3801
3349b57f
JB
3802 if (block_group->ro ||
3803 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2eda5708
NA
3804 ret = 1;
3805 goto out;
3806 }
3807
40ab3be1
NA
3808 /*
3809 * Do not allow currently using block group to be tree-log dedicated
3810 * block group.
3811 */
3812 if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3813 (block_group->used || block_group->reserved)) {
3814 ret = 1;
3815 goto out;
3816 }
3817
c2707a25
JT
3818 /*
3819 * Do not allow currently used block group to be the data relocation
3820 * dedicated block group.
3821 */
3822 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3823 (block_group->used || block_group->reserved)) {
3824 ret = 1;
3825 goto out;
3826 }
3827
98173255
NA
3828 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3829 avail = block_group->zone_capacity - block_group->alloc_offset;
2eda5708
NA
3830 if (avail < num_bytes) {
3831 if (ffe_ctl->max_extent_size < avail) {
3832 /*
3833 * With sequential allocator, free space is always
3834 * contiguous
3835 */
3836 ffe_ctl->max_extent_size = avail;
3837 ffe_ctl->total_free_space = avail;
3838 }
3839 ret = 1;
3840 goto out;
3841 }
3842
40ab3be1
NA
3843 if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3844 fs_info->treelog_bg = block_group->start;
3845
c2707a25
JT
3846 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
3847 fs_info->data_reloc_bg = block_group->start;
3848
2eda5708
NA
3849 ffe_ctl->found_offset = start + block_group->alloc_offset;
3850 block_group->alloc_offset += num_bytes;
3851 spin_lock(&ctl->tree_lock);
3852 ctl->free_space -= num_bytes;
3853 spin_unlock(&ctl->tree_lock);
3854
3855 /*
3856 * We do not check if found_offset is aligned to stripesize. The
3857 * address is anyway rewritten when using zone append writing.
3858 */
3859
3860 ffe_ctl->search_start = ffe_ctl->found_offset;
3861
3862out:
40ab3be1
NA
3863 if (ret && ffe_ctl->for_treelog)
3864 fs_info->treelog_bg = 0;
343d8a30
NA
3865 if (ret && ffe_ctl->for_data_reloc &&
3866 fs_info->data_reloc_bg == block_group->start) {
3867 /*
3868 * Do not allow further allocations from this block group.
3869 * Compared to increasing the ->ro, setting the
3870 * ->zoned_data_reloc_ongoing flag still allows nocow
3871 * writers to come in. See btrfs_inc_nocow_writers().
3872 *
3873 * We need to disable an allocation to avoid an allocation of
3874 * regular (non-relocation data) extent. With mix of relocation
3875 * extents and regular extents, we can dispatch WRITE commands
3876 * (for relocation extents) and ZONE APPEND commands (for
3877 * regular extents) at the same time to the same zone, which
3878 * easily break the write pointer.
3879 */
3349b57f 3880 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
c2707a25 3881 fs_info->data_reloc_bg = 0;
343d8a30 3882 }
c2707a25 3883 spin_unlock(&fs_info->relocation_bg_lock);
40ab3be1 3884 spin_unlock(&fs_info->treelog_bg_lock);
2eda5708
NA
3885 spin_unlock(&block_group->lock);
3886 spin_unlock(&space_info->lock);
3887 return ret;
3888}
3889
c668690d
NA
3890static int do_allocation(struct btrfs_block_group *block_group,
3891 struct find_free_extent_ctl *ffe_ctl,
3892 struct btrfs_block_group **bg_ret)
3893{
3894 switch (ffe_ctl->policy) {
3895 case BTRFS_EXTENT_ALLOC_CLUSTERED:
3896 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
2eda5708
NA
3897 case BTRFS_EXTENT_ALLOC_ZONED:
3898 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
c668690d
NA
3899 default:
3900 BUG();
3901 }
3902}
3903
baba5062
NA
3904static void release_block_group(struct btrfs_block_group *block_group,
3905 struct find_free_extent_ctl *ffe_ctl,
3906 int delalloc)
3907{
3908 switch (ffe_ctl->policy) {
3909 case BTRFS_EXTENT_ALLOC_CLUSTERED:
3910 ffe_ctl->retry_clustered = false;
3911 ffe_ctl->retry_unclustered = false;
3912 break;
2eda5708
NA
3913 case BTRFS_EXTENT_ALLOC_ZONED:
3914 /* Nothing to do */
3915 break;
baba5062
NA
3916 default:
3917 BUG();
3918 }
3919
3920 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3921 ffe_ctl->index);
3922 btrfs_release_block_group(block_group, delalloc);
3923}
3924
0ab9724b
NA
3925static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3926 struct btrfs_key *ins)
3927{
3928 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3929
3930 if (!ffe_ctl->use_cluster && last_ptr) {
3931 spin_lock(&last_ptr->lock);
3932 last_ptr->window_start = ins->objectid;
3933 spin_unlock(&last_ptr->lock);
3934 }
3935}
3936
3937static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3938 struct btrfs_key *ins)
3939{
3940 switch (ffe_ctl->policy) {
3941 case BTRFS_EXTENT_ALLOC_CLUSTERED:
3942 found_extent_clustered(ffe_ctl, ins);
3943 break;
2eda5708
NA
3944 case BTRFS_EXTENT_ALLOC_ZONED:
3945 /* Nothing to do */
3946 break;
0ab9724b
NA
3947 default:
3948 BUG();
3949 }
3950}
3951
393f646e
NA
3952static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
3953 struct find_free_extent_ctl *ffe_ctl)
3954{
3955 /* If we can activate new zone, just allocate a chunk and use it */
3956 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3957 return 0;
3958
3959 /*
3960 * We already reached the max active zones. Try to finish one block
3961 * group to make a room for a new block group. This is only possible
3962 * for a data block group because btrfs_zone_finish() may need to wait
3963 * for a running transaction which can cause a deadlock for metadata
3964 * allocation.
3965 */
3966 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
3967 int ret = btrfs_zone_finish_one_bg(fs_info);
3968
3969 if (ret == 1)
3970 return 0;
3971 else if (ret < 0)
3972 return ret;
3973 }
3974
3975 /*
3976 * If we have enough free space left in an already active block group
3977 * and we can't activate any other zone now, do not allow allocating a
3978 * new chunk and let find_free_extent() retry with a smaller size.
3979 */
3980 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
3981 return -ENOSPC;
3982
898793d9
NA
3983 /*
3984 * Even min_alloc_size is not left in any block groups. Since we cannot
3985 * activate a new block group, allocating it may not help. Let's tell a
3986 * caller to try again and hope it progress something by writing some
3987 * parts of the region. That is only possible for data block groups,
3988 * where a part of the region can be written.
3989 */
3990 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
3991 return -EAGAIN;
3992
393f646e
NA
3993 /*
3994 * We cannot activate a new block group and no enough space left in any
3995 * block groups. So, allocating a new block group may not help. But,
3996 * there is nothing to do anyway, so let's go with it.
3997 */
3998 return 0;
3999}
4000
bb9950d3
NA
4001static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4002 struct find_free_extent_ctl *ffe_ctl)
50475cd5
NA
4003{
4004 switch (ffe_ctl->policy) {
4005 case BTRFS_EXTENT_ALLOC_CLUSTERED:
bb9950d3 4006 return 0;
50475cd5 4007 case BTRFS_EXTENT_ALLOC_ZONED:
393f646e 4008 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
50475cd5
NA
4009 default:
4010 BUG();
4011 }
4012}
4013
c70e2139
NA
4014static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
4015{
4016 switch (ffe_ctl->policy) {
4017 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4018 /*
4019 * If we can't allocate a new chunk we've already looped through
4020 * at least once, move on to the NO_EMPTY_SIZE case.
4021 */
4022 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
4023 return 0;
2eda5708
NA
4024 case BTRFS_EXTENT_ALLOC_ZONED:
4025 /* Give up here */
4026 return -ENOSPC;
c70e2139
NA
4027 default:
4028 BUG();
4029 }
4030}
4031
e72d79d6
QW
4032/*
4033 * Return >0 means caller needs to re-search for free extent
4034 * Return 0 means we have the needed free extent.
4035 * Return <0 means we failed to locate any free extent.
4036 */
4037static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
e72d79d6
QW
4038 struct btrfs_key *ins,
4039 struct find_free_extent_ctl *ffe_ctl,
15b7ee65 4040 bool full_search)
e72d79d6 4041{
8e1d0290 4042 struct btrfs_root *root = fs_info->chunk_root;
e72d79d6
QW
4043 int ret;
4044
4045 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4046 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4047 ffe_ctl->orig_have_caching_bg = true;
4048
e72d79d6 4049 if (ins->objectid) {
0ab9724b 4050 found_extent(ffe_ctl, ins);
e72d79d6
QW
4051 return 0;
4052 }
4053
a85f05e5
NA
4054 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4055 return 1;
4056
4057 ffe_ctl->index++;
4058 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4059 return 1;
4060
e72d79d6
QW
4061 /*
4062 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4063 * caching kthreads as we move along
4064 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4065 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4066 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4067 * again
4068 */
4069 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4070 ffe_ctl->index = 0;
4071 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
4072 /*
4073 * We want to skip the LOOP_CACHING_WAIT step if we
4074 * don't have any uncached bgs and we've already done a
4075 * full search through.
4076 */
4077 if (ffe_ctl->orig_have_caching_bg || !full_search)
4078 ffe_ctl->loop = LOOP_CACHING_WAIT;
4079 else
4080 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
4081 } else {
4082 ffe_ctl->loop++;
4083 }
4084
4085 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4086 struct btrfs_trans_handle *trans;
4087 int exist = 0;
4088
50475cd5 4089 /*Check if allocation policy allows to create a new chunk */
bb9950d3
NA
4090 ret = can_allocate_chunk(fs_info, ffe_ctl);
4091 if (ret)
4092 return ret;
50475cd5 4093
e72d79d6
QW
4094 trans = current->journal_info;
4095 if (trans)
4096 exist = 1;
4097 else
4098 trans = btrfs_join_transaction(root);
4099
4100 if (IS_ERR(trans)) {
4101 ret = PTR_ERR(trans);
4102 return ret;
4103 }
4104
fc471cb0 4105 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
760e69c4 4106 CHUNK_ALLOC_FORCE_FOR_EXTENT);
e72d79d6 4107
e72d79d6 4108 /* Do not bail out on ENOSPC since we can do more. */
c70e2139
NA
4109 if (ret == -ENOSPC)
4110 ret = chunk_allocation_failed(ffe_ctl);
4111 else if (ret < 0)
e72d79d6
QW
4112 btrfs_abort_transaction(trans, ret);
4113 else
4114 ret = 0;
4115 if (!exist)
4116 btrfs_end_transaction(trans);
4117 if (ret)
4118 return ret;
4119 }
4120
4121 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
45d8e033
NA
4122 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4123 return -ENOSPC;
4124
e72d79d6
QW
4125 /*
4126 * Don't loop again if we already have no empty_size and
4127 * no empty_cluster.
4128 */
4129 if (ffe_ctl->empty_size == 0 &&
4130 ffe_ctl->empty_cluster == 0)
4131 return -ENOSPC;
4132 ffe_ctl->empty_size = 0;
4133 ffe_ctl->empty_cluster = 0;
4134 }
4135 return 1;
4136 }
4137 return -ENOSPC;
4138}
4139
7e895409
NA
4140static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4141 struct find_free_extent_ctl *ffe_ctl,
4142 struct btrfs_space_info *space_info,
4143 struct btrfs_key *ins)
4144{
4145 /*
4146 * If our free space is heavily fragmented we may not be able to make
4147 * big contiguous allocations, so instead of doing the expensive search
4148 * for free space, simply return ENOSPC with our max_extent_size so we
4149 * can go ahead and search for a more manageable chunk.
4150 *
4151 * If our max_extent_size is large enough for our allocation simply
4152 * disable clustering since we will likely not be able to find enough
4153 * space to create a cluster and induce latency trying.
4154 */
4155 if (space_info->max_extent_size) {
4156 spin_lock(&space_info->lock);
4157 if (space_info->max_extent_size &&
4158 ffe_ctl->num_bytes > space_info->max_extent_size) {
4159 ins->offset = space_info->max_extent_size;
4160 spin_unlock(&space_info->lock);
4161 return -ENOSPC;
4162 } else if (space_info->max_extent_size) {
4163 ffe_ctl->use_cluster = false;
4164 }
4165 spin_unlock(&space_info->lock);
4166 }
4167
4168 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4169 &ffe_ctl->empty_cluster);
4170 if (ffe_ctl->last_ptr) {
4171 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4172
4173 spin_lock(&last_ptr->lock);
4174 if (last_ptr->block_group)
4175 ffe_ctl->hint_byte = last_ptr->window_start;
4176 if (last_ptr->fragmented) {
4177 /*
4178 * We still set window_start so we can keep track of the
4179 * last place we found an allocation to try and save
4180 * some time.
4181 */
4182 ffe_ctl->hint_byte = last_ptr->window_start;
4183 ffe_ctl->use_cluster = false;
4184 }
4185 spin_unlock(&last_ptr->lock);
4186 }
4187
4188 return 0;
4189}
4190
4191static int prepare_allocation(struct btrfs_fs_info *fs_info,
4192 struct find_free_extent_ctl *ffe_ctl,
4193 struct btrfs_space_info *space_info,
4194 struct btrfs_key *ins)
4195{
4196 switch (ffe_ctl->policy) {
4197 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4198 return prepare_allocation_clustered(fs_info, ffe_ctl,
4199 space_info, ins);
2eda5708 4200 case BTRFS_EXTENT_ALLOC_ZONED:
40ab3be1
NA
4201 if (ffe_ctl->for_treelog) {
4202 spin_lock(&fs_info->treelog_bg_lock);
4203 if (fs_info->treelog_bg)
4204 ffe_ctl->hint_byte = fs_info->treelog_bg;
4205 spin_unlock(&fs_info->treelog_bg_lock);
4206 }
c2707a25
JT
4207 if (ffe_ctl->for_data_reloc) {
4208 spin_lock(&fs_info->relocation_bg_lock);
4209 if (fs_info->data_reloc_bg)
4210 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4211 spin_unlock(&fs_info->relocation_bg_lock);
4212 }
2eda5708 4213 return 0;
7e895409
NA
4214 default:
4215 BUG();
4216 }
4217}
4218
fec577fb
CM
4219/*
4220 * walks the btree of allocated extents and find a hole of a given size.
4221 * The key ins is changed to record the hole:
a4820398 4222 * ins->objectid == start position
62e2749e 4223 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 4224 * ins->offset == the size of the hole.
fec577fb 4225 * Any available blocks before search_start are skipped.
a4820398
MX
4226 *
4227 * If there is no suitable free space, we will record the max size of
4228 * the free space extent currently.
e72d79d6
QW
4229 *
4230 * The overall logic and call chain:
4231 *
4232 * find_free_extent()
4233 * |- Iterate through all block groups
4234 * | |- Get a valid block group
4235 * | |- Try to do clustered allocation in that block group
4236 * | |- Try to do unclustered allocation in that block group
4237 * | |- Check if the result is valid
4238 * | | |- If valid, then exit
4239 * | |- Jump to next block group
4240 * |
4241 * |- Push harder to find free extents
4242 * |- If not found, re-iterate all block groups
fec577fb 4243 */
437490fe 4244static noinline int find_free_extent(struct btrfs_root *root,
a12b0dc0
NA
4245 struct btrfs_key *ins,
4246 struct find_free_extent_ctl *ffe_ctl)
fec577fb 4247{
437490fe 4248 struct btrfs_fs_info *fs_info = root->fs_info;
80eb234a 4249 int ret = 0;
db8fe64f 4250 int cache_block_group_error = 0;
32da5386 4251 struct btrfs_block_group *block_group = NULL;
80eb234a 4252 struct btrfs_space_info *space_info;
a5e681d9 4253 bool full_search = false;
fec577fb 4254
a12b0dc0 4255 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
b4bd745d 4256
a12b0dc0
NA
4257 ffe_ctl->search_start = 0;
4258 /* For clustered allocation */
4259 ffe_ctl->empty_cluster = 0;
4260 ffe_ctl->last_ptr = NULL;
4261 ffe_ctl->use_cluster = true;
4262 ffe_ctl->have_caching_bg = false;
4263 ffe_ctl->orig_have_caching_bg = false;
4264 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4265 ffe_ctl->loop = 0;
c10859be 4266 /* For clustered allocation */
a12b0dc0
NA
4267 ffe_ctl->retry_clustered = false;
4268 ffe_ctl->retry_unclustered = false;
4269 ffe_ctl->cached = 0;
4270 ffe_ctl->max_extent_size = 0;
4271 ffe_ctl->total_free_space = 0;
4272 ffe_ctl->found_offset = 0;
4273 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
c10859be 4274
2eda5708 4275 if (btrfs_is_zoned(fs_info))
a12b0dc0 4276 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
2eda5708 4277
962a298f 4278 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
4279 ins->objectid = 0;
4280 ins->offset = 0;
b1a4d965 4281
a12b0dc0
NA
4282 trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
4283 ffe_ctl->flags);
3f7de037 4284
a12b0dc0 4285 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
1b1d1f66 4286 if (!space_info) {
a12b0dc0 4287 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
1b1d1f66
JB
4288 return -ENOSPC;
4289 }
2552d17e 4290
a12b0dc0 4291 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
7e895409
NA
4292 if (ret < 0)
4293 return ret;
fa9c0d79 4294
a12b0dc0 4295 ffe_ctl->search_start = max(ffe_ctl->search_start,
0eb997bf 4296 first_logical_byte(fs_info));
a12b0dc0
NA
4297 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4298 if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
b4bd745d 4299 block_group = btrfs_lookup_block_group(fs_info,
a12b0dc0 4300 ffe_ctl->search_start);
817d52f8
JB
4301 /*
4302 * we don't want to use the block group if it doesn't match our
4303 * allocation bits, or if its not cached.
ccf0e725
JB
4304 *
4305 * However if we are re-searching with an ideal block group
4306 * picked out then we don't care that the block group is cached.
817d52f8 4307 */
a12b0dc0 4308 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
285ff5af 4309 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 4310 down_read(&space_info->groups_sem);
44fb5511
CM
4311 if (list_empty(&block_group->list) ||
4312 block_group->ro) {
4313 /*
4314 * someone is removing this block group,
4315 * we can't jump into the have_block_group
4316 * target because our list pointers are not
4317 * valid
4318 */
4319 btrfs_put_block_group(block_group);
4320 up_read(&space_info->groups_sem);
ccf0e725 4321 } else {
a12b0dc0
NA
4322 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4323 block_group->flags);
4324 btrfs_lock_block_group(block_group,
4325 ffe_ctl->delalloc);
44fb5511 4326 goto have_block_group;
ccf0e725 4327 }
2552d17e 4328 } else if (block_group) {
fa9c0d79 4329 btrfs_put_block_group(block_group);
2552d17e 4330 }
42e70e7a 4331 }
2552d17e 4332search:
a12b0dc0
NA
4333 ffe_ctl->have_caching_bg = false;
4334 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4335 ffe_ctl->index == 0)
a5e681d9 4336 full_search = true;
80eb234a 4337 down_read(&space_info->groups_sem);
b4bd745d 4338 list_for_each_entry(block_group,
a12b0dc0 4339 &space_info->block_groups[ffe_ctl->index], list) {
c668690d
NA
4340 struct btrfs_block_group *bg_ret;
4341
14443937 4342 /* If the block group is read-only, we can skip it entirely. */
40ab3be1 4343 if (unlikely(block_group->ro)) {
a12b0dc0 4344 if (ffe_ctl->for_treelog)
40ab3be1 4345 btrfs_clear_treelog_bg(block_group);
c2707a25
JT
4346 if (ffe_ctl->for_data_reloc)
4347 btrfs_clear_data_reloc_bg(block_group);
14443937 4348 continue;
40ab3be1 4349 }
14443937 4350
a12b0dc0
NA
4351 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4352 ffe_ctl->search_start = block_group->start;
42e70e7a 4353
83a50de9
CM
4354 /*
4355 * this can happen if we end up cycling through all the
4356 * raid types, but we want to make sure we only allocate
4357 * for the proper type.
4358 */
a12b0dc0 4359 if (!block_group_bits(block_group, ffe_ctl->flags)) {
bece2e82 4360 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 4361 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 4362 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
4363 BTRFS_BLOCK_GROUP_RAID10;
4364
4365 /*
4366 * if they asked for extra copies and this block group
4367 * doesn't provide them, bail. This does allow us to
4368 * fill raid0 from raid1.
4369 */
a12b0dc0 4370 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
83a50de9 4371 goto loop;
2a28468e
QW
4372
4373 /*
4374 * This block group has different flags than we want.
4375 * It's possible that we have MIXED_GROUP flag but no
4376 * block group is mixed. Just skip such block group.
4377 */
a12b0dc0 4378 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2a28468e 4379 continue;
83a50de9
CM
4380 }
4381
2552d17e 4382have_block_group:
a12b0dc0
NA
4383 ffe_ctl->cached = btrfs_block_group_done(block_group);
4384 if (unlikely(!ffe_ctl->cached)) {
4385 ffe_ctl->have_caching_bg = true;
ced8ecf0 4386 ret = btrfs_cache_block_group(block_group, false);
db8fe64f
JB
4387
4388 /*
4389 * If we get ENOMEM here or something else we want to
4390 * try other block groups, because it may not be fatal.
4391 * However if we can't find anything else we need to
4392 * save our return here so that we return the actual
4393 * error that caused problems, not ENOSPC.
4394 */
4395 if (ret < 0) {
4396 if (!cache_block_group_error)
4397 cache_block_group_error = ret;
4398 ret = 0;
4399 goto loop;
4400 }
1d4284bd 4401 ret = 0;
817d52f8
JB
4402 }
4403
36cce922
JB
4404 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
4405 goto loop;
0f9dd46c 4406
c668690d 4407 bg_ret = NULL;
a12b0dc0 4408 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
c668690d
NA
4409 if (ret == 0) {
4410 if (bg_ret && bg_ret != block_group) {
a12b0dc0
NA
4411 btrfs_release_block_group(block_group,
4412 ffe_ctl->delalloc);
c668690d 4413 block_group = bg_ret;
fa9c0d79 4414 }
c668690d 4415 } else if (ret == -EAGAIN) {
817d52f8 4416 goto have_block_group;
c668690d 4417 } else if (ret > 0) {
1cdda9b8 4418 goto loop;
c668690d
NA
4419 }
4420
4421 /* Checks */
a12b0dc0
NA
4422 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4423 fs_info->stripesize);
25179201 4424
2552d17e 4425 /* move on to the next group */
a12b0dc0 4426 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
b3470b5d 4427 block_group->start + block_group->length) {
2eda5708 4428 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4429 ffe_ctl->found_offset,
4430 ffe_ctl->num_bytes);
2552d17e 4431 goto loop;
6226cb0a 4432 }
f5a31e16 4433
a12b0dc0 4434 if (ffe_ctl->found_offset < ffe_ctl->search_start)
2eda5708 4435 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4436 ffe_ctl->found_offset,
4437 ffe_ctl->search_start - ffe_ctl->found_offset);
2552d17e 4438
a12b0dc0
NA
4439 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4440 ffe_ctl->num_bytes,
4441 ffe_ctl->delalloc);
f0486c68 4442 if (ret == -EAGAIN) {
2eda5708 4443 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4444 ffe_ctl->found_offset,
4445 ffe_ctl->num_bytes);
2552d17e 4446 goto loop;
0f9dd46c 4447 }
9cfa3e34 4448 btrfs_inc_block_group_reservations(block_group);
0b86a832 4449
f0486c68 4450 /* we are all good, lets return */
a12b0dc0
NA
4451 ins->objectid = ffe_ctl->search_start;
4452 ins->offset = ffe_ctl->num_bytes;
d2fb3437 4453
a12b0dc0
NA
4454 trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
4455 ffe_ctl->num_bytes);
4456 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2552d17e
JB
4457 break;
4458loop:
a12b0dc0 4459 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
14443937 4460 cond_resched();
2552d17e
JB
4461 }
4462 up_read(&space_info->groups_sem);
4463
a12b0dc0 4464 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
e72d79d6 4465 if (ret > 0)
b742bb82
YZ
4466 goto search;
4467
db8fe64f 4468 if (ret == -ENOSPC && !cache_block_group_error) {
b4bd745d
QW
4469 /*
4470 * Use ffe_ctl->total_free_space as fallback if we can't find
4471 * any contiguous hole.
4472 */
a12b0dc0
NA
4473 if (!ffe_ctl->max_extent_size)
4474 ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4f4db217 4475 spin_lock(&space_info->lock);
a12b0dc0 4476 space_info->max_extent_size = ffe_ctl->max_extent_size;
4f4db217 4477 spin_unlock(&space_info->lock);
a12b0dc0 4478 ins->offset = ffe_ctl->max_extent_size;
db8fe64f
JB
4479 } else if (ret == -ENOSPC) {
4480 ret = cache_block_group_error;
4f4db217 4481 }
0f70abe2 4482 return ret;
fec577fb 4483}
ec44a35c 4484
6f47c706
NB
4485/*
4486 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4487 * hole that is at least as big as @num_bytes.
4488 *
4489 * @root - The root that will contain this extent
4490 *
4491 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4492 * is used for accounting purposes. This value differs
4493 * from @num_bytes only in the case of compressed extents.
4494 *
4495 * @num_bytes - Number of bytes to allocate on-disk.
4496 *
4497 * @min_alloc_size - Indicates the minimum amount of space that the
4498 * allocator should try to satisfy. In some cases
4499 * @num_bytes may be larger than what is required and if
4500 * the filesystem is fragmented then allocation fails.
4501 * However, the presence of @min_alloc_size gives a
4502 * chance to try and satisfy the smaller allocation.
4503 *
4504 * @empty_size - A hint that you plan on doing more COW. This is the
4505 * size in bytes the allocator should try to find free
4506 * next to the block it returns. This is just a hint and
4507 * may be ignored by the allocator.
4508 *
4509 * @hint_byte - Hint to the allocator to start searching above the byte
4510 * address passed. It might be ignored.
4511 *
4512 * @ins - This key is modified to record the found hole. It will
4513 * have the following values:
4514 * ins->objectid == start position
4515 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4516 * ins->offset == the size of the hole.
4517 *
4518 * @is_data - Boolean flag indicating whether an extent is
4519 * allocated for data (true) or metadata (false)
4520 *
4521 * @delalloc - Boolean flag indicating whether this allocation is for
4522 * delalloc or not. If 'true' data_rwsem of block groups
4523 * is going to be acquired.
4524 *
4525 *
4526 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4527 * case -ENOSPC is returned then @ins->offset will contain the size of the
4528 * largest available hole the allocator managed to find.
4529 */
18513091 4530int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
4531 u64 num_bytes, u64 min_alloc_size,
4532 u64 empty_size, u64 hint_byte,
e570fd27 4533 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 4534{
ab8d0fc4 4535 struct btrfs_fs_info *fs_info = root->fs_info;
a12b0dc0 4536 struct find_free_extent_ctl ffe_ctl = {};
36af4e07 4537 bool final_tried = num_bytes == min_alloc_size;
b6919a58 4538 u64 flags;
fec577fb 4539 int ret;
40ab3be1 4540 bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
c2707a25 4541 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
925baedd 4542
1b86826d 4543 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 4544again:
0b246afa 4545 WARN_ON(num_bytes < fs_info->sectorsize);
a12b0dc0
NA
4546
4547 ffe_ctl.ram_bytes = ram_bytes;
4548 ffe_ctl.num_bytes = num_bytes;
a85f05e5 4549 ffe_ctl.min_alloc_size = min_alloc_size;
a12b0dc0
NA
4550 ffe_ctl.empty_size = empty_size;
4551 ffe_ctl.flags = flags;
4552 ffe_ctl.delalloc = delalloc;
4553 ffe_ctl.hint_byte = hint_byte;
4554 ffe_ctl.for_treelog = for_treelog;
c2707a25 4555 ffe_ctl.for_data_reloc = for_data_reloc;
a12b0dc0
NA
4556
4557 ret = find_free_extent(root, ins, &ffe_ctl);
9cfa3e34 4558 if (!ret && !is_data) {
ab8d0fc4 4559 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 4560 } else if (ret == -ENOSPC) {
a4820398
MX
4561 if (!final_tried && ins->offset) {
4562 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 4563 num_bytes = round_down(num_bytes,
0b246afa 4564 fs_info->sectorsize);
9e622d6b 4565 num_bytes = max(num_bytes, min_alloc_size);
18513091 4566 ram_bytes = num_bytes;
9e622d6b
MX
4567 if (num_bytes == min_alloc_size)
4568 final_tried = true;
4569 goto again;
ab8d0fc4 4570 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
4571 struct btrfs_space_info *sinfo;
4572
280c2908 4573 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 4574 btrfs_err(fs_info,
c2707a25
JT
4575 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4576 flags, num_bytes, for_treelog, for_data_reloc);
53804280 4577 if (sinfo)
5da6afeb
JB
4578 btrfs_dump_space_info(fs_info, sinfo,
4579 num_bytes, 1);
9e622d6b 4580 }
925baedd 4581 }
0f9dd46c
JB
4582
4583 return ret;
e6dcd2dc
CM
4584}
4585
a0fbf736
NB
4586int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4587 u64 start, u64 len, int delalloc)
65b51a00 4588{
32da5386 4589 struct btrfs_block_group *cache;
0f9dd46c 4590
0b246afa 4591 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 4592 if (!cache) {
0b246afa
JM
4593 btrfs_err(fs_info, "Unable to find block group for %llu",
4594 start);
0f9dd46c
JB
4595 return -ENOSPC;
4596 }
1f3c79a2 4597
a0fbf736
NB
4598 btrfs_add_free_space(cache, start, len);
4599 btrfs_free_reserved_bytes(cache, len, delalloc);
4600 trace_btrfs_reserved_extent_free(fs_info, start, len);
4601
fa9c0d79 4602 btrfs_put_block_group(cache);
a0fbf736 4603 return 0;
e6dcd2dc
CM
4604}
4605
7bfc1007
NB
4606int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4607 u64 len)
e688b725 4608{
7ef54d54 4609 struct btrfs_block_group *cache;
a0fbf736 4610 int ret = 0;
7ef54d54 4611
7bfc1007 4612 cache = btrfs_lookup_block_group(trans->fs_info, start);
7ef54d54 4613 if (!cache) {
7bfc1007
NB
4614 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4615 start);
7ef54d54
NB
4616 return -ENOSPC;
4617 }
4618
6690d071 4619 ret = pin_down_extent(trans, cache, start, len, 1);
7ef54d54 4620 btrfs_put_block_group(cache);
a0fbf736 4621 return ret;
e688b725
CM
4622}
4623
34666705
JB
4624static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4625 u64 num_bytes)
4626{
4627 struct btrfs_fs_info *fs_info = trans->fs_info;
4628 int ret;
4629
4630 ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4631 if (ret)
4632 return ret;
4633
4634 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4635 if (ret) {
4636 ASSERT(!ret);
4637 btrfs_err(fs_info, "update block group failed for %llu %llu",
4638 bytenr, num_bytes);
4639 return ret;
4640 }
4641
4642 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4643 return 0;
4644}
4645
5d4f98a2 4646static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4647 u64 parent, u64 root_objectid,
4648 u64 flags, u64 owner, u64 offset,
4649 struct btrfs_key *ins, int ref_mod)
e6dcd2dc 4650{
ef89b824 4651 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4652 struct btrfs_root *extent_root;
e6dcd2dc 4653 int ret;
e6dcd2dc 4654 struct btrfs_extent_item *extent_item;
5d4f98a2 4655 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 4656 struct btrfs_path *path;
5d4f98a2
YZ
4657 struct extent_buffer *leaf;
4658 int type;
4659 u32 size;
26b8003f 4660
5d4f98a2
YZ
4661 if (parent > 0)
4662 type = BTRFS_SHARED_DATA_REF_KEY;
4663 else
4664 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 4665
5d4f98a2 4666 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
4667
4668 path = btrfs_alloc_path();
db5b493a
TI
4669 if (!path)
4670 return -ENOMEM;
47e4bb98 4671
29cbcf40
JB
4672 extent_root = btrfs_extent_root(fs_info, ins->objectid);
4673 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
79787eaa
JM
4674 if (ret) {
4675 btrfs_free_path(path);
4676 return ret;
4677 }
0f9dd46c 4678
5d4f98a2
YZ
4679 leaf = path->nodes[0];
4680 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 4681 struct btrfs_extent_item);
5d4f98a2
YZ
4682 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4683 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4684 btrfs_set_extent_flags(leaf, extent_item,
4685 flags | BTRFS_EXTENT_FLAG_DATA);
4686
4687 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4688 btrfs_set_extent_inline_ref_type(leaf, iref, type);
4689 if (parent > 0) {
4690 struct btrfs_shared_data_ref *ref;
4691 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4692 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4693 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4694 } else {
4695 struct btrfs_extent_data_ref *ref;
4696 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4697 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4698 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4699 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4700 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4701 }
47e4bb98
CM
4702
4703 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 4704 btrfs_free_path(path);
f510cfec 4705
34666705 4706 return alloc_reserved_extent(trans, ins->objectid, ins->offset);
e6dcd2dc
CM
4707}
4708
5d4f98a2 4709static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 4710 struct btrfs_delayed_ref_node *node,
21ebfbe7 4711 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 4712{
9dcdbe01 4713 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4714 struct btrfs_root *extent_root;
e6dcd2dc 4715 int ret;
5d4f98a2 4716 struct btrfs_extent_item *extent_item;
4e6bd4e0 4717 struct btrfs_key extent_key;
5d4f98a2
YZ
4718 struct btrfs_tree_block_info *block_info;
4719 struct btrfs_extent_inline_ref *iref;
4720 struct btrfs_path *path;
4721 struct extent_buffer *leaf;
4e6bd4e0 4722 struct btrfs_delayed_tree_ref *ref;
3173a18f 4723 u32 size = sizeof(*extent_item) + sizeof(*iref);
21ebfbe7 4724 u64 flags = extent_op->flags_to_set;
0b246afa 4725 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 4726
4e6bd4e0
NB
4727 ref = btrfs_delayed_node_to_tree_ref(node);
4728
4e6bd4e0
NB
4729 extent_key.objectid = node->bytenr;
4730 if (skinny_metadata) {
4731 extent_key.offset = ref->level;
4732 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4e6bd4e0
NB
4733 } else {
4734 extent_key.offset = node->num_bytes;
4735 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 4736 size += sizeof(*block_info);
4e6bd4e0 4737 }
1c2308f8 4738
5d4f98a2 4739 path = btrfs_alloc_path();
80ee54bf 4740 if (!path)
d8926bb3 4741 return -ENOMEM;
56bec294 4742
29cbcf40
JB
4743 extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4744 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4745 size);
79787eaa 4746 if (ret) {
dd825259 4747 btrfs_free_path(path);
79787eaa
JM
4748 return ret;
4749 }
5d4f98a2
YZ
4750
4751 leaf = path->nodes[0];
4752 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4753 struct btrfs_extent_item);
4754 btrfs_set_extent_refs(leaf, extent_item, 1);
4755 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4756 btrfs_set_extent_flags(leaf, extent_item,
4757 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 4758
3173a18f
JB
4759 if (skinny_metadata) {
4760 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4761 } else {
4762 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 4763 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4e6bd4e0 4764 btrfs_set_tree_block_level(leaf, block_info, ref->level);
3173a18f
JB
4765 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4766 }
5d4f98a2 4767
d4b20733 4768 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
4769 btrfs_set_extent_inline_ref_type(leaf, iref,
4770 BTRFS_SHARED_BLOCK_REF_KEY);
d4b20733 4771 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
5d4f98a2
YZ
4772 } else {
4773 btrfs_set_extent_inline_ref_type(leaf, iref,
4774 BTRFS_TREE_BLOCK_REF_KEY);
4e6bd4e0 4775 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
5d4f98a2
YZ
4776 }
4777
4778 btrfs_mark_buffer_dirty(leaf);
4779 btrfs_free_path(path);
4780
34666705 4781 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
5d4f98a2
YZ
4782}
4783
4784int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 4785 struct btrfs_root *root, u64 owner,
5846a3c2
QW
4786 u64 offset, u64 ram_bytes,
4787 struct btrfs_key *ins)
5d4f98a2 4788{
76675593 4789 struct btrfs_ref generic_ref = { 0 };
5d4f98a2 4790
84f7d8e6 4791 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 4792
76675593
QW
4793 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4794 ins->objectid, ins->offset, 0);
f42c5da6
NB
4795 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4796 offset, 0, false);
8a5040f7 4797 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
2187374f
JB
4798
4799 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
e6dcd2dc 4800}
e02119d5
CM
4801
4802/*
4803 * this is used by the tree logging recovery code. It records that
4804 * an extent has been allocated and makes sure to clear the free
4805 * space cache bits as well
4806 */
5d4f98a2 4807int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4808 u64 root_objectid, u64 owner, u64 offset,
4809 struct btrfs_key *ins)
e02119d5 4810{
61da2abf 4811 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5 4812 int ret;
32da5386 4813 struct btrfs_block_group *block_group;
ed7a6948 4814 struct btrfs_space_info *space_info;
11833d66 4815
8c2a1a30
JB
4816 /*
4817 * Mixed block groups will exclude before processing the log so we only
01327610 4818 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 4819 */
0b246afa 4820 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
4821 ret = __exclude_logged_extent(fs_info, ins->objectid,
4822 ins->offset);
b50c6e25 4823 if (ret)
8c2a1a30 4824 return ret;
11833d66
YZ
4825 }
4826
0b246afa 4827 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
4828 if (!block_group)
4829 return -EINVAL;
4830
ed7a6948
WX
4831 space_info = block_group->space_info;
4832 spin_lock(&space_info->lock);
4833 spin_lock(&block_group->lock);
4834 space_info->bytes_reserved += ins->offset;
4835 block_group->reserved += ins->offset;
4836 spin_unlock(&block_group->lock);
4837 spin_unlock(&space_info->lock);
4838
ef89b824
NB
4839 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4840 offset, ins, 1);
bd727173 4841 if (ret)
ab9b2c7b 4842 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
b50c6e25 4843 btrfs_put_block_group(block_group);
e02119d5
CM
4844 return ret;
4845}
4846
48a3b636
ES
4847static struct extent_buffer *
4848btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
9631e4cc
JB
4849 u64 bytenr, int level, u64 owner,
4850 enum btrfs_lock_nesting nest)
65b51a00 4851{
0b246afa 4852 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00 4853 struct extent_buffer *buf;
b40130b2 4854 u64 lockdep_owner = owner;
65b51a00 4855
3fbaf258 4856 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
c871b0f2
LB
4857 if (IS_ERR(buf))
4858 return buf;
4859
b72c3aba
QW
4860 /*
4861 * Extra safety check in case the extent tree is corrupted and extent
4862 * allocator chooses to use a tree block which is already used and
4863 * locked.
4864 */
4865 if (buf->lock_owner == current->pid) {
4866 btrfs_err_rl(fs_info,
4867"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4868 buf->start, btrfs_header_owner(buf), current->pid);
4869 free_extent_buffer(buf);
4870 return ERR_PTR(-EUCLEAN);
4871 }
4872
b40130b2
JB
4873 /*
4874 * The reloc trees are just snapshots, so we need them to appear to be
4875 * just like any other fs tree WRT lockdep.
4876 *
4877 * The exception however is in replace_path() in relocation, where we
4878 * hold the lock on the original fs root and then search for the reloc
4879 * root. At that point we need to make sure any reloc root buffers are
4880 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
4881 * lockdep happy.
4882 */
4883 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
4884 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
4885 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4886
e114c545
JB
4887 /*
4888 * This needs to stay, because we could allocate a freed block from an
4889 * old tree into a new tree, so we need to make sure this new block is
4890 * set to the appropriate level and owner.
4891 */
b40130b2
JB
4892 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
4893
9631e4cc 4894 __btrfs_tree_lock(buf, nest);
6a884d7d 4895 btrfs_clean_tree_block(buf);
3083ee2e 4896 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
d3575156 4897 clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
b4ce94de 4898
4db8c528 4899 set_extent_buffer_uptodate(buf);
b4ce94de 4900
bc877d28
NB
4901 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4902 btrfs_set_header_level(buf, level);
4903 btrfs_set_header_bytenr(buf, buf->start);
4904 btrfs_set_header_generation(buf, trans->transid);
4905 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4906 btrfs_set_header_owner(buf, owner);
de37aa51 4907 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 4908 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
d0c803c4 4909 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 4910 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
4911 /*
4912 * we allow two log transactions at a time, use different
52042d8e 4913 * EXTENT bit to differentiate dirty pages.
8cef4e16 4914 */
656f30db 4915 if (buf->log_index == 0)
8cef4e16
YZ
4916 set_extent_dirty(&root->dirty_log_pages, buf->start,
4917 buf->start + buf->len - 1, GFP_NOFS);
4918 else
4919 set_extent_new(&root->dirty_log_pages, buf->start,
3744dbeb 4920 buf->start + buf->len - 1);
d0c803c4 4921 } else {
656f30db 4922 buf->log_index = -1;
d0c803c4 4923 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 4924 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 4925 }
b4ce94de 4926 /* this returns a buffer locked for blocking */
65b51a00
CM
4927 return buf;
4928}
4929
fec577fb 4930/*
f0486c68 4931 * finds a free extent and does all the dirty work required for allocation
67b7859e 4932 * returns the tree buffer or an ERR_PTR on error.
fec577fb 4933 */
4d75f8a9 4934struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
4935 struct btrfs_root *root,
4936 u64 parent, u64 root_objectid,
4937 const struct btrfs_disk_key *key,
4938 int level, u64 hint,
9631e4cc
JB
4939 u64 empty_size,
4940 enum btrfs_lock_nesting nest)
fec577fb 4941{
0b246afa 4942 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 4943 struct btrfs_key ins;
f0486c68 4944 struct btrfs_block_rsv *block_rsv;
5f39d397 4945 struct extent_buffer *buf;
67b7859e 4946 struct btrfs_delayed_extent_op *extent_op;
ed4f255b 4947 struct btrfs_ref generic_ref = { 0 };
f0486c68
YZ
4948 u64 flags = 0;
4949 int ret;
0b246afa
JM
4950 u32 blocksize = fs_info->nodesize;
4951 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
fec577fb 4952
05653ef3 4953#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 4954 if (btrfs_is_testing(fs_info)) {
faa2dbf0 4955 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
9631e4cc 4956 level, root_objectid, nest);
faa2dbf0
JB
4957 if (!IS_ERR(buf))
4958 root->alloc_bytenr += blocksize;
4959 return buf;
4960 }
05653ef3 4961#endif
fccb84c9 4962
67f9c220 4963 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
f0486c68
YZ
4964 if (IS_ERR(block_rsv))
4965 return ERR_CAST(block_rsv);
4966
18513091 4967 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 4968 empty_size, hint, &ins, 0, 0);
67b7859e
OS
4969 if (ret)
4970 goto out_unuse;
55c69072 4971
bc877d28 4972 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
9631e4cc 4973 root_objectid, nest);
67b7859e
OS
4974 if (IS_ERR(buf)) {
4975 ret = PTR_ERR(buf);
4976 goto out_free_reserved;
4977 }
f0486c68
YZ
4978
4979 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4980 if (parent == 0)
4981 parent = ins.objectid;
4982 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4983 } else
4984 BUG_ON(parent > 0);
4985
4986 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
78a6184a 4987 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
4988 if (!extent_op) {
4989 ret = -ENOMEM;
4990 goto out_free_buf;
4991 }
f0486c68
YZ
4992 if (key)
4993 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4994 else
4995 memset(&extent_op->key, 0, sizeof(extent_op->key));
4996 extent_op->flags_to_set = flags;
35b3ad50
DS
4997 extent_op->update_key = skinny_metadata ? false : true;
4998 extent_op->update_flags = true;
b1c79e09 4999 extent_op->level = level;
f0486c68 5000
ed4f255b
QW
5001 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
5002 ins.objectid, ins.offset, parent);
f42c5da6
NB
5003 btrfs_init_tree_ref(&generic_ref, level, root_objectid,
5004 root->root_key.objectid, false);
8a5040f7 5005 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 5006 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
67b7859e
OS
5007 if (ret)
5008 goto out_free_delayed;
f0486c68 5009 }
fec577fb 5010 return buf;
67b7859e
OS
5011
5012out_free_delayed:
5013 btrfs_free_delayed_extent_op(extent_op);
5014out_free_buf:
19ea40dd 5015 btrfs_tree_unlock(buf);
67b7859e
OS
5016 free_extent_buffer(buf);
5017out_free_reserved:
2ff7e61e 5018 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 5019out_unuse:
67f9c220 5020 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 5021 return ERR_PTR(ret);
fec577fb 5022}
a28ec197 5023
2c47e605
YZ
5024struct walk_control {
5025 u64 refs[BTRFS_MAX_LEVEL];
5026 u64 flags[BTRFS_MAX_LEVEL];
5027 struct btrfs_key update_progress;
aea6f028
JB
5028 struct btrfs_key drop_progress;
5029 int drop_level;
2c47e605
YZ
5030 int stage;
5031 int level;
5032 int shared_level;
5033 int update_ref;
5034 int keep_locks;
1c4850e2
YZ
5035 int reada_slot;
5036 int reada_count;
78c52d9e 5037 int restarted;
2c47e605
YZ
5038};
5039
5040#define DROP_REFERENCE 1
5041#define UPDATE_BACKREF 2
5042
1c4850e2
YZ
5043static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5044 struct btrfs_root *root,
5045 struct walk_control *wc,
5046 struct btrfs_path *path)
6407bf6d 5047{
0b246afa 5048 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5049 u64 bytenr;
5050 u64 generation;
5051 u64 refs;
94fcca9f 5052 u64 flags;
5d4f98a2 5053 u32 nritems;
1c4850e2
YZ
5054 struct btrfs_key key;
5055 struct extent_buffer *eb;
6407bf6d 5056 int ret;
1c4850e2
YZ
5057 int slot;
5058 int nread = 0;
6407bf6d 5059
1c4850e2
YZ
5060 if (path->slots[wc->level] < wc->reada_slot) {
5061 wc->reada_count = wc->reada_count * 2 / 3;
5062 wc->reada_count = max(wc->reada_count, 2);
5063 } else {
5064 wc->reada_count = wc->reada_count * 3 / 2;
5065 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 5066 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 5067 }
7bb86316 5068
1c4850e2
YZ
5069 eb = path->nodes[wc->level];
5070 nritems = btrfs_header_nritems(eb);
bd56b302 5071
1c4850e2
YZ
5072 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5073 if (nread >= wc->reada_count)
5074 break;
bd56b302 5075
2dd3e67b 5076 cond_resched();
1c4850e2
YZ
5077 bytenr = btrfs_node_blockptr(eb, slot);
5078 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 5079
1c4850e2
YZ
5080 if (slot == path->slots[wc->level])
5081 goto reada;
5d4f98a2 5082
1c4850e2
YZ
5083 if (wc->stage == UPDATE_BACKREF &&
5084 generation <= root->root_key.offset)
bd56b302
CM
5085 continue;
5086
94fcca9f 5087 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 5088 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f
JB
5089 wc->level - 1, 1, &refs,
5090 &flags);
79787eaa
JM
5091 /* We don't care about errors in readahead. */
5092 if (ret < 0)
5093 continue;
94fcca9f
YZ
5094 BUG_ON(refs == 0);
5095
1c4850e2 5096 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
5097 if (refs == 1)
5098 goto reada;
bd56b302 5099
94fcca9f
YZ
5100 if (wc->level == 1 &&
5101 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5102 continue;
1c4850e2
YZ
5103 if (!wc->update_ref ||
5104 generation <= root->root_key.offset)
5105 continue;
5106 btrfs_node_key_to_cpu(eb, &key, slot);
5107 ret = btrfs_comp_cpu_keys(&key,
5108 &wc->update_progress);
5109 if (ret < 0)
5110 continue;
94fcca9f
YZ
5111 } else {
5112 if (wc->level == 1 &&
5113 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5114 continue;
6407bf6d 5115 }
1c4850e2 5116reada:
bfb484d9 5117 btrfs_readahead_node_child(eb, slot);
1c4850e2 5118 nread++;
20524f02 5119 }
1c4850e2 5120 wc->reada_slot = slot;
20524f02 5121}
2c47e605 5122
f82d02d9 5123/*
2c016dc2 5124 * helper to process tree block while walking down the tree.
2c47e605 5125 *
2c47e605
YZ
5126 * when wc->stage == UPDATE_BACKREF, this function updates
5127 * back refs for pointers in the block.
5128 *
5129 * NOTE: return value 1 means we should stop walking down.
f82d02d9 5130 */
2c47e605 5131static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 5132 struct btrfs_root *root,
2c47e605 5133 struct btrfs_path *path,
94fcca9f 5134 struct walk_control *wc, int lookup_info)
f82d02d9 5135{
2ff7e61e 5136 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
5137 int level = wc->level;
5138 struct extent_buffer *eb = path->nodes[level];
2c47e605 5139 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
5140 int ret;
5141
2c47e605
YZ
5142 if (wc->stage == UPDATE_BACKREF &&
5143 btrfs_header_owner(eb) != root->root_key.objectid)
5144 return 1;
f82d02d9 5145
2c47e605
YZ
5146 /*
5147 * when reference count of tree block is 1, it won't increase
5148 * again. once full backref flag is set, we never clear it.
5149 */
94fcca9f
YZ
5150 if (lookup_info &&
5151 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5152 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 5153 BUG_ON(!path->locks[level]);
2ff7e61e 5154 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5155 eb->start, level, 1,
2c47e605
YZ
5156 &wc->refs[level],
5157 &wc->flags[level]);
79787eaa
JM
5158 BUG_ON(ret == -ENOMEM);
5159 if (ret)
5160 return ret;
2c47e605
YZ
5161 BUG_ON(wc->refs[level] == 0);
5162 }
5d4f98a2 5163
2c47e605
YZ
5164 if (wc->stage == DROP_REFERENCE) {
5165 if (wc->refs[level] > 1)
5166 return 1;
f82d02d9 5167
2c47e605 5168 if (path->locks[level] && !wc->keep_locks) {
bd681513 5169 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5170 path->locks[level] = 0;
5171 }
5172 return 0;
5173 }
f82d02d9 5174
2c47e605
YZ
5175 /* wc->stage == UPDATE_BACKREF */
5176 if (!(wc->flags[level] & flag)) {
5177 BUG_ON(!path->locks[level]);
e339a6b0 5178 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 5179 BUG_ON(ret); /* -ENOMEM */
e339a6b0 5180 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 5181 BUG_ON(ret); /* -ENOMEM */
42c9d0b5 5182 ret = btrfs_set_disk_extent_flags(trans, eb, flag,
2fe6a5a1 5183 btrfs_header_level(eb));
79787eaa 5184 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
5185 wc->flags[level] |= flag;
5186 }
5187
5188 /*
5189 * the block is shared by multiple trees, so it's not good to
5190 * keep the tree lock
5191 */
5192 if (path->locks[level] && level > 0) {
bd681513 5193 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5194 path->locks[level] = 0;
5195 }
5196 return 0;
5197}
5198
78c52d9e
JB
5199/*
5200 * This is used to verify a ref exists for this root to deal with a bug where we
5201 * would have a drop_progress key that hadn't been updated properly.
5202 */
5203static int check_ref_exists(struct btrfs_trans_handle *trans,
5204 struct btrfs_root *root, u64 bytenr, u64 parent,
5205 int level)
5206{
5207 struct btrfs_path *path;
5208 struct btrfs_extent_inline_ref *iref;
5209 int ret;
5210
5211 path = btrfs_alloc_path();
5212 if (!path)
5213 return -ENOMEM;
5214
5215 ret = lookup_extent_backref(trans, path, &iref, bytenr,
5216 root->fs_info->nodesize, parent,
5217 root->root_key.objectid, level, 0);
5218 btrfs_free_path(path);
5219 if (ret == -ENOENT)
5220 return 0;
5221 if (ret < 0)
5222 return ret;
5223 return 1;
5224}
5225
1c4850e2 5226/*
2c016dc2 5227 * helper to process tree block pointer.
1c4850e2
YZ
5228 *
5229 * when wc->stage == DROP_REFERENCE, this function checks
5230 * reference count of the block pointed to. if the block
5231 * is shared and we need update back refs for the subtree
5232 * rooted at the block, this function changes wc->stage to
5233 * UPDATE_BACKREF. if the block is shared and there is no
5234 * need to update back, this function drops the reference
5235 * to the block.
5236 *
5237 * NOTE: return value 1 means we should stop walking down.
5238 */
5239static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5240 struct btrfs_root *root,
5241 struct btrfs_path *path,
94fcca9f 5242 struct walk_control *wc, int *lookup_info)
1c4850e2 5243{
0b246afa 5244 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5245 u64 bytenr;
5246 u64 generation;
5247 u64 parent;
1c4850e2 5248 struct btrfs_key key;
581c1760 5249 struct btrfs_key first_key;
ffd4bb2a 5250 struct btrfs_ref ref = { 0 };
1c4850e2
YZ
5251 struct extent_buffer *next;
5252 int level = wc->level;
5253 int reada = 0;
5254 int ret = 0;
1152651a 5255 bool need_account = false;
1c4850e2
YZ
5256
5257 generation = btrfs_node_ptr_generation(path->nodes[level],
5258 path->slots[level]);
5259 /*
5260 * if the lower level block was created before the snapshot
5261 * was created, we know there is no need to update back refs
5262 * for the subtree
5263 */
5264 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
5265 generation <= root->root_key.offset) {
5266 *lookup_info = 1;
1c4850e2 5267 return 1;
94fcca9f 5268 }
1c4850e2
YZ
5269
5270 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
581c1760
QW
5271 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
5272 path->slots[level]);
1c4850e2 5273
0b246afa 5274 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 5275 if (!next) {
3fbaf258
JB
5276 next = btrfs_find_create_tree_block(fs_info, bytenr,
5277 root->root_key.objectid, level - 1);
c871b0f2
LB
5278 if (IS_ERR(next))
5279 return PTR_ERR(next);
1c4850e2
YZ
5280 reada = 1;
5281 }
5282 btrfs_tree_lock(next);
1c4850e2 5283
2ff7e61e 5284 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f
YZ
5285 &wc->refs[level - 1],
5286 &wc->flags[level - 1]);
4867268c
JB
5287 if (ret < 0)
5288 goto out_unlock;
79787eaa 5289
c2cf52eb 5290 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 5291 btrfs_err(fs_info, "Missing references.");
4867268c
JB
5292 ret = -EIO;
5293 goto out_unlock;
c2cf52eb 5294 }
94fcca9f 5295 *lookup_info = 0;
1c4850e2 5296
94fcca9f 5297 if (wc->stage == DROP_REFERENCE) {
1c4850e2 5298 if (wc->refs[level - 1] > 1) {
1152651a 5299 need_account = true;
94fcca9f
YZ
5300 if (level == 1 &&
5301 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5302 goto skip;
5303
1c4850e2
YZ
5304 if (!wc->update_ref ||
5305 generation <= root->root_key.offset)
5306 goto skip;
5307
5308 btrfs_node_key_to_cpu(path->nodes[level], &key,
5309 path->slots[level]);
5310 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5311 if (ret < 0)
5312 goto skip;
5313
5314 wc->stage = UPDATE_BACKREF;
5315 wc->shared_level = level - 1;
5316 }
94fcca9f
YZ
5317 } else {
5318 if (level == 1 &&
5319 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5320 goto skip;
1c4850e2
YZ
5321 }
5322
b9fab919 5323 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
5324 btrfs_tree_unlock(next);
5325 free_extent_buffer(next);
5326 next = NULL;
94fcca9f 5327 *lookup_info = 1;
1c4850e2
YZ
5328 }
5329
5330 if (!next) {
5331 if (reada && level == 1)
5332 reada_walk_down(trans, root, wc, path);
1b7ec85e
JB
5333 next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
5334 generation, level - 1, &first_key);
64c043de
LB
5335 if (IS_ERR(next)) {
5336 return PTR_ERR(next);
5337 } else if (!extent_buffer_uptodate(next)) {
416bc658 5338 free_extent_buffer(next);
97d9a8a4 5339 return -EIO;
416bc658 5340 }
1c4850e2 5341 btrfs_tree_lock(next);
1c4850e2
YZ
5342 }
5343
5344 level--;
4867268c
JB
5345 ASSERT(level == btrfs_header_level(next));
5346 if (level != btrfs_header_level(next)) {
5347 btrfs_err(root->fs_info, "mismatched level");
5348 ret = -EIO;
5349 goto out_unlock;
5350 }
1c4850e2
YZ
5351 path->nodes[level] = next;
5352 path->slots[level] = 0;
ac5887c8 5353 path->locks[level] = BTRFS_WRITE_LOCK;
1c4850e2
YZ
5354 wc->level = level;
5355 if (wc->level == 1)
5356 wc->reada_slot = 0;
5357 return 0;
5358skip:
5359 wc->refs[level - 1] = 0;
5360 wc->flags[level - 1] = 0;
94fcca9f
YZ
5361 if (wc->stage == DROP_REFERENCE) {
5362 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5363 parent = path->nodes[level]->start;
5364 } else {
4867268c 5365 ASSERT(root->root_key.objectid ==
94fcca9f 5366 btrfs_header_owner(path->nodes[level]));
4867268c
JB
5367 if (root->root_key.objectid !=
5368 btrfs_header_owner(path->nodes[level])) {
5369 btrfs_err(root->fs_info,
5370 "mismatched block owner");
5371 ret = -EIO;
5372 goto out_unlock;
5373 }
94fcca9f
YZ
5374 parent = 0;
5375 }
1c4850e2 5376
78c52d9e
JB
5377 /*
5378 * If we had a drop_progress we need to verify the refs are set
5379 * as expected. If we find our ref then we know that from here
5380 * on out everything should be correct, and we can clear the
5381 * ->restarted flag.
5382 */
5383 if (wc->restarted) {
5384 ret = check_ref_exists(trans, root, bytenr, parent,
5385 level - 1);
5386 if (ret < 0)
5387 goto out_unlock;
5388 if (ret == 0)
5389 goto no_delete;
5390 ret = 0;
5391 wc->restarted = 0;
5392 }
5393
2cd86d30
QW
5394 /*
5395 * Reloc tree doesn't contribute to qgroup numbers, and we have
5396 * already accounted them at merge time (replace_path),
5397 * thus we could skip expensive subtree trace here.
5398 */
5399 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5400 need_account) {
deb40627 5401 ret = btrfs_qgroup_trace_subtree(trans, next,
33d1f05c 5402 generation, level - 1);
1152651a 5403 if (ret) {
0b246afa 5404 btrfs_err_rl(fs_info,
5d163e0e
JM
5405 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5406 ret);
1152651a
MF
5407 }
5408 }
aea6f028
JB
5409
5410 /*
5411 * We need to update the next key in our walk control so we can
5412 * update the drop_progress key accordingly. We don't care if
5413 * find_next_key doesn't find a key because that means we're at
5414 * the end and are going to clean up now.
5415 */
5416 wc->drop_level = level;
5417 find_next_key(path, level, &wc->drop_progress);
5418
ffd4bb2a
QW
5419 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5420 fs_info->nodesize, parent);
f42c5da6
NB
5421 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5422 0, false);
ffd4bb2a 5423 ret = btrfs_free_extent(trans, &ref);
4867268c
JB
5424 if (ret)
5425 goto out_unlock;
1c4850e2 5426 }
78c52d9e 5427no_delete:
4867268c
JB
5428 *lookup_info = 1;
5429 ret = 1;
5430
5431out_unlock:
1c4850e2
YZ
5432 btrfs_tree_unlock(next);
5433 free_extent_buffer(next);
4867268c
JB
5434
5435 return ret;
1c4850e2
YZ
5436}
5437
2c47e605 5438/*
2c016dc2 5439 * helper to process tree block while walking up the tree.
2c47e605
YZ
5440 *
5441 * when wc->stage == DROP_REFERENCE, this function drops
5442 * reference count on the block.
5443 *
5444 * when wc->stage == UPDATE_BACKREF, this function changes
5445 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5446 * to UPDATE_BACKREF previously while processing the block.
5447 *
5448 * NOTE: return value 1 means we should stop walking up.
5449 */
5450static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5451 struct btrfs_root *root,
5452 struct btrfs_path *path,
5453 struct walk_control *wc)
5454{
0b246afa 5455 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 5456 int ret;
2c47e605
YZ
5457 int level = wc->level;
5458 struct extent_buffer *eb = path->nodes[level];
5459 u64 parent = 0;
5460
5461 if (wc->stage == UPDATE_BACKREF) {
5462 BUG_ON(wc->shared_level < level);
5463 if (level < wc->shared_level)
5464 goto out;
5465
2c47e605
YZ
5466 ret = find_next_key(path, level + 1, &wc->update_progress);
5467 if (ret > 0)
5468 wc->update_ref = 0;
5469
5470 wc->stage = DROP_REFERENCE;
5471 wc->shared_level = -1;
5472 path->slots[level] = 0;
5473
5474 /*
5475 * check reference count again if the block isn't locked.
5476 * we should start walking down the tree again if reference
5477 * count is one.
5478 */
5479 if (!path->locks[level]) {
5480 BUG_ON(level == 0);
5481 btrfs_tree_lock(eb);
ac5887c8 5482 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5483
2ff7e61e 5484 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5485 eb->start, level, 1,
2c47e605
YZ
5486 &wc->refs[level],
5487 &wc->flags[level]);
79787eaa
JM
5488 if (ret < 0) {
5489 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5490 path->locks[level] = 0;
79787eaa
JM
5491 return ret;
5492 }
2c47e605
YZ
5493 BUG_ON(wc->refs[level] == 0);
5494 if (wc->refs[level] == 1) {
bd681513 5495 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5496 path->locks[level] = 0;
2c47e605
YZ
5497 return 1;
5498 }
f82d02d9 5499 }
2c47e605 5500 }
f82d02d9 5501
2c47e605
YZ
5502 /* wc->stage == DROP_REFERENCE */
5503 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 5504
2c47e605
YZ
5505 if (wc->refs[level] == 1) {
5506 if (level == 0) {
5507 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 5508 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 5509 else
e339a6b0 5510 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 5511 BUG_ON(ret); /* -ENOMEM */
c4140cbf
QW
5512 if (is_fstree(root->root_key.objectid)) {
5513 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5514 if (ret) {
5515 btrfs_err_rl(fs_info,
5516 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 5517 ret);
c4140cbf 5518 }
1152651a 5519 }
2c47e605 5520 }
6a884d7d 5521 /* make block locked assertion in btrfs_clean_tree_block happy */
2c47e605
YZ
5522 if (!path->locks[level] &&
5523 btrfs_header_generation(eb) == trans->transid) {
5524 btrfs_tree_lock(eb);
ac5887c8 5525 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5526 }
6a884d7d 5527 btrfs_clean_tree_block(eb);
2c47e605
YZ
5528 }
5529
5530 if (eb == root->node) {
5531 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5532 parent = eb->start;
65c6e82b
QW
5533 else if (root->root_key.objectid != btrfs_header_owner(eb))
5534 goto owner_mismatch;
2c47e605
YZ
5535 } else {
5536 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5537 parent = path->nodes[level + 1]->start;
65c6e82b
QW
5538 else if (root->root_key.objectid !=
5539 btrfs_header_owner(path->nodes[level + 1]))
5540 goto owner_mismatch;
f82d02d9 5541 }
f82d02d9 5542
7a163608
FM
5543 btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5544 wc->refs[level] == 1);
2c47e605
YZ
5545out:
5546 wc->refs[level] = 0;
5547 wc->flags[level] = 0;
f0486c68 5548 return 0;
65c6e82b
QW
5549
5550owner_mismatch:
5551 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5552 btrfs_header_owner(eb), root->root_key.objectid);
5553 return -EUCLEAN;
2c47e605
YZ
5554}
5555
5556static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5557 struct btrfs_root *root,
5558 struct btrfs_path *path,
5559 struct walk_control *wc)
5560{
2c47e605 5561 int level = wc->level;
94fcca9f 5562 int lookup_info = 1;
2c47e605
YZ
5563 int ret;
5564
5565 while (level >= 0) {
94fcca9f 5566 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
5567 if (ret > 0)
5568 break;
5569
5570 if (level == 0)
5571 break;
5572
7a7965f8
YZ
5573 if (path->slots[level] >=
5574 btrfs_header_nritems(path->nodes[level]))
5575 break;
5576
94fcca9f 5577 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
5578 if (ret > 0) {
5579 path->slots[level]++;
5580 continue;
90d2c51d
MX
5581 } else if (ret < 0)
5582 return ret;
1c4850e2 5583 level = wc->level;
f82d02d9 5584 }
f82d02d9
YZ
5585 return 0;
5586}
5587
d397712b 5588static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 5589 struct btrfs_root *root,
f82d02d9 5590 struct btrfs_path *path,
2c47e605 5591 struct walk_control *wc, int max_level)
20524f02 5592{
2c47e605 5593 int level = wc->level;
20524f02 5594 int ret;
9f3a7427 5595
2c47e605
YZ
5596 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5597 while (level < max_level && path->nodes[level]) {
5598 wc->level = level;
5599 if (path->slots[level] + 1 <
5600 btrfs_header_nritems(path->nodes[level])) {
5601 path->slots[level]++;
20524f02
CM
5602 return 0;
5603 } else {
2c47e605
YZ
5604 ret = walk_up_proc(trans, root, path, wc);
5605 if (ret > 0)
5606 return 0;
65c6e82b
QW
5607 if (ret < 0)
5608 return ret;
bd56b302 5609
2c47e605 5610 if (path->locks[level]) {
bd681513
CM
5611 btrfs_tree_unlock_rw(path->nodes[level],
5612 path->locks[level]);
2c47e605 5613 path->locks[level] = 0;
f82d02d9 5614 }
2c47e605
YZ
5615 free_extent_buffer(path->nodes[level]);
5616 path->nodes[level] = NULL;
5617 level++;
20524f02
CM
5618 }
5619 }
5620 return 1;
5621}
5622
9aca1d51 5623/*
2c47e605
YZ
5624 * drop a subvolume tree.
5625 *
5626 * this function traverses the tree freeing any blocks that only
5627 * referenced by the tree.
5628 *
5629 * when a shared tree block is found. this function decreases its
5630 * reference count by one. if update_ref is true, this function
5631 * also make sure backrefs for the shared block and all lower level
5632 * blocks are properly updated.
9d1a2a3a
DS
5633 *
5634 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 5635 */
0078a9f9 5636int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
20524f02 5637{
12a824dc
FM
5638 const bool is_reloc_root = (root->root_key.objectid ==
5639 BTRFS_TREE_RELOC_OBJECTID);
ab8d0fc4 5640 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 5641 struct btrfs_path *path;
2c47e605 5642 struct btrfs_trans_handle *trans;
ab8d0fc4 5643 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 5644 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
5645 struct walk_control *wc;
5646 struct btrfs_key key;
5647 int err = 0;
5648 int ret;
5649 int level;
d29a9f62 5650 bool root_dropped = false;
b4be6aef 5651 bool unfinished_drop = false;
20524f02 5652
4fd786e6 5653 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
1152651a 5654
5caf2a00 5655 path = btrfs_alloc_path();
cb1b69f4
TI
5656 if (!path) {
5657 err = -ENOMEM;
5658 goto out;
5659 }
20524f02 5660
2c47e605 5661 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
5662 if (!wc) {
5663 btrfs_free_path(path);
cb1b69f4
TI
5664 err = -ENOMEM;
5665 goto out;
38a1a919 5666 }
2c47e605 5667
f3e3d9cc
QW
5668 /*
5669 * Use join to avoid potential EINTR from transaction start. See
5670 * wait_reserve_ticket and the whole reservation callchain.
5671 */
5672 if (for_reloc)
5673 trans = btrfs_join_transaction(tree_root);
5674 else
5675 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
5676 if (IS_ERR(trans)) {
5677 err = PTR_ERR(trans);
5678 goto out_free;
5679 }
98d5dc13 5680
0568e82d
JB
5681 err = btrfs_run_delayed_items(trans);
5682 if (err)
5683 goto out_end_trans;
5684
83354f07
JB
5685 /*
5686 * This will help us catch people modifying the fs tree while we're
5687 * dropping it. It is unsafe to mess with the fs tree while it's being
5688 * dropped as we unlock the root node and parent nodes as we walk down
5689 * the tree, assuming nothing will change. If something does change
5690 * then we'll have stale information and drop references to blocks we've
5691 * already dropped.
5692 */
5693 set_bit(BTRFS_ROOT_DELETING, &root->state);
b4be6aef
JB
5694 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5695
9f3a7427 5696 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 5697 level = btrfs_header_level(root->node);
5d4f98a2 5698 path->nodes[level] = btrfs_lock_root_node(root);
9f3a7427 5699 path->slots[level] = 0;
ac5887c8 5700 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
5701 memset(&wc->update_progress, 0,
5702 sizeof(wc->update_progress));
9f3a7427 5703 } else {
9f3a7427 5704 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
5705 memcpy(&wc->update_progress, &key,
5706 sizeof(wc->update_progress));
5707
c8422684 5708 level = btrfs_root_drop_level(root_item);
2c47e605 5709 BUG_ON(level == 0);
6702ed49 5710 path->lowest_level = level;
2c47e605
YZ
5711 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5712 path->lowest_level = 0;
5713 if (ret < 0) {
5714 err = ret;
79787eaa 5715 goto out_end_trans;
9f3a7427 5716 }
1c4850e2 5717 WARN_ON(ret > 0);
2c47e605 5718
7d9eb12c
CM
5719 /*
5720 * unlock our path, this is safe because only this
5721 * function is allowed to delete this snapshot
5722 */
5d4f98a2 5723 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
5724
5725 level = btrfs_header_level(root->node);
5726 while (1) {
5727 btrfs_tree_lock(path->nodes[level]);
ac5887c8 5728 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5729
2ff7e61e 5730 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 5731 path->nodes[level]->start,
3173a18f 5732 level, 1, &wc->refs[level],
2c47e605 5733 &wc->flags[level]);
79787eaa
JM
5734 if (ret < 0) {
5735 err = ret;
5736 goto out_end_trans;
5737 }
2c47e605
YZ
5738 BUG_ON(wc->refs[level] == 0);
5739
c8422684 5740 if (level == btrfs_root_drop_level(root_item))
2c47e605
YZ
5741 break;
5742
5743 btrfs_tree_unlock(path->nodes[level]);
fec386ac 5744 path->locks[level] = 0;
2c47e605
YZ
5745 WARN_ON(wc->refs[level] != 1);
5746 level--;
5747 }
9f3a7427 5748 }
2c47e605 5749
78c52d9e 5750 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
5751 wc->level = level;
5752 wc->shared_level = -1;
5753 wc->stage = DROP_REFERENCE;
5754 wc->update_ref = update_ref;
5755 wc->keep_locks = 0;
0b246afa 5756 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 5757
d397712b 5758 while (1) {
9d1a2a3a 5759
2c47e605
YZ
5760 ret = walk_down_tree(trans, root, path, wc);
5761 if (ret < 0) {
5762 err = ret;
20524f02 5763 break;
2c47e605 5764 }
9aca1d51 5765
2c47e605
YZ
5766 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5767 if (ret < 0) {
5768 err = ret;
20524f02 5769 break;
2c47e605
YZ
5770 }
5771
5772 if (ret > 0) {
5773 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
5774 break;
5775 }
2c47e605
YZ
5776
5777 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
5778 wc->drop_level = wc->level;
5779 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5780 &wc->drop_progress,
5781 path->slots[wc->drop_level]);
5782 }
5783 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5784 &wc->drop_progress);
c8422684 5785 btrfs_set_root_drop_level(root_item, wc->drop_level);
2c47e605
YZ
5786
5787 BUG_ON(wc->level == 0);
3a45bb20 5788 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 5789 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
5790 ret = btrfs_update_root(trans, tree_root,
5791 &root->root_key,
5792 root_item);
79787eaa 5793 if (ret) {
66642832 5794 btrfs_abort_transaction(trans, ret);
79787eaa
JM
5795 err = ret;
5796 goto out_end_trans;
5797 }
2c47e605 5798
12a824dc
FM
5799 if (!is_reloc_root)
5800 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5801
3a45bb20 5802 btrfs_end_transaction_throttle(trans);
2ff7e61e 5803 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
5804 btrfs_debug(fs_info,
5805 "drop snapshot early exit");
3c8f2422
JB
5806 err = -EAGAIN;
5807 goto out_free;
5808 }
5809
18d3bff4
JB
5810 /*
5811 * Use join to avoid potential EINTR from transaction
5812 * start. See wait_reserve_ticket and the whole
5813 * reservation callchain.
5814 */
5815 if (for_reloc)
5816 trans = btrfs_join_transaction(tree_root);
5817 else
5818 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
5819 if (IS_ERR(trans)) {
5820 err = PTR_ERR(trans);
5821 goto out_free;
5822 }
c3e69d58 5823 }
20524f02 5824 }
b3b4aa74 5825 btrfs_release_path(path);
79787eaa
JM
5826 if (err)
5827 goto out_end_trans;
2c47e605 5828
ab9ce7d4 5829 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 5830 if (ret) {
66642832 5831 btrfs_abort_transaction(trans, ret);
e19182c0 5832 err = ret;
79787eaa
JM
5833 goto out_end_trans;
5834 }
2c47e605 5835
12a824dc 5836 if (!is_reloc_root) {
cb517eab
MX
5837 ret = btrfs_find_root(tree_root, &root->root_key, path,
5838 NULL, NULL);
79787eaa 5839 if (ret < 0) {
66642832 5840 btrfs_abort_transaction(trans, ret);
79787eaa
JM
5841 err = ret;
5842 goto out_end_trans;
5843 } else if (ret > 0) {
84cd948c
JB
5844 /* if we fail to delete the orphan item this time
5845 * around, it'll get picked up the next time.
5846 *
5847 * The most common failure here is just -ENOENT.
5848 */
5849 btrfs_del_orphan_item(trans, tree_root,
5850 root->root_key.objectid);
76dda93c
YZ
5851 }
5852 }
5853
a3cf0e43
QW
5854 /*
5855 * This subvolume is going to be completely dropped, and won't be
5856 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5857 * commit transaction time. So free it here manually.
5858 */
5859 btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5860 btrfs_qgroup_free_meta_all_pertrans(root);
5861
fc7cbcd4 5862 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
2b9dbef2 5863 btrfs_add_dropped_root(trans, root);
8c38938c 5864 else
00246528 5865 btrfs_put_root(root);
d29a9f62 5866 root_dropped = true;
79787eaa 5867out_end_trans:
12a824dc
FM
5868 if (!is_reloc_root)
5869 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5870
3a45bb20 5871 btrfs_end_transaction_throttle(trans);
79787eaa 5872out_free:
2c47e605 5873 kfree(wc);
5caf2a00 5874 btrfs_free_path(path);
cb1b69f4 5875out:
b4be6aef
JB
5876 /*
5877 * We were an unfinished drop root, check to see if there are any
5878 * pending, and if not clear and wake up any waiters.
5879 */
5880 if (!err && unfinished_drop)
5881 btrfs_maybe_wake_unfinished_drop(fs_info);
5882
d29a9f62
JB
5883 /*
5884 * So if we need to stop dropping the snapshot for whatever reason we
5885 * need to make sure to add it back to the dead root list so that we
5886 * keep trying to do the work later. This also cleans up roots if we
5887 * don't have it in the radix (like when we recover after a power fail
5888 * or unmount) so we don't leak memory.
5889 */
897ca819 5890 if (!for_reloc && !root_dropped)
d29a9f62 5891 btrfs_add_dead_root(root);
2c536799 5892 return err;
20524f02 5893}
9078a3e1 5894
2c47e605
YZ
5895/*
5896 * drop subtree rooted at tree block 'node'.
5897 *
5898 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 5899 * only used by relocation code
2c47e605 5900 */
f82d02d9
YZ
5901int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5902 struct btrfs_root *root,
5903 struct extent_buffer *node,
5904 struct extent_buffer *parent)
5905{
0b246afa 5906 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 5907 struct btrfs_path *path;
2c47e605 5908 struct walk_control *wc;
f82d02d9
YZ
5909 int level;
5910 int parent_level;
5911 int ret = 0;
5912 int wret;
5913
2c47e605
YZ
5914 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5915
f82d02d9 5916 path = btrfs_alloc_path();
db5b493a
TI
5917 if (!path)
5918 return -ENOMEM;
f82d02d9 5919
2c47e605 5920 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
5921 if (!wc) {
5922 btrfs_free_path(path);
5923 return -ENOMEM;
5924 }
2c47e605 5925
49d0c642 5926 btrfs_assert_tree_write_locked(parent);
f82d02d9 5927 parent_level = btrfs_header_level(parent);
67439dad 5928 atomic_inc(&parent->refs);
f82d02d9
YZ
5929 path->nodes[parent_level] = parent;
5930 path->slots[parent_level] = btrfs_header_nritems(parent);
5931
49d0c642 5932 btrfs_assert_tree_write_locked(node);
f82d02d9 5933 level = btrfs_header_level(node);
f82d02d9
YZ
5934 path->nodes[level] = node;
5935 path->slots[level] = 0;
ac5887c8 5936 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
5937
5938 wc->refs[parent_level] = 1;
5939 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5940 wc->level = level;
5941 wc->shared_level = -1;
5942 wc->stage = DROP_REFERENCE;
5943 wc->update_ref = 0;
5944 wc->keep_locks = 1;
0b246afa 5945 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
5946
5947 while (1) {
2c47e605
YZ
5948 wret = walk_down_tree(trans, root, path, wc);
5949 if (wret < 0) {
f82d02d9 5950 ret = wret;
f82d02d9 5951 break;
2c47e605 5952 }
f82d02d9 5953
2c47e605 5954 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
5955 if (wret < 0)
5956 ret = wret;
5957 if (wret != 0)
5958 break;
5959 }
5960
2c47e605 5961 kfree(wc);
f82d02d9
YZ
5962 btrfs_free_path(path);
5963 return ret;
5964}
5965
6d07bcec
MX
5966/*
5967 * helper to account the unused space of all the readonly block group in the
633c0aad 5968 * space_info. takes mirrors into account.
6d07bcec 5969 */
633c0aad 5970u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6d07bcec 5971{
32da5386 5972 struct btrfs_block_group *block_group;
6d07bcec
MX
5973 u64 free_bytes = 0;
5974 int factor;
5975
01327610 5976 /* It's df, we don't care if it's racy */
633c0aad
JB
5977 if (list_empty(&sinfo->ro_bgs))
5978 return 0;
5979
5980 spin_lock(&sinfo->lock);
5981 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
6d07bcec
MX
5982 spin_lock(&block_group->lock);
5983
5984 if (!block_group->ro) {
5985 spin_unlock(&block_group->lock);
5986 continue;
5987 }
5988
46df06b8 5989 factor = btrfs_bg_type_to_factor(block_group->flags);
b3470b5d 5990 free_bytes += (block_group->length -
bf38be65 5991 block_group->used) * factor;
6d07bcec
MX
5992
5993 spin_unlock(&block_group->lock);
5994 }
6d07bcec
MX
5995 spin_unlock(&sinfo->lock);
5996
5997 return free_bytes;
5998}
5999
2ff7e61e
JM
6000int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
6001 u64 start, u64 end)
acce952b 6002{
2ff7e61e 6003 return unpin_extent_range(fs_info, start, end, false);
acce952b 6004}
6005
499f377f
JM
6006/*
6007 * It used to be that old block groups would be left around forever.
6008 * Iterating over them would be enough to trim unused space. Since we
6009 * now automatically remove them, we also need to iterate over unallocated
6010 * space.
6011 *
6012 * We don't want a transaction for this since the discard may take a
6013 * substantial amount of time. We don't require that a transaction be
6014 * running, but we do need to take a running transaction into account
fee7acc3
JM
6015 * to ensure that we're not discarding chunks that were released or
6016 * allocated in the current transaction.
499f377f
JM
6017 *
6018 * Holding the chunks lock will prevent other threads from allocating
6019 * or releasing chunks, but it won't prevent a running transaction
6020 * from committing and releasing the memory that the pending chunks
6021 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
6022 * transaction and hold the commit root sem. We only need to hold
6023 * it while performing the free space search since we have already
6024 * held back allocations.
499f377f 6025 */
8103d10b 6026static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 6027{
37f85ec3 6028 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
499f377f
JM
6029 int ret;
6030
6031 *trimmed = 0;
6032
0be88e36 6033 /* Discard not supported = nothing to do. */
70200574 6034 if (!bdev_max_discard_sectors(device->bdev))
0be88e36
JM
6035 return 0;
6036
52042d8e 6037 /* Not writable = nothing to do. */
ebbede42 6038 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
6039 return 0;
6040
6041 /* No free space = nothing to do. */
6042 if (device->total_bytes <= device->bytes_used)
6043 return 0;
6044
6045 ret = 0;
6046
6047 while (1) {
fb456252 6048 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
6049 u64 bytes;
6050
6051 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6052 if (ret)
fee7acc3 6053 break;
499f377f 6054
929be17a
NB
6055 find_first_clear_extent_bit(&device->alloc_state, start,
6056 &start, &end,
6057 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45 6058
c57dd1f2
QW
6059 /* Check if there are any CHUNK_* bits left */
6060 if (start > device->total_bytes) {
6061 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6062 btrfs_warn_in_rcu(fs_info,
6063"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6064 start, end - start + 1,
6065 rcu_str_deref(device->name),
6066 device->total_bytes);
6067 mutex_unlock(&fs_info->chunk_mutex);
6068 ret = 0;
6069 break;
6070 }
6071
37f85ec3
QW
6072 /* Ensure we skip the reserved space on each device. */
6073 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
53460a45 6074
929be17a
NB
6075 /*
6076 * If find_first_clear_extent_bit find a range that spans the
6077 * end of the device it will set end to -1, in this case it's up
6078 * to the caller to trim the value to the size of the device.
6079 */
6080 end = min(end, device->total_bytes - 1);
53460a45 6081
929be17a 6082 len = end - start + 1;
499f377f 6083
929be17a
NB
6084 /* We didn't find any extents */
6085 if (!len) {
499f377f 6086 mutex_unlock(&fs_info->chunk_mutex);
929be17a 6087 ret = 0;
499f377f
JM
6088 break;
6089 }
6090
929be17a
NB
6091 ret = btrfs_issue_discard(device->bdev, start, len,
6092 &bytes);
6093 if (!ret)
6094 set_extent_bits(&device->alloc_state, start,
6095 start + bytes - 1,
6096 CHUNK_TRIMMED);
499f377f
JM
6097 mutex_unlock(&fs_info->chunk_mutex);
6098
6099 if (ret)
6100 break;
6101
6102 start += len;
6103 *trimmed += bytes;
6104
6105 if (fatal_signal_pending(current)) {
6106 ret = -ERESTARTSYS;
6107 break;
6108 }
6109
6110 cond_resched();
6111 }
6112
6113 return ret;
6114}
6115
93bba24d
QW
6116/*
6117 * Trim the whole filesystem by:
6118 * 1) trimming the free space in each block group
6119 * 2) trimming the unallocated space on each device
6120 *
6121 * This will also continue trimming even if a block group or device encounters
6122 * an error. The return value will be the last error, or 0 if nothing bad
6123 * happens.
6124 */
2ff7e61e 6125int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 6126{
23608d51 6127 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
32da5386 6128 struct btrfs_block_group *cache = NULL;
499f377f 6129 struct btrfs_device *device;
f7039b1d 6130 u64 group_trimmed;
07301df7 6131 u64 range_end = U64_MAX;
f7039b1d
LD
6132 u64 start;
6133 u64 end;
6134 u64 trimmed = 0;
93bba24d
QW
6135 u64 bg_failed = 0;
6136 u64 dev_failed = 0;
6137 int bg_ret = 0;
6138 int dev_ret = 0;
f7039b1d
LD
6139 int ret = 0;
6140
f981fec1
JB
6141 if (range->start == U64_MAX)
6142 return -EINVAL;
6143
07301df7
QW
6144 /*
6145 * Check range overflow if range->len is set.
6146 * The default range->len is U64_MAX.
6147 */
6148 if (range->len != U64_MAX &&
6149 check_add_overflow(range->start, range->len, &range_end))
6150 return -EINVAL;
6151
6ba9fc8e 6152 cache = btrfs_lookup_first_block_group(fs_info, range->start);
2e405ad8 6153 for (; cache; cache = btrfs_next_block_group(cache)) {
b3470b5d 6154 if (cache->start >= range_end) {
f7039b1d
LD
6155 btrfs_put_block_group(cache);
6156 break;
6157 }
6158
b3470b5d
DS
6159 start = max(range->start, cache->start);
6160 end = min(range_end, cache->start + cache->length);
f7039b1d
LD
6161
6162 if (end - start >= range->minlen) {
32da5386 6163 if (!btrfs_block_group_done(cache)) {
ced8ecf0 6164 ret = btrfs_cache_block_group(cache, true);
1be41b78 6165 if (ret) {
93bba24d
QW
6166 bg_failed++;
6167 bg_ret = ret;
6168 continue;
1be41b78 6169 }
f7039b1d
LD
6170 }
6171 ret = btrfs_trim_block_group(cache,
6172 &group_trimmed,
6173 start,
6174 end,
6175 range->minlen);
6176
6177 trimmed += group_trimmed;
6178 if (ret) {
93bba24d
QW
6179 bg_failed++;
6180 bg_ret = ret;
6181 continue;
f7039b1d
LD
6182 }
6183 }
f7039b1d
LD
6184 }
6185
93bba24d
QW
6186 if (bg_failed)
6187 btrfs_warn(fs_info,
6188 "failed to trim %llu block group(s), last error %d",
6189 bg_failed, bg_ret);
23608d51
AJ
6190
6191 mutex_lock(&fs_devices->device_list_mutex);
6192 list_for_each_entry(device, &fs_devices->devices, dev_list) {
16a200f6
AJ
6193 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6194 continue;
6195
8103d10b 6196 ret = btrfs_trim_free_extents(device, &group_trimmed);
93bba24d
QW
6197 if (ret) {
6198 dev_failed++;
6199 dev_ret = ret;
499f377f 6200 break;
93bba24d 6201 }
499f377f
JM
6202
6203 trimmed += group_trimmed;
6204 }
23608d51 6205 mutex_unlock(&fs_devices->device_list_mutex);
499f377f 6206
93bba24d
QW
6207 if (dev_failed)
6208 btrfs_warn(fs_info,
6209 "failed to trim %llu device(s), last error %d",
6210 dev_failed, dev_ret);
f7039b1d 6211 range->len = trimmed;
93bba24d
QW
6212 if (bg_ret)
6213 return bg_ret;
6214 return dev_ret;
f7039b1d 6215}