btrfs: stop referencing btrfs_delayed_data_ref directly
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
cfc2de0f
BB
19#include "ctree.h"
20#include "extent-tree.h"
2b712e3b 21#include "transaction.h"
fec577fb
CM
22#include "disk-io.h"
23#include "print-tree.h"
0b86a832 24#include "volumes.h"
53b381b3 25#include "raid56.h"
925baedd 26#include "locking.h"
fa9c0d79 27#include "free-space-cache.h"
1e144fb8 28#include "free-space-tree.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
b0643e59 33#include "discard.h"
169e0da9 34#include "zoned.h"
6143c23c 35#include "dev-replace.h"
c7f13d42 36#include "fs.h"
07e81dc9 37#include "accessors.h"
45c40c8f 38#include "root-tree.h"
7c8ede16 39#include "file-item.h"
aa5d3003 40#include "orphan.h"
103c1972 41#include "tree-checker.h"
02c372e1 42#include "raid-stripe-tree.h"
fec577fb 43
709c0486
AJ
44#undef SCRAMBLE_DELAYED_REFS
45
9f9b8e8d 46
5d4f98a2 47static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
cecbb533 48 struct btrfs_delayed_ref_head *href,
85bb9f54 49 struct btrfs_delayed_ref_node *node,
e72cb923 50 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
51static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52 struct extent_buffer *leaf,
53 struct btrfs_extent_item *ei);
54static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
55 u64 parent, u64 root_objectid,
56 u64 flags, u64 owner, u64 offset,
2672a051 57 struct btrfs_key *ins, int ref_mod, u64 oref_root);
5d4f98a2 58static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 59 struct btrfs_delayed_ref_node *node,
21ebfbe7 60 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
61static int find_next_key(struct btrfs_path *path, int level,
62 struct btrfs_key *key);
6a63209f 63
32da5386 64static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
0f9dd46c
JB
65{
66 return (cache->flags & bits) == bits;
67}
68
1a4ed8fd 69/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 70int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5 71{
29cbcf40 72 struct btrfs_root *root = btrfs_extent_root(fs_info, start);
e02119d5
CM
73 int ret;
74 struct btrfs_key key;
31840ae1 75 struct btrfs_path *path;
e02119d5 76
31840ae1 77 path = btrfs_alloc_path();
d8926bb3
MF
78 if (!path)
79 return -ENOMEM;
80
e02119d5
CM
81 key.objectid = start;
82 key.offset = len;
3173a18f 83 key.type = BTRFS_EXTENT_ITEM_KEY;
29cbcf40 84 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
31840ae1 85 btrfs_free_path(path);
7bb86316
CM
86 return ret;
87}
88
a22285a6 89/*
3173a18f 90 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
91 *
92 * the head node for delayed ref is used to store the sum of all the
93 * reference count modifications queued up in the rbtree. the head
94 * node may also store the extent flags to set. This way you can check
95 * to see what the reference count and extent flags would be if all of
96 * the delayed refs are not processed.
97 */
98int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 99 struct btrfs_fs_info *fs_info, u64 bytenr,
d8ba2a91
JB
100 u64 offset, int metadata, u64 *refs, u64 *flags,
101 u64 *owning_root)
a22285a6 102{
29cbcf40 103 struct btrfs_root *extent_root;
a22285a6
YZ
104 struct btrfs_delayed_ref_head *head;
105 struct btrfs_delayed_ref_root *delayed_refs;
106 struct btrfs_path *path;
107 struct btrfs_extent_item *ei;
108 struct extent_buffer *leaf;
109 struct btrfs_key key;
110 u32 item_size;
111 u64 num_refs;
112 u64 extent_flags;
d8ba2a91 113 u64 owner = 0;
a22285a6
YZ
114 int ret;
115
3173a18f
JB
116 /*
117 * If we don't have skinny metadata, don't bother doing anything
118 * different
119 */
0b246afa
JM
120 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
121 offset = fs_info->nodesize;
3173a18f
JB
122 metadata = 0;
123 }
124
a22285a6
YZ
125 path = btrfs_alloc_path();
126 if (!path)
127 return -ENOMEM;
128
a22285a6
YZ
129 if (!trans) {
130 path->skip_locking = 1;
131 path->search_commit_root = 1;
132 }
639eefc8
FDBM
133
134search_again:
135 key.objectid = bytenr;
136 key.offset = offset;
137 if (metadata)
138 key.type = BTRFS_METADATA_ITEM_KEY;
139 else
140 key.type = BTRFS_EXTENT_ITEM_KEY;
141
29cbcf40
JB
142 extent_root = btrfs_extent_root(fs_info, bytenr);
143 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
a22285a6
YZ
144 if (ret < 0)
145 goto out_free;
146
3173a18f 147 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
148 if (path->slots[0]) {
149 path->slots[0]--;
150 btrfs_item_key_to_cpu(path->nodes[0], &key,
151 path->slots[0]);
152 if (key.objectid == bytenr &&
153 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 154 key.offset == fs_info->nodesize)
74be9510
FDBM
155 ret = 0;
156 }
3173a18f
JB
157 }
158
a22285a6
YZ
159 if (ret == 0) {
160 leaf = path->nodes[0];
3212fa14 161 item_size = btrfs_item_size(leaf, path->slots[0]);
a22285a6
YZ
162 if (item_size >= sizeof(*ei)) {
163 ei = btrfs_item_ptr(leaf, path->slots[0],
164 struct btrfs_extent_item);
165 num_refs = btrfs_extent_refs(leaf, ei);
166 extent_flags = btrfs_extent_flags(leaf, ei);
d8ba2a91
JB
167 owner = btrfs_get_extent_owner_root(fs_info, leaf,
168 path->slots[0]);
a22285a6 169 } else {
182741d2
QW
170 ret = -EUCLEAN;
171 btrfs_err(fs_info,
172 "unexpected extent item size, has %u expect >= %zu",
173 item_size, sizeof(*ei));
ba3c2b19
NB
174 if (trans)
175 btrfs_abort_transaction(trans, ret);
176 else
177 btrfs_handle_fs_error(fs_info, ret, NULL);
178
179 goto out_free;
a22285a6 180 }
ba3c2b19 181
a22285a6
YZ
182 BUG_ON(num_refs == 0);
183 } else {
184 num_refs = 0;
185 extent_flags = 0;
186 ret = 0;
187 }
188
189 if (!trans)
190 goto out;
191
192 delayed_refs = &trans->transaction->delayed_refs;
193 spin_lock(&delayed_refs->lock);
f72ad18e 194 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
195 if (head) {
196 if (!mutex_trylock(&head->mutex)) {
d278850e 197 refcount_inc(&head->refs);
a22285a6
YZ
198 spin_unlock(&delayed_refs->lock);
199
b3b4aa74 200 btrfs_release_path(path);
a22285a6 201
8cc33e5c
DS
202 /*
203 * Mutex was contended, block until it's released and try
204 * again
205 */
a22285a6
YZ
206 mutex_lock(&head->mutex);
207 mutex_unlock(&head->mutex);
d278850e 208 btrfs_put_delayed_ref_head(head);
639eefc8 209 goto search_again;
a22285a6 210 }
d7df2c79 211 spin_lock(&head->lock);
a22285a6
YZ
212 if (head->extent_op && head->extent_op->update_flags)
213 extent_flags |= head->extent_op->flags_to_set;
214 else
215 BUG_ON(num_refs == 0);
216
d278850e 217 num_refs += head->ref_mod;
d7df2c79 218 spin_unlock(&head->lock);
a22285a6
YZ
219 mutex_unlock(&head->mutex);
220 }
221 spin_unlock(&delayed_refs->lock);
222out:
223 WARN_ON(num_refs == 0);
224 if (refs)
225 *refs = num_refs;
226 if (flags)
227 *flags = extent_flags;
d8ba2a91
JB
228 if (owning_root)
229 *owning_root = owner;
a22285a6
YZ
230out_free:
231 btrfs_free_path(path);
232 return ret;
233}
234
d8d5f3e1
CM
235/*
236 * Back reference rules. Back refs have three main goals:
237 *
238 * 1) differentiate between all holders of references to an extent so that
239 * when a reference is dropped we can make sure it was a valid reference
240 * before freeing the extent.
241 *
242 * 2) Provide enough information to quickly find the holders of an extent
243 * if we notice a given block is corrupted or bad.
244 *
245 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
246 * maintenance. This is actually the same as #2, but with a slightly
247 * different use case.
248 *
5d4f98a2
YZ
249 * There are two kinds of back refs. The implicit back refs is optimized
250 * for pointers in non-shared tree blocks. For a given pointer in a block,
251 * back refs of this kind provide information about the block's owner tree
252 * and the pointer's key. These information allow us to find the block by
253 * b-tree searching. The full back refs is for pointers in tree blocks not
254 * referenced by their owner trees. The location of tree block is recorded
255 * in the back refs. Actually the full back refs is generic, and can be
256 * used in all cases the implicit back refs is used. The major shortcoming
257 * of the full back refs is its overhead. Every time a tree block gets
258 * COWed, we have to update back refs entry for all pointers in it.
259 *
260 * For a newly allocated tree block, we use implicit back refs for
261 * pointers in it. This means most tree related operations only involve
262 * implicit back refs. For a tree block created in old transaction, the
263 * only way to drop a reference to it is COW it. So we can detect the
264 * event that tree block loses its owner tree's reference and do the
265 * back refs conversion.
266 *
01327610 267 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
268 *
269 * The reference count of the block is one and the tree is the block's
270 * owner tree. Nothing to do in this case.
271 *
272 * The reference count of the block is one and the tree is not the
273 * block's owner tree. In this case, full back refs is used for pointers
274 * in the block. Remove these full back refs, add implicit back refs for
275 * every pointers in the new block.
276 *
277 * The reference count of the block is greater than one and the tree is
278 * the block's owner tree. In this case, implicit back refs is used for
279 * pointers in the block. Add full back refs for every pointers in the
280 * block, increase lower level extents' reference counts. The original
281 * implicit back refs are entailed to the new block.
282 *
283 * The reference count of the block is greater than one and the tree is
284 * not the block's owner tree. Add implicit back refs for every pointer in
285 * the new block, increase lower level extents' reference count.
286 *
287 * Back Reference Key composing:
288 *
289 * The key objectid corresponds to the first byte in the extent,
290 * The key type is used to differentiate between types of back refs.
291 * There are different meanings of the key offset for different types
292 * of back refs.
293 *
d8d5f3e1
CM
294 * File extents can be referenced by:
295 *
296 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 297 * - different files inside a single subvolume
d8d5f3e1
CM
298 * - different offsets inside a file (bookend extents in file.c)
299 *
5d4f98a2 300 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
301 *
302 * - Objectid of the subvolume root
d8d5f3e1 303 * - objectid of the file holding the reference
5d4f98a2
YZ
304 * - original offset in the file
305 * - how many bookend extents
d8d5f3e1 306 *
5d4f98a2
YZ
307 * The key offset for the implicit back refs is hash of the first
308 * three fields.
d8d5f3e1 309 *
5d4f98a2 310 * The extent ref structure for the full back refs has field for:
d8d5f3e1 311 *
5d4f98a2 312 * - number of pointers in the tree leaf
d8d5f3e1 313 *
5d4f98a2
YZ
314 * The key offset for the implicit back refs is the first byte of
315 * the tree leaf
d8d5f3e1 316 *
5d4f98a2
YZ
317 * When a file extent is allocated, The implicit back refs is used.
318 * the fields are filled in:
d8d5f3e1 319 *
5d4f98a2 320 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 321 *
5d4f98a2
YZ
322 * When a file extent is removed file truncation, we find the
323 * corresponding implicit back refs and check the following fields:
d8d5f3e1 324 *
5d4f98a2 325 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 326 *
5d4f98a2 327 * Btree extents can be referenced by:
d8d5f3e1 328 *
5d4f98a2 329 * - Different subvolumes
d8d5f3e1 330 *
5d4f98a2
YZ
331 * Both the implicit back refs and the full back refs for tree blocks
332 * only consist of key. The key offset for the implicit back refs is
333 * objectid of block's owner tree. The key offset for the full back refs
334 * is the first byte of parent block.
d8d5f3e1 335 *
5d4f98a2
YZ
336 * When implicit back refs is used, information about the lowest key and
337 * level of the tree block are required. These information are stored in
338 * tree block info structure.
d8d5f3e1 339 */
31840ae1 340
167ce953
LB
341/*
342 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 343 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
344 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
345 */
346int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
347 struct btrfs_extent_inline_ref *iref,
348 enum btrfs_inline_ref_type is_data)
349{
d9a620f7 350 struct btrfs_fs_info *fs_info = eb->fs_info;
167ce953 351 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 352 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953 353
d9a620f7
BB
354 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
355 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
356 return type;
357 }
358
167ce953
LB
359 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
360 type == BTRFS_SHARED_BLOCK_REF_KEY ||
361 type == BTRFS_SHARED_DATA_REF_KEY ||
362 type == BTRFS_EXTENT_DATA_REF_KEY) {
363 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 364 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 365 return type;
64ecdb64 366 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
d9a620f7 367 ASSERT(fs_info);
64ecdb64 368 /*
ea57788e
QW
369 * Every shared one has parent tree block,
370 * which must be aligned to sector size.
64ecdb64 371 */
d9a620f7 372 if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
64ecdb64
LB
373 return type;
374 }
167ce953 375 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 376 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 377 return type;
64ecdb64 378 if (type == BTRFS_SHARED_DATA_REF_KEY) {
d9a620f7 379 ASSERT(fs_info);
64ecdb64 380 /*
ea57788e
QW
381 * Every shared one has parent tree block,
382 * which must be aligned to sector size.
64ecdb64
LB
383 */
384 if (offset &&
d9a620f7 385 IS_ALIGNED(offset, fs_info->sectorsize))
64ecdb64
LB
386 return type;
387 }
167ce953
LB
388 } else {
389 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
390 return type;
391 }
392 }
393
25761430 394 WARN_ON(1);
6c75a589 395 btrfs_print_leaf(eb);
d9a620f7 396 btrfs_err(fs_info,
ea57788e
QW
397 "eb %llu iref 0x%lx invalid extent inline ref type %d",
398 eb->start, (unsigned long)iref, type);
167ce953
LB
399
400 return BTRFS_REF_TYPE_INVALID;
401}
402
0785a9aa 403u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
5d4f98a2
YZ
404{
405 u32 high_crc = ~(u32)0;
406 u32 low_crc = ~(u32)0;
407 __le64 lenum;
408
409 lenum = cpu_to_le64(root_objectid);
03e86348 410 high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 411 lenum = cpu_to_le64(owner);
03e86348 412 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 413 lenum = cpu_to_le64(offset);
03e86348 414 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
415
416 return ((u64)high_crc << 31) ^ (u64)low_crc;
417}
418
419static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
420 struct btrfs_extent_data_ref *ref)
421{
422 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
423 btrfs_extent_data_ref_objectid(leaf, ref),
424 btrfs_extent_data_ref_offset(leaf, ref));
425}
426
427static int match_extent_data_ref(struct extent_buffer *leaf,
428 struct btrfs_extent_data_ref *ref,
429 u64 root_objectid, u64 owner, u64 offset)
430{
431 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
432 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
433 btrfs_extent_data_ref_offset(leaf, ref) != offset)
434 return 0;
435 return 1;
436}
437
438static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
439 struct btrfs_path *path,
440 u64 bytenr, u64 parent,
441 u64 root_objectid,
442 u64 owner, u64 offset)
443{
29cbcf40 444 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2
YZ
445 struct btrfs_key key;
446 struct btrfs_extent_data_ref *ref;
31840ae1 447 struct extent_buffer *leaf;
5d4f98a2 448 u32 nritems;
74493f7a 449 int ret;
5d4f98a2
YZ
450 int recow;
451 int err = -ENOENT;
74493f7a 452
31840ae1 453 key.objectid = bytenr;
5d4f98a2
YZ
454 if (parent) {
455 key.type = BTRFS_SHARED_DATA_REF_KEY;
456 key.offset = parent;
457 } else {
458 key.type = BTRFS_EXTENT_DATA_REF_KEY;
459 key.offset = hash_extent_data_ref(root_objectid,
460 owner, offset);
461 }
462again:
463 recow = 0;
464 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
465 if (ret < 0) {
466 err = ret;
467 goto fail;
468 }
31840ae1 469
5d4f98a2
YZ
470 if (parent) {
471 if (!ret)
472 return 0;
5d4f98a2 473 goto fail;
31840ae1
ZY
474 }
475
476 leaf = path->nodes[0];
5d4f98a2
YZ
477 nritems = btrfs_header_nritems(leaf);
478 while (1) {
479 if (path->slots[0] >= nritems) {
480 ret = btrfs_next_leaf(root, path);
481 if (ret < 0)
482 err = ret;
483 if (ret)
484 goto fail;
485
486 leaf = path->nodes[0];
487 nritems = btrfs_header_nritems(leaf);
488 recow = 1;
489 }
490
491 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
492 if (key.objectid != bytenr ||
493 key.type != BTRFS_EXTENT_DATA_REF_KEY)
494 goto fail;
495
496 ref = btrfs_item_ptr(leaf, path->slots[0],
497 struct btrfs_extent_data_ref);
498
499 if (match_extent_data_ref(leaf, ref, root_objectid,
500 owner, offset)) {
501 if (recow) {
b3b4aa74 502 btrfs_release_path(path);
5d4f98a2
YZ
503 goto again;
504 }
505 err = 0;
506 break;
507 }
508 path->slots[0]++;
31840ae1 509 }
5d4f98a2
YZ
510fail:
511 return err;
31840ae1
ZY
512}
513
5d4f98a2 514static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2 515 struct btrfs_path *path,
b4b5934a
JB
516 struct btrfs_delayed_ref_node *node,
517 u64 bytenr)
31840ae1 518{
29cbcf40 519 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
31840ae1
ZY
520 struct btrfs_key key;
521 struct extent_buffer *leaf;
b4b5934a
JB
522 u64 owner = btrfs_delayed_ref_owner(node);
523 u64 offset = btrfs_delayed_ref_offset(node);
5d4f98a2 524 u32 size;
31840ae1
ZY
525 u32 num_refs;
526 int ret;
74493f7a 527
74493f7a 528 key.objectid = bytenr;
b4b5934a 529 if (node->parent) {
5d4f98a2 530 key.type = BTRFS_SHARED_DATA_REF_KEY;
b4b5934a 531 key.offset = node->parent;
5d4f98a2
YZ
532 size = sizeof(struct btrfs_shared_data_ref);
533 } else {
534 key.type = BTRFS_EXTENT_DATA_REF_KEY;
b4b5934a 535 key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
5d4f98a2
YZ
536 size = sizeof(struct btrfs_extent_data_ref);
537 }
74493f7a 538
5d4f98a2
YZ
539 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
540 if (ret && ret != -EEXIST)
541 goto fail;
542
543 leaf = path->nodes[0];
b4b5934a 544 if (node->parent) {
5d4f98a2 545 struct btrfs_shared_data_ref *ref;
31840ae1 546 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
547 struct btrfs_shared_data_ref);
548 if (ret == 0) {
b4b5934a 549 btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
5d4f98a2
YZ
550 } else {
551 num_refs = btrfs_shared_data_ref_count(leaf, ref);
b4b5934a 552 num_refs += node->ref_mod;
5d4f98a2 553 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 554 }
5d4f98a2
YZ
555 } else {
556 struct btrfs_extent_data_ref *ref;
557 while (ret == -EEXIST) {
558 ref = btrfs_item_ptr(leaf, path->slots[0],
559 struct btrfs_extent_data_ref);
b4b5934a 560 if (match_extent_data_ref(leaf, ref, node->ref_root,
5d4f98a2
YZ
561 owner, offset))
562 break;
b3b4aa74 563 btrfs_release_path(path);
5d4f98a2
YZ
564 key.offset++;
565 ret = btrfs_insert_empty_item(trans, root, path, &key,
566 size);
567 if (ret && ret != -EEXIST)
568 goto fail;
31840ae1 569
5d4f98a2
YZ
570 leaf = path->nodes[0];
571 }
572 ref = btrfs_item_ptr(leaf, path->slots[0],
573 struct btrfs_extent_data_ref);
574 if (ret == 0) {
b4b5934a 575 btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
5d4f98a2
YZ
576 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
577 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
b4b5934a 578 btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
5d4f98a2
YZ
579 } else {
580 num_refs = btrfs_extent_data_ref_count(leaf, ref);
b4b5934a 581 num_refs += node->ref_mod;
5d4f98a2 582 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 583 }
31840ae1 584 }
50564b65 585 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
586 ret = 0;
587fail:
b3b4aa74 588 btrfs_release_path(path);
7bb86316 589 return ret;
74493f7a
CM
590}
591
5d4f98a2 592static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
76d76e78 593 struct btrfs_root *root,
5d4f98a2 594 struct btrfs_path *path,
5b2a54bb 595 int refs_to_drop)
31840ae1 596{
5d4f98a2
YZ
597 struct btrfs_key key;
598 struct btrfs_extent_data_ref *ref1 = NULL;
599 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 600 struct extent_buffer *leaf;
5d4f98a2 601 u32 num_refs = 0;
31840ae1
ZY
602 int ret = 0;
603
604 leaf = path->nodes[0];
5d4f98a2
YZ
605 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
606
607 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
608 ref1 = btrfs_item_ptr(leaf, path->slots[0],
609 struct btrfs_extent_data_ref);
610 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
611 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
612 ref2 = btrfs_item_ptr(leaf, path->slots[0],
613 struct btrfs_shared_data_ref);
614 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2 615 } else {
182741d2
QW
616 btrfs_err(trans->fs_info,
617 "unrecognized backref key (%llu %u %llu)",
618 key.objectid, key.type, key.offset);
619 btrfs_abort_transaction(trans, -EUCLEAN);
620 return -EUCLEAN;
5d4f98a2
YZ
621 }
622
56bec294
CM
623 BUG_ON(num_refs < refs_to_drop);
624 num_refs -= refs_to_drop;
5d4f98a2 625
31840ae1 626 if (num_refs == 0) {
76d76e78 627 ret = btrfs_del_item(trans, root, path);
31840ae1 628 } else {
5d4f98a2
YZ
629 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
630 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
631 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
632 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
50564b65 633 btrfs_mark_buffer_dirty(trans, leaf);
31840ae1 634 }
31840ae1
ZY
635 return ret;
636}
637
9ed0dea0 638static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 639 struct btrfs_extent_inline_ref *iref)
15916de8 640{
5d4f98a2
YZ
641 struct btrfs_key key;
642 struct extent_buffer *leaf;
643 struct btrfs_extent_data_ref *ref1;
644 struct btrfs_shared_data_ref *ref2;
645 u32 num_refs = 0;
3de28d57 646 int type;
5d4f98a2
YZ
647
648 leaf = path->nodes[0];
649 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19 650
5d4f98a2 651 if (iref) {
3de28d57
LB
652 /*
653 * If type is invalid, we should have bailed out earlier than
654 * this call.
655 */
656 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
657 ASSERT(type != BTRFS_REF_TYPE_INVALID);
658 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
659 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
660 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
661 } else {
662 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
663 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
664 }
665 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
666 ref1 = btrfs_item_ptr(leaf, path->slots[0],
667 struct btrfs_extent_data_ref);
668 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
669 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
670 ref2 = btrfs_item_ptr(leaf, path->slots[0],
671 struct btrfs_shared_data_ref);
672 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
673 } else {
674 WARN_ON(1);
675 }
676 return num_refs;
677}
15916de8 678
5d4f98a2 679static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
680 struct btrfs_path *path,
681 u64 bytenr, u64 parent,
682 u64 root_objectid)
1f3c79a2 683{
29cbcf40 684 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 685 struct btrfs_key key;
1f3c79a2 686 int ret;
1f3c79a2 687
5d4f98a2
YZ
688 key.objectid = bytenr;
689 if (parent) {
690 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
691 key.offset = parent;
692 } else {
693 key.type = BTRFS_TREE_BLOCK_REF_KEY;
694 key.offset = root_objectid;
1f3c79a2
LH
695 }
696
5d4f98a2
YZ
697 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
698 if (ret > 0)
699 ret = -ENOENT;
5d4f98a2 700 return ret;
1f3c79a2
LH
701}
702
5d4f98a2 703static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2 704 struct btrfs_path *path,
b4b5934a
JB
705 struct btrfs_delayed_ref_node *node,
706 u64 bytenr)
31840ae1 707{
29cbcf40 708 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 709 struct btrfs_key key;
31840ae1 710 int ret;
31840ae1 711
5d4f98a2 712 key.objectid = bytenr;
b4b5934a 713 if (node->parent) {
5d4f98a2 714 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
b4b5934a 715 key.offset = node->parent;
5d4f98a2
YZ
716 } else {
717 key.type = BTRFS_TREE_BLOCK_REF_KEY;
b4b5934a 718 key.offset = node->ref_root;
5d4f98a2
YZ
719 }
720
29cbcf40 721 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
b3b4aa74 722 btrfs_release_path(path);
31840ae1
ZY
723 return ret;
724}
725
5d4f98a2 726static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 727{
5d4f98a2
YZ
728 int type;
729 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
730 if (parent > 0)
731 type = BTRFS_SHARED_BLOCK_REF_KEY;
732 else
733 type = BTRFS_TREE_BLOCK_REF_KEY;
734 } else {
735 if (parent > 0)
736 type = BTRFS_SHARED_DATA_REF_KEY;
737 else
738 type = BTRFS_EXTENT_DATA_REF_KEY;
739 }
740 return type;
31840ae1 741}
56bec294 742
2c47e605
YZ
743static int find_next_key(struct btrfs_path *path, int level,
744 struct btrfs_key *key)
56bec294 745
02217ed2 746{
2c47e605 747 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
748 if (!path->nodes[level])
749 break;
5d4f98a2
YZ
750 if (path->slots[level] + 1 >=
751 btrfs_header_nritems(path->nodes[level]))
752 continue;
753 if (level == 0)
754 btrfs_item_key_to_cpu(path->nodes[level], key,
755 path->slots[level] + 1);
756 else
757 btrfs_node_key_to_cpu(path->nodes[level], key,
758 path->slots[level] + 1);
759 return 0;
760 }
761 return 1;
762}
037e6390 763
5d4f98a2
YZ
764/*
765 * look for inline back ref. if back ref is found, *ref_ret is set
766 * to the address of inline back ref, and 0 is returned.
767 *
768 * if back ref isn't found, *ref_ret is set to the address where it
769 * should be inserted, and -ENOENT is returned.
770 *
771 * if insert is true and there are too many inline back refs, the path
772 * points to the extent item, and -EAGAIN is returned.
773 *
774 * NOTE: inline back refs are ordered in the same way that back ref
775 * items in the tree are ordered.
776 */
777static noinline_for_stack
778int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
779 struct btrfs_path *path,
780 struct btrfs_extent_inline_ref **ref_ret,
781 u64 bytenr, u64 num_bytes,
782 u64 parent, u64 root_objectid,
783 u64 owner, u64 offset, int insert)
784{
867cc1fb 785 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 786 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
5d4f98a2
YZ
787 struct btrfs_key key;
788 struct extent_buffer *leaf;
789 struct btrfs_extent_item *ei;
790 struct btrfs_extent_inline_ref *iref;
791 u64 flags;
792 u64 item_size;
793 unsigned long ptr;
794 unsigned long end;
795 int extra_size;
796 int type;
797 int want;
798 int ret;
0b246afa 799 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 800 int needed;
26b8003f 801
db94535d 802 key.objectid = bytenr;
31840ae1 803 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 804 key.offset = num_bytes;
31840ae1 805
5d4f98a2
YZ
806 want = extent_ref_type(parent, owner);
807 if (insert) {
808 extra_size = btrfs_extent_inline_ref_size(want);
9a664971 809 path->search_for_extension = 1;
85d4198e 810 path->keep_locks = 1;
5d4f98a2
YZ
811 } else
812 extra_size = -1;
3173a18f
JB
813
814 /*
16d1c062
NB
815 * Owner is our level, so we can just add one to get the level for the
816 * block we are interested in.
3173a18f
JB
817 */
818 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
819 key.type = BTRFS_METADATA_ITEM_KEY;
820 key.offset = owner;
821 }
822
823again:
5d4f98a2 824 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
cc925b96 825 if (ret < 0)
5d4f98a2 826 goto out;
3173a18f
JB
827
828 /*
829 * We may be a newly converted file system which still has the old fat
830 * extent entries for metadata, so try and see if we have one of those.
831 */
832 if (ret > 0 && skinny_metadata) {
833 skinny_metadata = false;
834 if (path->slots[0]) {
835 path->slots[0]--;
836 btrfs_item_key_to_cpu(path->nodes[0], &key,
837 path->slots[0]);
838 if (key.objectid == bytenr &&
839 key.type == BTRFS_EXTENT_ITEM_KEY &&
840 key.offset == num_bytes)
841 ret = 0;
842 }
843 if (ret) {
9ce49a0b 844 key.objectid = bytenr;
3173a18f
JB
845 key.type = BTRFS_EXTENT_ITEM_KEY;
846 key.offset = num_bytes;
847 btrfs_release_path(path);
848 goto again;
849 }
850 }
851
79787eaa 852 if (ret && !insert) {
cc925b96 853 ret = -ENOENT;
79787eaa 854 goto out;
fae7f21c 855 } else if (WARN_ON(ret)) {
7f72f505
QW
856 btrfs_print_leaf(path->nodes[0]);
857 btrfs_err(fs_info,
858"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
859 bytenr, num_bytes, parent, root_objectid, owner,
860 offset);
eba444f1 861 ret = -EUCLEAN;
492104c8 862 goto out;
79787eaa 863 }
5d4f98a2
YZ
864
865 leaf = path->nodes[0];
3212fa14 866 item_size = btrfs_item_size(leaf, path->slots[0]);
6d8ff4e4 867 if (unlikely(item_size < sizeof(*ei))) {
cc925b96 868 ret = -EUCLEAN;
182741d2
QW
869 btrfs_err(fs_info,
870 "unexpected extent item size, has %llu expect >= %zu",
871 item_size, sizeof(*ei));
cc925b96 872 btrfs_abort_transaction(trans, ret);
ba3c2b19
NB
873 goto out;
874 }
5d4f98a2 875
5d4f98a2
YZ
876 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
877 flags = btrfs_extent_flags(leaf, ei);
878
879 ptr = (unsigned long)(ei + 1);
880 end = (unsigned long)ei + item_size;
881
3173a18f 882 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
883 ptr += sizeof(struct btrfs_tree_block_info);
884 BUG_ON(ptr > end);
5d4f98a2
YZ
885 }
886
3de28d57
LB
887 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
888 needed = BTRFS_REF_TYPE_DATA;
889 else
890 needed = BTRFS_REF_TYPE_BLOCK;
891
cc925b96 892 ret = -ENOENT;
da8848ac 893 while (ptr < end) {
5d4f98a2 894 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57 895 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
d9a620f7
BB
896 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
897 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
898 ptr += btrfs_extent_inline_ref_size(type);
899 continue;
900 }
3de28d57 901 if (type == BTRFS_REF_TYPE_INVALID) {
cc925b96 902 ret = -EUCLEAN;
3de28d57
LB
903 goto out;
904 }
905
5d4f98a2
YZ
906 if (want < type)
907 break;
908 if (want > type) {
909 ptr += btrfs_extent_inline_ref_size(type);
910 continue;
911 }
912
913 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
914 struct btrfs_extent_data_ref *dref;
915 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
916 if (match_extent_data_ref(leaf, dref, root_objectid,
917 owner, offset)) {
cc925b96 918 ret = 0;
5d4f98a2
YZ
919 break;
920 }
921 if (hash_extent_data_ref_item(leaf, dref) <
922 hash_extent_data_ref(root_objectid, owner, offset))
923 break;
924 } else {
925 u64 ref_offset;
926 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
927 if (parent > 0) {
928 if (parent == ref_offset) {
cc925b96 929 ret = 0;
5d4f98a2
YZ
930 break;
931 }
932 if (ref_offset < parent)
933 break;
934 } else {
935 if (root_objectid == ref_offset) {
cc925b96 936 ret = 0;
5d4f98a2
YZ
937 break;
938 }
939 if (ref_offset < root_objectid)
940 break;
941 }
942 }
943 ptr += btrfs_extent_inline_ref_size(type);
944 }
da8848ac
FM
945
946 if (unlikely(ptr > end)) {
947 ret = -EUCLEAN;
948 btrfs_print_leaf(path->nodes[0]);
949 btrfs_crit(fs_info,
950"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
951 path->slots[0], root_objectid, owner, offset, parent);
952 goto out;
953 }
954
cc925b96 955 if (ret == -ENOENT && insert) {
5d4f98a2
YZ
956 if (item_size + extra_size >=
957 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
cc925b96 958 ret = -EAGAIN;
5d4f98a2
YZ
959 goto out;
960 }
961 /*
962 * To add new inline back ref, we have to make sure
963 * there is no corresponding back ref item.
964 * For simplicity, we just do not add new inline back
965 * ref if there is any kind of item for this block
966 */
2c47e605
YZ
967 if (find_next_key(path, 0, &key) == 0 &&
968 key.objectid == bytenr &&
85d4198e 969 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
cc925b96 970 ret = -EAGAIN;
5d4f98a2
YZ
971 goto out;
972 }
973 }
974 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
975out:
85d4198e 976 if (insert) {
5d4f98a2 977 path->keep_locks = 0;
9a664971 978 path->search_for_extension = 0;
5d4f98a2
YZ
979 btrfs_unlock_up_safe(path, 1);
980 }
cc925b96 981 return ret;
5d4f98a2
YZ
982}
983
984/*
985 * helper to add new inline back ref
986 */
987static noinline_for_stack
50564b65 988void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
143bede5
JM
989 struct btrfs_path *path,
990 struct btrfs_extent_inline_ref *iref,
991 u64 parent, u64 root_objectid,
992 u64 owner, u64 offset, int refs_to_add,
993 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
994{
995 struct extent_buffer *leaf;
996 struct btrfs_extent_item *ei;
997 unsigned long ptr;
998 unsigned long end;
999 unsigned long item_offset;
1000 u64 refs;
1001 int size;
1002 int type;
5d4f98a2
YZ
1003
1004 leaf = path->nodes[0];
1005 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1006 item_offset = (unsigned long)iref - (unsigned long)ei;
1007
1008 type = extent_ref_type(parent, owner);
1009 size = btrfs_extent_inline_ref_size(type);
1010
50564b65 1011 btrfs_extend_item(trans, path, size);
5d4f98a2
YZ
1012
1013 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1014 refs = btrfs_extent_refs(leaf, ei);
1015 refs += refs_to_add;
1016 btrfs_set_extent_refs(leaf, ei, refs);
1017 if (extent_op)
1018 __run_delayed_extent_op(extent_op, leaf, ei);
1019
1020 ptr = (unsigned long)ei + item_offset;
3212fa14 1021 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1022 if (ptr < end - size)
1023 memmove_extent_buffer(leaf, ptr + size, ptr,
1024 end - size - ptr);
1025
1026 iref = (struct btrfs_extent_inline_ref *)ptr;
1027 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1028 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1029 struct btrfs_extent_data_ref *dref;
1030 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1031 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1032 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1033 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1034 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1035 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1036 struct btrfs_shared_data_ref *sref;
1037 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1038 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1039 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1040 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1041 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1042 } else {
1043 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1044 }
50564b65 1045 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
1046}
1047
1048static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1049 struct btrfs_path *path,
1050 struct btrfs_extent_inline_ref **ref_ret,
1051 u64 bytenr, u64 num_bytes, u64 parent,
1052 u64 root_objectid, u64 owner, u64 offset)
1053{
1054 int ret;
1055
867cc1fb
NB
1056 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1057 num_bytes, parent, root_objectid,
1058 owner, offset, 0);
5d4f98a2 1059 if (ret != -ENOENT)
54aa1f4d 1060 return ret;
5d4f98a2 1061
b3b4aa74 1062 btrfs_release_path(path);
5d4f98a2
YZ
1063 *ref_ret = NULL;
1064
1065 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1066 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1067 root_objectid);
5d4f98a2 1068 } else {
bd1d53ef
NB
1069 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1070 root_objectid, owner, offset);
b9473439 1071 }
5d4f98a2
YZ
1072 return ret;
1073}
31840ae1 1074
5d4f98a2
YZ
1075/*
1076 * helper to update/remove inline back ref
1077 */
50564b65
FM
1078static noinline_for_stack int update_inline_extent_backref(
1079 struct btrfs_trans_handle *trans,
1080 struct btrfs_path *path,
143bede5
JM
1081 struct btrfs_extent_inline_ref *iref,
1082 int refs_to_mod,
5b2a54bb 1083 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2 1084{
61a18f1c 1085 struct extent_buffer *leaf = path->nodes[0];
25761430 1086 struct btrfs_fs_info *fs_info = leaf->fs_info;
5d4f98a2
YZ
1087 struct btrfs_extent_item *ei;
1088 struct btrfs_extent_data_ref *dref = NULL;
1089 struct btrfs_shared_data_ref *sref = NULL;
1090 unsigned long ptr;
1091 unsigned long end;
1092 u32 item_size;
1093 int size;
1094 int type;
5d4f98a2
YZ
1095 u64 refs;
1096
5d4f98a2
YZ
1097 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1098 refs = btrfs_extent_refs(leaf, ei);
25761430
QW
1099 if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1100 struct btrfs_key key;
1101 u32 extent_size;
1102
1103 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1104 if (key.type == BTRFS_METADATA_ITEM_KEY)
1105 extent_size = fs_info->nodesize;
1106 else
1107 extent_size = key.offset;
1108 btrfs_print_leaf(leaf);
1109 btrfs_err(fs_info,
1110 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1111 key.objectid, extent_size, refs_to_mod, refs);
1112 return -EUCLEAN;
1113 }
5d4f98a2
YZ
1114 refs += refs_to_mod;
1115 btrfs_set_extent_refs(leaf, ei, refs);
1116 if (extent_op)
1117 __run_delayed_extent_op(extent_op, leaf, ei);
1118
25761430 1119 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
3de28d57 1120 /*
25761430
QW
1121 * Function btrfs_get_extent_inline_ref_type() has already printed
1122 * error messages.
3de28d57 1123 */
25761430
QW
1124 if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1125 return -EUCLEAN;
5d4f98a2
YZ
1126
1127 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1128 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1129 refs = btrfs_extent_data_ref_count(leaf, dref);
1130 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1131 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1132 refs = btrfs_shared_data_ref_count(leaf, sref);
1133 } else {
1134 refs = 1;
25761430
QW
1135 /*
1136 * For tree blocks we can only drop one ref for it, and tree
1137 * blocks should not have refs > 1.
1138 *
1139 * Furthermore if we're inserting a new inline backref, we
1140 * won't reach this path either. That would be
1141 * setup_inline_extent_backref().
1142 */
1143 if (unlikely(refs_to_mod != -1)) {
1144 struct btrfs_key key;
1145
1146 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1147
1148 btrfs_print_leaf(leaf);
1149 btrfs_err(fs_info,
1150 "invalid refs_to_mod for tree block %llu, has %d expect -1",
1151 key.objectid, refs_to_mod);
1152 return -EUCLEAN;
1153 }
56bec294 1154 }
31840ae1 1155
25761430
QW
1156 if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1157 struct btrfs_key key;
1158 u32 extent_size;
1159
1160 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1161 if (key.type == BTRFS_METADATA_ITEM_KEY)
1162 extent_size = fs_info->nodesize;
1163 else
1164 extent_size = key.offset;
1165 btrfs_print_leaf(leaf);
1166 btrfs_err(fs_info,
1167"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1168 (unsigned long)iref, key.objectid, extent_size,
1169 refs_to_mod, refs);
1170 return -EUCLEAN;
1171 }
5d4f98a2
YZ
1172 refs += refs_to_mod;
1173
1174 if (refs > 0) {
1175 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1176 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1177 else
1178 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1179 } else {
1180 size = btrfs_extent_inline_ref_size(type);
3212fa14 1181 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1182 ptr = (unsigned long)iref;
1183 end = (unsigned long)ei + item_size;
1184 if (ptr + size < end)
1185 memmove_extent_buffer(leaf, ptr, ptr + size,
1186 end - ptr - size);
1187 item_size -= size;
50564b65 1188 btrfs_truncate_item(trans, path, item_size, 1);
5d4f98a2 1189 }
50564b65 1190 btrfs_mark_buffer_dirty(trans, leaf);
25761430 1191 return 0;
5d4f98a2
YZ
1192}
1193
1194static noinline_for_stack
1195int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1196 struct btrfs_path *path,
1197 u64 bytenr, u64 num_bytes, u64 parent,
1198 u64 root_objectid, u64 owner,
1199 u64 offset, int refs_to_add,
1200 struct btrfs_delayed_extent_op *extent_op)
1201{
1202 struct btrfs_extent_inline_ref *iref;
1203 int ret;
1204
867cc1fb
NB
1205 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1206 num_bytes, parent, root_objectid,
1207 owner, offset, 1);
5d4f98a2 1208 if (ret == 0) {
07cce5cf
QW
1209 /*
1210 * We're adding refs to a tree block we already own, this
1211 * should not happen at all.
1212 */
1213 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
eee3b811 1214 btrfs_print_leaf(path->nodes[0]);
07cce5cf 1215 btrfs_crit(trans->fs_info,
eee3b811
QW
1216"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1217 bytenr, num_bytes, root_objectid, path->slots[0]);
07cce5cf
QW
1218 return -EUCLEAN;
1219 }
50564b65
FM
1220 ret = update_inline_extent_backref(trans, path, iref,
1221 refs_to_add, extent_op);
5d4f98a2 1222 } else if (ret == -ENOENT) {
50564b65 1223 setup_inline_extent_backref(trans, path, iref, parent,
143bede5
JM
1224 root_objectid, owner, offset,
1225 refs_to_add, extent_op);
1226 ret = 0;
771ed689 1227 }
5d4f98a2
YZ
1228 return ret;
1229}
31840ae1 1230
5d4f98a2 1231static int remove_extent_backref(struct btrfs_trans_handle *trans,
76d76e78 1232 struct btrfs_root *root,
5d4f98a2
YZ
1233 struct btrfs_path *path,
1234 struct btrfs_extent_inline_ref *iref,
5b2a54bb 1235 int refs_to_drop, int is_data)
5d4f98a2 1236{
143bede5 1237 int ret = 0;
b9473439 1238
5d4f98a2 1239 BUG_ON(!is_data && refs_to_drop != 1);
5b2a54bb 1240 if (iref)
50564b65
FM
1241 ret = update_inline_extent_backref(trans, path, iref,
1242 -refs_to_drop, NULL);
5b2a54bb
JB
1243 else if (is_data)
1244 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1245 else
76d76e78 1246 ret = btrfs_del_item(trans, root, path);
5d4f98a2
YZ
1247 return ret;
1248}
1249
d04c6b88
JM
1250static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1251 u64 *discarded_bytes)
5d4f98a2 1252{
86557861
JM
1253 int j, ret = 0;
1254 u64 bytes_left, end;
adbe7e38 1255 u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
d04c6b88 1256
a208b3f1
DS
1257 /* Adjust the range to be aligned to 512B sectors if necessary. */
1258 if (start != aligned_start) {
4d89d377 1259 len -= aligned_start - start;
adbe7e38 1260 len = round_down(len, 1 << SECTOR_SHIFT);
4d89d377
JM
1261 start = aligned_start;
1262 }
d04c6b88 1263
4d89d377 1264 *discarded_bytes = 0;
86557861
JM
1265
1266 if (!len)
1267 return 0;
1268
1269 end = start + len;
1270 bytes_left = len;
1271
1272 /* Skip any superblocks on this device. */
1273 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1274 u64 sb_start = btrfs_sb_offset(j);
1275 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1276 u64 size = sb_start - start;
1277
1278 if (!in_range(sb_start, start, bytes_left) &&
1279 !in_range(sb_end, start, bytes_left) &&
1280 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1281 continue;
1282
1283 /*
1284 * Superblock spans beginning of range. Adjust start and
1285 * try again.
1286 */
1287 if (sb_start <= start) {
1288 start += sb_end - start;
1289 if (start > end) {
1290 bytes_left = 0;
1291 break;
1292 }
1293 bytes_left = end - start;
1294 continue;
1295 }
1296
1297 if (size) {
29e70be2
AJ
1298 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1299 size >> SECTOR_SHIFT,
44abff2c 1300 GFP_NOFS);
86557861
JM
1301 if (!ret)
1302 *discarded_bytes += size;
1303 else if (ret != -EOPNOTSUPP)
1304 return ret;
1305 }
1306
1307 start = sb_end;
1308 if (start > end) {
1309 bytes_left = 0;
1310 break;
1311 }
1312 bytes_left = end - start;
1313 }
1314
1315 if (bytes_left) {
29e70be2
AJ
1316 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1317 bytes_left >> SECTOR_SHIFT,
44abff2c 1318 GFP_NOFS);
4d89d377 1319 if (!ret)
86557861 1320 *discarded_bytes += bytes_left;
4d89d377 1321 }
d04c6b88 1322 return ret;
5d4f98a2 1323}
5d4f98a2 1324
a4012f06 1325static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
6143c23c
NA
1326{
1327 struct btrfs_device *dev = stripe->dev;
1328 struct btrfs_fs_info *fs_info = dev->fs_info;
1329 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1330 u64 phys = stripe->physical;
1331 u64 len = stripe->length;
1332 u64 discarded = 0;
1333 int ret = 0;
1334
1335 /* Zone reset on a zoned filesystem */
1336 if (btrfs_can_zone_reset(dev, phys, len)) {
1337 u64 src_disc;
1338
1339 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1340 if (ret)
1341 goto out;
1342
1343 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1344 dev != dev_replace->srcdev)
1345 goto out;
1346
1347 src_disc = discarded;
1348
1349 /* Send to replace target as well */
1350 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1351 &discarded);
1352 discarded += src_disc;
70200574 1353 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
6143c23c
NA
1354 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1355 } else {
1356 ret = 0;
1357 *bytes = 0;
1358 }
1359
1360out:
1361 *bytes = discarded;
1362 return ret;
1363}
1364
2ff7e61e 1365int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1366 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1367{
6b7faadd 1368 int ret = 0;
5378e607 1369 u64 discarded_bytes = 0;
6b7faadd
QW
1370 u64 end = bytenr + num_bytes;
1371 u64 cur = bytenr;
e244a0ae 1372
2999241d 1373 /*
a4012f06
CH
1374 * Avoid races with device replace and make sure the devices in the
1375 * stripes don't go away while we are discarding.
2999241d 1376 */
0b246afa 1377 btrfs_bio_counter_inc_blocked(fs_info);
6b7faadd 1378 while (cur < end) {
a4012f06
CH
1379 struct btrfs_discard_stripe *stripes;
1380 unsigned int num_stripes;
5d4f98a2
YZ
1381 int i;
1382
6b7faadd 1383 num_bytes = end - cur;
a4012f06
CH
1384 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1385 if (IS_ERR(stripes)) {
1386 ret = PTR_ERR(stripes);
1387 if (ret == -EOPNOTSUPP)
1388 ret = 0;
1389 break;
1390 }
5d4f98a2 1391
a4012f06
CH
1392 for (i = 0; i < num_stripes; i++) {
1393 struct btrfs_discard_stripe *stripe = stripes + i;
d04c6b88 1394 u64 bytes;
38b5f68e 1395
a4012f06 1396 if (!stripe->dev->bdev) {
627e0873
FM
1397 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1398 continue;
1399 }
dcba6e48 1400
a4012f06
CH
1401 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1402 &stripe->dev->dev_state))
5e753a81
AJ
1403 continue;
1404
6143c23c 1405 ret = do_discard_extent(stripe, &bytes);
a4012f06 1406 if (ret) {
6b7faadd 1407 /*
a4012f06
CH
1408 * Keep going if discard is not supported by the
1409 * device.
6b7faadd 1410 */
a4012f06
CH
1411 if (ret != -EOPNOTSUPP)
1412 break;
1413 ret = 0;
1414 } else {
1415 discarded_bytes += bytes;
6b7faadd 1416 }
5d4f98a2 1417 }
a4012f06
CH
1418 kfree(stripes);
1419 if (ret)
1420 break;
6b7faadd 1421 cur += num_bytes;
5d4f98a2 1422 }
0b246afa 1423 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1424 if (actual_bytes)
1425 *actual_bytes = discarded_bytes;
5d4f98a2 1426 return ret;
5d4f98a2
YZ
1427}
1428
79787eaa 1429/* Can return -ENOMEM */
5d4f98a2 1430int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1431 struct btrfs_ref *generic_ref)
5d4f98a2 1432{
82fa113f 1433 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2 1434 int ret;
66d7e7f0 1435
82fa113f
QW
1436 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1437 generic_ref->action);
1438 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
f2e69a77 1439 generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1440
82fa113f 1441 if (generic_ref->type == BTRFS_REF_METADATA)
2187374f 1442 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
82fa113f 1443 else
2187374f 1444 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
d7eae340 1445
82fa113f 1446 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1447
5d4f98a2
YZ
1448 return ret;
1449}
1450
bd3c685e 1451/*
9580503b 1452 * Insert backreference for a given extent.
bd3c685e 1453 *
07cce5cf
QW
1454 * The counterpart is in __btrfs_free_extent(), with examples and more details
1455 * how it works.
1456 *
bd3c685e
NB
1457 * @trans: Handle of transaction
1458 *
1459 * @node: The delayed ref node used to get the bytenr/length for
1460 * extent whose references are incremented.
1461 *
bd3c685e
NB
1462 * @extent_op Pointer to a structure, holding information necessary when
1463 * updating a tree block's flags
1464 *
1465 */
5d4f98a2 1466static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 1467 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
1468 struct btrfs_delayed_extent_op *extent_op)
1469{
1470 struct btrfs_path *path;
1471 struct extent_buffer *leaf;
1472 struct btrfs_extent_item *item;
fcebe456 1473 struct btrfs_key key;
c682f9b3
QW
1474 u64 bytenr = node->bytenr;
1475 u64 num_bytes = node->num_bytes;
a502f112
JB
1476 u64 owner = btrfs_delayed_ref_owner(node);
1477 u64 offset = btrfs_delayed_ref_offset(node);
5d4f98a2 1478 u64 refs;
88b2d088 1479 int refs_to_add = node->ref_mod;
5d4f98a2 1480 int ret;
5d4f98a2
YZ
1481
1482 path = btrfs_alloc_path();
1483 if (!path)
1484 return -ENOMEM;
1485
5d4f98a2 1486 /* this will setup the path even if it fails to insert the back ref */
a639cdeb 1487 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
a502f112 1488 node->parent, node->ref_root, owner,
a639cdeb 1489 offset, refs_to_add, extent_op);
0ed4792a 1490 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 1491 goto out;
fcebe456
JB
1492
1493 /*
1494 * Ok we had -EAGAIN which means we didn't have space to insert and
1495 * inline extent ref, so just update the reference count and add a
1496 * normal backref.
1497 */
5d4f98a2 1498 leaf = path->nodes[0];
fcebe456 1499 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
1500 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1501 refs = btrfs_extent_refs(leaf, item);
1502 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1503 if (extent_op)
1504 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 1505
50564b65 1506 btrfs_mark_buffer_dirty(trans, leaf);
b3b4aa74 1507 btrfs_release_path(path);
56bec294 1508
56bec294 1509 /* now insert the actual backref */
d2f79e63 1510 if (owner < BTRFS_FIRST_FREE_OBJECTID)
b4b5934a 1511 ret = insert_tree_block_ref(trans, path, node, bytenr);
d2f79e63 1512 else
b4b5934a 1513 ret = insert_extent_data_ref(trans, path, node, bytenr);
d2f79e63 1514
79787eaa 1515 if (ret)
66642832 1516 btrfs_abort_transaction(trans, ret);
5d4f98a2 1517out:
56bec294 1518 btrfs_free_path(path);
30d133fc 1519 return ret;
56bec294
CM
1520}
1521
e85a0ada
BB
1522static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1523 struct btrfs_delayed_ref_head *href)
1524{
1525 u64 root = href->owning_root;
1526
1527 /*
1528 * Don't check must_insert_reserved, as this is called from contexts
1529 * where it has already been unset.
1530 */
1531 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1532 !href->is_data || !is_fstree(root))
1533 return;
1534
1535 btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1536 BTRFS_QGROUP_RSV_DATA);
1537}
1538
5d4f98a2 1539static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
cecbb533 1540 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1541 struct btrfs_delayed_ref_node *node,
1542 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1543 bool insert_reserved)
56bec294 1544{
5d4f98a2 1545 int ret = 0;
5d4f98a2 1546 u64 parent = 0;
5d4f98a2
YZ
1547 u64 flags = 0;
1548
1bff6d4f 1549 trace_run_delayed_data_ref(trans->fs_info, node);
599c75ec 1550
5d4f98a2 1551 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
cf4f0432 1552 parent = node->parent;
5d4f98a2
YZ
1553
1554 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
7cce0d69 1555 struct btrfs_key key;
cecbb533
BB
1556 struct btrfs_squota_delta delta = {
1557 .root = href->owning_root,
1558 .num_bytes = node->num_bytes,
cecbb533
BB
1559 .is_data = true,
1560 .is_inc = true,
bd7c1ea3 1561 .generation = trans->transid,
cecbb533 1562 };
44cc2e38
JB
1563 u64 owner = btrfs_delayed_ref_owner(node);
1564 u64 offset = btrfs_delayed_ref_offset(node);
7cce0d69 1565
3173a18f 1566 if (extent_op)
5d4f98a2 1567 flags |= extent_op->flags_to_set;
7cce0d69
FM
1568
1569 key.objectid = node->bytenr;
1570 key.type = BTRFS_EXTENT_ITEM_KEY;
1571 key.offset = node->num_bytes;
1572
cf4f0432 1573 ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
44cc2e38
JB
1574 flags, owner, offset, &key,
1575 node->ref_mod,
1576 href->owning_root);
e85a0ada 1577 free_head_ref_squota_rsv(trans->fs_info, href);
cecbb533
BB
1578 if (!ret)
1579 ret = btrfs_record_squota_delta(trans->fs_info, &delta);
5d4f98a2 1580 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
a502f112 1581 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
5d4f98a2 1582 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
85bb9f54 1583 ret = __btrfs_free_extent(trans, href, node, extent_op);
5d4f98a2
YZ
1584 } else {
1585 BUG();
1586 }
1587 return ret;
1588}
1589
1590static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1591 struct extent_buffer *leaf,
1592 struct btrfs_extent_item *ei)
1593{
1594 u64 flags = btrfs_extent_flags(leaf, ei);
1595 if (extent_op->update_flags) {
1596 flags |= extent_op->flags_to_set;
1597 btrfs_set_extent_flags(leaf, ei, flags);
1598 }
1599
1600 if (extent_op->update_key) {
1601 struct btrfs_tree_block_info *bi;
1602 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1603 bi = (struct btrfs_tree_block_info *)(ei + 1);
1604 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1605 }
1606}
1607
1608static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 1609 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
1610 struct btrfs_delayed_extent_op *extent_op)
1611{
20b9a2d6 1612 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 1613 struct btrfs_root *root;
5d4f98a2
YZ
1614 struct btrfs_key key;
1615 struct btrfs_path *path;
1616 struct btrfs_extent_item *ei;
1617 struct extent_buffer *leaf;
1618 u32 item_size;
56bec294 1619 int ret;
0e3696f8 1620 int metadata = 1;
5d4f98a2 1621
bf31f87f 1622 if (TRANS_ABORTED(trans))
79787eaa
JM
1623 return 0;
1624
0e3696f8 1625 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
1626 metadata = 0;
1627
5d4f98a2
YZ
1628 path = btrfs_alloc_path();
1629 if (!path)
1630 return -ENOMEM;
1631
d278850e 1632 key.objectid = head->bytenr;
5d4f98a2 1633
3173a18f 1634 if (metadata) {
3173a18f 1635 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 1636 key.offset = extent_op->level;
3173a18f
JB
1637 } else {
1638 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 1639 key.offset = head->num_bytes;
3173a18f
JB
1640 }
1641
29cbcf40 1642 root = btrfs_extent_root(fs_info, key.objectid);
3173a18f 1643again:
29cbcf40 1644 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5d4f98a2 1645 if (ret < 0) {
5d4f98a2 1646 goto out;
20fb05a6 1647 } else if (ret > 0) {
3173a18f 1648 if (metadata) {
55994887
FDBM
1649 if (path->slots[0] > 0) {
1650 path->slots[0]--;
1651 btrfs_item_key_to_cpu(path->nodes[0], &key,
1652 path->slots[0]);
d278850e 1653 if (key.objectid == head->bytenr &&
55994887 1654 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 1655 key.offset == head->num_bytes)
55994887
FDBM
1656 ret = 0;
1657 }
1658 if (ret > 0) {
1659 btrfs_release_path(path);
1660 metadata = 0;
3173a18f 1661
d278850e
JB
1662 key.objectid = head->bytenr;
1663 key.offset = head->num_bytes;
55994887
FDBM
1664 key.type = BTRFS_EXTENT_ITEM_KEY;
1665 goto again;
1666 }
1667 } else {
20fb05a6 1668 ret = -EUCLEAN;
8ec0a4a5
FM
1669 btrfs_err(fs_info,
1670 "missing extent item for extent %llu num_bytes %llu level %d",
1671 head->bytenr, head->num_bytes, extent_op->level);
55994887 1672 goto out;
3173a18f 1673 }
5d4f98a2
YZ
1674 }
1675
1676 leaf = path->nodes[0];
3212fa14 1677 item_size = btrfs_item_size(leaf, path->slots[0]);
ba3c2b19 1678
6d8ff4e4 1679 if (unlikely(item_size < sizeof(*ei))) {
20fb05a6 1680 ret = -EUCLEAN;
182741d2
QW
1681 btrfs_err(fs_info,
1682 "unexpected extent item size, has %u expect >= %zu",
1683 item_size, sizeof(*ei));
20fb05a6 1684 btrfs_abort_transaction(trans, ret);
ba3c2b19
NB
1685 goto out;
1686 }
1687
5d4f98a2
YZ
1688 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1689 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 1690
50564b65 1691 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
1692out:
1693 btrfs_free_path(path);
20fb05a6 1694 return ret;
56bec294
CM
1695}
1696
5d4f98a2 1697static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
cecbb533 1698 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1699 struct btrfs_delayed_ref_node *node,
1700 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1701 bool insert_reserved)
56bec294
CM
1702{
1703 int ret = 0;
cecbb533 1704 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
1705 u64 parent = 0;
1706 u64 ref_root = 0;
56bec294 1707
1bff6d4f 1708 trace_run_delayed_tree_ref(trans->fs_info, node);
599c75ec 1709
5d4f98a2 1710 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
cf4f0432
JB
1711 parent = node->parent;
1712 ref_root = node->ref_root;
5d4f98a2 1713
1bf76df3 1714 if (unlikely(node->ref_mod != 1)) {
f97806f2 1715 btrfs_err(trans->fs_info,
1bf76df3 1716 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
02794222
LB
1717 node->bytenr, node->ref_mod, node->action, ref_root,
1718 parent);
1bf76df3 1719 return -EUCLEAN;
02794222 1720 }
5d4f98a2 1721 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
cecbb533
BB
1722 struct btrfs_squota_delta delta = {
1723 .root = href->owning_root,
1724 .num_bytes = fs_info->nodesize,
cecbb533
BB
1725 .is_data = false,
1726 .is_inc = true,
bd7c1ea3 1727 .generation = trans->transid,
cecbb533
BB
1728 };
1729
3173a18f 1730 BUG_ON(!extent_op || !extent_op->update_flags);
21ebfbe7 1731 ret = alloc_reserved_tree_block(trans, node, extent_op);
cecbb533
BB
1732 if (!ret)
1733 btrfs_record_squota_delta(fs_info, &delta);
5d4f98a2 1734 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
a502f112 1735 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
5d4f98a2 1736 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
85bb9f54 1737 ret = __btrfs_free_extent(trans, href, node, extent_op);
5d4f98a2
YZ
1738 } else {
1739 BUG();
1740 }
56bec294
CM
1741 return ret;
1742}
1743
1744/* helper function to actually process a single delayed ref entry */
5d4f98a2 1745static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
cecbb533 1746 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1747 struct btrfs_delayed_ref_node *node,
1748 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1749 bool insert_reserved)
56bec294 1750{
79787eaa
JM
1751 int ret = 0;
1752
bf31f87f 1753 if (TRANS_ABORTED(trans)) {
e85a0ada 1754 if (insert_reserved) {
b25c36f8 1755 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
e85a0ada
BB
1756 free_head_ref_squota_rsv(trans->fs_info, href);
1757 }
79787eaa 1758 return 0;
857cc2fc 1759 }
79787eaa 1760
5d4f98a2
YZ
1761 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1762 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
cecbb533 1763 ret = run_delayed_tree_ref(trans, href, node, extent_op,
5d4f98a2
YZ
1764 insert_reserved);
1765 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1766 node->type == BTRFS_SHARED_DATA_REF_KEY)
cecbb533 1767 ret = run_delayed_data_ref(trans, href, node, extent_op,
5d4f98a2 1768 insert_reserved);
d9a620f7
BB
1769 else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1770 ret = 0;
5d4f98a2
YZ
1771 else
1772 BUG();
80ee54bf 1773 if (ret && insert_reserved)
b25c36f8 1774 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
39f501d6
QW
1775 if (ret < 0)
1776 btrfs_err(trans->fs_info,
1777"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1778 node->bytenr, node->num_bytes, node->type,
1779 node->action, node->ref_mod, ret);
5d4f98a2 1780 return ret;
56bec294
CM
1781}
1782
c6fc2454 1783static inline struct btrfs_delayed_ref_node *
56bec294
CM
1784select_delayed_ref(struct btrfs_delayed_ref_head *head)
1785{
cffc3374
FM
1786 struct btrfs_delayed_ref_node *ref;
1787
e3d03965 1788 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 1789 return NULL;
d7df2c79 1790
cffc3374
FM
1791 /*
1792 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1793 * This is to prevent a ref count from going down to zero, which deletes
1794 * the extent item from the extent tree, when there still are references
1795 * to add, which would fail because they would not find the extent item.
1796 */
1d57ee94
WX
1797 if (!list_empty(&head->ref_add_list))
1798 return list_first_entry(&head->ref_add_list,
1799 struct btrfs_delayed_ref_node, add_list);
1800
e3d03965 1801 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 1802 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
1803 ASSERT(list_empty(&ref->add_list));
1804 return ref;
56bec294
CM
1805}
1806
2eadaa22
JB
1807static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1808 struct btrfs_delayed_ref_head *head)
1809{
1810 spin_lock(&delayed_refs->lock);
61c681fe 1811 head->processing = false;
2eadaa22
JB
1812 delayed_refs->num_heads_ready++;
1813 spin_unlock(&delayed_refs->lock);
1814 btrfs_delayed_ref_unlock(head);
1815}
1816
bedc6617
JB
1817static struct btrfs_delayed_extent_op *cleanup_extent_op(
1818 struct btrfs_delayed_ref_head *head)
b00e6250
JB
1819{
1820 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
1821
1822 if (!extent_op)
bedc6617
JB
1823 return NULL;
1824
b00e6250 1825 if (head->must_insert_reserved) {
bedc6617 1826 head->extent_op = NULL;
b00e6250 1827 btrfs_free_delayed_extent_op(extent_op);
bedc6617 1828 return NULL;
b00e6250 1829 }
bedc6617
JB
1830 return extent_op;
1831}
1832
1833static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1834 struct btrfs_delayed_ref_head *head)
1835{
1836 struct btrfs_delayed_extent_op *extent_op;
1837 int ret;
1838
1839 extent_op = cleanup_extent_op(head);
1840 if (!extent_op)
1841 return 0;
1842 head->extent_op = NULL;
b00e6250 1843 spin_unlock(&head->lock);
20b9a2d6 1844 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
1845 btrfs_free_delayed_extent_op(extent_op);
1846 return ret ? ret : 1;
1847}
1848
8a526c44 1849u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
31890da0
JB
1850 struct btrfs_delayed_ref_root *delayed_refs,
1851 struct btrfs_delayed_ref_head *head)
07c47775 1852{
e85a0ada
BB
1853 u64 ret = 0;
1854
81e75ac7
JB
1855 /*
1856 * We had csum deletions accounted for in our delayed refs rsv, we need
1857 * to drop the csum leaves for this update from our delayed_refs_rsv.
1858 */
1859 if (head->total_ref_mod < 0 && head->is_data) {
adb86dbe 1860 int nr_csums;
3ee56a58 1861
81e75ac7
JB
1862 spin_lock(&delayed_refs->lock);
1863 delayed_refs->pending_csums -= head->num_bytes;
1864 spin_unlock(&delayed_refs->lock);
adb86dbe 1865 nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
3ee56a58 1866
adb86dbe 1867 btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
81e75ac7 1868
e85a0ada 1869 ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
3ee56a58 1870 }
e85a0ada
BB
1871 /* must_insert_reserved can be set only if we didn't run the head ref. */
1872 if (head->must_insert_reserved)
1873 free_head_ref_squota_rsv(fs_info, head);
8a526c44 1874
e85a0ada 1875 return ret;
07c47775
JB
1876}
1877
194ab0bc 1878static int cleanup_ref_head(struct btrfs_trans_handle *trans,
8a526c44
FM
1879 struct btrfs_delayed_ref_head *head,
1880 u64 *bytes_released)
194ab0bc 1881{
f9871edd
NB
1882
1883 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
1884 struct btrfs_delayed_ref_root *delayed_refs;
1885 int ret;
1886
1887 delayed_refs = &trans->transaction->delayed_refs;
1888
bedc6617 1889 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
1890 if (ret < 0) {
1891 unselect_delayed_ref_head(delayed_refs, head);
1892 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1893 return ret;
1894 } else if (ret) {
1895 return ret;
1896 }
1897
1898 /*
1899 * Need to drop our head ref lock and re-acquire the delayed ref lock
1900 * and then re-check to make sure nobody got added.
1901 */
1902 spin_unlock(&head->lock);
1903 spin_lock(&delayed_refs->lock);
1904 spin_lock(&head->lock);
e3d03965 1905 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
1906 spin_unlock(&head->lock);
1907 spin_unlock(&delayed_refs->lock);
1908 return 1;
1909 }
d7baffda 1910 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 1911 spin_unlock(&head->lock);
1e7a1421 1912 spin_unlock(&delayed_refs->lock);
c1103f7a 1913
c1103f7a 1914 if (head->must_insert_reserved) {
b25c36f8 1915 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
c1103f7a 1916 if (head->is_data) {
fc28b25e
JB
1917 struct btrfs_root *csum_root;
1918
1919 csum_root = btrfs_csum_root(fs_info, head->bytenr);
1920 ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1921 head->num_bytes);
c1103f7a
JB
1922 }
1923 }
1924
3ee56a58 1925 *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
1926
1927 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 1928 btrfs_delayed_ref_unlock(head);
d278850e 1929 btrfs_put_delayed_ref_head(head);
856bd270 1930 return ret;
194ab0bc
JB
1931}
1932
b1cdbcb5
NB
1933static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1934 struct btrfs_trans_handle *trans)
1935{
1936 struct btrfs_delayed_ref_root *delayed_refs =
1937 &trans->transaction->delayed_refs;
1938 struct btrfs_delayed_ref_head *head = NULL;
1939 int ret;
1940
1941 spin_lock(&delayed_refs->lock);
5637c74b 1942 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
1943 if (!head) {
1944 spin_unlock(&delayed_refs->lock);
1945 return head;
1946 }
1947
1948 /*
1949 * Grab the lock that says we are going to process all the refs for
1950 * this head
1951 */
9e920a6f 1952 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
1953 spin_unlock(&delayed_refs->lock);
1954
1955 /*
1956 * We may have dropped the spin lock to get the head mutex lock, and
1957 * that might have given someone else time to free the head. If that's
1958 * true, it has been removed from our list and we can move on.
1959 */
1960 if (ret == -EAGAIN)
1961 head = ERR_PTR(-EAGAIN);
1962
1963 return head;
1964}
1965
e7261386 1966static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
3ee56a58
FM
1967 struct btrfs_delayed_ref_head *locked_ref,
1968 u64 *bytes_released)
e7261386
NB
1969{
1970 struct btrfs_fs_info *fs_info = trans->fs_info;
1971 struct btrfs_delayed_ref_root *delayed_refs;
1972 struct btrfs_delayed_extent_op *extent_op;
1973 struct btrfs_delayed_ref_node *ref;
61c681fe 1974 bool must_insert_reserved;
e7261386
NB
1975 int ret;
1976
1977 delayed_refs = &trans->transaction->delayed_refs;
1978
0110a4c4
NB
1979 lockdep_assert_held(&locked_ref->mutex);
1980 lockdep_assert_held(&locked_ref->lock);
1981
e7261386
NB
1982 while ((ref = select_delayed_ref(locked_ref))) {
1983 if (ref->seq &&
1984 btrfs_check_delayed_seq(fs_info, ref->seq)) {
1985 spin_unlock(&locked_ref->lock);
1986 unselect_delayed_ref_head(delayed_refs, locked_ref);
1987 return -EAGAIN;
1988 }
1989
e7261386
NB
1990 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1991 RB_CLEAR_NODE(&ref->ref_node);
1992 if (!list_empty(&ref->add_list))
1993 list_del(&ref->add_list);
1994 /*
1995 * When we play the delayed ref, also correct the ref_mod on
1996 * head
1997 */
1998 switch (ref->action) {
1999 case BTRFS_ADD_DELAYED_REF:
2000 case BTRFS_ADD_DELAYED_EXTENT:
2001 locked_ref->ref_mod -= ref->ref_mod;
2002 break;
2003 case BTRFS_DROP_DELAYED_REF:
2004 locked_ref->ref_mod += ref->ref_mod;
2005 break;
2006 default:
2007 WARN_ON(1);
2008 }
2009 atomic_dec(&delayed_refs->num_entries);
2010
2011 /*
2012 * Record the must_insert_reserved flag before we drop the
2013 * spin lock.
2014 */
2015 must_insert_reserved = locked_ref->must_insert_reserved;
e85a0ada
BB
2016 /*
2017 * Unsetting this on the head ref relinquishes ownership of
2018 * the rsv_bytes, so it is critical that every possible code
2019 * path from here forward frees all reserves including qgroup
2020 * reserve.
2021 */
61c681fe 2022 locked_ref->must_insert_reserved = false;
e7261386
NB
2023
2024 extent_op = locked_ref->extent_op;
2025 locked_ref->extent_op = NULL;
2026 spin_unlock(&locked_ref->lock);
2027
cecbb533 2028 ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
e7261386 2029 must_insert_reserved);
adb86dbe 2030 btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
3ee56a58 2031 *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
cecbb533 2032
e7261386
NB
2033 btrfs_free_delayed_extent_op(extent_op);
2034 if (ret) {
2035 unselect_delayed_ref_head(delayed_refs, locked_ref);
2036 btrfs_put_delayed_ref(ref);
e7261386
NB
2037 return ret;
2038 }
2039
2040 btrfs_put_delayed_ref(ref);
2041 cond_resched();
2042
2043 spin_lock(&locked_ref->lock);
0c555c97 2044 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
e7261386
NB
2045 }
2046
2047 return 0;
2048}
2049
79787eaa
JM
2050/*
2051 * Returns 0 on success or if called with an already aborted transaction.
2052 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2053 */
d7df2c79 2054static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
8a526c44 2055 u64 min_bytes)
56bec294 2056{
0a1e458a 2057 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 2058 struct btrfs_delayed_ref_root *delayed_refs;
56bec294
CM
2059 struct btrfs_delayed_ref_head *locked_ref = NULL;
2060 int ret;
d7df2c79 2061 unsigned long count = 0;
8a526c44
FM
2062 unsigned long max_count = 0;
2063 u64 bytes_processed = 0;
56bec294
CM
2064
2065 delayed_refs = &trans->transaction->delayed_refs;
8a526c44
FM
2066 if (min_bytes == 0) {
2067 max_count = delayed_refs->num_heads_ready;
2068 min_bytes = U64_MAX;
2069 }
2070
0110a4c4 2071 do {
56bec294 2072 if (!locked_ref) {
b1cdbcb5 2073 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
2074 if (IS_ERR_OR_NULL(locked_ref)) {
2075 if (PTR_ERR(locked_ref) == -EAGAIN) {
2076 continue;
2077 } else {
2078 break;
2079 }
56bec294 2080 }
0110a4c4 2081 count++;
56bec294 2082 }
2c3cf7d5
FM
2083 /*
2084 * We need to try and merge add/drops of the same ref since we
2085 * can run into issues with relocate dropping the implicit ref
2086 * and then it being added back again before the drop can
2087 * finish. If we merged anything we need to re-loop so we can
2088 * get a good ref.
2089 * Or we can get node references of the same type that weren't
2090 * merged when created due to bumps in the tree mod seq, and
2091 * we need to merge them to prevent adding an inline extent
2092 * backref before dropping it (triggering a BUG_ON at
2093 * insert_inline_extent_backref()).
2094 */
d7df2c79 2095 spin_lock(&locked_ref->lock);
0c555c97 2096 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
ae1e206b 2097
3ee56a58 2098 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
0110a4c4
NB
2099 if (ret < 0 && ret != -EAGAIN) {
2100 /*
2101 * Error, btrfs_run_delayed_refs_for_head already
2102 * unlocked everything so just bail out
2103 */
2104 return ret;
2105 } else if (!ret) {
2106 /*
2107 * Success, perform the usual cleanup of a processed
2108 * head
2109 */
3ee56a58 2110 ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
194ab0bc 2111 if (ret > 0 ) {
b00e6250
JB
2112 /* We dropped our lock, we need to loop. */
2113 ret = 0;
d7df2c79 2114 continue;
194ab0bc
JB
2115 } else if (ret) {
2116 return ret;
5d4f98a2 2117 }
22cd2e7d 2118 }
1ce7a5ec 2119
b00e6250 2120 /*
0110a4c4
NB
2121 * Either success case or btrfs_run_delayed_refs_for_head
2122 * returned -EAGAIN, meaning we need to select another head
b00e6250 2123 */
b00e6250 2124
0110a4c4 2125 locked_ref = NULL;
c3e69d58 2126 cond_resched();
8a526c44
FM
2127 } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2128 (max_count > 0 && count < max_count) ||
2129 locked_ref);
0a2b2a84 2130
d7df2c79 2131 return 0;
c3e69d58
CM
2132}
2133
709c0486
AJ
2134#ifdef SCRAMBLE_DELAYED_REFS
2135/*
2136 * Normally delayed refs get processed in ascending bytenr order. This
2137 * correlates in most cases to the order added. To expose dependencies on this
2138 * order, we start to process the tree in the middle instead of the beginning
2139 */
2140static u64 find_middle(struct rb_root *root)
2141{
2142 struct rb_node *n = root->rb_node;
2143 struct btrfs_delayed_ref_node *entry;
2144 int alt = 1;
2145 u64 middle;
2146 u64 first = 0, last = 0;
2147
2148 n = rb_first(root);
2149 if (n) {
2150 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2151 first = entry->bytenr;
2152 }
2153 n = rb_last(root);
2154 if (n) {
2155 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2156 last = entry->bytenr;
2157 }
2158 n = root->rb_node;
2159
2160 while (n) {
2161 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2162 WARN_ON(!entry->in_tree);
2163
2164 middle = entry->bytenr;
2165
2166 if (alt)
2167 n = n->rb_left;
2168 else
2169 n = n->rb_right;
2170
2171 alt = 1 - alt;
2172 }
2173 return middle;
2174}
2175#endif
2176
c3e69d58 2177/*
8a526c44
FM
2178 * Start processing the delayed reference count updates and extent insertions
2179 * we have queued up so far.
2180 *
2181 * @trans: Transaction handle.
2182 * @min_bytes: How many bytes of delayed references to process. After this
2183 * many bytes we stop processing delayed references if there are
2184 * any more. If 0 it means to run all existing delayed references,
2185 * but not new ones added after running all existing ones.
2186 * Use (u64)-1 (U64_MAX) to run all existing delayed references
2187 * plus any new ones that are added.
79787eaa
JM
2188 *
2189 * Returns 0 on success or if called with an aborted transaction
2190 * Returns <0 on error and aborts the transaction
c3e69d58 2191 */
8a526c44 2192int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
c3e69d58 2193{
c79a70b1 2194 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58 2195 struct btrfs_delayed_ref_root *delayed_refs;
c3e69d58 2196 int ret;
c3e69d58 2197
79787eaa 2198 /* We'll clean this up in btrfs_cleanup_transaction */
bf31f87f 2199 if (TRANS_ABORTED(trans))
79787eaa
JM
2200 return 0;
2201
0b246afa 2202 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2203 return 0;
2204
c3e69d58 2205 delayed_refs = &trans->transaction->delayed_refs;
c3e69d58 2206again:
709c0486
AJ
2207#ifdef SCRAMBLE_DELAYED_REFS
2208 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2209#endif
8a526c44 2210 ret = __btrfs_run_delayed_refs(trans, min_bytes);
d7df2c79 2211 if (ret < 0) {
66642832 2212 btrfs_abort_transaction(trans, ret);
d7df2c79 2213 return ret;
eb099670 2214 }
c3e69d58 2215
8a526c44 2216 if (min_bytes == U64_MAX) {
119e80df 2217 btrfs_create_pending_block_groups(trans);
ea658bad 2218
d7df2c79 2219 spin_lock(&delayed_refs->lock);
3cbb9f51 2220 if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
d7df2c79 2221 spin_unlock(&delayed_refs->lock);
3cbb9f51 2222 return 0;
d7df2c79 2223 }
d278850e 2224 spin_unlock(&delayed_refs->lock);
e9d0b13b 2225
d7df2c79 2226 cond_resched();
56bec294 2227 goto again;
5f39d397 2228 }
3cbb9f51 2229
a28ec197
CM
2230 return 0;
2231}
2232
5d4f98a2 2233int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
4aec05fa 2234 struct extent_buffer *eb, u64 flags)
5d4f98a2
YZ
2235{
2236 struct btrfs_delayed_extent_op *extent_op;
4aec05fa 2237 int level = btrfs_header_level(eb);
5d4f98a2
YZ
2238 int ret;
2239
78a6184a 2240 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2241 if (!extent_op)
2242 return -ENOMEM;
2243
2244 extent_op->flags_to_set = flags;
35b3ad50
DS
2245 extent_op->update_flags = true;
2246 extent_op->update_key = false;
b1c79e09 2247 extent_op->level = level;
5d4f98a2 2248
42c9d0b5 2249 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
5d4f98a2 2250 if (ret)
78a6184a 2251 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2252 return ret;
2253}
2254
e4c3b2dc 2255static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2256 struct btrfs_path *path,
2257 u64 objectid, u64 offset, u64 bytenr)
2258{
2259 struct btrfs_delayed_ref_head *head;
2260 struct btrfs_delayed_ref_node *ref;
5d4f98a2 2261 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2262 struct btrfs_transaction *cur_trans;
0e0adbcf 2263 struct rb_node *node;
5d4f98a2
YZ
2264 int ret = 0;
2265
998ac6d2 2266 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2267 cur_trans = root->fs_info->running_transaction;
998ac6d2 2268 if (cur_trans)
2269 refcount_inc(&cur_trans->use_count);
2270 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2271 if (!cur_trans)
2272 return 0;
2273
2274 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2275 spin_lock(&delayed_refs->lock);
f72ad18e 2276 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2277 if (!head) {
2278 spin_unlock(&delayed_refs->lock);
998ac6d2 2279 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2280 return 0;
2281 }
5d4f98a2
YZ
2282
2283 if (!mutex_trylock(&head->mutex)) {
26ce9114
JB
2284 if (path->nowait) {
2285 spin_unlock(&delayed_refs->lock);
2286 btrfs_put_transaction(cur_trans);
2287 return -EAGAIN;
2288 }
2289
d278850e 2290 refcount_inc(&head->refs);
5d4f98a2
YZ
2291 spin_unlock(&delayed_refs->lock);
2292
b3b4aa74 2293 btrfs_release_path(path);
5d4f98a2 2294
8cc33e5c
DS
2295 /*
2296 * Mutex was contended, block until it's released and let
2297 * caller try again
2298 */
5d4f98a2
YZ
2299 mutex_lock(&head->mutex);
2300 mutex_unlock(&head->mutex);
d278850e 2301 btrfs_put_delayed_ref_head(head);
998ac6d2 2302 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2303 return -EAGAIN;
2304 }
d7df2c79 2305 spin_unlock(&delayed_refs->lock);
5d4f98a2 2306
d7df2c79 2307 spin_lock(&head->lock);
0e0adbcf
JB
2308 /*
2309 * XXX: We should replace this with a proper search function in the
2310 * future.
2311 */
e3d03965
LB
2312 for (node = rb_first_cached(&head->ref_tree); node;
2313 node = rb_next(node)) {
44cc2e38
JB
2314 u64 ref_owner;
2315 u64 ref_offset;
2316
0e0adbcf 2317 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2318 /* If it's a shared ref we know a cross reference exists */
2319 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2320 ret = 1;
2321 break;
2322 }
5d4f98a2 2323
44cc2e38
JB
2324 ref_owner = btrfs_delayed_ref_owner(ref);
2325 ref_offset = btrfs_delayed_ref_offset(ref);
5d4f98a2 2326
d7df2c79
JB
2327 /*
2328 * If our ref doesn't match the one we're currently looking at
2329 * then we have a cross reference.
2330 */
cf4f0432 2331 if (ref->ref_root != root->root_key.objectid ||
44cc2e38 2332 ref_owner != objectid || ref_offset != offset) {
d7df2c79
JB
2333 ret = 1;
2334 break;
2335 }
5d4f98a2 2336 }
d7df2c79 2337 spin_unlock(&head->lock);
5d4f98a2 2338 mutex_unlock(&head->mutex);
998ac6d2 2339 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2340 return ret;
2341}
2342
e4c3b2dc 2343static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2 2344 struct btrfs_path *path,
a84d5d42
BB
2345 u64 objectid, u64 offset, u64 bytenr,
2346 bool strict)
be20aa9d 2347{
0b246afa 2348 struct btrfs_fs_info *fs_info = root->fs_info;
29cbcf40 2349 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
f321e491 2350 struct extent_buffer *leaf;
5d4f98a2
YZ
2351 struct btrfs_extent_data_ref *ref;
2352 struct btrfs_extent_inline_ref *iref;
2353 struct btrfs_extent_item *ei;
f321e491 2354 struct btrfs_key key;
5d4f98a2 2355 u32 item_size;
d9a620f7 2356 u32 expected_size;
3de28d57 2357 int type;
be20aa9d 2358 int ret;
925baedd 2359
be20aa9d 2360 key.objectid = bytenr;
31840ae1 2361 key.offset = (u64)-1;
f321e491 2362 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2363
be20aa9d
CM
2364 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2365 if (ret < 0)
2366 goto out;
a4259b6c
DS
2367 if (ret == 0) {
2368 /*
2369 * Key with offset -1 found, there would have to exist an extent
2370 * item with such offset, but this is out of the valid range.
2371 */
2372 ret = -EUCLEAN;
2373 goto out;
2374 }
80ff3856
YZ
2375
2376 ret = -ENOENT;
2377 if (path->slots[0] == 0)
31840ae1 2378 goto out;
be20aa9d 2379
31840ae1 2380 path->slots[0]--;
f321e491 2381 leaf = path->nodes[0];
5d4f98a2 2382 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2383
5d4f98a2 2384 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2385 goto out;
f321e491 2386
5d4f98a2 2387 ret = 1;
3212fa14 2388 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2 2389 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
d9a620f7
BB
2390 expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2391
2392 /* No inline refs; we need to bail before checking for owner ref. */
2393 if (item_size == sizeof(*ei))
2394 goto out;
2395
2396 /* Check for an owner ref; skip over it to the real inline refs. */
2397 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2398 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2399 if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2400 expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2401 iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2402 }
bd09835d 2403
a6bd9cd1 2404 /* If extent item has more than 1 inline ref then it's shared */
d9a620f7 2405 if (item_size != expected_size)
5d4f98a2 2406 goto out;
be20aa9d 2407
a84d5d42
BB
2408 /*
2409 * If extent created before last snapshot => it's shared unless the
2410 * snapshot has been deleted. Use the heuristic if strict is false.
2411 */
2412 if (!strict &&
2413 (btrfs_extent_generation(leaf, ei) <=
2414 btrfs_root_last_snapshot(&root->root_item)))
5d4f98a2
YZ
2415 goto out;
2416
a6bd9cd1 2417 /* If this extent has SHARED_DATA_REF then it's shared */
3de28d57
LB
2418 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2419 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
2420 goto out;
2421
2422 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2423 if (btrfs_extent_refs(leaf, ei) !=
2424 btrfs_extent_data_ref_count(leaf, ref) ||
2425 btrfs_extent_data_ref_root(leaf, ref) !=
2426 root->root_key.objectid ||
2427 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2428 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2429 goto out;
2430
2431 ret = 0;
2432out:
2433 return ret;
2434}
2435
e4c3b2dc 2436int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
1a89f173 2437 u64 bytenr, bool strict, struct btrfs_path *path)
5d4f98a2 2438{
5d4f98a2 2439 int ret;
5d4f98a2 2440
5d4f98a2 2441 do {
e4c3b2dc 2442 ret = check_committed_ref(root, path, objectid,
a84d5d42 2443 offset, bytenr, strict);
5d4f98a2 2444 if (ret && ret != -ENOENT)
f321e491 2445 goto out;
80ff3856 2446
380fd066
MT
2447 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2448 } while (ret == -EAGAIN);
5d4f98a2 2449
be20aa9d 2450out:
1a89f173 2451 btrfs_release_path(path);
37f00a6d 2452 if (btrfs_is_data_reloc_root(root))
f0486c68 2453 WARN_ON(ret > 0);
f321e491 2454 return ret;
be20aa9d 2455}
c5739bba 2456
5d4f98a2 2457static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 2458 struct btrfs_root *root,
5d4f98a2 2459 struct extent_buffer *buf,
e339a6b0 2460 int full_backref, int inc)
31840ae1 2461{
0b246afa 2462 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 2463 u64 parent;
31840ae1 2464 u64 ref_root;
31840ae1 2465 u32 nritems;
31840ae1
ZY
2466 struct btrfs_key key;
2467 struct btrfs_file_extent_item *fi;
82fa113f 2468 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 2469 int i;
82fa113f 2470 int action;
31840ae1
ZY
2471 int level;
2472 int ret = 0;
fccb84c9 2473
0b246afa 2474 if (btrfs_is_testing(fs_info))
faa2dbf0 2475 return 0;
fccb84c9 2476
31840ae1 2477 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
2478 nritems = btrfs_header_nritems(buf);
2479 level = btrfs_header_level(buf);
2480
92a7cc42 2481 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
5d4f98a2 2482 return 0;
31840ae1 2483
5d4f98a2
YZ
2484 if (full_backref)
2485 parent = buf->start;
2486 else
2487 parent = 0;
82fa113f
QW
2488 if (inc)
2489 action = BTRFS_ADD_DELAYED_REF;
2490 else
2491 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
2492
2493 for (i = 0; i < nritems; i++) {
4d09b4e9
JB
2494 struct btrfs_ref ref = {
2495 .action = action,
2496 .parent = parent,
f2e69a77 2497 .ref_root = ref_root,
4d09b4e9
JB
2498 };
2499
31840ae1 2500 if (level == 0) {
5d4f98a2 2501 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 2502 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 2503 continue;
5d4f98a2 2504 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
2505 struct btrfs_file_extent_item);
2506 if (btrfs_file_extent_type(buf, fi) ==
2507 BTRFS_FILE_EXTENT_INLINE)
2508 continue;
4d09b4e9
JB
2509 ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2510 if (ref.bytenr == 0)
31840ae1 2511 continue;
5d4f98a2 2512
12390e42 2513 ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
4d09b4e9
JB
2514 ref.owning_root = ref_root;
2515
5d4f98a2 2516 key.offset -= btrfs_file_extent_offset(buf, fi);
f2e69a77
JB
2517 btrfs_init_data_ref(&ref, key.objectid, key.offset,
2518 root->root_key.objectid, for_reloc);
dd28b6a5 2519 if (inc)
4d09b4e9 2520 ret = btrfs_inc_extent_ref(trans, &ref);
dd28b6a5 2521 else
4d09b4e9 2522 ret = btrfs_free_extent(trans, &ref);
31840ae1
ZY
2523 if (ret)
2524 goto fail;
2525 } else {
4d09b4e9
JB
2526 /* We don't know the owning_root, leave as 0. */
2527 ref.bytenr = btrfs_node_blockptr(buf, i);
12390e42 2528 ref.num_bytes = fs_info->nodesize;
4d09b4e9 2529
f2e69a77 2530 btrfs_init_tree_ref(&ref, level - 1,
f42c5da6 2531 root->root_key.objectid, for_reloc);
dd28b6a5 2532 if (inc)
4d09b4e9 2533 ret = btrfs_inc_extent_ref(trans, &ref);
dd28b6a5 2534 else
4d09b4e9 2535 ret = btrfs_free_extent(trans, &ref);
31840ae1
ZY
2536 if (ret)
2537 goto fail;
2538 }
2539 }
2540 return 0;
2541fail:
5d4f98a2
YZ
2542 return ret;
2543}
2544
2545int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2546 struct extent_buffer *buf, int full_backref)
5d4f98a2 2547{
e339a6b0 2548 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
2549}
2550
2551int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2552 struct extent_buffer *buf, int full_backref)
5d4f98a2 2553{
e339a6b0 2554 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
2555}
2556
1b86826d 2557static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 2558{
0b246afa 2559 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 2560 u64 flags;
53b381b3 2561 u64 ret;
9ed74f2d 2562
b742bb82
YZ
2563 if (data)
2564 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 2565 else if (root == fs_info->chunk_root)
b742bb82 2566 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 2567 else
b742bb82 2568 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 2569
878d7b67 2570 ret = btrfs_get_alloc_profile(fs_info, flags);
53b381b3 2571 return ret;
6a63209f 2572}
9ed74f2d 2573
0eb997bf 2574static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
a061fc8d 2575{
08dddb29
FM
2576 struct rb_node *leftmost;
2577 u64 bytenr = 0;
a1897fdd 2578
16b0c258 2579 read_lock(&fs_info->block_group_cache_lock);
0eb997bf 2580 /* Get the block group with the lowest logical start address. */
08dddb29
FM
2581 leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2582 if (leftmost) {
2583 struct btrfs_block_group *bg;
0f9dd46c 2584
08dddb29
FM
2585 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2586 bytenr = bg->start;
2587 }
16b0c258 2588 read_unlock(&fs_info->block_group_cache_lock);
d2fb3437
YZ
2589
2590 return bytenr;
a061fc8d
CM
2591}
2592
6690d071
NB
2593static int pin_down_extent(struct btrfs_trans_handle *trans,
2594 struct btrfs_block_group *cache,
f0486c68 2595 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 2596{
fdf08605
DS
2597 struct btrfs_fs_info *fs_info = cache->fs_info;
2598
11833d66
YZ
2599 spin_lock(&cache->space_info->lock);
2600 spin_lock(&cache->lock);
2601 cache->pinned += num_bytes;
bb96c4e5
JB
2602 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2603 num_bytes);
11833d66
YZ
2604 if (reserved) {
2605 cache->reserved -= num_bytes;
2606 cache->space_info->bytes_reserved -= num_bytes;
2607 }
2608 spin_unlock(&cache->lock);
2609 spin_unlock(&cache->space_info->lock);
68b38550 2610
fe1a598c 2611 set_extent_bit(&trans->transaction->pinned_extents, bytenr,
1d126800 2612 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
f0486c68
YZ
2613 return 0;
2614}
68b38550 2615
b25c36f8 2616int btrfs_pin_extent(struct btrfs_trans_handle *trans,
f0486c68
YZ
2617 u64 bytenr, u64 num_bytes, int reserved)
2618{
32da5386 2619 struct btrfs_block_group *cache;
68b38550 2620
b25c36f8 2621 cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
79787eaa 2622 BUG_ON(!cache); /* Logic error */
f0486c68 2623
6690d071 2624 pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
f0486c68
YZ
2625
2626 btrfs_put_block_group(cache);
11833d66
YZ
2627 return 0;
2628}
2629
9fce5704 2630int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
007dec8c 2631 const struct extent_buffer *eb)
e688b725 2632{
32da5386 2633 struct btrfs_block_group *cache;
b50c6e25 2634 int ret;
e688b725 2635
007dec8c 2636 cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
b50c6e25
JB
2637 if (!cache)
2638 return -EINVAL;
e688b725
CM
2639
2640 /*
ced8ecf0
OS
2641 * Fully cache the free space first so that our pin removes the free space
2642 * from the cache.
e688b725 2643 */
ced8ecf0 2644 ret = btrfs_cache_block_group(cache, true);
9ad6d91f
FM
2645 if (ret)
2646 goto out;
e688b725 2647
007dec8c 2648 pin_down_extent(trans, cache, eb->start, eb->len, 0);
e688b725
CM
2649
2650 /* remove us from the free space cache (if we're there at all) */
007dec8c 2651 ret = btrfs_remove_free_space(cache, eb->start, eb->len);
9ad6d91f 2652out:
e688b725 2653 btrfs_put_block_group(cache);
b50c6e25 2654 return ret;
e688b725
CM
2655}
2656
2ff7e61e
JM
2657static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2658 u64 start, u64 num_bytes)
8c2a1a30
JB
2659{
2660 int ret;
32da5386 2661 struct btrfs_block_group *block_group;
8c2a1a30 2662
0b246afa 2663 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
2664 if (!block_group)
2665 return -EINVAL;
2666
ced8ecf0 2667 ret = btrfs_cache_block_group(block_group, true);
9ad6d91f
FM
2668 if (ret)
2669 goto out;
8c2a1a30 2670
9ad6d91f
FM
2671 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2672out:
8c2a1a30
JB
2673 btrfs_put_block_group(block_group);
2674 return ret;
2675}
2676
bcdc428c 2677int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 2678{
bcdc428c 2679 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
2680 struct btrfs_file_extent_item *item;
2681 struct btrfs_key key;
2682 int found_type;
2683 int i;
b89311ef 2684 int ret = 0;
8c2a1a30 2685
2ff7e61e 2686 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
2687 return 0;
2688
2689 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2690 btrfs_item_key_to_cpu(eb, &key, i);
2691 if (key.type != BTRFS_EXTENT_DATA_KEY)
2692 continue;
2693 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2694 found_type = btrfs_file_extent_type(eb, item);
2695 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2696 continue;
2697 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2698 continue;
2699 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2700 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
2701 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2702 if (ret)
2703 break;
8c2a1a30
JB
2704 }
2705
b89311ef 2706 return ret;
8c2a1a30
JB
2707}
2708
9cfa3e34 2709static void
32da5386 2710btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
9cfa3e34
FM
2711{
2712 atomic_inc(&bg->reservations);
2713}
2714
c759c4e1
JB
2715/*
2716 * Returns the free cluster for the given space info and sets empty_cluster to
2717 * what it should be based on the mount options.
2718 */
2719static struct btrfs_free_cluster *
2ff7e61e
JM
2720fetch_cluster_info(struct btrfs_fs_info *fs_info,
2721 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
2722{
2723 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
2724
2725 *empty_cluster = 0;
2726 if (btrfs_mixed_space_info(space_info))
2727 return ret;
2728
c759c4e1 2729 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 2730 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
2731 if (btrfs_test_opt(fs_info, SSD))
2732 *empty_cluster = SZ_2M;
2733 else
ee22184b 2734 *empty_cluster = SZ_64K;
583b7231
HK
2735 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2736 btrfs_test_opt(fs_info, SSD_SPREAD)) {
2737 *empty_cluster = SZ_2M;
0b246afa 2738 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
2739 }
2740
2741 return ret;
2742}
2743
2ff7e61e
JM
2744static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2745 u64 start, u64 end,
678886bd 2746 const bool return_free_space)
ccd467d6 2747{
32da5386 2748 struct btrfs_block_group *cache = NULL;
7b398f8e
JB
2749 struct btrfs_space_info *space_info;
2750 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 2751 struct btrfs_free_cluster *cluster = NULL;
11833d66 2752 u64 len;
c759c4e1
JB
2753 u64 total_unpinned = 0;
2754 u64 empty_cluster = 0;
7b398f8e 2755 bool readonly;
44a6c343 2756 int ret = 0;
ccd467d6 2757
11833d66 2758 while (start <= end) {
7b398f8e 2759 readonly = false;
11833d66 2760 if (!cache ||
b3470b5d 2761 start >= cache->start + cache->length) {
11833d66
YZ
2762 if (cache)
2763 btrfs_put_block_group(cache);
c759c4e1 2764 total_unpinned = 0;
11833d66 2765 cache = btrfs_lookup_block_group(fs_info, start);
44a6c343
DS
2766 if (cache == NULL) {
2767 /* Logic error, something removed the block group. */
2768 ret = -EUCLEAN;
2769 goto out;
2770 }
c759c4e1 2771
2ff7e61e 2772 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
2773 cache->space_info,
2774 &empty_cluster);
2775 empty_cluster <<= 1;
11833d66
YZ
2776 }
2777
b3470b5d 2778 len = cache->start + cache->length - start;
11833d66
YZ
2779 len = min(len, end + 1 - start);
2780
48ff7083
OS
2781 if (return_free_space)
2782 btrfs_add_free_space(cache, start, len);
11833d66 2783
f0486c68 2784 start += len;
c759c4e1 2785 total_unpinned += len;
7b398f8e 2786 space_info = cache->space_info;
f0486c68 2787
c759c4e1
JB
2788 /*
2789 * If this space cluster has been marked as fragmented and we've
2790 * unpinned enough in this block group to potentially allow a
2791 * cluster to be created inside of it go ahead and clear the
2792 * fragmented check.
2793 */
2794 if (cluster && cluster->fragmented &&
2795 total_unpinned > empty_cluster) {
2796 spin_lock(&cluster->lock);
2797 cluster->fragmented = 0;
2798 spin_unlock(&cluster->lock);
2799 }
2800
7b398f8e 2801 spin_lock(&space_info->lock);
11833d66
YZ
2802 spin_lock(&cache->lock);
2803 cache->pinned -= len;
bb96c4e5 2804 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
4f4db217 2805 space_info->max_extent_size = 0;
7b398f8e
JB
2806 if (cache->ro) {
2807 space_info->bytes_readonly += len;
2808 readonly = true;
169e0da9
NA
2809 } else if (btrfs_is_zoned(fs_info)) {
2810 /* Need reset before reusing in a zoned block group */
2811 space_info->bytes_zone_unusable += len;
2812 readonly = true;
7b398f8e 2813 }
11833d66 2814 spin_unlock(&cache->lock);
957780eb
JB
2815 if (!readonly && return_free_space &&
2816 global_rsv->space_info == space_info) {
7b398f8e
JB
2817 spin_lock(&global_rsv->lock);
2818 if (!global_rsv->full) {
c4bf1909
JC
2819 u64 to_add = min(len, global_rsv->size -
2820 global_rsv->reserved);
2821
957780eb 2822 global_rsv->reserved += to_add;
bb96c4e5
JB
2823 btrfs_space_info_update_bytes_may_use(fs_info,
2824 space_info, to_add);
7b398f8e
JB
2825 if (global_rsv->reserved >= global_rsv->size)
2826 global_rsv->full = 1;
957780eb 2827 len -= to_add;
7b398f8e
JB
2828 }
2829 spin_unlock(&global_rsv->lock);
2830 }
2732798c
JB
2831 /* Add to any tickets we may have */
2832 if (!readonly && return_free_space && len)
2833 btrfs_try_granting_tickets(fs_info, space_info);
7b398f8e 2834 spin_unlock(&space_info->lock);
ccd467d6 2835 }
11833d66
YZ
2836
2837 if (cache)
2838 btrfs_put_block_group(cache);
44a6c343
DS
2839out:
2840 return ret;
ccd467d6
CM
2841}
2842
5ead2dd0 2843int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 2844{
5ead2dd0 2845 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2846 struct btrfs_block_group *block_group, *tmp;
e33e17ee 2847 struct list_head *deleted_bgs;
11833d66 2848 struct extent_io_tree *unpin;
1a5bc167
CM
2849 u64 start;
2850 u64 end;
a28ec197 2851 int ret;
a28ec197 2852
fe119a6e 2853 unpin = &trans->transaction->pinned_extents;
11833d66 2854
bf31f87f 2855 while (!TRANS_ABORTED(trans)) {
0e6ec385
FM
2856 struct extent_state *cached_state = NULL;
2857
d4b450cd 2858 mutex_lock(&fs_info->unused_bg_unpin_mutex);
e5860f82
FM
2859 if (!find_first_extent_bit(unpin, 0, &start, &end,
2860 EXTENT_DIRTY, &cached_state)) {
d4b450cd 2861 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 2862 break;
d4b450cd 2863 }
1f3c79a2 2864
46b27f50 2865 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2ff7e61e 2866 ret = btrfs_discard_extent(fs_info, start,
5378e607 2867 end + 1 - start, NULL);
1f3c79a2 2868
0e6ec385 2869 clear_extent_dirty(unpin, start, end, &cached_state);
44a6c343
DS
2870 ret = unpin_extent_range(fs_info, start, end, true);
2871 BUG_ON(ret);
d4b450cd 2872 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 2873 free_extent_state(cached_state);
b9473439 2874 cond_resched();
a28ec197 2875 }
817d52f8 2876
a2309300
DZ
2877 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2878 btrfs_discard_calc_delay(&fs_info->discard_ctl);
b0643e59 2879 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
a2309300 2880 }
b0643e59 2881
e33e17ee
JM
2882 /*
2883 * Transaction is finished. We don't need the lock anymore. We
2884 * do need to clean up the block groups in case of a transaction
2885 * abort.
2886 */
2887 deleted_bgs = &trans->transaction->deleted_bgs;
2888 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2889 u64 trimmed = 0;
2890
2891 ret = -EROFS;
bf31f87f 2892 if (!TRANS_ABORTED(trans))
2ff7e61e 2893 ret = btrfs_discard_extent(fs_info,
b3470b5d
DS
2894 block_group->start,
2895 block_group->length,
e33e17ee
JM
2896 &trimmed);
2897
2898 list_del_init(&block_group->bg_list);
6b7304af 2899 btrfs_unfreeze_block_group(block_group);
e33e17ee
JM
2900 btrfs_put_block_group(block_group);
2901
2902 if (ret) {
2903 const char *errstr = btrfs_decode_error(ret);
2904 btrfs_warn(fs_info,
913e1535 2905 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
2906 ret, errstr);
2907 }
2908 }
2909
e20d96d6
CM
2910 return 0;
2911}
2912
8d299091
BB
2913/*
2914 * Parse an extent item's inline extents looking for a simple quotas owner ref.
2915 *
2916 * @fs_info: the btrfs_fs_info for this mount
2917 * @leaf: a leaf in the extent tree containing the extent item
2918 * @slot: the slot in the leaf where the extent item is found
2919 *
2920 * Returns the objectid of the root that originally allocated the extent item
2921 * if the inline owner ref is expected and present, otherwise 0.
2922 *
2923 * If an extent item has an owner ref item, it will be the first inline ref
2924 * item. Therefore the logic is to check whether there are any inline ref
2925 * items, then check the type of the first one.
2926 */
2927u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2928 struct extent_buffer *leaf, int slot)
2929{
2930 struct btrfs_extent_item *ei;
2931 struct btrfs_extent_inline_ref *iref;
2932 struct btrfs_extent_owner_ref *oref;
2933 unsigned long ptr;
2934 unsigned long end;
2935 int type;
2936
2937 if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2938 return 0;
2939
2940 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2941 ptr = (unsigned long)(ei + 1);
2942 end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2943
2944 /* No inline ref items of any kind, can't check type. */
2945 if (ptr == end)
2946 return 0;
2947
2948 iref = (struct btrfs_extent_inline_ref *)ptr;
2949 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2950
2951 /* We found an owner ref, get the root out of it. */
2952 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2953 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2954 return btrfs_extent_owner_ref_root_id(leaf, oref);
2955 }
2956
2957 /* We have inline refs, but not an owner ref. */
2958 return 0;
2959}
2960
8f8aa4c7 2961static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
cecbb533 2962 u64 bytenr, struct btrfs_squota_delta *delta)
8f8aa4c7
JB
2963{
2964 int ret;
cecbb533 2965 u64 num_bytes = delta->num_bytes;
8f8aa4c7 2966
cecbb533 2967 if (delta->is_data) {
8f8aa4c7
JB
2968 struct btrfs_root *csum_root;
2969
2970 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2971 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2972 if (ret) {
2973 btrfs_abort_transaction(trans, ret);
2974 return ret;
2975 }
ca41504e
JT
2976
2977 ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2978 if (ret) {
2979 btrfs_abort_transaction(trans, ret);
2980 return ret;
2981 }
8f8aa4c7
JB
2982 }
2983
cecbb533
BB
2984 ret = btrfs_record_squota_delta(trans->fs_info, delta);
2985 if (ret) {
2986 btrfs_abort_transaction(trans, ret);
2987 return ret;
2988 }
2989
8f8aa4c7
JB
2990 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2991 if (ret) {
2992 btrfs_abort_transaction(trans, ret);
2993 return ret;
2994 }
2995
2996 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2997 if (ret)
2998 btrfs_abort_transaction(trans, ret);
2999
3000 return ret;
3001}
3002
eee3b811
QW
3003#define abort_and_dump(trans, path, fmt, args...) \
3004({ \
3005 btrfs_abort_transaction(trans, -EUCLEAN); \
3006 btrfs_print_leaf(path->nodes[0]); \
3007 btrfs_crit(trans->fs_info, fmt, ##args); \
3008})
3009
1c2a07f5
QW
3010/*
3011 * Drop one or more refs of @node.
3012 *
3013 * 1. Locate the extent refs.
3014 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3015 * Locate it, then reduce the refs number or remove the ref line completely.
3016 *
3017 * 2. Update the refs count in EXTENT/METADATA_ITEM
3018 *
3019 * Inline backref case:
3020 *
3021 * in extent tree we have:
3022 *
3023 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3024 * refs 2 gen 6 flags DATA
3025 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3026 * extent data backref root FS_TREE objectid 257 offset 0 count 1
3027 *
3028 * This function gets called with:
3029 *
3030 * node->bytenr = 13631488
3031 * node->num_bytes = 1048576
3032 * root_objectid = FS_TREE
3033 * owner_objectid = 257
3034 * owner_offset = 0
3035 * refs_to_drop = 1
3036 *
3037 * Then we should get some like:
3038 *
3039 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3040 * refs 1 gen 6 flags DATA
3041 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3042 *
3043 * Keyed backref case:
3044 *
3045 * in extent tree we have:
3046 *
3047 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3048 * refs 754 gen 6 flags DATA
3049 * [...]
3050 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3051 * extent data backref root FS_TREE objectid 866 offset 0 count 1
3052 *
3053 * This function get called with:
3054 *
3055 * node->bytenr = 13631488
3056 * node->num_bytes = 1048576
3057 * root_objectid = FS_TREE
3058 * owner_objectid = 866
3059 * owner_offset = 0
3060 * refs_to_drop = 1
3061 *
3062 * Then we should get some like:
3063 *
3064 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3065 * refs 753 gen 6 flags DATA
3066 *
3067 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3068 */
5d4f98a2 3069static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
cecbb533 3070 struct btrfs_delayed_ref_head *href,
85bb9f54 3071 struct btrfs_delayed_ref_node *node,
e72cb923 3072 struct btrfs_delayed_extent_op *extent_op)
a28ec197 3073{
e72cb923 3074 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 3075 struct btrfs_key key;
5d4f98a2 3076 struct btrfs_path *path;
29cbcf40 3077 struct btrfs_root *extent_root;
5f39d397 3078 struct extent_buffer *leaf;
5d4f98a2
YZ
3079 struct btrfs_extent_item *ei;
3080 struct btrfs_extent_inline_ref *iref;
a28ec197 3081 int ret;
5d4f98a2 3082 int is_data;
952fccac
CM
3083 int extent_slot = 0;
3084 int found_extent = 0;
3085 int num_to_del = 1;
1df6b3c0 3086 int refs_to_drop = node->ref_mod;
5d4f98a2
YZ
3087 u32 item_size;
3088 u64 refs;
c682f9b3
QW
3089 u64 bytenr = node->bytenr;
3090 u64 num_bytes = node->num_bytes;
85bb9f54
JB
3091 u64 owner_objectid = btrfs_delayed_ref_owner(node);
3092 u64 owner_offset = btrfs_delayed_ref_offset(node);
0b246afa 3093 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
cecbb533 3094 u64 delayed_ref_root = href->owning_root;
037e6390 3095
29cbcf40 3096 extent_root = btrfs_extent_root(info, bytenr);
abed4aaa 3097 ASSERT(extent_root);
29cbcf40 3098
5caf2a00 3099 path = btrfs_alloc_path();
54aa1f4d
CM
3100 if (!path)
3101 return -ENOMEM;
5f26f772 3102
5d4f98a2 3103 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
1c2a07f5
QW
3104
3105 if (!is_data && refs_to_drop != 1) {
3106 btrfs_crit(info,
3107"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3108 node->bytenr, refs_to_drop);
3109 ret = -EINVAL;
3110 btrfs_abort_transaction(trans, ret);
3111 goto out;
3112 }
5d4f98a2 3113
3173a18f 3114 if (is_data)
897ca819 3115 skinny_metadata = false;
3173a18f 3116
fbe4801b 3117 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
85bb9f54 3118 node->parent, node->ref_root, owner_objectid,
5d4f98a2 3119 owner_offset);
7bb86316 3120 if (ret == 0) {
1c2a07f5
QW
3121 /*
3122 * Either the inline backref or the SHARED_DATA_REF/
3123 * SHARED_BLOCK_REF is found
3124 *
3125 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3126 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3127 */
952fccac 3128 extent_slot = path->slots[0];
5d4f98a2
YZ
3129 while (extent_slot >= 0) {
3130 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 3131 extent_slot);
5d4f98a2 3132 if (key.objectid != bytenr)
952fccac 3133 break;
5d4f98a2
YZ
3134 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3135 key.offset == num_bytes) {
952fccac
CM
3136 found_extent = 1;
3137 break;
3138 }
3173a18f
JB
3139 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3140 key.offset == owner_objectid) {
3141 found_extent = 1;
3142 break;
3143 }
1c2a07f5
QW
3144
3145 /* Quick path didn't find the EXTEMT/METADATA_ITEM */
952fccac
CM
3146 if (path->slots[0] - extent_slot > 5)
3147 break;
5d4f98a2 3148 extent_slot--;
952fccac 3149 }
a79865c6 3150
31840ae1 3151 if (!found_extent) {
1c2a07f5 3152 if (iref) {
eee3b811
QW
3153 abort_and_dump(trans, path,
3154"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3155 path->slots[0]);
3156 ret = -EUCLEAN;
3157 goto out;
1c2a07f5
QW
3158 }
3159 /* Must be SHARED_* item, remove the backref first */
76d76e78 3160 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 3161 NULL, refs_to_drop, is_data);
005d6427 3162 if (ret) {
66642832 3163 btrfs_abort_transaction(trans, ret);
005d6427
DS
3164 goto out;
3165 }
b3b4aa74 3166 btrfs_release_path(path);
5d4f98a2 3167
1c2a07f5 3168 /* Slow path to locate EXTENT/METADATA_ITEM */
5d4f98a2
YZ
3169 key.objectid = bytenr;
3170 key.type = BTRFS_EXTENT_ITEM_KEY;
3171 key.offset = num_bytes;
3172
3173a18f
JB
3173 if (!is_data && skinny_metadata) {
3174 key.type = BTRFS_METADATA_ITEM_KEY;
3175 key.offset = owner_objectid;
3176 }
3177
31840ae1
ZY
3178 ret = btrfs_search_slot(trans, extent_root,
3179 &key, path, -1, 1);
3173a18f
JB
3180 if (ret > 0 && skinny_metadata && path->slots[0]) {
3181 /*
3182 * Couldn't find our skinny metadata item,
3183 * see if we have ye olde extent item.
3184 */
3185 path->slots[0]--;
3186 btrfs_item_key_to_cpu(path->nodes[0], &key,
3187 path->slots[0]);
3188 if (key.objectid == bytenr &&
3189 key.type == BTRFS_EXTENT_ITEM_KEY &&
3190 key.offset == num_bytes)
3191 ret = 0;
3192 }
3193
3194 if (ret > 0 && skinny_metadata) {
3195 skinny_metadata = false;
9ce49a0b 3196 key.objectid = bytenr;
3173a18f
JB
3197 key.type = BTRFS_EXTENT_ITEM_KEY;
3198 key.offset = num_bytes;
3199 btrfs_release_path(path);
3200 ret = btrfs_search_slot(trans, extent_root,
3201 &key, path, -1, 1);
3202 }
3203
f3465ca4 3204 if (ret) {
b783e62d 3205 if (ret > 0)
a4f78750 3206 btrfs_print_leaf(path->nodes[0]);
eee3b811
QW
3207 btrfs_err(info,
3208 "umm, got %d back from search, was looking for %llu, slot %d",
3209 ret, bytenr, path->slots[0]);
f3465ca4 3210 }
005d6427 3211 if (ret < 0) {
66642832 3212 btrfs_abort_transaction(trans, ret);
005d6427
DS
3213 goto out;
3214 }
31840ae1
ZY
3215 extent_slot = path->slots[0];
3216 }
fae7f21c 3217 } else if (WARN_ON(ret == -ENOENT)) {
eee3b811
QW
3218 abort_and_dump(trans, path,
3219"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
85bb9f54 3220 bytenr, node->parent, node->ref_root, owner_objectid,
eee3b811 3221 owner_offset, path->slots[0]);
c4a050bb 3222 goto out;
79787eaa 3223 } else {
66642832 3224 btrfs_abort_transaction(trans, ret);
005d6427 3225 goto out;
7bb86316 3226 }
5f39d397
CM
3227
3228 leaf = path->nodes[0];
3212fa14 3229 item_size = btrfs_item_size(leaf, extent_slot);
6d8ff4e4 3230 if (unlikely(item_size < sizeof(*ei))) {
182741d2
QW
3231 ret = -EUCLEAN;
3232 btrfs_err(trans->fs_info,
3233 "unexpected extent item size, has %u expect >= %zu",
3234 item_size, sizeof(*ei));
ba3c2b19
NB
3235 btrfs_abort_transaction(trans, ret);
3236 goto out;
3237 }
952fccac 3238 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 3239 struct btrfs_extent_item);
3173a18f
JB
3240 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3241 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2 3242 struct btrfs_tree_block_info *bi;
eee3b811 3243
1c2a07f5 3244 if (item_size < sizeof(*ei) + sizeof(*bi)) {
eee3b811
QW
3245 abort_and_dump(trans, path,
3246"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3247 key.objectid, key.type, key.offset,
3248 path->slots[0], owner_objectid, item_size,
3249 sizeof(*ei) + sizeof(*bi));
3250 ret = -EUCLEAN;
3251 goto out;
1c2a07f5 3252 }
5d4f98a2
YZ
3253 bi = (struct btrfs_tree_block_info *)(ei + 1);
3254 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3255 }
56bec294 3256
5d4f98a2 3257 refs = btrfs_extent_refs(leaf, ei);
32b02538 3258 if (refs < refs_to_drop) {
eee3b811
QW
3259 abort_and_dump(trans, path,
3260 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3261 refs_to_drop, refs, bytenr, path->slots[0]);
3262 ret = -EUCLEAN;
3263 goto out;
32b02538 3264 }
56bec294 3265 refs -= refs_to_drop;
5f39d397 3266
5d4f98a2
YZ
3267 if (refs > 0) {
3268 if (extent_op)
3269 __run_delayed_extent_op(extent_op, leaf, ei);
3270 /*
3271 * In the case of inline back ref, reference count will
3272 * be updated by remove_extent_backref
952fccac 3273 */
5d4f98a2 3274 if (iref) {
1c2a07f5 3275 if (!found_extent) {
eee3b811
QW
3276 abort_and_dump(trans, path,
3277"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3278 path->slots[0]);
3279 ret = -EUCLEAN;
3280 goto out;
1c2a07f5 3281 }
5d4f98a2
YZ
3282 } else {
3283 btrfs_set_extent_refs(leaf, ei, refs);
50564b65 3284 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
3285 }
3286 if (found_extent) {
76d76e78 3287 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 3288 iref, refs_to_drop, is_data);
005d6427 3289 if (ret) {
66642832 3290 btrfs_abort_transaction(trans, ret);
005d6427
DS
3291 goto out;
3292 }
952fccac 3293 }
5d4f98a2 3294 } else {
cecbb533
BB
3295 struct btrfs_squota_delta delta = {
3296 .root = delayed_ref_root,
3297 .num_bytes = num_bytes,
cecbb533
BB
3298 .is_data = is_data,
3299 .is_inc = false,
bd7c1ea3 3300 .generation = btrfs_extent_generation(leaf, ei),
cecbb533
BB
3301 };
3302
1c2a07f5 3303 /* In this branch refs == 1 */
5d4f98a2 3304 if (found_extent) {
1c2a07f5
QW
3305 if (is_data && refs_to_drop !=
3306 extent_data_ref_count(path, iref)) {
eee3b811
QW
3307 abort_and_dump(trans, path,
3308 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3309 extent_data_ref_count(path, iref),
3310 refs_to_drop, path->slots[0]);
3311 ret = -EUCLEAN;
3312 goto out;
1c2a07f5 3313 }
5d4f98a2 3314 if (iref) {
1c2a07f5 3315 if (path->slots[0] != extent_slot) {
eee3b811
QW
3316 abort_and_dump(trans, path,
3317"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3318 key.objectid, key.type,
3319 key.offset, path->slots[0]);
3320 ret = -EUCLEAN;
3321 goto out;
1c2a07f5 3322 }
5d4f98a2 3323 } else {
1c2a07f5
QW
3324 /*
3325 * No inline ref, we must be at SHARED_* item,
3326 * And it's single ref, it must be:
3327 * | extent_slot ||extent_slot + 1|
3328 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3329 */
3330 if (path->slots[0] != extent_slot + 1) {
eee3b811
QW
3331 abort_and_dump(trans, path,
3332 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3333 path->slots[0]);
3334 ret = -EUCLEAN;
3335 goto out;
1c2a07f5 3336 }
5d4f98a2
YZ
3337 path->slots[0] = extent_slot;
3338 num_to_del = 2;
3339 }
78fae27e 3340 }
cecbb533
BB
3341 /*
3342 * We can't infer the data owner from the delayed ref, so we need
3343 * to try to get it from the owning ref item.
3344 *
3345 * If it is not present, then that extent was not written under
3346 * simple quotas mode, so we don't need to account for its deletion.
3347 */
3348 if (is_data)
3349 delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3350 leaf, extent_slot);
b9473439 3351
952fccac
CM
3352 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3353 num_to_del);
005d6427 3354 if (ret) {
66642832 3355 btrfs_abort_transaction(trans, ret);
005d6427
DS
3356 goto out;
3357 }
b3b4aa74 3358 btrfs_release_path(path);
21af804c 3359
cecbb533 3360 ret = do_free_extent_accounting(trans, bytenr, &delta);
a28ec197 3361 }
fcebe456
JB
3362 btrfs_release_path(path);
3363
79787eaa 3364out:
5caf2a00 3365 btrfs_free_path(path);
a28ec197
CM
3366 return ret;
3367}
3368
1887be66 3369/*
f0486c68 3370 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
3371 * delayed ref for that extent as well. This searches the delayed ref tree for
3372 * a given extent, and if there are no other delayed refs to be processed, it
3373 * removes it from the tree.
3374 */
3375static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 3376 u64 bytenr)
1887be66
CM
3377{
3378 struct btrfs_delayed_ref_head *head;
3379 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 3380 int ret = 0;
1887be66
CM
3381
3382 delayed_refs = &trans->transaction->delayed_refs;
3383 spin_lock(&delayed_refs->lock);
f72ad18e 3384 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 3385 if (!head)
cf93da7b 3386 goto out_delayed_unlock;
1887be66 3387
d7df2c79 3388 spin_lock(&head->lock);
e3d03965 3389 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
3390 goto out;
3391
bedc6617
JB
3392 if (cleanup_extent_op(head) != NULL)
3393 goto out;
5d4f98a2 3394
1887be66
CM
3395 /*
3396 * waiting for the lock here would deadlock. If someone else has it
3397 * locked they are already in the process of dropping it anyway
3398 */
3399 if (!mutex_trylock(&head->mutex))
3400 goto out;
3401
d7baffda 3402 btrfs_delete_ref_head(delayed_refs, head);
61c681fe 3403 head->processing = false;
d7baffda 3404
d7df2c79 3405 spin_unlock(&head->lock);
1887be66
CM
3406 spin_unlock(&delayed_refs->lock);
3407
f0486c68
YZ
3408 BUG_ON(head->extent_op);
3409 if (head->must_insert_reserved)
3410 ret = 1;
3411
31890da0 3412 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 3413 mutex_unlock(&head->mutex);
d278850e 3414 btrfs_put_delayed_ref_head(head);
f0486c68 3415 return ret;
1887be66 3416out:
d7df2c79 3417 spin_unlock(&head->lock);
cf93da7b
CM
3418
3419out_delayed_unlock:
1887be66
CM
3420 spin_unlock(&delayed_refs->lock);
3421 return 0;
3422}
3423
f0486c68 3424void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7a163608 3425 u64 root_id,
f0486c68 3426 struct extent_buffer *buf,
5581a51a 3427 u64 parent, int last_ref)
f0486c68 3428{
7a163608 3429 struct btrfs_fs_info *fs_info = trans->fs_info;
3ba2d364 3430 struct btrfs_block_group *bg;
f0486c68
YZ
3431 int ret;
3432
7a163608 3433 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
4d09b4e9
JB
3434 struct btrfs_ref generic_ref = {
3435 .action = BTRFS_DROP_DELAYED_REF,
3436 .bytenr = buf->start,
12390e42 3437 .num_bytes = buf->len,
4d09b4e9
JB
3438 .parent = parent,
3439 .owning_root = btrfs_header_owner(buf),
f2e69a77 3440 .ref_root = root_id,
4d09b4e9 3441 };
74cd8cac 3442
073bda7a
NA
3443 /*
3444 * Assert that the extent buffer is not cleared due to
3445 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3446 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3447 * detail.
3448 */
3449 ASSERT(btrfs_header_bytenr(buf) != 0);
3450
f2e69a77 3451 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
8a5040f7 3452 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 3453 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
79787eaa 3454 BUG_ON(ret); /* -ENOMEM */
f0486c68
YZ
3455 }
3456
3ba2d364
JT
3457 if (!last_ref)
3458 return;
f0486c68 3459
3ba2d364
JT
3460 if (btrfs_header_generation(buf) != trans->transid)
3461 goto out;
6219872d 3462
3ba2d364
JT
3463 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3464 ret = check_ref_cleanup(trans, buf->start);
3465 if (!ret)
37be25bc 3466 goto out;
3ba2d364 3467 }
f0486c68 3468
3ba2d364 3469 bg = btrfs_lookup_block_group(fs_info, buf->start);
485df755 3470
3ba2d364
JT
3471 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3472 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3473 btrfs_put_block_group(bg);
3474 goto out;
3475 }
d3575156 3476
3ba2d364
JT
3477 /*
3478 * If there are tree mod log users we may have recorded mod log
3479 * operations for this node. If we re-allocate this node we
3480 * could replay operations on this node that happened when it
3481 * existed in a completely different root. For example if it
3482 * was part of root A, then was reallocated to root B, and we
3483 * are doing a btrfs_old_search_slot(root b), we could replay
3484 * operations that happened when the block was part of root A,
3485 * giving us an inconsistent view of the btree.
3486 *
3487 * We are safe from races here because at this point no other
3488 * node or root points to this extent buffer, so if after this
3489 * check a new tree mod log user joins we will not have an
3490 * existing log of operations on this node that we have to
3491 * contend with.
3492 */
f0486c68 3493
3ba2d364
JT
3494 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3495 || btrfs_is_zoned(fs_info)) {
3496 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3497 btrfs_put_block_group(bg);
3498 goto out;
f0486c68 3499 }
3ba2d364
JT
3500
3501 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3502
3503 btrfs_add_free_space(bg, buf->start, buf->len);
3504 btrfs_free_reserved_bytes(bg, buf->len, 0);
3505 btrfs_put_block_group(bg);
3506 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3507
f0486c68 3508out:
3ba2d364
JT
3509
3510 /*
3511 * Deleting the buffer, clear the corrupt flag since it doesn't
3512 * matter anymore.
3513 */
3514 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
f0486c68
YZ
3515}
3516
79787eaa 3517/* Can return -ENOMEM */
ffd4bb2a 3518int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 3519{
ffd4bb2a 3520 struct btrfs_fs_info *fs_info = trans->fs_info;
925baedd
CM
3521 int ret;
3522
f5ee5c9a 3523 if (btrfs_is_testing(fs_info))
faa2dbf0 3524 return 0;
fccb84c9 3525
56bec294
CM
3526 /*
3527 * tree log blocks never actually go into the extent allocation
3528 * tree, just update pinning info and exit early.
56bec294 3529 */
f2e69a77 3530 if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
12390e42 3531 btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
56bec294 3532 ret = 0;
ffd4bb2a 3533 } else if (ref->type == BTRFS_REF_METADATA) {
2187374f 3534 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
5d4f98a2 3535 } else {
2187374f 3536 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
56bec294 3537 }
d7eae340 3538
f2e69a77 3539 if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
ffd4bb2a 3540 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 3541
925baedd
CM
3542 return ret;
3543}
3544
817d52f8 3545enum btrfs_loop_type {
b9d97cff
JB
3546 /*
3547 * Start caching block groups but do not wait for progress or for them
3548 * to be done.
3549 */
f262fa8d 3550 LOOP_CACHING_NOWAIT,
b9d97cff
JB
3551
3552 /*
3553 * Wait for the block group free_space >= the space we're waiting for if
3554 * the block group isn't cached.
3555 */
f262fa8d 3556 LOOP_CACHING_WAIT,
b9d97cff
JB
3557
3558 /*
3559 * Allow allocations to happen from block groups that do not yet have a
3560 * size classification.
3561 */
52bb7a21 3562 LOOP_UNSET_SIZE_CLASS,
b9d97cff
JB
3563
3564 /*
3565 * Allocate a chunk and then retry the allocation.
3566 */
f262fa8d 3567 LOOP_ALLOC_CHUNK,
b9d97cff
JB
3568
3569 /*
3570 * Ignore the size class restrictions for this allocation.
3571 */
52bb7a21 3572 LOOP_WRONG_SIZE_CLASS,
b9d97cff
JB
3573
3574 /*
3575 * Ignore the empty size, only try to allocate the number of bytes
3576 * needed for this allocation.
3577 */
f262fa8d 3578 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
3579};
3580
e570fd27 3581static inline void
32da5386 3582btrfs_lock_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3583 int delalloc)
3584{
3585 if (delalloc)
3586 down_read(&cache->data_rwsem);
3587}
3588
32da5386 3589static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3590 int delalloc)
3591{
3592 btrfs_get_block_group(cache);
3593 if (delalloc)
3594 down_read(&cache->data_rwsem);
3595}
3596
32da5386
DS
3597static struct btrfs_block_group *btrfs_lock_cluster(
3598 struct btrfs_block_group *block_group,
e570fd27
MX
3599 struct btrfs_free_cluster *cluster,
3600 int delalloc)
c142c6a4 3601 __acquires(&cluster->refill_lock)
e570fd27 3602{
32da5386 3603 struct btrfs_block_group *used_bg = NULL;
6719afdc 3604
e570fd27 3605 spin_lock(&cluster->refill_lock);
6719afdc
GU
3606 while (1) {
3607 used_bg = cluster->block_group;
3608 if (!used_bg)
3609 return NULL;
3610
3611 if (used_bg == block_group)
e570fd27
MX
3612 return used_bg;
3613
6719afdc 3614 btrfs_get_block_group(used_bg);
e570fd27 3615
6719afdc
GU
3616 if (!delalloc)
3617 return used_bg;
e570fd27 3618
6719afdc
GU
3619 if (down_read_trylock(&used_bg->data_rwsem))
3620 return used_bg;
e570fd27 3621
6719afdc 3622 spin_unlock(&cluster->refill_lock);
e570fd27 3623
e321f8a8
LB
3624 /* We should only have one-level nested. */
3625 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 3626
6719afdc
GU
3627 spin_lock(&cluster->refill_lock);
3628 if (used_bg == cluster->block_group)
3629 return used_bg;
e570fd27 3630
6719afdc
GU
3631 up_read(&used_bg->data_rwsem);
3632 btrfs_put_block_group(used_bg);
3633 }
e570fd27
MX
3634}
3635
3636static inline void
32da5386 3637btrfs_release_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3638 int delalloc)
3639{
3640 if (delalloc)
3641 up_read(&cache->data_rwsem);
3642 btrfs_put_block_group(cache);
3643}
3644
d06e3bb6
QW
3645/*
3646 * Helper function for find_free_extent().
3647 *
3648 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
d06e3bb6
QW
3649 * Return >0 to inform caller that we find nothing
3650 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3651 */
32da5386 3652static int find_free_extent_clustered(struct btrfs_block_group *bg,
897cae79
NA
3653 struct find_free_extent_ctl *ffe_ctl,
3654 struct btrfs_block_group **cluster_bg_ret)
d06e3bb6 3655{
32da5386 3656 struct btrfs_block_group *cluster_bg;
897cae79 3657 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
d06e3bb6
QW
3658 u64 aligned_cluster;
3659 u64 offset;
3660 int ret;
3661
3662 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3663 if (!cluster_bg)
3664 goto refill_cluster;
3665 if (cluster_bg != bg && (cluster_bg->ro ||
3666 !block_group_bits(cluster_bg, ffe_ctl->flags)))
3667 goto release_cluster;
3668
3669 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
b3470b5d 3670 ffe_ctl->num_bytes, cluster_bg->start,
d06e3bb6
QW
3671 &ffe_ctl->max_extent_size);
3672 if (offset) {
3673 /* We have a block, we're done */
3674 spin_unlock(&last_ptr->refill_lock);
cfc2de0f 3675 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
d06e3bb6
QW
3676 *cluster_bg_ret = cluster_bg;
3677 ffe_ctl->found_offset = offset;
3678 return 0;
3679 }
3680 WARN_ON(last_ptr->block_group != cluster_bg);
3681
3682release_cluster:
3683 /*
3684 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3685 * lets just skip it and let the allocator find whatever block it can
3686 * find. If we reach this point, we will have tried the cluster
3687 * allocator plenty of times and not have found anything, so we are
3688 * likely way too fragmented for the clustering stuff to find anything.
3689 *
3690 * However, if the cluster is taken from the current block group,
3691 * release the cluster first, so that we stand a better chance of
3692 * succeeding in the unclustered allocation.
3693 */
3694 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3695 spin_unlock(&last_ptr->refill_lock);
3696 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3697 return -ENOENT;
3698 }
3699
3700 /* This cluster didn't work out, free it and start over */
3701 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3702
3703 if (cluster_bg != bg)
3704 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3705
3706refill_cluster:
3707 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3708 spin_unlock(&last_ptr->refill_lock);
3709 return -ENOENT;
3710 }
3711
3712 aligned_cluster = max_t(u64,
3713 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3714 bg->full_stripe_len);
2ceeae2e
DS
3715 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3716 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
3717 if (ret == 0) {
3718 /* Now pull our allocation out of this cluster */
3719 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3720 ffe_ctl->num_bytes, ffe_ctl->search_start,
3721 &ffe_ctl->max_extent_size);
3722 if (offset) {
3723 /* We found one, proceed */
3724 spin_unlock(&last_ptr->refill_lock);
d06e3bb6 3725 ffe_ctl->found_offset = offset;
cfc2de0f 3726 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
d06e3bb6
QW
3727 return 0;
3728 }
d06e3bb6
QW
3729 }
3730 /*
3731 * At this point we either didn't find a cluster or we weren't able to
3732 * allocate a block from our cluster. Free the cluster we've been
3733 * trying to use, and go to the next block group.
3734 */
3735 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3736 spin_unlock(&last_ptr->refill_lock);
3737 return 1;
3738}
3739
e1a41848
QW
3740/*
3741 * Return >0 to inform caller that we find nothing
3742 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
e1a41848 3743 */
32da5386 3744static int find_free_extent_unclustered(struct btrfs_block_group *bg,
897cae79 3745 struct find_free_extent_ctl *ffe_ctl)
e1a41848 3746{
897cae79 3747 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
e1a41848
QW
3748 u64 offset;
3749
3750 /*
3751 * We are doing an unclustered allocation, set the fragmented flag so
3752 * we don't bother trying to setup a cluster again until we get more
3753 * space.
3754 */
3755 if (unlikely(last_ptr)) {
3756 spin_lock(&last_ptr->lock);
3757 last_ptr->fragmented = 1;
3758 spin_unlock(&last_ptr->lock);
3759 }
3760 if (ffe_ctl->cached) {
3761 struct btrfs_free_space_ctl *free_space_ctl;
3762
3763 free_space_ctl = bg->free_space_ctl;
3764 spin_lock(&free_space_ctl->tree_lock);
3765 if (free_space_ctl->free_space <
3766 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3767 ffe_ctl->empty_size) {
3768 ffe_ctl->total_free_space = max_t(u64,
3769 ffe_ctl->total_free_space,
3770 free_space_ctl->free_space);
3771 spin_unlock(&free_space_ctl->tree_lock);
3772 return 1;
3773 }
3774 spin_unlock(&free_space_ctl->tree_lock);
3775 }
3776
3777 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3778 ffe_ctl->num_bytes, ffe_ctl->empty_size,
3779 &ffe_ctl->max_extent_size);
cd361199 3780 if (!offset)
e1a41848 3781 return 1;
e1a41848
QW
3782 ffe_ctl->found_offset = offset;
3783 return 0;
3784}
3785
c668690d
NA
3786static int do_allocation_clustered(struct btrfs_block_group *block_group,
3787 struct find_free_extent_ctl *ffe_ctl,
3788 struct btrfs_block_group **bg_ret)
3789{
3790 int ret;
3791
3792 /* We want to try and use the cluster allocator, so lets look there */
3793 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
897cae79 3794 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
cd361199 3795 if (ret >= 0)
c668690d
NA
3796 return ret;
3797 /* ret == -ENOENT case falls through */
3798 }
3799
897cae79 3800 return find_free_extent_unclustered(block_group, ffe_ctl);
c668690d
NA
3801}
3802
40ab3be1
NA
3803/*
3804 * Tree-log block group locking
3805 * ============================
3806 *
3807 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3808 * indicates the starting address of a block group, which is reserved only
3809 * for tree-log metadata.
3810 *
3811 * Lock nesting
3812 * ============
3813 *
3814 * space_info::lock
3815 * block_group::lock
3816 * fs_info::treelog_bg_lock
3817 */
3818
2eda5708
NA
3819/*
3820 * Simple allocator for sequential-only block group. It only allows sequential
3821 * allocation. No need to play with trees. This function also reserves the
3822 * bytes as in btrfs_add_reserved_bytes.
3823 */
3824static int do_allocation_zoned(struct btrfs_block_group *block_group,
3825 struct find_free_extent_ctl *ffe_ctl,
3826 struct btrfs_block_group **bg_ret)
3827{
40ab3be1 3828 struct btrfs_fs_info *fs_info = block_group->fs_info;
2eda5708
NA
3829 struct btrfs_space_info *space_info = block_group->space_info;
3830 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3831 u64 start = block_group->start;
3832 u64 num_bytes = ffe_ctl->num_bytes;
3833 u64 avail;
40ab3be1
NA
3834 u64 bytenr = block_group->start;
3835 u64 log_bytenr;
c2707a25 3836 u64 data_reloc_bytenr;
2eda5708 3837 int ret = 0;
2d81eb1c 3838 bool skip = false;
2eda5708
NA
3839
3840 ASSERT(btrfs_is_zoned(block_group->fs_info));
3841
40ab3be1
NA
3842 /*
3843 * Do not allow non-tree-log blocks in the dedicated tree-log block
3844 * group, and vice versa.
3845 */
3846 spin_lock(&fs_info->treelog_bg_lock);
3847 log_bytenr = fs_info->treelog_bg;
2d81eb1c
JT
3848 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3849 (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3850 skip = true;
40ab3be1
NA
3851 spin_unlock(&fs_info->treelog_bg_lock);
3852 if (skip)
3853 return 1;
3854
c2707a25
JT
3855 /*
3856 * Do not allow non-relocation blocks in the dedicated relocation block
3857 * group, and vice versa.
3858 */
3859 spin_lock(&fs_info->relocation_bg_lock);
3860 data_reloc_bytenr = fs_info->data_reloc_bg;
3861 if (data_reloc_bytenr &&
3862 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3863 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3864 skip = true;
3865 spin_unlock(&fs_info->relocation_bg_lock);
3866 if (skip)
3867 return 1;
1ada69f6 3868
2e654e4b
NA
3869 /* Check RO and no space case before trying to activate it */
3870 spin_lock(&block_group->lock);
1bfd4767 3871 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
1ada69f6
NA
3872 ret = 1;
3873 /*
3874 * May need to clear fs_info->{treelog,data_reloc}_bg.
3875 * Return the error after taking the locks.
3876 */
2e654e4b
NA
3877 }
3878 spin_unlock(&block_group->lock);
3879
5a7d107e
NA
3880 /* Metadata block group is activated at write time. */
3881 if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3882 !btrfs_zone_activate(block_group)) {
1ada69f6
NA
3883 ret = 1;
3884 /*
3885 * May need to clear fs_info->{treelog,data_reloc}_bg.
3886 * Return the error after taking the locks.
3887 */
3888 }
2e654e4b 3889
2eda5708
NA
3890 spin_lock(&space_info->lock);
3891 spin_lock(&block_group->lock);
40ab3be1 3892 spin_lock(&fs_info->treelog_bg_lock);
c2707a25 3893 spin_lock(&fs_info->relocation_bg_lock);
40ab3be1 3894
1ada69f6
NA
3895 if (ret)
3896 goto out;
3897
40ab3be1
NA
3898 ASSERT(!ffe_ctl->for_treelog ||
3899 block_group->start == fs_info->treelog_bg ||
3900 fs_info->treelog_bg == 0);
c2707a25
JT
3901 ASSERT(!ffe_ctl->for_data_reloc ||
3902 block_group->start == fs_info->data_reloc_bg ||
3903 fs_info->data_reloc_bg == 0);
2eda5708 3904
3349b57f 3905 if (block_group->ro ||
332581bd
NA
3906 (!ffe_ctl->for_data_reloc &&
3907 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
2eda5708
NA
3908 ret = 1;
3909 goto out;
3910 }
3911
40ab3be1
NA
3912 /*
3913 * Do not allow currently using block group to be tree-log dedicated
3914 * block group.
3915 */
3916 if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3917 (block_group->used || block_group->reserved)) {
3918 ret = 1;
3919 goto out;
3920 }
3921
c2707a25
JT
3922 /*
3923 * Do not allow currently used block group to be the data relocation
3924 * dedicated block group.
3925 */
3926 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3927 (block_group->used || block_group->reserved)) {
3928 ret = 1;
3929 goto out;
3930 }
3931
98173255
NA
3932 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3933 avail = block_group->zone_capacity - block_group->alloc_offset;
2eda5708
NA
3934 if (avail < num_bytes) {
3935 if (ffe_ctl->max_extent_size < avail) {
3936 /*
3937 * With sequential allocator, free space is always
3938 * contiguous
3939 */
3940 ffe_ctl->max_extent_size = avail;
3941 ffe_ctl->total_free_space = avail;
3942 }
3943 ret = 1;
3944 goto out;
3945 }
3946
40ab3be1
NA
3947 if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3948 fs_info->treelog_bg = block_group->start;
3949
332581bd
NA
3950 if (ffe_ctl->for_data_reloc) {
3951 if (!fs_info->data_reloc_bg)
3952 fs_info->data_reloc_bg = block_group->start;
3953 /*
3954 * Do not allow allocations from this block group, unless it is
3955 * for data relocation. Compared to increasing the ->ro, setting
3956 * the ->zoned_data_reloc_ongoing flag still allows nocow
3957 * writers to come in. See btrfs_inc_nocow_writers().
3958 *
3959 * We need to disable an allocation to avoid an allocation of
3960 * regular (non-relocation data) extent. With mix of relocation
3961 * extents and regular extents, we can dispatch WRITE commands
3962 * (for relocation extents) and ZONE APPEND commands (for
3963 * regular extents) at the same time to the same zone, which
3964 * easily break the write pointer.
3965 *
3966 * Also, this flag avoids this block group to be zone finished.
3967 */
3968 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3969 }
c2707a25 3970
2eda5708
NA
3971 ffe_ctl->found_offset = start + block_group->alloc_offset;
3972 block_group->alloc_offset += num_bytes;
3973 spin_lock(&ctl->tree_lock);
3974 ctl->free_space -= num_bytes;
3975 spin_unlock(&ctl->tree_lock);
3976
3977 /*
3978 * We do not check if found_offset is aligned to stripesize. The
3979 * address is anyway rewritten when using zone append writing.
3980 */
3981
3982 ffe_ctl->search_start = ffe_ctl->found_offset;
3983
3984out:
40ab3be1
NA
3985 if (ret && ffe_ctl->for_treelog)
3986 fs_info->treelog_bg = 0;
332581bd 3987 if (ret && ffe_ctl->for_data_reloc)
c2707a25
JT
3988 fs_info->data_reloc_bg = 0;
3989 spin_unlock(&fs_info->relocation_bg_lock);
40ab3be1 3990 spin_unlock(&fs_info->treelog_bg_lock);
2eda5708
NA
3991 spin_unlock(&block_group->lock);
3992 spin_unlock(&space_info->lock);
3993 return ret;
3994}
3995
c668690d
NA
3996static int do_allocation(struct btrfs_block_group *block_group,
3997 struct find_free_extent_ctl *ffe_ctl,
3998 struct btrfs_block_group **bg_ret)
3999{
4000 switch (ffe_ctl->policy) {
4001 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4002 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
2eda5708
NA
4003 case BTRFS_EXTENT_ALLOC_ZONED:
4004 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
c668690d
NA
4005 default:
4006 BUG();
4007 }
4008}
4009
baba5062
NA
4010static void release_block_group(struct btrfs_block_group *block_group,
4011 struct find_free_extent_ctl *ffe_ctl,
4012 int delalloc)
4013{
4014 switch (ffe_ctl->policy) {
4015 case BTRFS_EXTENT_ALLOC_CLUSTERED:
cd361199 4016 ffe_ctl->retry_uncached = false;
baba5062 4017 break;
2eda5708
NA
4018 case BTRFS_EXTENT_ALLOC_ZONED:
4019 /* Nothing to do */
4020 break;
baba5062
NA
4021 default:
4022 BUG();
4023 }
4024
4025 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4026 ffe_ctl->index);
4027 btrfs_release_block_group(block_group, delalloc);
4028}
4029
0ab9724b
NA
4030static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4031 struct btrfs_key *ins)
4032{
4033 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4034
4035 if (!ffe_ctl->use_cluster && last_ptr) {
4036 spin_lock(&last_ptr->lock);
4037 last_ptr->window_start = ins->objectid;
4038 spin_unlock(&last_ptr->lock);
4039 }
4040}
4041
4042static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4043 struct btrfs_key *ins)
4044{
4045 switch (ffe_ctl->policy) {
4046 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4047 found_extent_clustered(ffe_ctl, ins);
4048 break;
2eda5708
NA
4049 case BTRFS_EXTENT_ALLOC_ZONED:
4050 /* Nothing to do */
4051 break;
0ab9724b
NA
4052 default:
4053 BUG();
4054 }
4055}
4056
393f646e
NA
4057static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4058 struct find_free_extent_ctl *ffe_ctl)
4059{
5a7d107e
NA
4060 /* Block group's activeness is not a requirement for METADATA block groups. */
4061 if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4062 return 0;
4063
393f646e
NA
4064 /* If we can activate new zone, just allocate a chunk and use it */
4065 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4066 return 0;
4067
4068 /*
4069 * We already reached the max active zones. Try to finish one block
4070 * group to make a room for a new block group. This is only possible
4071 * for a data block group because btrfs_zone_finish() may need to wait
4072 * for a running transaction which can cause a deadlock for metadata
4073 * allocation.
4074 */
4075 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4076 int ret = btrfs_zone_finish_one_bg(fs_info);
4077
4078 if (ret == 1)
4079 return 0;
4080 else if (ret < 0)
4081 return ret;
4082 }
4083
4084 /*
4085 * If we have enough free space left in an already active block group
4086 * and we can't activate any other zone now, do not allow allocating a
4087 * new chunk and let find_free_extent() retry with a smaller size.
4088 */
4089 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4090 return -ENOSPC;
4091
898793d9
NA
4092 /*
4093 * Even min_alloc_size is not left in any block groups. Since we cannot
4094 * activate a new block group, allocating it may not help. Let's tell a
4095 * caller to try again and hope it progress something by writing some
4096 * parts of the region. That is only possible for data block groups,
4097 * where a part of the region can be written.
4098 */
4099 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4100 return -EAGAIN;
4101
393f646e
NA
4102 /*
4103 * We cannot activate a new block group and no enough space left in any
4104 * block groups. So, allocating a new block group may not help. But,
4105 * there is nothing to do anyway, so let's go with it.
4106 */
4107 return 0;
4108}
4109
bb9950d3
NA
4110static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4111 struct find_free_extent_ctl *ffe_ctl)
50475cd5
NA
4112{
4113 switch (ffe_ctl->policy) {
4114 case BTRFS_EXTENT_ALLOC_CLUSTERED:
bb9950d3 4115 return 0;
50475cd5 4116 case BTRFS_EXTENT_ALLOC_ZONED:
393f646e 4117 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
50475cd5
NA
4118 default:
4119 BUG();
4120 }
4121}
4122
e72d79d6
QW
4123/*
4124 * Return >0 means caller needs to re-search for free extent
4125 * Return 0 means we have the needed free extent.
4126 * Return <0 means we failed to locate any free extent.
4127 */
4128static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
e72d79d6
QW
4129 struct btrfs_key *ins,
4130 struct find_free_extent_ctl *ffe_ctl,
15b7ee65 4131 bool full_search)
e72d79d6 4132{
8e1d0290 4133 struct btrfs_root *root = fs_info->chunk_root;
e72d79d6
QW
4134 int ret;
4135
4136 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4137 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4138 ffe_ctl->orig_have_caching_bg = true;
4139
e72d79d6 4140 if (ins->objectid) {
0ab9724b 4141 found_extent(ffe_ctl, ins);
e72d79d6
QW
4142 return 0;
4143 }
4144
a85f05e5
NA
4145 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4146 return 1;
4147
4148 ffe_ctl->index++;
4149 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4150 return 1;
4151
b9d97cff 4152 /* See the comments for btrfs_loop_type for an explanation of the phases. */
e72d79d6
QW
4153 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4154 ffe_ctl->index = 0;
52bb7a21
BB
4155 /*
4156 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4157 * any uncached bgs and we've already done a full search
4158 * through.
4159 */
4160 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4161 (!ffe_ctl->orig_have_caching_bg && full_search))
e72d79d6 4162 ffe_ctl->loop++;
52bb7a21 4163 ffe_ctl->loop++;
e72d79d6
QW
4164
4165 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4166 struct btrfs_trans_handle *trans;
4167 int exist = 0;
4168
52bb7a21 4169 /* Check if allocation policy allows to create a new chunk */
bb9950d3
NA
4170 ret = can_allocate_chunk(fs_info, ffe_ctl);
4171 if (ret)
4172 return ret;
50475cd5 4173
e72d79d6
QW
4174 trans = current->journal_info;
4175 if (trans)
4176 exist = 1;
4177 else
4178 trans = btrfs_join_transaction(root);
4179
4180 if (IS_ERR(trans)) {
4181 ret = PTR_ERR(trans);
4182 return ret;
4183 }
4184
fc471cb0 4185 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
760e69c4 4186 CHUNK_ALLOC_FORCE_FOR_EXTENT);
e72d79d6 4187
e72d79d6 4188 /* Do not bail out on ENOSPC since we can do more. */
52bb7a21
BB
4189 if (ret == -ENOSPC) {
4190 ret = 0;
4191 ffe_ctl->loop++;
4192 }
c70e2139 4193 else if (ret < 0)
e72d79d6
QW
4194 btrfs_abort_transaction(trans, ret);
4195 else
4196 ret = 0;
4197 if (!exist)
4198 btrfs_end_transaction(trans);
4199 if (ret)
4200 return ret;
4201 }
4202
4203 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
45d8e033
NA
4204 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4205 return -ENOSPC;
4206
e72d79d6
QW
4207 /*
4208 * Don't loop again if we already have no empty_size and
4209 * no empty_cluster.
4210 */
4211 if (ffe_ctl->empty_size == 0 &&
4212 ffe_ctl->empty_cluster == 0)
4213 return -ENOSPC;
4214 ffe_ctl->empty_size = 0;
4215 ffe_ctl->empty_cluster = 0;
4216 }
4217 return 1;
4218 }
4219 return -ENOSPC;
4220}
4221
52bb7a21
BB
4222static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4223 struct btrfs_block_group *bg)
4224{
4225 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4226 return true;
cb0922f2 4227 if (!btrfs_block_group_should_use_size_class(bg))
52bb7a21
BB
4228 return true;
4229 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4230 return true;
4231 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4232 bg->size_class == BTRFS_BG_SZ_NONE)
4233 return true;
4234 return ffe_ctl->size_class == bg->size_class;
4235}
4236
7e895409
NA
4237static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4238 struct find_free_extent_ctl *ffe_ctl,
4239 struct btrfs_space_info *space_info,
4240 struct btrfs_key *ins)
4241{
4242 /*
4243 * If our free space is heavily fragmented we may not be able to make
4244 * big contiguous allocations, so instead of doing the expensive search
4245 * for free space, simply return ENOSPC with our max_extent_size so we
4246 * can go ahead and search for a more manageable chunk.
4247 *
4248 * If our max_extent_size is large enough for our allocation simply
4249 * disable clustering since we will likely not be able to find enough
4250 * space to create a cluster and induce latency trying.
4251 */
4252 if (space_info->max_extent_size) {
4253 spin_lock(&space_info->lock);
4254 if (space_info->max_extent_size &&
4255 ffe_ctl->num_bytes > space_info->max_extent_size) {
4256 ins->offset = space_info->max_extent_size;
4257 spin_unlock(&space_info->lock);
4258 return -ENOSPC;
4259 } else if (space_info->max_extent_size) {
4260 ffe_ctl->use_cluster = false;
4261 }
4262 spin_unlock(&space_info->lock);
4263 }
4264
4265 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4266 &ffe_ctl->empty_cluster);
4267 if (ffe_ctl->last_ptr) {
4268 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4269
4270 spin_lock(&last_ptr->lock);
4271 if (last_ptr->block_group)
4272 ffe_ctl->hint_byte = last_ptr->window_start;
4273 if (last_ptr->fragmented) {
4274 /*
4275 * We still set window_start so we can keep track of the
4276 * last place we found an allocation to try and save
4277 * some time.
4278 */
4279 ffe_ctl->hint_byte = last_ptr->window_start;
4280 ffe_ctl->use_cluster = false;
4281 }
4282 spin_unlock(&last_ptr->lock);
4283 }
4284
4285 return 0;
4286}
4287
b271fee9
NA
4288static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4289 struct find_free_extent_ctl *ffe_ctl)
4290{
4291 if (ffe_ctl->for_treelog) {
4292 spin_lock(&fs_info->treelog_bg_lock);
4293 if (fs_info->treelog_bg)
4294 ffe_ctl->hint_byte = fs_info->treelog_bg;
4295 spin_unlock(&fs_info->treelog_bg_lock);
4296 } else if (ffe_ctl->for_data_reloc) {
4297 spin_lock(&fs_info->relocation_bg_lock);
4298 if (fs_info->data_reloc_bg)
4299 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4300 spin_unlock(&fs_info->relocation_bg_lock);
02444f2a
NA
4301 } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4302 struct btrfs_block_group *block_group;
4303
4304 spin_lock(&fs_info->zone_active_bgs_lock);
4305 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4306 /*
4307 * No lock is OK here because avail is monotinically
4308 * decreasing, and this is just a hint.
4309 */
4310 u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4311
4312 if (block_group_bits(block_group, ffe_ctl->flags) &&
4313 avail >= ffe_ctl->num_bytes) {
4314 ffe_ctl->hint_byte = block_group->start;
4315 break;
4316 }
4317 }
4318 spin_unlock(&fs_info->zone_active_bgs_lock);
b271fee9
NA
4319 }
4320
4321 return 0;
4322}
4323
7e895409
NA
4324static int prepare_allocation(struct btrfs_fs_info *fs_info,
4325 struct find_free_extent_ctl *ffe_ctl,
4326 struct btrfs_space_info *space_info,
4327 struct btrfs_key *ins)
4328{
4329 switch (ffe_ctl->policy) {
4330 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4331 return prepare_allocation_clustered(fs_info, ffe_ctl,
4332 space_info, ins);
2eda5708 4333 case BTRFS_EXTENT_ALLOC_ZONED:
b271fee9 4334 return prepare_allocation_zoned(fs_info, ffe_ctl);
7e895409
NA
4335 default:
4336 BUG();
4337 }
4338}
4339
fec577fb
CM
4340/*
4341 * walks the btree of allocated extents and find a hole of a given size.
4342 * The key ins is changed to record the hole:
a4820398 4343 * ins->objectid == start position
62e2749e 4344 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 4345 * ins->offset == the size of the hole.
fec577fb 4346 * Any available blocks before search_start are skipped.
a4820398
MX
4347 *
4348 * If there is no suitable free space, we will record the max size of
4349 * the free space extent currently.
e72d79d6
QW
4350 *
4351 * The overall logic and call chain:
4352 *
4353 * find_free_extent()
4354 * |- Iterate through all block groups
4355 * | |- Get a valid block group
4356 * | |- Try to do clustered allocation in that block group
4357 * | |- Try to do unclustered allocation in that block group
4358 * | |- Check if the result is valid
4359 * | | |- If valid, then exit
4360 * | |- Jump to next block group
4361 * |
4362 * |- Push harder to find free extents
4363 * |- If not found, re-iterate all block groups
fec577fb 4364 */
437490fe 4365static noinline int find_free_extent(struct btrfs_root *root,
a12b0dc0
NA
4366 struct btrfs_key *ins,
4367 struct find_free_extent_ctl *ffe_ctl)
fec577fb 4368{
437490fe 4369 struct btrfs_fs_info *fs_info = root->fs_info;
80eb234a 4370 int ret = 0;
db8fe64f 4371 int cache_block_group_error = 0;
32da5386 4372 struct btrfs_block_group *block_group = NULL;
80eb234a 4373 struct btrfs_space_info *space_info;
a5e681d9 4374 bool full_search = false;
fec577fb 4375
a12b0dc0 4376 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
b4bd745d 4377
a12b0dc0
NA
4378 ffe_ctl->search_start = 0;
4379 /* For clustered allocation */
4380 ffe_ctl->empty_cluster = 0;
4381 ffe_ctl->last_ptr = NULL;
4382 ffe_ctl->use_cluster = true;
4383 ffe_ctl->have_caching_bg = false;
4384 ffe_ctl->orig_have_caching_bg = false;
4385 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4386 ffe_ctl->loop = 0;
cd361199 4387 ffe_ctl->retry_uncached = false;
a12b0dc0
NA
4388 ffe_ctl->cached = 0;
4389 ffe_ctl->max_extent_size = 0;
4390 ffe_ctl->total_free_space = 0;
4391 ffe_ctl->found_offset = 0;
4392 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
52bb7a21 4393 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
c10859be 4394
2eda5708 4395 if (btrfs_is_zoned(fs_info))
a12b0dc0 4396 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
2eda5708 4397
962a298f 4398 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
4399 ins->objectid = 0;
4400 ins->offset = 0;
b1a4d965 4401
cfc2de0f 4402 trace_find_free_extent(root, ffe_ctl);
3f7de037 4403
a12b0dc0 4404 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
1b1d1f66 4405 if (!space_info) {
a12b0dc0 4406 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
1b1d1f66
JB
4407 return -ENOSPC;
4408 }
2552d17e 4409
a12b0dc0 4410 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
7e895409
NA
4411 if (ret < 0)
4412 return ret;
fa9c0d79 4413
a12b0dc0 4414 ffe_ctl->search_start = max(ffe_ctl->search_start,
0eb997bf 4415 first_logical_byte(fs_info));
a12b0dc0
NA
4416 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4417 if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
b4bd745d 4418 block_group = btrfs_lookup_block_group(fs_info,
a12b0dc0 4419 ffe_ctl->search_start);
817d52f8
JB
4420 /*
4421 * we don't want to use the block group if it doesn't match our
4422 * allocation bits, or if its not cached.
ccf0e725
JB
4423 *
4424 * However if we are re-searching with an ideal block group
4425 * picked out then we don't care that the block group is cached.
817d52f8 4426 */
a12b0dc0 4427 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
285ff5af 4428 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 4429 down_read(&space_info->groups_sem);
44fb5511
CM
4430 if (list_empty(&block_group->list) ||
4431 block_group->ro) {
4432 /*
4433 * someone is removing this block group,
4434 * we can't jump into the have_block_group
4435 * target because our list pointers are not
4436 * valid
4437 */
4438 btrfs_put_block_group(block_group);
4439 up_read(&space_info->groups_sem);
ccf0e725 4440 } else {
a12b0dc0
NA
4441 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4442 block_group->flags);
4443 btrfs_lock_block_group(block_group,
4444 ffe_ctl->delalloc);
854c2f36 4445 ffe_ctl->hinted = true;
44fb5511 4446 goto have_block_group;
ccf0e725 4447 }
2552d17e 4448 } else if (block_group) {
fa9c0d79 4449 btrfs_put_block_group(block_group);
2552d17e 4450 }
42e70e7a 4451 }
2552d17e 4452search:
854c2f36 4453 trace_find_free_extent_search_loop(root, ffe_ctl);
a12b0dc0
NA
4454 ffe_ctl->have_caching_bg = false;
4455 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4456 ffe_ctl->index == 0)
a5e681d9 4457 full_search = true;
80eb234a 4458 down_read(&space_info->groups_sem);
b4bd745d 4459 list_for_each_entry(block_group,
a12b0dc0 4460 &space_info->block_groups[ffe_ctl->index], list) {
c668690d
NA
4461 struct btrfs_block_group *bg_ret;
4462
854c2f36 4463 ffe_ctl->hinted = false;
14443937 4464 /* If the block group is read-only, we can skip it entirely. */
40ab3be1 4465 if (unlikely(block_group->ro)) {
a12b0dc0 4466 if (ffe_ctl->for_treelog)
40ab3be1 4467 btrfs_clear_treelog_bg(block_group);
c2707a25
JT
4468 if (ffe_ctl->for_data_reloc)
4469 btrfs_clear_data_reloc_bg(block_group);
14443937 4470 continue;
40ab3be1 4471 }
14443937 4472
a12b0dc0
NA
4473 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4474 ffe_ctl->search_start = block_group->start;
42e70e7a 4475
83a50de9
CM
4476 /*
4477 * this can happen if we end up cycling through all the
4478 * raid types, but we want to make sure we only allocate
4479 * for the proper type.
4480 */
a12b0dc0 4481 if (!block_group_bits(block_group, ffe_ctl->flags)) {
bece2e82 4482 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 4483 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 4484 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
4485 BTRFS_BLOCK_GROUP_RAID10;
4486
4487 /*
4488 * if they asked for extra copies and this block group
4489 * doesn't provide them, bail. This does allow us to
4490 * fill raid0 from raid1.
4491 */
a12b0dc0 4492 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
83a50de9 4493 goto loop;
2a28468e
QW
4494
4495 /*
4496 * This block group has different flags than we want.
4497 * It's possible that we have MIXED_GROUP flag but no
4498 * block group is mixed. Just skip such block group.
4499 */
a12b0dc0 4500 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2a28468e 4501 continue;
83a50de9
CM
4502 }
4503
2552d17e 4504have_block_group:
854c2f36 4505 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
a12b0dc0
NA
4506 ffe_ctl->cached = btrfs_block_group_done(block_group);
4507 if (unlikely(!ffe_ctl->cached)) {
4508 ffe_ctl->have_caching_bg = true;
ced8ecf0 4509 ret = btrfs_cache_block_group(block_group, false);
db8fe64f
JB
4510
4511 /*
4512 * If we get ENOMEM here or something else we want to
4513 * try other block groups, because it may not be fatal.
4514 * However if we can't find anything else we need to
4515 * save our return here so that we return the actual
4516 * error that caused problems, not ENOSPC.
4517 */
4518 if (ret < 0) {
4519 if (!cache_block_group_error)
4520 cache_block_group_error = ret;
4521 ret = 0;
4522 goto loop;
4523 }
1d4284bd 4524 ret = 0;
817d52f8
JB
4525 }
4526
92fb94b6
JB
4527 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4528 if (!cache_block_group_error)
4529 cache_block_group_error = -EIO;
36cce922 4530 goto loop;
92fb94b6 4531 }
0f9dd46c 4532
52bb7a21
BB
4533 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4534 goto loop;
4535
c668690d 4536 bg_ret = NULL;
a12b0dc0 4537 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
cd361199 4538 if (ret > 0)
1cdda9b8 4539 goto loop;
cd361199
JB
4540
4541 if (bg_ret && bg_ret != block_group) {
4542 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4543 block_group = bg_ret;
c668690d
NA
4544 }
4545
4546 /* Checks */
a12b0dc0
NA
4547 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4548 fs_info->stripesize);
25179201 4549
2552d17e 4550 /* move on to the next group */
a12b0dc0 4551 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
b3470b5d 4552 block_group->start + block_group->length) {
2eda5708 4553 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4554 ffe_ctl->found_offset,
4555 ffe_ctl->num_bytes);
2552d17e 4556 goto loop;
6226cb0a 4557 }
f5a31e16 4558
a12b0dc0 4559 if (ffe_ctl->found_offset < ffe_ctl->search_start)
2eda5708 4560 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4561 ffe_ctl->found_offset,
4562 ffe_ctl->search_start - ffe_ctl->found_offset);
2552d17e 4563
a12b0dc0
NA
4564 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4565 ffe_ctl->num_bytes,
52bb7a21
BB
4566 ffe_ctl->delalloc,
4567 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
f0486c68 4568 if (ret == -EAGAIN) {
2eda5708 4569 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4570 ffe_ctl->found_offset,
4571 ffe_ctl->num_bytes);
2552d17e 4572 goto loop;
0f9dd46c 4573 }
9cfa3e34 4574 btrfs_inc_block_group_reservations(block_group);
0b86a832 4575
f0486c68 4576 /* we are all good, lets return */
a12b0dc0
NA
4577 ins->objectid = ffe_ctl->search_start;
4578 ins->offset = ffe_ctl->num_bytes;
d2fb3437 4579
cfc2de0f 4580 trace_btrfs_reserve_extent(block_group, ffe_ctl);
a12b0dc0 4581 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2552d17e
JB
4582 break;
4583loop:
cd361199
JB
4584 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4585 !ffe_ctl->retry_uncached) {
4586 ffe_ctl->retry_uncached = true;
4587 btrfs_wait_block_group_cache_progress(block_group,
4588 ffe_ctl->num_bytes +
4589 ffe_ctl->empty_cluster +
4590 ffe_ctl->empty_size);
4591 goto have_block_group;
4592 }
a12b0dc0 4593 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
14443937 4594 cond_resched();
2552d17e
JB
4595 }
4596 up_read(&space_info->groups_sem);
4597
a12b0dc0 4598 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
e72d79d6 4599 if (ret > 0)
b742bb82
YZ
4600 goto search;
4601
db8fe64f 4602 if (ret == -ENOSPC && !cache_block_group_error) {
b4bd745d
QW
4603 /*
4604 * Use ffe_ctl->total_free_space as fallback if we can't find
4605 * any contiguous hole.
4606 */
a12b0dc0
NA
4607 if (!ffe_ctl->max_extent_size)
4608 ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4f4db217 4609 spin_lock(&space_info->lock);
a12b0dc0 4610 space_info->max_extent_size = ffe_ctl->max_extent_size;
4f4db217 4611 spin_unlock(&space_info->lock);
a12b0dc0 4612 ins->offset = ffe_ctl->max_extent_size;
db8fe64f
JB
4613 } else if (ret == -ENOSPC) {
4614 ret = cache_block_group_error;
4f4db217 4615 }
0f70abe2 4616 return ret;
fec577fb 4617}
ec44a35c 4618
6f47c706 4619/*
9580503b
DS
4620 * Entry point to the extent allocator. Tries to find a hole that is at least
4621 * as big as @num_bytes.
6f47c706
NB
4622 *
4623 * @root - The root that will contain this extent
4624 *
4625 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4626 * is used for accounting purposes. This value differs
4627 * from @num_bytes only in the case of compressed extents.
4628 *
4629 * @num_bytes - Number of bytes to allocate on-disk.
4630 *
4631 * @min_alloc_size - Indicates the minimum amount of space that the
4632 * allocator should try to satisfy. In some cases
4633 * @num_bytes may be larger than what is required and if
4634 * the filesystem is fragmented then allocation fails.
4635 * However, the presence of @min_alloc_size gives a
4636 * chance to try and satisfy the smaller allocation.
4637 *
4638 * @empty_size - A hint that you plan on doing more COW. This is the
4639 * size in bytes the allocator should try to find free
4640 * next to the block it returns. This is just a hint and
4641 * may be ignored by the allocator.
4642 *
4643 * @hint_byte - Hint to the allocator to start searching above the byte
4644 * address passed. It might be ignored.
4645 *
4646 * @ins - This key is modified to record the found hole. It will
4647 * have the following values:
4648 * ins->objectid == start position
4649 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4650 * ins->offset == the size of the hole.
4651 *
4652 * @is_data - Boolean flag indicating whether an extent is
4653 * allocated for data (true) or metadata (false)
4654 *
4655 * @delalloc - Boolean flag indicating whether this allocation is for
4656 * delalloc or not. If 'true' data_rwsem of block groups
4657 * is going to be acquired.
4658 *
4659 *
4660 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4661 * case -ENOSPC is returned then @ins->offset will contain the size of the
4662 * largest available hole the allocator managed to find.
4663 */
18513091 4664int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
4665 u64 num_bytes, u64 min_alloc_size,
4666 u64 empty_size, u64 hint_byte,
e570fd27 4667 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 4668{
ab8d0fc4 4669 struct btrfs_fs_info *fs_info = root->fs_info;
a12b0dc0 4670 struct find_free_extent_ctl ffe_ctl = {};
36af4e07 4671 bool final_tried = num_bytes == min_alloc_size;
b6919a58 4672 u64 flags;
fec577fb 4673 int ret;
40ab3be1 4674 bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
c2707a25 4675 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
925baedd 4676
1b86826d 4677 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 4678again:
0b246afa 4679 WARN_ON(num_bytes < fs_info->sectorsize);
a12b0dc0
NA
4680
4681 ffe_ctl.ram_bytes = ram_bytes;
4682 ffe_ctl.num_bytes = num_bytes;
a85f05e5 4683 ffe_ctl.min_alloc_size = min_alloc_size;
a12b0dc0
NA
4684 ffe_ctl.empty_size = empty_size;
4685 ffe_ctl.flags = flags;
4686 ffe_ctl.delalloc = delalloc;
4687 ffe_ctl.hint_byte = hint_byte;
4688 ffe_ctl.for_treelog = for_treelog;
c2707a25 4689 ffe_ctl.for_data_reloc = for_data_reloc;
a12b0dc0
NA
4690
4691 ret = find_free_extent(root, ins, &ffe_ctl);
9cfa3e34 4692 if (!ret && !is_data) {
ab8d0fc4 4693 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 4694 } else if (ret == -ENOSPC) {
a4820398
MX
4695 if (!final_tried && ins->offset) {
4696 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 4697 num_bytes = round_down(num_bytes,
0b246afa 4698 fs_info->sectorsize);
9e622d6b 4699 num_bytes = max(num_bytes, min_alloc_size);
18513091 4700 ram_bytes = num_bytes;
9e622d6b
MX
4701 if (num_bytes == min_alloc_size)
4702 final_tried = true;
4703 goto again;
ab8d0fc4 4704 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
4705 struct btrfs_space_info *sinfo;
4706
280c2908 4707 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 4708 btrfs_err(fs_info,
c2707a25
JT
4709 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4710 flags, num_bytes, for_treelog, for_data_reloc);
53804280 4711 if (sinfo)
5da6afeb
JB
4712 btrfs_dump_space_info(fs_info, sinfo,
4713 num_bytes, 1);
9e622d6b 4714 }
925baedd 4715 }
0f9dd46c
JB
4716
4717 return ret;
e6dcd2dc
CM
4718}
4719
a0fbf736
NB
4720int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4721 u64 start, u64 len, int delalloc)
65b51a00 4722{
32da5386 4723 struct btrfs_block_group *cache;
0f9dd46c 4724
0b246afa 4725 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 4726 if (!cache) {
0b246afa
JM
4727 btrfs_err(fs_info, "Unable to find block group for %llu",
4728 start);
0f9dd46c
JB
4729 return -ENOSPC;
4730 }
1f3c79a2 4731
a0fbf736
NB
4732 btrfs_add_free_space(cache, start, len);
4733 btrfs_free_reserved_bytes(cache, len, delalloc);
4734 trace_btrfs_reserved_extent_free(fs_info, start, len);
4735
fa9c0d79 4736 btrfs_put_block_group(cache);
a0fbf736 4737 return 0;
e6dcd2dc
CM
4738}
4739
f863c502
DS
4740int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4741 const struct extent_buffer *eb)
e688b725 4742{
7ef54d54 4743 struct btrfs_block_group *cache;
a0fbf736 4744 int ret = 0;
7ef54d54 4745
f863c502 4746 cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
7ef54d54 4747 if (!cache) {
7bfc1007 4748 btrfs_err(trans->fs_info, "unable to find block group for %llu",
f863c502 4749 eb->start);
7ef54d54
NB
4750 return -ENOSPC;
4751 }
4752
f863c502 4753 ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
7ef54d54 4754 btrfs_put_block_group(cache);
a0fbf736 4755 return ret;
e688b725
CM
4756}
4757
34666705
JB
4758static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4759 u64 num_bytes)
4760{
4761 struct btrfs_fs_info *fs_info = trans->fs_info;
4762 int ret;
4763
4764 ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4765 if (ret)
4766 return ret;
4767
4768 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4769 if (ret) {
4770 ASSERT(!ret);
4771 btrfs_err(fs_info, "update block group failed for %llu %llu",
4772 bytenr, num_bytes);
4773 return ret;
4774 }
4775
4776 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4777 return 0;
4778}
4779
5d4f98a2 4780static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4781 u64 parent, u64 root_objectid,
4782 u64 flags, u64 owner, u64 offset,
2672a051 4783 struct btrfs_key *ins, int ref_mod, u64 oref_root)
e6dcd2dc 4784{
ef89b824 4785 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4786 struct btrfs_root *extent_root;
e6dcd2dc 4787 int ret;
e6dcd2dc 4788 struct btrfs_extent_item *extent_item;
d9a620f7 4789 struct btrfs_extent_owner_ref *oref;
5d4f98a2 4790 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 4791 struct btrfs_path *path;
5d4f98a2
YZ
4792 struct extent_buffer *leaf;
4793 int type;
4794 u32 size;
d9a620f7 4795 const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
26b8003f 4796
5d4f98a2
YZ
4797 if (parent > 0)
4798 type = BTRFS_SHARED_DATA_REF_KEY;
4799 else
4800 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 4801
d9a620f7
BB
4802 size = sizeof(*extent_item);
4803 if (simple_quota)
4804 size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4805 size += btrfs_extent_inline_ref_size(type);
7bb86316
CM
4806
4807 path = btrfs_alloc_path();
db5b493a
TI
4808 if (!path)
4809 return -ENOMEM;
47e4bb98 4810
29cbcf40
JB
4811 extent_root = btrfs_extent_root(fs_info, ins->objectid);
4812 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
79787eaa
JM
4813 if (ret) {
4814 btrfs_free_path(path);
4815 return ret;
4816 }
0f9dd46c 4817
5d4f98a2
YZ
4818 leaf = path->nodes[0];
4819 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 4820 struct btrfs_extent_item);
5d4f98a2
YZ
4821 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4822 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4823 btrfs_set_extent_flags(leaf, extent_item,
4824 flags | BTRFS_EXTENT_FLAG_DATA);
4825
4826 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
d9a620f7
BB
4827 if (simple_quota) {
4828 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4829 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2672a051 4830 btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
d9a620f7
BB
4831 iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4832 }
5d4f98a2 4833 btrfs_set_extent_inline_ref_type(leaf, iref, type);
d9a620f7 4834
5d4f98a2
YZ
4835 if (parent > 0) {
4836 struct btrfs_shared_data_ref *ref;
4837 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4838 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4839 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4840 } else {
4841 struct btrfs_extent_data_ref *ref;
4842 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4843 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4844 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4845 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4846 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4847 }
47e4bb98 4848
50564b65 4849 btrfs_mark_buffer_dirty(trans, path->nodes[0]);
7bb86316 4850 btrfs_free_path(path);
f510cfec 4851
34666705 4852 return alloc_reserved_extent(trans, ins->objectid, ins->offset);
e6dcd2dc
CM
4853}
4854
5d4f98a2 4855static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 4856 struct btrfs_delayed_ref_node *node,
21ebfbe7 4857 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 4858{
9dcdbe01 4859 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4860 struct btrfs_root *extent_root;
e6dcd2dc 4861 int ret;
5d4f98a2 4862 struct btrfs_extent_item *extent_item;
4e6bd4e0 4863 struct btrfs_key extent_key;
5d4f98a2
YZ
4864 struct btrfs_tree_block_info *block_info;
4865 struct btrfs_extent_inline_ref *iref;
4866 struct btrfs_path *path;
4867 struct extent_buffer *leaf;
4e6bd4e0 4868 struct btrfs_delayed_tree_ref *ref;
3173a18f 4869 u32 size = sizeof(*extent_item) + sizeof(*iref);
21ebfbe7 4870 u64 flags = extent_op->flags_to_set;
0b246afa 4871 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 4872
4e6bd4e0
NB
4873 ref = btrfs_delayed_node_to_tree_ref(node);
4874
4e6bd4e0
NB
4875 extent_key.objectid = node->bytenr;
4876 if (skinny_metadata) {
4877 extent_key.offset = ref->level;
4878 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4e6bd4e0
NB
4879 } else {
4880 extent_key.offset = node->num_bytes;
4881 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 4882 size += sizeof(*block_info);
4e6bd4e0 4883 }
1c2308f8 4884
5d4f98a2 4885 path = btrfs_alloc_path();
80ee54bf 4886 if (!path)
d8926bb3 4887 return -ENOMEM;
56bec294 4888
29cbcf40
JB
4889 extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4890 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4891 size);
79787eaa 4892 if (ret) {
dd825259 4893 btrfs_free_path(path);
79787eaa
JM
4894 return ret;
4895 }
5d4f98a2
YZ
4896
4897 leaf = path->nodes[0];
4898 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4899 struct btrfs_extent_item);
4900 btrfs_set_extent_refs(leaf, extent_item, 1);
4901 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4902 btrfs_set_extent_flags(leaf, extent_item,
4903 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 4904
3173a18f
JB
4905 if (skinny_metadata) {
4906 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4907 } else {
4908 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 4909 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4e6bd4e0 4910 btrfs_set_tree_block_level(leaf, block_info, ref->level);
3173a18f
JB
4911 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4912 }
5d4f98a2 4913
d4b20733 4914 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
4915 btrfs_set_extent_inline_ref_type(leaf, iref,
4916 BTRFS_SHARED_BLOCK_REF_KEY);
cf4f0432 4917 btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
5d4f98a2
YZ
4918 } else {
4919 btrfs_set_extent_inline_ref_type(leaf, iref,
4920 BTRFS_TREE_BLOCK_REF_KEY);
cf4f0432 4921 btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
5d4f98a2
YZ
4922 }
4923
50564b65 4924 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
4925 btrfs_free_path(path);
4926
34666705 4927 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
5d4f98a2
YZ
4928}
4929
4930int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 4931 struct btrfs_root *root, u64 owner,
5846a3c2
QW
4932 u64 offset, u64 ram_bytes,
4933 struct btrfs_key *ins)
5d4f98a2 4934{
4d09b4e9
JB
4935 struct btrfs_ref generic_ref = {
4936 .action = BTRFS_ADD_DELAYED_EXTENT,
4937 .bytenr = ins->objectid,
12390e42 4938 .num_bytes = ins->offset,
4d09b4e9 4939 .owning_root = root->root_key.objectid,
f2e69a77 4940 .ref_root = root->root_key.objectid,
4d09b4e9 4941 };
5d4f98a2 4942
f2e69a77 4943 ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 4944
2672a051 4945 if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4d09b4e9 4946 generic_ref.owning_root = root->relocation_src_root;
2672a051 4947
f2e69a77 4948 btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
8a5040f7 4949 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
2187374f
JB
4950
4951 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
e6dcd2dc 4952}
e02119d5
CM
4953
4954/*
4955 * this is used by the tree logging recovery code. It records that
4956 * an extent has been allocated and makes sure to clear the free
4957 * space cache bits as well
4958 */
5d4f98a2 4959int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4960 u64 root_objectid, u64 owner, u64 offset,
4961 struct btrfs_key *ins)
e02119d5 4962{
61da2abf 4963 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5 4964 int ret;
32da5386 4965 struct btrfs_block_group *block_group;
ed7a6948 4966 struct btrfs_space_info *space_info;
cecbb533
BB
4967 struct btrfs_squota_delta delta = {
4968 .root = root_objectid,
4969 .num_bytes = ins->offset,
bd7c1ea3 4970 .generation = trans->transid,
cecbb533
BB
4971 .is_data = true,
4972 .is_inc = true,
4973 };
11833d66 4974
8c2a1a30
JB
4975 /*
4976 * Mixed block groups will exclude before processing the log so we only
01327610 4977 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 4978 */
0b246afa 4979 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
4980 ret = __exclude_logged_extent(fs_info, ins->objectid,
4981 ins->offset);
b50c6e25 4982 if (ret)
8c2a1a30 4983 return ret;
11833d66
YZ
4984 }
4985
0b246afa 4986 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
4987 if (!block_group)
4988 return -EINVAL;
4989
ed7a6948
WX
4990 space_info = block_group->space_info;
4991 spin_lock(&space_info->lock);
4992 spin_lock(&block_group->lock);
4993 space_info->bytes_reserved += ins->offset;
4994 block_group->reserved += ins->offset;
4995 spin_unlock(&block_group->lock);
4996 spin_unlock(&space_info->lock);
4997
ef89b824 4998 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
2672a051 4999 offset, ins, 1, root_objectid);
bd727173 5000 if (ret)
ab9b2c7b 5001 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
cecbb533 5002 ret = btrfs_record_squota_delta(fs_info, &delta);
b50c6e25 5003 btrfs_put_block_group(block_group);
e02119d5
CM
5004 return ret;
5005}
5006
150cce2d
DS
5007#ifdef CONFIG_BTRFS_DEBUG
5008/*
5009 * Extra safety check in case the extent tree is corrupted and extent allocator
5010 * chooses to use a tree block which is already used and locked.
5011 */
5012static bool check_eb_lock_owner(const struct extent_buffer *eb)
5013{
5014 if (eb->lock_owner == current->pid) {
5015 btrfs_err_rl(eb->fs_info,
5016"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5017 eb->start, btrfs_header_owner(eb), current->pid);
5018 return true;
5019 }
5020 return false;
5021}
5022#else
5023static bool check_eb_lock_owner(struct extent_buffer *eb)
5024{
5025 return false;
5026}
5027#endif
5028
48a3b636
ES
5029static struct extent_buffer *
5030btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
9631e4cc
JB
5031 u64 bytenr, int level, u64 owner,
5032 enum btrfs_lock_nesting nest)
65b51a00 5033{
0b246afa 5034 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00 5035 struct extent_buffer *buf;
b40130b2 5036 u64 lockdep_owner = owner;
65b51a00 5037
3fbaf258 5038 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
c871b0f2
LB
5039 if (IS_ERR(buf))
5040 return buf;
5041
150cce2d 5042 if (check_eb_lock_owner(buf)) {
b72c3aba
QW
5043 free_extent_buffer(buf);
5044 return ERR_PTR(-EUCLEAN);
5045 }
5046
b40130b2
JB
5047 /*
5048 * The reloc trees are just snapshots, so we need them to appear to be
5049 * just like any other fs tree WRT lockdep.
5050 *
5051 * The exception however is in replace_path() in relocation, where we
5052 * hold the lock on the original fs root and then search for the reloc
5053 * root. At that point we need to make sure any reloc root buffers are
5054 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5055 * lockdep happy.
5056 */
5057 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5058 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5059 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5060
618d1d7d 5061 /* btrfs_clear_buffer_dirty() accesses generation field. */
cbddcc4f
TH
5062 btrfs_set_header_generation(buf, trans->transid);
5063
e114c545
JB
5064 /*
5065 * This needs to stay, because we could allocate a freed block from an
5066 * old tree into a new tree, so we need to make sure this new block is
5067 * set to the appropriate level and owner.
5068 */
b40130b2
JB
5069 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5070
2066bbfc 5071 btrfs_tree_lock_nested(buf, nest);
190a8339 5072 btrfs_clear_buffer_dirty(trans, buf);
3083ee2e 5073 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
cbf44cd9 5074 clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
b4ce94de 5075
4db8c528 5076 set_extent_buffer_uptodate(buf);
b4ce94de 5077
bc877d28
NB
5078 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5079 btrfs_set_header_level(buf, level);
5080 btrfs_set_header_bytenr(buf, buf->start);
5081 btrfs_set_header_generation(buf, trans->transid);
5082 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5083 btrfs_set_header_owner(buf, owner);
de37aa51 5084 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 5085 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
d0c803c4 5086 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 5087 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
5088 /*
5089 * we allow two log transactions at a time, use different
52042d8e 5090 * EXTENT bit to differentiate dirty pages.
8cef4e16 5091 */
656f30db 5092 if (buf->log_index == 0)
fe1a598c
DS
5093 set_extent_bit(&root->dirty_log_pages, buf->start,
5094 buf->start + buf->len - 1,
1d126800 5095 EXTENT_DIRTY, NULL);
8cef4e16 5096 else
eea8686e
DS
5097 set_extent_bit(&root->dirty_log_pages, buf->start,
5098 buf->start + buf->len - 1,
1d126800 5099 EXTENT_NEW, NULL);
d0c803c4 5100 } else {
656f30db 5101 buf->log_index = -1;
fe1a598c 5102 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
1d126800 5103 buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
d0c803c4 5104 }
b4ce94de 5105 /* this returns a buffer locked for blocking */
65b51a00
CM
5106 return buf;
5107}
5108
fec577fb 5109/*
f0486c68 5110 * finds a free extent and does all the dirty work required for allocation
67b7859e 5111 * returns the tree buffer or an ERR_PTR on error.
fec577fb 5112 */
4d75f8a9 5113struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
5114 struct btrfs_root *root,
5115 u64 parent, u64 root_objectid,
5116 const struct btrfs_disk_key *key,
5117 int level, u64 hint,
9631e4cc 5118 u64 empty_size,
60ea105a 5119 u64 reloc_src_root,
9631e4cc 5120 enum btrfs_lock_nesting nest)
fec577fb 5121{
0b246afa 5122 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 5123 struct btrfs_key ins;
f0486c68 5124 struct btrfs_block_rsv *block_rsv;
5f39d397 5125 struct extent_buffer *buf;
67b7859e 5126 struct btrfs_delayed_extent_op *extent_op;
f0486c68
YZ
5127 u64 flags = 0;
5128 int ret;
0b246afa
JM
5129 u32 blocksize = fs_info->nodesize;
5130 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
60ea105a 5131 u64 owning_root;
fec577fb 5132
05653ef3 5133#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 5134 if (btrfs_is_testing(fs_info)) {
faa2dbf0 5135 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
9631e4cc 5136 level, root_objectid, nest);
faa2dbf0
JB
5137 if (!IS_ERR(buf))
5138 root->alloc_bytenr += blocksize;
5139 return buf;
5140 }
05653ef3 5141#endif
fccb84c9 5142
67f9c220 5143 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
f0486c68
YZ
5144 if (IS_ERR(block_rsv))
5145 return ERR_CAST(block_rsv);
5146
18513091 5147 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 5148 empty_size, hint, &ins, 0, 0);
67b7859e
OS
5149 if (ret)
5150 goto out_unuse;
55c69072 5151
bc877d28 5152 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
9631e4cc 5153 root_objectid, nest);
67b7859e
OS
5154 if (IS_ERR(buf)) {
5155 ret = PTR_ERR(buf);
5156 goto out_free_reserved;
5157 }
60ea105a 5158 owning_root = btrfs_header_owner(buf);
f0486c68
YZ
5159
5160 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5161 if (parent == 0)
5162 parent = ins.objectid;
5163 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
60ea105a 5164 owning_root = reloc_src_root;
f0486c68
YZ
5165 } else
5166 BUG_ON(parent > 0);
5167
5168 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4d09b4e9
JB
5169 struct btrfs_ref generic_ref = {
5170 .action = BTRFS_ADD_DELAYED_EXTENT,
5171 .bytenr = ins.objectid,
12390e42 5172 .num_bytes = ins.offset,
4d09b4e9
JB
5173 .parent = parent,
5174 .owning_root = owning_root,
f2e69a77 5175 .ref_root = root_objectid,
4d09b4e9 5176 };
78a6184a 5177 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
5178 if (!extent_op) {
5179 ret = -ENOMEM;
5180 goto out_free_buf;
5181 }
f0486c68
YZ
5182 if (key)
5183 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5184 else
5185 memset(&extent_op->key, 0, sizeof(extent_op->key));
5186 extent_op->flags_to_set = flags;
35b3ad50
DS
5187 extent_op->update_key = skinny_metadata ? false : true;
5188 extent_op->update_flags = true;
b1c79e09 5189 extent_op->level = level;
f0486c68 5190
f2e69a77 5191 btrfs_init_tree_ref(&generic_ref, level,
f42c5da6 5192 root->root_key.objectid, false);
8a5040f7 5193 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 5194 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
67b7859e
OS
5195 if (ret)
5196 goto out_free_delayed;
f0486c68 5197 }
fec577fb 5198 return buf;
67b7859e
OS
5199
5200out_free_delayed:
5201 btrfs_free_delayed_extent_op(extent_op);
5202out_free_buf:
19ea40dd 5203 btrfs_tree_unlock(buf);
67b7859e
OS
5204 free_extent_buffer(buf);
5205out_free_reserved:
2ff7e61e 5206 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 5207out_unuse:
67f9c220 5208 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 5209 return ERR_PTR(ret);
fec577fb 5210}
a28ec197 5211
2c47e605
YZ
5212struct walk_control {
5213 u64 refs[BTRFS_MAX_LEVEL];
5214 u64 flags[BTRFS_MAX_LEVEL];
5215 struct btrfs_key update_progress;
aea6f028
JB
5216 struct btrfs_key drop_progress;
5217 int drop_level;
2c47e605
YZ
5218 int stage;
5219 int level;
5220 int shared_level;
5221 int update_ref;
5222 int keep_locks;
1c4850e2
YZ
5223 int reada_slot;
5224 int reada_count;
78c52d9e 5225 int restarted;
2c47e605
YZ
5226};
5227
5228#define DROP_REFERENCE 1
5229#define UPDATE_BACKREF 2
5230
1c4850e2
YZ
5231static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5232 struct btrfs_root *root,
5233 struct walk_control *wc,
5234 struct btrfs_path *path)
6407bf6d 5235{
0b246afa 5236 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5237 u64 bytenr;
5238 u64 generation;
5239 u64 refs;
94fcca9f 5240 u64 flags;
5d4f98a2 5241 u32 nritems;
1c4850e2
YZ
5242 struct btrfs_key key;
5243 struct extent_buffer *eb;
6407bf6d 5244 int ret;
1c4850e2
YZ
5245 int slot;
5246 int nread = 0;
6407bf6d 5247
1c4850e2
YZ
5248 if (path->slots[wc->level] < wc->reada_slot) {
5249 wc->reada_count = wc->reada_count * 2 / 3;
5250 wc->reada_count = max(wc->reada_count, 2);
5251 } else {
5252 wc->reada_count = wc->reada_count * 3 / 2;
5253 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 5254 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 5255 }
7bb86316 5256
1c4850e2
YZ
5257 eb = path->nodes[wc->level];
5258 nritems = btrfs_header_nritems(eb);
bd56b302 5259
1c4850e2
YZ
5260 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5261 if (nread >= wc->reada_count)
5262 break;
bd56b302 5263
2dd3e67b 5264 cond_resched();
1c4850e2
YZ
5265 bytenr = btrfs_node_blockptr(eb, slot);
5266 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 5267
1c4850e2
YZ
5268 if (slot == path->slots[wc->level])
5269 goto reada;
5d4f98a2 5270
1c4850e2
YZ
5271 if (wc->stage == UPDATE_BACKREF &&
5272 generation <= root->root_key.offset)
bd56b302
CM
5273 continue;
5274
94fcca9f 5275 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 5276 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f 5277 wc->level - 1, 1, &refs,
d8ba2a91 5278 &flags, NULL);
79787eaa
JM
5279 /* We don't care about errors in readahead. */
5280 if (ret < 0)
5281 continue;
94fcca9f
YZ
5282 BUG_ON(refs == 0);
5283
1c4850e2 5284 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
5285 if (refs == 1)
5286 goto reada;
bd56b302 5287
94fcca9f
YZ
5288 if (wc->level == 1 &&
5289 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5290 continue;
1c4850e2
YZ
5291 if (!wc->update_ref ||
5292 generation <= root->root_key.offset)
5293 continue;
5294 btrfs_node_key_to_cpu(eb, &key, slot);
5295 ret = btrfs_comp_cpu_keys(&key,
5296 &wc->update_progress);
5297 if (ret < 0)
5298 continue;
94fcca9f
YZ
5299 } else {
5300 if (wc->level == 1 &&
5301 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5302 continue;
6407bf6d 5303 }
1c4850e2 5304reada:
bfb484d9 5305 btrfs_readahead_node_child(eb, slot);
1c4850e2 5306 nread++;
20524f02 5307 }
1c4850e2 5308 wc->reada_slot = slot;
20524f02 5309}
2c47e605 5310
f82d02d9 5311/*
2c016dc2 5312 * helper to process tree block while walking down the tree.
2c47e605 5313 *
2c47e605
YZ
5314 * when wc->stage == UPDATE_BACKREF, this function updates
5315 * back refs for pointers in the block.
5316 *
5317 * NOTE: return value 1 means we should stop walking down.
f82d02d9 5318 */
2c47e605 5319static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 5320 struct btrfs_root *root,
2c47e605 5321 struct btrfs_path *path,
94fcca9f 5322 struct walk_control *wc, int lookup_info)
f82d02d9 5323{
2ff7e61e 5324 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
5325 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level];
2c47e605 5327 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
5328 int ret;
5329
2c47e605
YZ
5330 if (wc->stage == UPDATE_BACKREF &&
5331 btrfs_header_owner(eb) != root->root_key.objectid)
5332 return 1;
f82d02d9 5333
2c47e605
YZ
5334 /*
5335 * when reference count of tree block is 1, it won't increase
5336 * again. once full backref flag is set, we never clear it.
5337 */
94fcca9f
YZ
5338 if (lookup_info &&
5339 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5340 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 5341 BUG_ON(!path->locks[level]);
2ff7e61e 5342 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5343 eb->start, level, 1,
2c47e605 5344 &wc->refs[level],
d8ba2a91
JB
5345 &wc->flags[level],
5346 NULL);
79787eaa
JM
5347 BUG_ON(ret == -ENOMEM);
5348 if (ret)
5349 return ret;
2c47e605
YZ
5350 BUG_ON(wc->refs[level] == 0);
5351 }
5d4f98a2 5352
2c47e605
YZ
5353 if (wc->stage == DROP_REFERENCE) {
5354 if (wc->refs[level] > 1)
5355 return 1;
f82d02d9 5356
2c47e605 5357 if (path->locks[level] && !wc->keep_locks) {
bd681513 5358 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5359 path->locks[level] = 0;
5360 }
5361 return 0;
5362 }
f82d02d9 5363
2c47e605
YZ
5364 /* wc->stage == UPDATE_BACKREF */
5365 if (!(wc->flags[level] & flag)) {
5366 BUG_ON(!path->locks[level]);
e339a6b0 5367 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 5368 BUG_ON(ret); /* -ENOMEM */
e339a6b0 5369 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 5370 BUG_ON(ret); /* -ENOMEM */
4aec05fa 5371 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
79787eaa 5372 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
5373 wc->flags[level] |= flag;
5374 }
5375
5376 /*
5377 * the block is shared by multiple trees, so it's not good to
5378 * keep the tree lock
5379 */
5380 if (path->locks[level] && level > 0) {
bd681513 5381 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5382 path->locks[level] = 0;
5383 }
5384 return 0;
5385}
5386
78c52d9e
JB
5387/*
5388 * This is used to verify a ref exists for this root to deal with a bug where we
5389 * would have a drop_progress key that hadn't been updated properly.
5390 */
5391static int check_ref_exists(struct btrfs_trans_handle *trans,
5392 struct btrfs_root *root, u64 bytenr, u64 parent,
5393 int level)
5394{
5395 struct btrfs_path *path;
5396 struct btrfs_extent_inline_ref *iref;
5397 int ret;
5398
5399 path = btrfs_alloc_path();
5400 if (!path)
5401 return -ENOMEM;
5402
5403 ret = lookup_extent_backref(trans, path, &iref, bytenr,
5404 root->fs_info->nodesize, parent,
5405 root->root_key.objectid, level, 0);
5406 btrfs_free_path(path);
5407 if (ret == -ENOENT)
5408 return 0;
5409 if (ret < 0)
5410 return ret;
5411 return 1;
5412}
5413
1c4850e2 5414/*
2c016dc2 5415 * helper to process tree block pointer.
1c4850e2
YZ
5416 *
5417 * when wc->stage == DROP_REFERENCE, this function checks
5418 * reference count of the block pointed to. if the block
5419 * is shared and we need update back refs for the subtree
5420 * rooted at the block, this function changes wc->stage to
5421 * UPDATE_BACKREF. if the block is shared and there is no
5422 * need to update back, this function drops the reference
5423 * to the block.
5424 *
5425 * NOTE: return value 1 means we should stop walking down.
5426 */
5427static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5428 struct btrfs_root *root,
5429 struct btrfs_path *path,
94fcca9f 5430 struct walk_control *wc, int *lookup_info)
1c4850e2 5431{
0b246afa 5432 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5433 u64 bytenr;
5434 u64 generation;
d8ba2a91 5435 u64 owner_root = 0;
789d6a3a 5436 struct btrfs_tree_parent_check check = { 0 };
1c4850e2
YZ
5437 struct btrfs_key key;
5438 struct extent_buffer *next;
5439 int level = wc->level;
5440 int reada = 0;
5441 int ret = 0;
1152651a 5442 bool need_account = false;
1c4850e2
YZ
5443
5444 generation = btrfs_node_ptr_generation(path->nodes[level],
5445 path->slots[level]);
5446 /*
5447 * if the lower level block was created before the snapshot
5448 * was created, we know there is no need to update back refs
5449 * for the subtree
5450 */
5451 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
5452 generation <= root->root_key.offset) {
5453 *lookup_info = 1;
1c4850e2 5454 return 1;
94fcca9f 5455 }
1c4850e2
YZ
5456
5457 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
789d6a3a
QW
5458
5459 check.level = level - 1;
5460 check.transid = generation;
5461 check.owner_root = root->root_key.objectid;
5462 check.has_first_key = true;
5463 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
581c1760 5464 path->slots[level]);
1c4850e2 5465
0b246afa 5466 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 5467 if (!next) {
3fbaf258
JB
5468 next = btrfs_find_create_tree_block(fs_info, bytenr,
5469 root->root_key.objectid, level - 1);
c871b0f2
LB
5470 if (IS_ERR(next))
5471 return PTR_ERR(next);
1c4850e2
YZ
5472 reada = 1;
5473 }
5474 btrfs_tree_lock(next);
1c4850e2 5475
2ff7e61e 5476 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f 5477 &wc->refs[level - 1],
d8ba2a91
JB
5478 &wc->flags[level - 1],
5479 &owner_root);
4867268c
JB
5480 if (ret < 0)
5481 goto out_unlock;
79787eaa 5482
c2cf52eb 5483 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 5484 btrfs_err(fs_info, "Missing references.");
4867268c
JB
5485 ret = -EIO;
5486 goto out_unlock;
c2cf52eb 5487 }
94fcca9f 5488 *lookup_info = 0;
1c4850e2 5489
94fcca9f 5490 if (wc->stage == DROP_REFERENCE) {
1c4850e2 5491 if (wc->refs[level - 1] > 1) {
1152651a 5492 need_account = true;
94fcca9f
YZ
5493 if (level == 1 &&
5494 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5495 goto skip;
5496
1c4850e2
YZ
5497 if (!wc->update_ref ||
5498 generation <= root->root_key.offset)
5499 goto skip;
5500
5501 btrfs_node_key_to_cpu(path->nodes[level], &key,
5502 path->slots[level]);
5503 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5504 if (ret < 0)
5505 goto skip;
5506
5507 wc->stage = UPDATE_BACKREF;
5508 wc->shared_level = level - 1;
5509 }
94fcca9f
YZ
5510 } else {
5511 if (level == 1 &&
5512 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5513 goto skip;
1c4850e2
YZ
5514 }
5515
b9fab919 5516 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
5517 btrfs_tree_unlock(next);
5518 free_extent_buffer(next);
5519 next = NULL;
94fcca9f 5520 *lookup_info = 1;
1c4850e2
YZ
5521 }
5522
5523 if (!next) {
5524 if (reada && level == 1)
5525 reada_walk_down(trans, root, wc, path);
789d6a3a 5526 next = read_tree_block(fs_info, bytenr, &check);
64c043de
LB
5527 if (IS_ERR(next)) {
5528 return PTR_ERR(next);
5529 } else if (!extent_buffer_uptodate(next)) {
416bc658 5530 free_extent_buffer(next);
97d9a8a4 5531 return -EIO;
416bc658 5532 }
1c4850e2 5533 btrfs_tree_lock(next);
1c4850e2
YZ
5534 }
5535
5536 level--;
4867268c
JB
5537 ASSERT(level == btrfs_header_level(next));
5538 if (level != btrfs_header_level(next)) {
5539 btrfs_err(root->fs_info, "mismatched level");
5540 ret = -EIO;
5541 goto out_unlock;
5542 }
1c4850e2
YZ
5543 path->nodes[level] = next;
5544 path->slots[level] = 0;
ac5887c8 5545 path->locks[level] = BTRFS_WRITE_LOCK;
1c4850e2
YZ
5546 wc->level = level;
5547 if (wc->level == 1)
5548 wc->reada_slot = 0;
5549 return 0;
5550skip:
5551 wc->refs[level - 1] = 0;
5552 wc->flags[level - 1] = 0;
94fcca9f 5553 if (wc->stage == DROP_REFERENCE) {
4d09b4e9
JB
5554 struct btrfs_ref ref = {
5555 .action = BTRFS_DROP_DELAYED_REF,
5556 .bytenr = bytenr,
12390e42 5557 .num_bytes = fs_info->nodesize,
4d09b4e9 5558 .owning_root = owner_root,
f2e69a77 5559 .ref_root = root->root_key.objectid,
4d09b4e9 5560 };
94fcca9f 5561 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
4d09b4e9 5562 ref.parent = path->nodes[level]->start;
94fcca9f 5563 } else {
4867268c 5564 ASSERT(root->root_key.objectid ==
94fcca9f 5565 btrfs_header_owner(path->nodes[level]));
4867268c
JB
5566 if (root->root_key.objectid !=
5567 btrfs_header_owner(path->nodes[level])) {
5568 btrfs_err(root->fs_info,
5569 "mismatched block owner");
5570 ret = -EIO;
5571 goto out_unlock;
5572 }
94fcca9f 5573 }
1c4850e2 5574
78c52d9e
JB
5575 /*
5576 * If we had a drop_progress we need to verify the refs are set
5577 * as expected. If we find our ref then we know that from here
5578 * on out everything should be correct, and we can clear the
5579 * ->restarted flag.
5580 */
5581 if (wc->restarted) {
4d09b4e9 5582 ret = check_ref_exists(trans, root, bytenr, ref.parent,
78c52d9e
JB
5583 level - 1);
5584 if (ret < 0)
5585 goto out_unlock;
5586 if (ret == 0)
5587 goto no_delete;
5588 ret = 0;
5589 wc->restarted = 0;
5590 }
5591
2cd86d30
QW
5592 /*
5593 * Reloc tree doesn't contribute to qgroup numbers, and we have
5594 * already accounted them at merge time (replace_path),
5595 * thus we could skip expensive subtree trace here.
5596 */
5597 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5598 need_account) {
deb40627 5599 ret = btrfs_qgroup_trace_subtree(trans, next,
33d1f05c 5600 generation, level - 1);
1152651a 5601 if (ret) {
0b246afa 5602 btrfs_err_rl(fs_info,
5d163e0e
JM
5603 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5604 ret);
1152651a
MF
5605 }
5606 }
aea6f028
JB
5607
5608 /*
5609 * We need to update the next key in our walk control so we can
5610 * update the drop_progress key accordingly. We don't care if
5611 * find_next_key doesn't find a key because that means we're at
5612 * the end and are going to clean up now.
5613 */
5614 wc->drop_level = level;
5615 find_next_key(path, level, &wc->drop_progress);
5616
f2e69a77 5617 btrfs_init_tree_ref(&ref, level - 1, 0, false);
ffd4bb2a 5618 ret = btrfs_free_extent(trans, &ref);
4867268c
JB
5619 if (ret)
5620 goto out_unlock;
1c4850e2 5621 }
78c52d9e 5622no_delete:
4867268c
JB
5623 *lookup_info = 1;
5624 ret = 1;
5625
5626out_unlock:
1c4850e2
YZ
5627 btrfs_tree_unlock(next);
5628 free_extent_buffer(next);
4867268c
JB
5629
5630 return ret;
1c4850e2
YZ
5631}
5632
2c47e605 5633/*
2c016dc2 5634 * helper to process tree block while walking up the tree.
2c47e605
YZ
5635 *
5636 * when wc->stage == DROP_REFERENCE, this function drops
5637 * reference count on the block.
5638 *
5639 * when wc->stage == UPDATE_BACKREF, this function changes
5640 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5641 * to UPDATE_BACKREF previously while processing the block.
5642 *
5643 * NOTE: return value 1 means we should stop walking up.
5644 */
5645static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5646 struct btrfs_root *root,
5647 struct btrfs_path *path,
5648 struct walk_control *wc)
5649{
0b246afa 5650 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 5651 int ret;
2c47e605
YZ
5652 int level = wc->level;
5653 struct extent_buffer *eb = path->nodes[level];
5654 u64 parent = 0;
5655
5656 if (wc->stage == UPDATE_BACKREF) {
5657 BUG_ON(wc->shared_level < level);
5658 if (level < wc->shared_level)
5659 goto out;
5660
2c47e605
YZ
5661 ret = find_next_key(path, level + 1, &wc->update_progress);
5662 if (ret > 0)
5663 wc->update_ref = 0;
5664
5665 wc->stage = DROP_REFERENCE;
5666 wc->shared_level = -1;
5667 path->slots[level] = 0;
5668
5669 /*
5670 * check reference count again if the block isn't locked.
5671 * we should start walking down the tree again if reference
5672 * count is one.
5673 */
5674 if (!path->locks[level]) {
5675 BUG_ON(level == 0);
5676 btrfs_tree_lock(eb);
ac5887c8 5677 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5678
2ff7e61e 5679 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5680 eb->start, level, 1,
2c47e605 5681 &wc->refs[level],
d8ba2a91
JB
5682 &wc->flags[level],
5683 NULL);
79787eaa
JM
5684 if (ret < 0) {
5685 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5686 path->locks[level] = 0;
79787eaa
JM
5687 return ret;
5688 }
2c47e605
YZ
5689 BUG_ON(wc->refs[level] == 0);
5690 if (wc->refs[level] == 1) {
bd681513 5691 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5692 path->locks[level] = 0;
2c47e605
YZ
5693 return 1;
5694 }
f82d02d9 5695 }
2c47e605 5696 }
f82d02d9 5697
2c47e605
YZ
5698 /* wc->stage == DROP_REFERENCE */
5699 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 5700
2c47e605
YZ
5701 if (wc->refs[level] == 1) {
5702 if (level == 0) {
5703 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 5704 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 5705 else
e339a6b0 5706 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 5707 BUG_ON(ret); /* -ENOMEM */
c4140cbf
QW
5708 if (is_fstree(root->root_key.objectid)) {
5709 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5710 if (ret) {
5711 btrfs_err_rl(fs_info,
5712 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 5713 ret);
c4140cbf 5714 }
1152651a 5715 }
2c47e605 5716 }
190a8339 5717 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
d3fb6615 5718 if (!path->locks[level]) {
2c47e605 5719 btrfs_tree_lock(eb);
ac5887c8 5720 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5721 }
190a8339 5722 btrfs_clear_buffer_dirty(trans, eb);
2c47e605
YZ
5723 }
5724
5725 if (eb == root->node) {
5726 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5727 parent = eb->start;
65c6e82b
QW
5728 else if (root->root_key.objectid != btrfs_header_owner(eb))
5729 goto owner_mismatch;
2c47e605
YZ
5730 } else {
5731 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5732 parent = path->nodes[level + 1]->start;
65c6e82b
QW
5733 else if (root->root_key.objectid !=
5734 btrfs_header_owner(path->nodes[level + 1]))
5735 goto owner_mismatch;
f82d02d9 5736 }
f82d02d9 5737
7a163608
FM
5738 btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5739 wc->refs[level] == 1);
2c47e605
YZ
5740out:
5741 wc->refs[level] = 0;
5742 wc->flags[level] = 0;
f0486c68 5743 return 0;
65c6e82b
QW
5744
5745owner_mismatch:
5746 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5747 btrfs_header_owner(eb), root->root_key.objectid);
5748 return -EUCLEAN;
2c47e605
YZ
5749}
5750
5751static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5752 struct btrfs_root *root,
5753 struct btrfs_path *path,
5754 struct walk_control *wc)
5755{
2c47e605 5756 int level = wc->level;
94fcca9f 5757 int lookup_info = 1;
4e194384 5758 int ret = 0;
2c47e605
YZ
5759
5760 while (level >= 0) {
94fcca9f 5761 ret = walk_down_proc(trans, root, path, wc, lookup_info);
4e194384 5762 if (ret)
2c47e605
YZ
5763 break;
5764
5765 if (level == 0)
5766 break;
5767
7a7965f8
YZ
5768 if (path->slots[level] >=
5769 btrfs_header_nritems(path->nodes[level]))
5770 break;
5771
94fcca9f 5772 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
5773 if (ret > 0) {
5774 path->slots[level]++;
5775 continue;
90d2c51d 5776 } else if (ret < 0)
4e194384 5777 break;
1c4850e2 5778 level = wc->level;
f82d02d9 5779 }
4e194384 5780 return (ret == 1) ? 0 : ret;
f82d02d9
YZ
5781}
5782
d397712b 5783static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 5784 struct btrfs_root *root,
f82d02d9 5785 struct btrfs_path *path,
2c47e605 5786 struct walk_control *wc, int max_level)
20524f02 5787{
2c47e605 5788 int level = wc->level;
20524f02 5789 int ret;
9f3a7427 5790
2c47e605
YZ
5791 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5792 while (level < max_level && path->nodes[level]) {
5793 wc->level = level;
5794 if (path->slots[level] + 1 <
5795 btrfs_header_nritems(path->nodes[level])) {
5796 path->slots[level]++;
20524f02
CM
5797 return 0;
5798 } else {
2c47e605
YZ
5799 ret = walk_up_proc(trans, root, path, wc);
5800 if (ret > 0)
5801 return 0;
65c6e82b
QW
5802 if (ret < 0)
5803 return ret;
bd56b302 5804
2c47e605 5805 if (path->locks[level]) {
bd681513
CM
5806 btrfs_tree_unlock_rw(path->nodes[level],
5807 path->locks[level]);
2c47e605 5808 path->locks[level] = 0;
f82d02d9 5809 }
2c47e605
YZ
5810 free_extent_buffer(path->nodes[level]);
5811 path->nodes[level] = NULL;
5812 level++;
20524f02
CM
5813 }
5814 }
5815 return 1;
5816}
5817
9aca1d51 5818/*
2c47e605
YZ
5819 * drop a subvolume tree.
5820 *
5821 * this function traverses the tree freeing any blocks that only
5822 * referenced by the tree.
5823 *
5824 * when a shared tree block is found. this function decreases its
5825 * reference count by one. if update_ref is true, this function
5826 * also make sure backrefs for the shared block and all lower level
5827 * blocks are properly updated.
9d1a2a3a
DS
5828 *
5829 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 5830 */
0078a9f9 5831int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
20524f02 5832{
12a824dc
FM
5833 const bool is_reloc_root = (root->root_key.objectid ==
5834 BTRFS_TREE_RELOC_OBJECTID);
ab8d0fc4 5835 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 5836 struct btrfs_path *path;
2c47e605 5837 struct btrfs_trans_handle *trans;
ab8d0fc4 5838 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 5839 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
5840 struct walk_control *wc;
5841 struct btrfs_key key;
5842 int err = 0;
5843 int ret;
5844 int level;
d29a9f62 5845 bool root_dropped = false;
b4be6aef 5846 bool unfinished_drop = false;
20524f02 5847
4fd786e6 5848 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
1152651a 5849
5caf2a00 5850 path = btrfs_alloc_path();
cb1b69f4
TI
5851 if (!path) {
5852 err = -ENOMEM;
5853 goto out;
5854 }
20524f02 5855
2c47e605 5856 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
5857 if (!wc) {
5858 btrfs_free_path(path);
cb1b69f4
TI
5859 err = -ENOMEM;
5860 goto out;
38a1a919 5861 }
2c47e605 5862
f3e3d9cc
QW
5863 /*
5864 * Use join to avoid potential EINTR from transaction start. See
5865 * wait_reserve_ticket and the whole reservation callchain.
5866 */
5867 if (for_reloc)
5868 trans = btrfs_join_transaction(tree_root);
5869 else
5870 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
5871 if (IS_ERR(trans)) {
5872 err = PTR_ERR(trans);
5873 goto out_free;
5874 }
98d5dc13 5875
0568e82d
JB
5876 err = btrfs_run_delayed_items(trans);
5877 if (err)
5878 goto out_end_trans;
5879
83354f07
JB
5880 /*
5881 * This will help us catch people modifying the fs tree while we're
5882 * dropping it. It is unsafe to mess with the fs tree while it's being
5883 * dropped as we unlock the root node and parent nodes as we walk down
5884 * the tree, assuming nothing will change. If something does change
5885 * then we'll have stale information and drop references to blocks we've
5886 * already dropped.
5887 */
5888 set_bit(BTRFS_ROOT_DELETING, &root->state);
b4be6aef
JB
5889 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5890
9f3a7427 5891 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 5892 level = btrfs_header_level(root->node);
5d4f98a2 5893 path->nodes[level] = btrfs_lock_root_node(root);
9f3a7427 5894 path->slots[level] = 0;
ac5887c8 5895 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
5896 memset(&wc->update_progress, 0,
5897 sizeof(wc->update_progress));
9f3a7427 5898 } else {
9f3a7427 5899 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
5900 memcpy(&wc->update_progress, &key,
5901 sizeof(wc->update_progress));
5902
c8422684 5903 level = btrfs_root_drop_level(root_item);
2c47e605 5904 BUG_ON(level == 0);
6702ed49 5905 path->lowest_level = level;
2c47e605
YZ
5906 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5907 path->lowest_level = 0;
5908 if (ret < 0) {
5909 err = ret;
79787eaa 5910 goto out_end_trans;
9f3a7427 5911 }
1c4850e2 5912 WARN_ON(ret > 0);
2c47e605 5913
7d9eb12c
CM
5914 /*
5915 * unlock our path, this is safe because only this
5916 * function is allowed to delete this snapshot
5917 */
5d4f98a2 5918 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
5919
5920 level = btrfs_header_level(root->node);
5921 while (1) {
5922 btrfs_tree_lock(path->nodes[level]);
ac5887c8 5923 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5924
2ff7e61e 5925 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 5926 path->nodes[level]->start,
3173a18f 5927 level, 1, &wc->refs[level],
d8ba2a91 5928 &wc->flags[level], NULL);
79787eaa
JM
5929 if (ret < 0) {
5930 err = ret;
5931 goto out_end_trans;
5932 }
2c47e605
YZ
5933 BUG_ON(wc->refs[level] == 0);
5934
c8422684 5935 if (level == btrfs_root_drop_level(root_item))
2c47e605
YZ
5936 break;
5937
5938 btrfs_tree_unlock(path->nodes[level]);
fec386ac 5939 path->locks[level] = 0;
2c47e605
YZ
5940 WARN_ON(wc->refs[level] != 1);
5941 level--;
5942 }
9f3a7427 5943 }
2c47e605 5944
78c52d9e 5945 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
5946 wc->level = level;
5947 wc->shared_level = -1;
5948 wc->stage = DROP_REFERENCE;
5949 wc->update_ref = update_ref;
5950 wc->keep_locks = 0;
0b246afa 5951 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 5952
d397712b 5953 while (1) {
9d1a2a3a 5954
2c47e605
YZ
5955 ret = walk_down_tree(trans, root, path, wc);
5956 if (ret < 0) {
9a93b5a3 5957 btrfs_abort_transaction(trans, ret);
2c47e605 5958 err = ret;
20524f02 5959 break;
2c47e605 5960 }
9aca1d51 5961
2c47e605
YZ
5962 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5963 if (ret < 0) {
9a93b5a3 5964 btrfs_abort_transaction(trans, ret);
2c47e605 5965 err = ret;
20524f02 5966 break;
2c47e605
YZ
5967 }
5968
5969 if (ret > 0) {
5970 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
5971 break;
5972 }
2c47e605
YZ
5973
5974 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
5975 wc->drop_level = wc->level;
5976 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5977 &wc->drop_progress,
5978 path->slots[wc->drop_level]);
5979 }
5980 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5981 &wc->drop_progress);
c8422684 5982 btrfs_set_root_drop_level(root_item, wc->drop_level);
2c47e605
YZ
5983
5984 BUG_ON(wc->level == 0);
3a45bb20 5985 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 5986 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
5987 ret = btrfs_update_root(trans, tree_root,
5988 &root->root_key,
5989 root_item);
79787eaa 5990 if (ret) {
66642832 5991 btrfs_abort_transaction(trans, ret);
79787eaa
JM
5992 err = ret;
5993 goto out_end_trans;
5994 }
2c47e605 5995
12a824dc
FM
5996 if (!is_reloc_root)
5997 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5998
3a45bb20 5999 btrfs_end_transaction_throttle(trans);
2ff7e61e 6000 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
6001 btrfs_debug(fs_info,
6002 "drop snapshot early exit");
3c8f2422
JB
6003 err = -EAGAIN;
6004 goto out_free;
6005 }
6006
18d3bff4
JB
6007 /*
6008 * Use join to avoid potential EINTR from transaction
6009 * start. See wait_reserve_ticket and the whole
6010 * reservation callchain.
6011 */
6012 if (for_reloc)
6013 trans = btrfs_join_transaction(tree_root);
6014 else
6015 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
6016 if (IS_ERR(trans)) {
6017 err = PTR_ERR(trans);
6018 goto out_free;
6019 }
c3e69d58 6020 }
20524f02 6021 }
b3b4aa74 6022 btrfs_release_path(path);
79787eaa
JM
6023 if (err)
6024 goto out_end_trans;
2c47e605 6025
ab9ce7d4 6026 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 6027 if (ret) {
66642832 6028 btrfs_abort_transaction(trans, ret);
e19182c0 6029 err = ret;
79787eaa
JM
6030 goto out_end_trans;
6031 }
2c47e605 6032
12a824dc 6033 if (!is_reloc_root) {
cb517eab
MX
6034 ret = btrfs_find_root(tree_root, &root->root_key, path,
6035 NULL, NULL);
79787eaa 6036 if (ret < 0) {
66642832 6037 btrfs_abort_transaction(trans, ret);
79787eaa
JM
6038 err = ret;
6039 goto out_end_trans;
6040 } else if (ret > 0) {
84cd948c
JB
6041 /* if we fail to delete the orphan item this time
6042 * around, it'll get picked up the next time.
6043 *
6044 * The most common failure here is just -ENOENT.
6045 */
6046 btrfs_del_orphan_item(trans, tree_root,
6047 root->root_key.objectid);
76dda93c
YZ
6048 }
6049 }
6050
a3cf0e43
QW
6051 /*
6052 * This subvolume is going to be completely dropped, and won't be
6053 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6054 * commit transaction time. So free it here manually.
6055 */
6056 btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6057 btrfs_qgroup_free_meta_all_pertrans(root);
6058
fc7cbcd4 6059 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
2b9dbef2 6060 btrfs_add_dropped_root(trans, root);
8c38938c 6061 else
00246528 6062 btrfs_put_root(root);
d29a9f62 6063 root_dropped = true;
79787eaa 6064out_end_trans:
12a824dc
FM
6065 if (!is_reloc_root)
6066 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6067
3a45bb20 6068 btrfs_end_transaction_throttle(trans);
79787eaa 6069out_free:
2c47e605 6070 kfree(wc);
5caf2a00 6071 btrfs_free_path(path);
cb1b69f4 6072out:
b4be6aef
JB
6073 /*
6074 * We were an unfinished drop root, check to see if there are any
6075 * pending, and if not clear and wake up any waiters.
6076 */
6077 if (!err && unfinished_drop)
6078 btrfs_maybe_wake_unfinished_drop(fs_info);
6079
d29a9f62
JB
6080 /*
6081 * So if we need to stop dropping the snapshot for whatever reason we
6082 * need to make sure to add it back to the dead root list so that we
6083 * keep trying to do the work later. This also cleans up roots if we
6084 * don't have it in the radix (like when we recover after a power fail
6085 * or unmount) so we don't leak memory.
6086 */
897ca819 6087 if (!for_reloc && !root_dropped)
d29a9f62 6088 btrfs_add_dead_root(root);
2c536799 6089 return err;
20524f02 6090}
9078a3e1 6091
2c47e605
YZ
6092/*
6093 * drop subtree rooted at tree block 'node'.
6094 *
6095 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 6096 * only used by relocation code
2c47e605 6097 */
f82d02d9
YZ
6098int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6099 struct btrfs_root *root,
6100 struct extent_buffer *node,
6101 struct extent_buffer *parent)
6102{
0b246afa 6103 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 6104 struct btrfs_path *path;
2c47e605 6105 struct walk_control *wc;
f82d02d9
YZ
6106 int level;
6107 int parent_level;
6108 int ret = 0;
6109 int wret;
6110
2c47e605
YZ
6111 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6112
f82d02d9 6113 path = btrfs_alloc_path();
db5b493a
TI
6114 if (!path)
6115 return -ENOMEM;
f82d02d9 6116
2c47e605 6117 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
6118 if (!wc) {
6119 btrfs_free_path(path);
6120 return -ENOMEM;
6121 }
2c47e605 6122
49d0c642 6123 btrfs_assert_tree_write_locked(parent);
f82d02d9 6124 parent_level = btrfs_header_level(parent);
67439dad 6125 atomic_inc(&parent->refs);
f82d02d9
YZ
6126 path->nodes[parent_level] = parent;
6127 path->slots[parent_level] = btrfs_header_nritems(parent);
6128
49d0c642 6129 btrfs_assert_tree_write_locked(node);
f82d02d9 6130 level = btrfs_header_level(node);
f82d02d9
YZ
6131 path->nodes[level] = node;
6132 path->slots[level] = 0;
ac5887c8 6133 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
6134
6135 wc->refs[parent_level] = 1;
6136 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6137 wc->level = level;
6138 wc->shared_level = -1;
6139 wc->stage = DROP_REFERENCE;
6140 wc->update_ref = 0;
6141 wc->keep_locks = 1;
0b246afa 6142 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
6143
6144 while (1) {
2c47e605
YZ
6145 wret = walk_down_tree(trans, root, path, wc);
6146 if (wret < 0) {
f82d02d9 6147 ret = wret;
f82d02d9 6148 break;
2c47e605 6149 }
f82d02d9 6150
2c47e605 6151 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
6152 if (wret < 0)
6153 ret = wret;
6154 if (wret != 0)
6155 break;
6156 }
6157
2c47e605 6158 kfree(wc);
f82d02d9
YZ
6159 btrfs_free_path(path);
6160 return ret;
6161}
6162
91701bdf
DS
6163/*
6164 * Unpin the extent range in an error context and don't add the space back.
6165 * Errors are not propagated further.
6166 */
6167void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
acce952b 6168{
91701bdf 6169 unpin_extent_range(fs_info, start, end, false);
acce952b 6170}
6171
499f377f
JM
6172/*
6173 * It used to be that old block groups would be left around forever.
6174 * Iterating over them would be enough to trim unused space. Since we
6175 * now automatically remove them, we also need to iterate over unallocated
6176 * space.
6177 *
6178 * We don't want a transaction for this since the discard may take a
6179 * substantial amount of time. We don't require that a transaction be
6180 * running, but we do need to take a running transaction into account
fee7acc3
JM
6181 * to ensure that we're not discarding chunks that were released or
6182 * allocated in the current transaction.
499f377f
JM
6183 *
6184 * Holding the chunks lock will prevent other threads from allocating
6185 * or releasing chunks, but it won't prevent a running transaction
6186 * from committing and releasing the memory that the pending chunks
6187 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
6188 * transaction and hold the commit root sem. We only need to hold
6189 * it while performing the free space search since we have already
6190 * held back allocations.
499f377f 6191 */
8103d10b 6192static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 6193{
37f85ec3 6194 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
499f377f
JM
6195 int ret;
6196
6197 *trimmed = 0;
6198
0be88e36 6199 /* Discard not supported = nothing to do. */
70200574 6200 if (!bdev_max_discard_sectors(device->bdev))
0be88e36
JM
6201 return 0;
6202
52042d8e 6203 /* Not writable = nothing to do. */
ebbede42 6204 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
6205 return 0;
6206
6207 /* No free space = nothing to do. */
6208 if (device->total_bytes <= device->bytes_used)
6209 return 0;
6210
6211 ret = 0;
6212
6213 while (1) {
fb456252 6214 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
6215 u64 bytes;
6216
6217 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6218 if (ret)
fee7acc3 6219 break;
499f377f 6220
929be17a
NB
6221 find_first_clear_extent_bit(&device->alloc_state, start,
6222 &start, &end,
6223 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45 6224
c57dd1f2
QW
6225 /* Check if there are any CHUNK_* bits left */
6226 if (start > device->total_bytes) {
6227 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6228 btrfs_warn_in_rcu(fs_info,
6229"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6230 start, end - start + 1,
cb3e217b 6231 btrfs_dev_name(device),
c57dd1f2
QW
6232 device->total_bytes);
6233 mutex_unlock(&fs_info->chunk_mutex);
6234 ret = 0;
6235 break;
6236 }
6237
37f85ec3
QW
6238 /* Ensure we skip the reserved space on each device. */
6239 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
53460a45 6240
929be17a
NB
6241 /*
6242 * If find_first_clear_extent_bit find a range that spans the
6243 * end of the device it will set end to -1, in this case it's up
6244 * to the caller to trim the value to the size of the device.
6245 */
6246 end = min(end, device->total_bytes - 1);
53460a45 6247
929be17a 6248 len = end - start + 1;
499f377f 6249
929be17a
NB
6250 /* We didn't find any extents */
6251 if (!len) {
499f377f 6252 mutex_unlock(&fs_info->chunk_mutex);
929be17a 6253 ret = 0;
499f377f
JM
6254 break;
6255 }
6256
929be17a
NB
6257 ret = btrfs_issue_discard(device->bdev, start, len,
6258 &bytes);
6259 if (!ret)
0acd32c2 6260 set_extent_bit(&device->alloc_state, start,
1d126800 6261 start + bytes - 1, CHUNK_TRIMMED, NULL);
499f377f
JM
6262 mutex_unlock(&fs_info->chunk_mutex);
6263
6264 if (ret)
6265 break;
6266
6267 start += len;
6268 *trimmed += bytes;
6269
6270 if (fatal_signal_pending(current)) {
6271 ret = -ERESTARTSYS;
6272 break;
6273 }
6274
6275 cond_resched();
6276 }
6277
6278 return ret;
6279}
6280
93bba24d
QW
6281/*
6282 * Trim the whole filesystem by:
6283 * 1) trimming the free space in each block group
6284 * 2) trimming the unallocated space on each device
6285 *
6286 * This will also continue trimming even if a block group or device encounters
6287 * an error. The return value will be the last error, or 0 if nothing bad
6288 * happens.
6289 */
2ff7e61e 6290int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 6291{
23608d51 6292 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
32da5386 6293 struct btrfs_block_group *cache = NULL;
499f377f 6294 struct btrfs_device *device;
f7039b1d 6295 u64 group_trimmed;
07301df7 6296 u64 range_end = U64_MAX;
f7039b1d
LD
6297 u64 start;
6298 u64 end;
6299 u64 trimmed = 0;
93bba24d
QW
6300 u64 bg_failed = 0;
6301 u64 dev_failed = 0;
6302 int bg_ret = 0;
6303 int dev_ret = 0;
f7039b1d
LD
6304 int ret = 0;
6305
f981fec1
JB
6306 if (range->start == U64_MAX)
6307 return -EINVAL;
6308
07301df7
QW
6309 /*
6310 * Check range overflow if range->len is set.
6311 * The default range->len is U64_MAX.
6312 */
6313 if (range->len != U64_MAX &&
6314 check_add_overflow(range->start, range->len, &range_end))
6315 return -EINVAL;
6316
6ba9fc8e 6317 cache = btrfs_lookup_first_block_group(fs_info, range->start);
2e405ad8 6318 for (; cache; cache = btrfs_next_block_group(cache)) {
b3470b5d 6319 if (cache->start >= range_end) {
f7039b1d
LD
6320 btrfs_put_block_group(cache);
6321 break;
6322 }
6323
b3470b5d
DS
6324 start = max(range->start, cache->start);
6325 end = min(range_end, cache->start + cache->length);
f7039b1d
LD
6326
6327 if (end - start >= range->minlen) {
32da5386 6328 if (!btrfs_block_group_done(cache)) {
ced8ecf0 6329 ret = btrfs_cache_block_group(cache, true);
1be41b78 6330 if (ret) {
93bba24d
QW
6331 bg_failed++;
6332 bg_ret = ret;
6333 continue;
1be41b78 6334 }
f7039b1d
LD
6335 }
6336 ret = btrfs_trim_block_group(cache,
6337 &group_trimmed,
6338 start,
6339 end,
6340 range->minlen);
6341
6342 trimmed += group_trimmed;
6343 if (ret) {
93bba24d
QW
6344 bg_failed++;
6345 bg_ret = ret;
6346 continue;
f7039b1d
LD
6347 }
6348 }
f7039b1d
LD
6349 }
6350
93bba24d
QW
6351 if (bg_failed)
6352 btrfs_warn(fs_info,
6353 "failed to trim %llu block group(s), last error %d",
6354 bg_failed, bg_ret);
23608d51
AJ
6355
6356 mutex_lock(&fs_devices->device_list_mutex);
6357 list_for_each_entry(device, &fs_devices->devices, dev_list) {
16a200f6
AJ
6358 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6359 continue;
6360
8103d10b 6361 ret = btrfs_trim_free_extents(device, &group_trimmed);
93bba24d
QW
6362 if (ret) {
6363 dev_failed++;
6364 dev_ret = ret;
499f377f 6365 break;
93bba24d 6366 }
499f377f
JM
6367
6368 trimmed += group_trimmed;
6369 }
23608d51 6370 mutex_unlock(&fs_devices->device_list_mutex);
499f377f 6371
93bba24d
QW
6372 if (dev_failed)
6373 btrfs_warn(fs_info,
6374 "failed to trim %llu device(s), last error %d",
6375 dev_failed, dev_ret);
f7039b1d 6376 range->len = trimmed;
93bba24d
QW
6377 if (bg_ret)
6378 return bg_ret;
6379 return dev_ret;
f7039b1d 6380}