Merge tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
cfc2de0f
BB
19#include "ctree.h"
20#include "extent-tree.h"
2b712e3b 21#include "transaction.h"
fec577fb
CM
22#include "disk-io.h"
23#include "print-tree.h"
0b86a832 24#include "volumes.h"
53b381b3 25#include "raid56.h"
925baedd 26#include "locking.h"
fa9c0d79 27#include "free-space-cache.h"
1e144fb8 28#include "free-space-tree.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
b0643e59 33#include "discard.h"
169e0da9 34#include "zoned.h"
6143c23c 35#include "dev-replace.h"
c7f13d42 36#include "fs.h"
07e81dc9 37#include "accessors.h"
45c40c8f 38#include "root-tree.h"
7c8ede16 39#include "file-item.h"
aa5d3003 40#include "orphan.h"
103c1972 41#include "tree-checker.h"
02c372e1 42#include "raid-stripe-tree.h"
fec577fb 43
709c0486
AJ
44#undef SCRAMBLE_DELAYED_REFS
45
9f9b8e8d 46
5d4f98a2 47static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
cecbb533 48 struct btrfs_delayed_ref_head *href,
85bb9f54 49 struct btrfs_delayed_ref_node *node,
e72cb923 50 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
51static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52 struct extent_buffer *leaf,
53 struct btrfs_extent_item *ei);
54static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
55 u64 parent, u64 root_objectid,
56 u64 flags, u64 owner, u64 offset,
2672a051 57 struct btrfs_key *ins, int ref_mod, u64 oref_root);
5d4f98a2 58static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 59 struct btrfs_delayed_ref_node *node,
21ebfbe7 60 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
61static int find_next_key(struct btrfs_path *path, int level,
62 struct btrfs_key *key);
6a63209f 63
32da5386 64static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
0f9dd46c
JB
65{
66 return (cache->flags & bits) == bits;
67}
68
1a4ed8fd 69/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 70int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5 71{
29cbcf40 72 struct btrfs_root *root = btrfs_extent_root(fs_info, start);
e02119d5
CM
73 int ret;
74 struct btrfs_key key;
31840ae1 75 struct btrfs_path *path;
e02119d5 76
31840ae1 77 path = btrfs_alloc_path();
d8926bb3
MF
78 if (!path)
79 return -ENOMEM;
80
e02119d5
CM
81 key.objectid = start;
82 key.offset = len;
3173a18f 83 key.type = BTRFS_EXTENT_ITEM_KEY;
29cbcf40 84 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
31840ae1 85 btrfs_free_path(path);
7bb86316
CM
86 return ret;
87}
88
a22285a6 89/*
3173a18f 90 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
91 *
92 * the head node for delayed ref is used to store the sum of all the
93 * reference count modifications queued up in the rbtree. the head
94 * node may also store the extent flags to set. This way you can check
95 * to see what the reference count and extent flags would be if all of
96 * the delayed refs are not processed.
97 */
98int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 99 struct btrfs_fs_info *fs_info, u64 bytenr,
d8ba2a91
JB
100 u64 offset, int metadata, u64 *refs, u64 *flags,
101 u64 *owning_root)
a22285a6 102{
29cbcf40 103 struct btrfs_root *extent_root;
a22285a6
YZ
104 struct btrfs_delayed_ref_head *head;
105 struct btrfs_delayed_ref_root *delayed_refs;
106 struct btrfs_path *path;
a22285a6 107 struct btrfs_key key;
a22285a6
YZ
108 u64 num_refs;
109 u64 extent_flags;
d8ba2a91 110 u64 owner = 0;
a22285a6
YZ
111 int ret;
112
3173a18f
JB
113 /*
114 * If we don't have skinny metadata, don't bother doing anything
115 * different
116 */
0b246afa
JM
117 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
118 offset = fs_info->nodesize;
3173a18f
JB
119 metadata = 0;
120 }
121
a22285a6
YZ
122 path = btrfs_alloc_path();
123 if (!path)
124 return -ENOMEM;
125
639eefc8
FDBM
126search_again:
127 key.objectid = bytenr;
128 key.offset = offset;
129 if (metadata)
130 key.type = BTRFS_METADATA_ITEM_KEY;
131 else
132 key.type = BTRFS_EXTENT_ITEM_KEY;
133
29cbcf40
JB
134 extent_root = btrfs_extent_root(fs_info, bytenr);
135 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
a22285a6
YZ
136 if (ret < 0)
137 goto out_free;
138
c65967ac 139 if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
140 if (path->slots[0]) {
141 path->slots[0]--;
142 btrfs_item_key_to_cpu(path->nodes[0], &key,
143 path->slots[0]);
144 if (key.objectid == bytenr &&
145 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 146 key.offset == fs_info->nodesize)
74be9510
FDBM
147 ret = 0;
148 }
3173a18f
JB
149 }
150
a22285a6 151 if (ret == 0) {
5c83b3be
FM
152 struct extent_buffer *leaf = path->nodes[0];
153 struct btrfs_extent_item *ei;
154 const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
155
156 if (unlikely(item_size < sizeof(*ei))) {
182741d2
QW
157 ret = -EUCLEAN;
158 btrfs_err(fs_info,
159 "unexpected extent item size, has %u expect >= %zu",
160 item_size, sizeof(*ei));
119474bd 161 btrfs_abort_transaction(trans, ret);
ba3c2b19 162 goto out_free;
a22285a6 163 }
ba3c2b19 164
5c83b3be
FM
165 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
166 num_refs = btrfs_extent_refs(leaf, ei);
28cb13f2
FM
167 if (unlikely(num_refs == 0)) {
168 ret = -EUCLEAN;
169 btrfs_err(fs_info,
170 "unexpected zero reference count for extent item (%llu %u %llu)",
171 key.objectid, key.type, key.offset);
172 btrfs_abort_transaction(trans, ret);
173 goto out_free;
174 }
5c83b3be
FM
175 extent_flags = btrfs_extent_flags(leaf, ei);
176 owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
a22285a6
YZ
177 } else {
178 num_refs = 0;
179 extent_flags = 0;
180 ret = 0;
181 }
182
a22285a6
YZ
183 delayed_refs = &trans->transaction->delayed_refs;
184 spin_lock(&delayed_refs->lock);
f72ad18e 185 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
186 if (head) {
187 if (!mutex_trylock(&head->mutex)) {
d278850e 188 refcount_inc(&head->refs);
a22285a6
YZ
189 spin_unlock(&delayed_refs->lock);
190
b3b4aa74 191 btrfs_release_path(path);
a22285a6 192
8cc33e5c
DS
193 /*
194 * Mutex was contended, block until it's released and try
195 * again
196 */
a22285a6
YZ
197 mutex_lock(&head->mutex);
198 mutex_unlock(&head->mutex);
d278850e 199 btrfs_put_delayed_ref_head(head);
639eefc8 200 goto search_again;
a22285a6 201 }
d7df2c79 202 spin_lock(&head->lock);
eba1469f 203 if (head->extent_op && head->extent_op->update_flags)
a22285a6 204 extent_flags |= head->extent_op->flags_to_set;
a22285a6 205
d278850e 206 num_refs += head->ref_mod;
d7df2c79 207 spin_unlock(&head->lock);
a22285a6
YZ
208 mutex_unlock(&head->mutex);
209 }
210 spin_unlock(&delayed_refs->lock);
119474bd 211
a22285a6
YZ
212 WARN_ON(num_refs == 0);
213 if (refs)
214 *refs = num_refs;
215 if (flags)
216 *flags = extent_flags;
d8ba2a91
JB
217 if (owning_root)
218 *owning_root = owner;
a22285a6
YZ
219out_free:
220 btrfs_free_path(path);
221 return ret;
222}
223
d8d5f3e1
CM
224/*
225 * Back reference rules. Back refs have three main goals:
226 *
227 * 1) differentiate between all holders of references to an extent so that
228 * when a reference is dropped we can make sure it was a valid reference
229 * before freeing the extent.
230 *
231 * 2) Provide enough information to quickly find the holders of an extent
232 * if we notice a given block is corrupted or bad.
233 *
234 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235 * maintenance. This is actually the same as #2, but with a slightly
236 * different use case.
237 *
5d4f98a2
YZ
238 * There are two kinds of back refs. The implicit back refs is optimized
239 * for pointers in non-shared tree blocks. For a given pointer in a block,
240 * back refs of this kind provide information about the block's owner tree
241 * and the pointer's key. These information allow us to find the block by
242 * b-tree searching. The full back refs is for pointers in tree blocks not
243 * referenced by their owner trees. The location of tree block is recorded
244 * in the back refs. Actually the full back refs is generic, and can be
245 * used in all cases the implicit back refs is used. The major shortcoming
246 * of the full back refs is its overhead. Every time a tree block gets
247 * COWed, we have to update back refs entry for all pointers in it.
248 *
249 * For a newly allocated tree block, we use implicit back refs for
250 * pointers in it. This means most tree related operations only involve
251 * implicit back refs. For a tree block created in old transaction, the
252 * only way to drop a reference to it is COW it. So we can detect the
253 * event that tree block loses its owner tree's reference and do the
254 * back refs conversion.
255 *
01327610 256 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
257 *
258 * The reference count of the block is one and the tree is the block's
259 * owner tree. Nothing to do in this case.
260 *
261 * The reference count of the block is one and the tree is not the
262 * block's owner tree. In this case, full back refs is used for pointers
263 * in the block. Remove these full back refs, add implicit back refs for
264 * every pointers in the new block.
265 *
266 * The reference count of the block is greater than one and the tree is
267 * the block's owner tree. In this case, implicit back refs is used for
268 * pointers in the block. Add full back refs for every pointers in the
269 * block, increase lower level extents' reference counts. The original
270 * implicit back refs are entailed to the new block.
271 *
272 * The reference count of the block is greater than one and the tree is
273 * not the block's owner tree. Add implicit back refs for every pointer in
274 * the new block, increase lower level extents' reference count.
275 *
276 * Back Reference Key composing:
277 *
278 * The key objectid corresponds to the first byte in the extent,
279 * The key type is used to differentiate between types of back refs.
280 * There are different meanings of the key offset for different types
281 * of back refs.
282 *
d8d5f3e1
CM
283 * File extents can be referenced by:
284 *
285 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 286 * - different files inside a single subvolume
d8d5f3e1
CM
287 * - different offsets inside a file (bookend extents in file.c)
288 *
5d4f98a2 289 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
290 *
291 * - Objectid of the subvolume root
d8d5f3e1 292 * - objectid of the file holding the reference
5d4f98a2
YZ
293 * - original offset in the file
294 * - how many bookend extents
d8d5f3e1 295 *
5d4f98a2
YZ
296 * The key offset for the implicit back refs is hash of the first
297 * three fields.
d8d5f3e1 298 *
5d4f98a2 299 * The extent ref structure for the full back refs has field for:
d8d5f3e1 300 *
5d4f98a2 301 * - number of pointers in the tree leaf
d8d5f3e1 302 *
5d4f98a2
YZ
303 * The key offset for the implicit back refs is the first byte of
304 * the tree leaf
d8d5f3e1 305 *
5d4f98a2
YZ
306 * When a file extent is allocated, The implicit back refs is used.
307 * the fields are filled in:
d8d5f3e1 308 *
5d4f98a2 309 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 310 *
5d4f98a2
YZ
311 * When a file extent is removed file truncation, we find the
312 * corresponding implicit back refs and check the following fields:
d8d5f3e1 313 *
5d4f98a2 314 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 315 *
5d4f98a2 316 * Btree extents can be referenced by:
d8d5f3e1 317 *
5d4f98a2 318 * - Different subvolumes
d8d5f3e1 319 *
5d4f98a2
YZ
320 * Both the implicit back refs and the full back refs for tree blocks
321 * only consist of key. The key offset for the implicit back refs is
322 * objectid of block's owner tree. The key offset for the full back refs
323 * is the first byte of parent block.
d8d5f3e1 324 *
5d4f98a2
YZ
325 * When implicit back refs is used, information about the lowest key and
326 * level of the tree block are required. These information are stored in
327 * tree block info structure.
d8d5f3e1 328 */
31840ae1 329
167ce953
LB
330/*
331 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 332 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
333 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
334 */
335int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
336 struct btrfs_extent_inline_ref *iref,
337 enum btrfs_inline_ref_type is_data)
338{
d9a620f7 339 struct btrfs_fs_info *fs_info = eb->fs_info;
167ce953 340 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 341 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953 342
d9a620f7
BB
343 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
344 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
345 return type;
346 }
347
167ce953
LB
348 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
349 type == BTRFS_SHARED_BLOCK_REF_KEY ||
350 type == BTRFS_SHARED_DATA_REF_KEY ||
351 type == BTRFS_EXTENT_DATA_REF_KEY) {
352 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 353 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 354 return type;
64ecdb64 355 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
d9a620f7 356 ASSERT(fs_info);
64ecdb64 357 /*
ea57788e
QW
358 * Every shared one has parent tree block,
359 * which must be aligned to sector size.
64ecdb64 360 */
d9a620f7 361 if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
64ecdb64
LB
362 return type;
363 }
167ce953 364 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 365 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 366 return type;
64ecdb64 367 if (type == BTRFS_SHARED_DATA_REF_KEY) {
d9a620f7 368 ASSERT(fs_info);
64ecdb64 369 /*
ea57788e
QW
370 * Every shared one has parent tree block,
371 * which must be aligned to sector size.
64ecdb64
LB
372 */
373 if (offset &&
d9a620f7 374 IS_ALIGNED(offset, fs_info->sectorsize))
64ecdb64
LB
375 return type;
376 }
167ce953
LB
377 } else {
378 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
379 return type;
380 }
381 }
382
25761430 383 WARN_ON(1);
6c75a589 384 btrfs_print_leaf(eb);
d9a620f7 385 btrfs_err(fs_info,
ea57788e
QW
386 "eb %llu iref 0x%lx invalid extent inline ref type %d",
387 eb->start, (unsigned long)iref, type);
167ce953
LB
388
389 return BTRFS_REF_TYPE_INVALID;
390}
391
0785a9aa 392u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
5d4f98a2
YZ
393{
394 u32 high_crc = ~(u32)0;
395 u32 low_crc = ~(u32)0;
396 __le64 lenum;
397
398 lenum = cpu_to_le64(root_objectid);
03e86348 399 high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 400 lenum = cpu_to_le64(owner);
03e86348 401 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 402 lenum = cpu_to_le64(offset);
03e86348 403 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
404
405 return ((u64)high_crc << 31) ^ (u64)low_crc;
406}
407
408static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
409 struct btrfs_extent_data_ref *ref)
410{
411 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
412 btrfs_extent_data_ref_objectid(leaf, ref),
413 btrfs_extent_data_ref_offset(leaf, ref));
414}
415
416static int match_extent_data_ref(struct extent_buffer *leaf,
417 struct btrfs_extent_data_ref *ref,
418 u64 root_objectid, u64 owner, u64 offset)
419{
420 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
421 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
422 btrfs_extent_data_ref_offset(leaf, ref) != offset)
423 return 0;
424 return 1;
425}
426
427static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
428 struct btrfs_path *path,
429 u64 bytenr, u64 parent,
430 u64 root_objectid,
431 u64 owner, u64 offset)
432{
29cbcf40 433 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2
YZ
434 struct btrfs_key key;
435 struct btrfs_extent_data_ref *ref;
31840ae1 436 struct extent_buffer *leaf;
5d4f98a2 437 u32 nritems;
5d4f98a2 438 int recow;
1618aa3c 439 int ret;
74493f7a 440
31840ae1 441 key.objectid = bytenr;
5d4f98a2
YZ
442 if (parent) {
443 key.type = BTRFS_SHARED_DATA_REF_KEY;
444 key.offset = parent;
445 } else {
446 key.type = BTRFS_EXTENT_DATA_REF_KEY;
447 key.offset = hash_extent_data_ref(root_objectid,
448 owner, offset);
449 }
450again:
451 recow = 0;
452 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1618aa3c
AJ
453 if (ret < 0)
454 return ret;
31840ae1 455
5d4f98a2 456 if (parent) {
1618aa3c
AJ
457 if (ret)
458 return -ENOENT;
459 return 0;
31840ae1
ZY
460 }
461
1618aa3c 462 ret = -ENOENT;
31840ae1 463 leaf = path->nodes[0];
5d4f98a2
YZ
464 nritems = btrfs_header_nritems(leaf);
465 while (1) {
466 if (path->slots[0] >= nritems) {
467 ret = btrfs_next_leaf(root, path);
1618aa3c 468 if (ret) {
dddff821 469 if (ret > 0)
1618aa3c
AJ
470 return -ENOENT;
471 return ret;
472 }
5d4f98a2
YZ
473
474 leaf = path->nodes[0];
475 nritems = btrfs_header_nritems(leaf);
476 recow = 1;
477 }
478
479 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
480 if (key.objectid != bytenr ||
481 key.type != BTRFS_EXTENT_DATA_REF_KEY)
482 goto fail;
483
484 ref = btrfs_item_ptr(leaf, path->slots[0],
485 struct btrfs_extent_data_ref);
486
487 if (match_extent_data_ref(leaf, ref, root_objectid,
488 owner, offset)) {
489 if (recow) {
b3b4aa74 490 btrfs_release_path(path);
5d4f98a2
YZ
491 goto again;
492 }
1618aa3c 493 ret = 0;
5d4f98a2
YZ
494 break;
495 }
496 path->slots[0]++;
31840ae1 497 }
5d4f98a2 498fail:
1618aa3c 499 return ret;
31840ae1
ZY
500}
501
5d4f98a2 502static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2 503 struct btrfs_path *path,
b4b5934a
JB
504 struct btrfs_delayed_ref_node *node,
505 u64 bytenr)
31840ae1 506{
29cbcf40 507 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
31840ae1
ZY
508 struct btrfs_key key;
509 struct extent_buffer *leaf;
b4b5934a
JB
510 u64 owner = btrfs_delayed_ref_owner(node);
511 u64 offset = btrfs_delayed_ref_offset(node);
5d4f98a2 512 u32 size;
31840ae1
ZY
513 u32 num_refs;
514 int ret;
74493f7a 515
74493f7a 516 key.objectid = bytenr;
b4b5934a 517 if (node->parent) {
5d4f98a2 518 key.type = BTRFS_SHARED_DATA_REF_KEY;
b4b5934a 519 key.offset = node->parent;
5d4f98a2
YZ
520 size = sizeof(struct btrfs_shared_data_ref);
521 } else {
522 key.type = BTRFS_EXTENT_DATA_REF_KEY;
b4b5934a 523 key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
5d4f98a2
YZ
524 size = sizeof(struct btrfs_extent_data_ref);
525 }
74493f7a 526
5d4f98a2
YZ
527 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
528 if (ret && ret != -EEXIST)
529 goto fail;
530
531 leaf = path->nodes[0];
b4b5934a 532 if (node->parent) {
5d4f98a2 533 struct btrfs_shared_data_ref *ref;
31840ae1 534 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
535 struct btrfs_shared_data_ref);
536 if (ret == 0) {
b4b5934a 537 btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
5d4f98a2
YZ
538 } else {
539 num_refs = btrfs_shared_data_ref_count(leaf, ref);
b4b5934a 540 num_refs += node->ref_mod;
5d4f98a2 541 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 542 }
5d4f98a2
YZ
543 } else {
544 struct btrfs_extent_data_ref *ref;
545 while (ret == -EEXIST) {
546 ref = btrfs_item_ptr(leaf, path->slots[0],
547 struct btrfs_extent_data_ref);
b4b5934a 548 if (match_extent_data_ref(leaf, ref, node->ref_root,
5d4f98a2
YZ
549 owner, offset))
550 break;
b3b4aa74 551 btrfs_release_path(path);
5d4f98a2
YZ
552 key.offset++;
553 ret = btrfs_insert_empty_item(trans, root, path, &key,
554 size);
555 if (ret && ret != -EEXIST)
556 goto fail;
31840ae1 557
5d4f98a2
YZ
558 leaf = path->nodes[0];
559 }
560 ref = btrfs_item_ptr(leaf, path->slots[0],
561 struct btrfs_extent_data_ref);
562 if (ret == 0) {
b4b5934a 563 btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
5d4f98a2
YZ
564 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
565 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
b4b5934a 566 btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
5d4f98a2
YZ
567 } else {
568 num_refs = btrfs_extent_data_ref_count(leaf, ref);
b4b5934a 569 num_refs += node->ref_mod;
5d4f98a2 570 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 571 }
31840ae1 572 }
50564b65 573 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
574 ret = 0;
575fail:
b3b4aa74 576 btrfs_release_path(path);
7bb86316 577 return ret;
74493f7a
CM
578}
579
5d4f98a2 580static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
76d76e78 581 struct btrfs_root *root,
5d4f98a2 582 struct btrfs_path *path,
5b2a54bb 583 int refs_to_drop)
31840ae1 584{
5d4f98a2
YZ
585 struct btrfs_key key;
586 struct btrfs_extent_data_ref *ref1 = NULL;
587 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 588 struct extent_buffer *leaf;
5d4f98a2 589 u32 num_refs = 0;
31840ae1
ZY
590 int ret = 0;
591
592 leaf = path->nodes[0];
5d4f98a2
YZ
593 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
594
595 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
596 ref1 = btrfs_item_ptr(leaf, path->slots[0],
597 struct btrfs_extent_data_ref);
598 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
599 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
600 ref2 = btrfs_item_ptr(leaf, path->slots[0],
601 struct btrfs_shared_data_ref);
602 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2 603 } else {
182741d2
QW
604 btrfs_err(trans->fs_info,
605 "unrecognized backref key (%llu %u %llu)",
606 key.objectid, key.type, key.offset);
607 btrfs_abort_transaction(trans, -EUCLEAN);
608 return -EUCLEAN;
5d4f98a2
YZ
609 }
610
56bec294
CM
611 BUG_ON(num_refs < refs_to_drop);
612 num_refs -= refs_to_drop;
5d4f98a2 613
31840ae1 614 if (num_refs == 0) {
76d76e78 615 ret = btrfs_del_item(trans, root, path);
31840ae1 616 } else {
5d4f98a2
YZ
617 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
618 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
619 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
620 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
50564b65 621 btrfs_mark_buffer_dirty(trans, leaf);
31840ae1 622 }
31840ae1
ZY
623 return ret;
624}
625
9ed0dea0 626static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 627 struct btrfs_extent_inline_ref *iref)
15916de8 628{
5d4f98a2
YZ
629 struct btrfs_key key;
630 struct extent_buffer *leaf;
631 struct btrfs_extent_data_ref *ref1;
632 struct btrfs_shared_data_ref *ref2;
633 u32 num_refs = 0;
3de28d57 634 int type;
5d4f98a2
YZ
635
636 leaf = path->nodes[0];
637 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19 638
5d4f98a2 639 if (iref) {
3de28d57
LB
640 /*
641 * If type is invalid, we should have bailed out earlier than
642 * this call.
643 */
644 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
645 ASSERT(type != BTRFS_REF_TYPE_INVALID);
646 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
647 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
648 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649 } else {
650 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
651 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
652 }
653 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
654 ref1 = btrfs_item_ptr(leaf, path->slots[0],
655 struct btrfs_extent_data_ref);
656 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
657 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
658 ref2 = btrfs_item_ptr(leaf, path->slots[0],
659 struct btrfs_shared_data_ref);
660 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
661 } else {
662 WARN_ON(1);
663 }
664 return num_refs;
665}
15916de8 666
5d4f98a2 667static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
668 struct btrfs_path *path,
669 u64 bytenr, u64 parent,
670 u64 root_objectid)
1f3c79a2 671{
29cbcf40 672 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 673 struct btrfs_key key;
1f3c79a2 674 int ret;
1f3c79a2 675
5d4f98a2
YZ
676 key.objectid = bytenr;
677 if (parent) {
678 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
679 key.offset = parent;
680 } else {
681 key.type = BTRFS_TREE_BLOCK_REF_KEY;
682 key.offset = root_objectid;
1f3c79a2
LH
683 }
684
5d4f98a2
YZ
685 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
686 if (ret > 0)
687 ret = -ENOENT;
5d4f98a2 688 return ret;
1f3c79a2
LH
689}
690
5d4f98a2 691static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2 692 struct btrfs_path *path,
b4b5934a
JB
693 struct btrfs_delayed_ref_node *node,
694 u64 bytenr)
31840ae1 695{
29cbcf40 696 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
5d4f98a2 697 struct btrfs_key key;
31840ae1 698 int ret;
31840ae1 699
5d4f98a2 700 key.objectid = bytenr;
b4b5934a 701 if (node->parent) {
5d4f98a2 702 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
b4b5934a 703 key.offset = node->parent;
5d4f98a2
YZ
704 } else {
705 key.type = BTRFS_TREE_BLOCK_REF_KEY;
b4b5934a 706 key.offset = node->ref_root;
5d4f98a2
YZ
707 }
708
29cbcf40 709 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
b3b4aa74 710 btrfs_release_path(path);
31840ae1
ZY
711 return ret;
712}
713
5d4f98a2 714static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 715{
5d4f98a2
YZ
716 int type;
717 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
718 if (parent > 0)
719 type = BTRFS_SHARED_BLOCK_REF_KEY;
720 else
721 type = BTRFS_TREE_BLOCK_REF_KEY;
722 } else {
723 if (parent > 0)
724 type = BTRFS_SHARED_DATA_REF_KEY;
725 else
726 type = BTRFS_EXTENT_DATA_REF_KEY;
727 }
728 return type;
31840ae1 729}
56bec294 730
2c47e605
YZ
731static int find_next_key(struct btrfs_path *path, int level,
732 struct btrfs_key *key)
56bec294 733
02217ed2 734{
2c47e605 735 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
736 if (!path->nodes[level])
737 break;
5d4f98a2
YZ
738 if (path->slots[level] + 1 >=
739 btrfs_header_nritems(path->nodes[level]))
740 continue;
741 if (level == 0)
742 btrfs_item_key_to_cpu(path->nodes[level], key,
743 path->slots[level] + 1);
744 else
745 btrfs_node_key_to_cpu(path->nodes[level], key,
746 path->slots[level] + 1);
747 return 0;
748 }
749 return 1;
750}
037e6390 751
5d4f98a2
YZ
752/*
753 * look for inline back ref. if back ref is found, *ref_ret is set
754 * to the address of inline back ref, and 0 is returned.
755 *
756 * if back ref isn't found, *ref_ret is set to the address where it
757 * should be inserted, and -ENOENT is returned.
758 *
759 * if insert is true and there are too many inline back refs, the path
760 * points to the extent item, and -EAGAIN is returned.
761 *
762 * NOTE: inline back refs are ordered in the same way that back ref
763 * items in the tree are ordered.
764 */
765static noinline_for_stack
766int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
767 struct btrfs_path *path,
768 struct btrfs_extent_inline_ref **ref_ret,
769 u64 bytenr, u64 num_bytes,
770 u64 parent, u64 root_objectid,
771 u64 owner, u64 offset, int insert)
772{
867cc1fb 773 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 774 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
5d4f98a2
YZ
775 struct btrfs_key key;
776 struct extent_buffer *leaf;
777 struct btrfs_extent_item *ei;
778 struct btrfs_extent_inline_ref *iref;
779 u64 flags;
780 u64 item_size;
781 unsigned long ptr;
782 unsigned long end;
783 int extra_size;
784 int type;
785 int want;
786 int ret;
0b246afa 787 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 788 int needed;
26b8003f 789
db94535d 790 key.objectid = bytenr;
31840ae1 791 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 792 key.offset = num_bytes;
31840ae1 793
5d4f98a2
YZ
794 want = extent_ref_type(parent, owner);
795 if (insert) {
796 extra_size = btrfs_extent_inline_ref_size(want);
9a664971 797 path->search_for_extension = 1;
85d4198e 798 path->keep_locks = 1;
5d4f98a2
YZ
799 } else
800 extra_size = -1;
3173a18f
JB
801
802 /*
16d1c062
NB
803 * Owner is our level, so we can just add one to get the level for the
804 * block we are interested in.
3173a18f
JB
805 */
806 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
807 key.type = BTRFS_METADATA_ITEM_KEY;
808 key.offset = owner;
809 }
810
811again:
5d4f98a2 812 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
cc925b96 813 if (ret < 0)
5d4f98a2 814 goto out;
3173a18f
JB
815
816 /*
817 * We may be a newly converted file system which still has the old fat
818 * extent entries for metadata, so try and see if we have one of those.
819 */
820 if (ret > 0 && skinny_metadata) {
821 skinny_metadata = false;
822 if (path->slots[0]) {
823 path->slots[0]--;
824 btrfs_item_key_to_cpu(path->nodes[0], &key,
825 path->slots[0]);
826 if (key.objectid == bytenr &&
827 key.type == BTRFS_EXTENT_ITEM_KEY &&
828 key.offset == num_bytes)
829 ret = 0;
830 }
831 if (ret) {
9ce49a0b 832 key.objectid = bytenr;
3173a18f
JB
833 key.type = BTRFS_EXTENT_ITEM_KEY;
834 key.offset = num_bytes;
835 btrfs_release_path(path);
836 goto again;
837 }
838 }
839
79787eaa 840 if (ret && !insert) {
cc925b96 841 ret = -ENOENT;
79787eaa 842 goto out;
fae7f21c 843 } else if (WARN_ON(ret)) {
7f72f505
QW
844 btrfs_print_leaf(path->nodes[0]);
845 btrfs_err(fs_info,
846"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
847 bytenr, num_bytes, parent, root_objectid, owner,
848 offset);
eba444f1 849 ret = -EUCLEAN;
492104c8 850 goto out;
79787eaa 851 }
5d4f98a2
YZ
852
853 leaf = path->nodes[0];
3212fa14 854 item_size = btrfs_item_size(leaf, path->slots[0]);
6d8ff4e4 855 if (unlikely(item_size < sizeof(*ei))) {
cc925b96 856 ret = -EUCLEAN;
182741d2
QW
857 btrfs_err(fs_info,
858 "unexpected extent item size, has %llu expect >= %zu",
859 item_size, sizeof(*ei));
cc925b96 860 btrfs_abort_transaction(trans, ret);
ba3c2b19
NB
861 goto out;
862 }
5d4f98a2 863
5d4f98a2
YZ
864 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
865 flags = btrfs_extent_flags(leaf, ei);
866
867 ptr = (unsigned long)(ei + 1);
868 end = (unsigned long)ei + item_size;
869
3173a18f 870 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
871 ptr += sizeof(struct btrfs_tree_block_info);
872 BUG_ON(ptr > end);
5d4f98a2
YZ
873 }
874
3de28d57
LB
875 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
876 needed = BTRFS_REF_TYPE_DATA;
877 else
878 needed = BTRFS_REF_TYPE_BLOCK;
879
cc925b96 880 ret = -ENOENT;
da8848ac 881 while (ptr < end) {
5d4f98a2 882 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57 883 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
d9a620f7
BB
884 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
885 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
886 ptr += btrfs_extent_inline_ref_size(type);
887 continue;
888 }
3de28d57 889 if (type == BTRFS_REF_TYPE_INVALID) {
cc925b96 890 ret = -EUCLEAN;
3de28d57
LB
891 goto out;
892 }
893
5d4f98a2
YZ
894 if (want < type)
895 break;
896 if (want > type) {
897 ptr += btrfs_extent_inline_ref_size(type);
898 continue;
899 }
900
901 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
902 struct btrfs_extent_data_ref *dref;
903 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
904 if (match_extent_data_ref(leaf, dref, root_objectid,
905 owner, offset)) {
cc925b96 906 ret = 0;
5d4f98a2
YZ
907 break;
908 }
909 if (hash_extent_data_ref_item(leaf, dref) <
910 hash_extent_data_ref(root_objectid, owner, offset))
911 break;
912 } else {
913 u64 ref_offset;
914 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
915 if (parent > 0) {
916 if (parent == ref_offset) {
cc925b96 917 ret = 0;
5d4f98a2
YZ
918 break;
919 }
920 if (ref_offset < parent)
921 break;
922 } else {
923 if (root_objectid == ref_offset) {
cc925b96 924 ret = 0;
5d4f98a2
YZ
925 break;
926 }
927 if (ref_offset < root_objectid)
928 break;
929 }
930 }
931 ptr += btrfs_extent_inline_ref_size(type);
932 }
da8848ac
FM
933
934 if (unlikely(ptr > end)) {
935 ret = -EUCLEAN;
936 btrfs_print_leaf(path->nodes[0]);
937 btrfs_crit(fs_info,
938"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
939 path->slots[0], root_objectid, owner, offset, parent);
940 goto out;
941 }
942
cc925b96 943 if (ret == -ENOENT && insert) {
5d4f98a2
YZ
944 if (item_size + extra_size >=
945 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
cc925b96 946 ret = -EAGAIN;
5d4f98a2
YZ
947 goto out;
948 }
949 /*
950 * To add new inline back ref, we have to make sure
951 * there is no corresponding back ref item.
952 * For simplicity, we just do not add new inline back
953 * ref if there is any kind of item for this block
954 */
2c47e605
YZ
955 if (find_next_key(path, 0, &key) == 0 &&
956 key.objectid == bytenr &&
85d4198e 957 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
cc925b96 958 ret = -EAGAIN;
5d4f98a2
YZ
959 goto out;
960 }
961 }
962 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963out:
85d4198e 964 if (insert) {
5d4f98a2 965 path->keep_locks = 0;
9a664971 966 path->search_for_extension = 0;
5d4f98a2
YZ
967 btrfs_unlock_up_safe(path, 1);
968 }
cc925b96 969 return ret;
5d4f98a2
YZ
970}
971
972/*
973 * helper to add new inline back ref
974 */
975static noinline_for_stack
50564b65 976void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
143bede5
JM
977 struct btrfs_path *path,
978 struct btrfs_extent_inline_ref *iref,
979 u64 parent, u64 root_objectid,
980 u64 owner, u64 offset, int refs_to_add,
981 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
982{
983 struct extent_buffer *leaf;
984 struct btrfs_extent_item *ei;
985 unsigned long ptr;
986 unsigned long end;
987 unsigned long item_offset;
988 u64 refs;
989 int size;
990 int type;
5d4f98a2
YZ
991
992 leaf = path->nodes[0];
993 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994 item_offset = (unsigned long)iref - (unsigned long)ei;
995
996 type = extent_ref_type(parent, owner);
997 size = btrfs_extent_inline_ref_size(type);
998
50564b65 999 btrfs_extend_item(trans, path, size);
5d4f98a2
YZ
1000
1001 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002 refs = btrfs_extent_refs(leaf, ei);
1003 refs += refs_to_add;
1004 btrfs_set_extent_refs(leaf, ei, refs);
1005 if (extent_op)
1006 __run_delayed_extent_op(extent_op, leaf, ei);
1007
1008 ptr = (unsigned long)ei + item_offset;
3212fa14 1009 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1010 if (ptr < end - size)
1011 memmove_extent_buffer(leaf, ptr + size, ptr,
1012 end - size - ptr);
1013
1014 iref = (struct btrfs_extent_inline_ref *)ptr;
1015 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017 struct btrfs_extent_data_ref *dref;
1018 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024 struct btrfs_shared_data_ref *sref;
1025 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030 } else {
1031 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032 }
50564b65 1033 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
1034}
1035
1036static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1037 struct btrfs_path *path,
1038 struct btrfs_extent_inline_ref **ref_ret,
1039 u64 bytenr, u64 num_bytes, u64 parent,
1040 u64 root_objectid, u64 owner, u64 offset)
1041{
1042 int ret;
1043
867cc1fb
NB
1044 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045 num_bytes, parent, root_objectid,
1046 owner, offset, 0);
5d4f98a2 1047 if (ret != -ENOENT)
54aa1f4d 1048 return ret;
5d4f98a2 1049
b3b4aa74 1050 btrfs_release_path(path);
5d4f98a2
YZ
1051 *ref_ret = NULL;
1052
1053 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1054 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055 root_objectid);
5d4f98a2 1056 } else {
bd1d53ef
NB
1057 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058 root_objectid, owner, offset);
b9473439 1059 }
5d4f98a2
YZ
1060 return ret;
1061}
31840ae1 1062
5d4f98a2
YZ
1063/*
1064 * helper to update/remove inline back ref
1065 */
50564b65
FM
1066static noinline_for_stack int update_inline_extent_backref(
1067 struct btrfs_trans_handle *trans,
1068 struct btrfs_path *path,
143bede5
JM
1069 struct btrfs_extent_inline_ref *iref,
1070 int refs_to_mod,
5b2a54bb 1071 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2 1072{
61a18f1c 1073 struct extent_buffer *leaf = path->nodes[0];
25761430 1074 struct btrfs_fs_info *fs_info = leaf->fs_info;
5d4f98a2
YZ
1075 struct btrfs_extent_item *ei;
1076 struct btrfs_extent_data_ref *dref = NULL;
1077 struct btrfs_shared_data_ref *sref = NULL;
1078 unsigned long ptr;
1079 unsigned long end;
1080 u32 item_size;
1081 int size;
1082 int type;
5d4f98a2
YZ
1083 u64 refs;
1084
5d4f98a2
YZ
1085 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086 refs = btrfs_extent_refs(leaf, ei);
25761430
QW
1087 if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1088 struct btrfs_key key;
1089 u32 extent_size;
1090
1091 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092 if (key.type == BTRFS_METADATA_ITEM_KEY)
1093 extent_size = fs_info->nodesize;
1094 else
1095 extent_size = key.offset;
1096 btrfs_print_leaf(leaf);
1097 btrfs_err(fs_info,
1098 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1099 key.objectid, extent_size, refs_to_mod, refs);
1100 return -EUCLEAN;
1101 }
5d4f98a2
YZ
1102 refs += refs_to_mod;
1103 btrfs_set_extent_refs(leaf, ei, refs);
1104 if (extent_op)
1105 __run_delayed_extent_op(extent_op, leaf, ei);
1106
25761430 1107 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
3de28d57 1108 /*
25761430
QW
1109 * Function btrfs_get_extent_inline_ref_type() has already printed
1110 * error messages.
3de28d57 1111 */
25761430
QW
1112 if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1113 return -EUCLEAN;
5d4f98a2
YZ
1114
1115 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1117 refs = btrfs_extent_data_ref_count(leaf, dref);
1118 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1119 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1120 refs = btrfs_shared_data_ref_count(leaf, sref);
1121 } else {
1122 refs = 1;
25761430
QW
1123 /*
1124 * For tree blocks we can only drop one ref for it, and tree
1125 * blocks should not have refs > 1.
1126 *
1127 * Furthermore if we're inserting a new inline backref, we
1128 * won't reach this path either. That would be
1129 * setup_inline_extent_backref().
1130 */
1131 if (unlikely(refs_to_mod != -1)) {
1132 struct btrfs_key key;
1133
1134 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135
1136 btrfs_print_leaf(leaf);
1137 btrfs_err(fs_info,
1138 "invalid refs_to_mod for tree block %llu, has %d expect -1",
1139 key.objectid, refs_to_mod);
1140 return -EUCLEAN;
1141 }
56bec294 1142 }
31840ae1 1143
25761430
QW
1144 if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1145 struct btrfs_key key;
1146 u32 extent_size;
1147
1148 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1149 if (key.type == BTRFS_METADATA_ITEM_KEY)
1150 extent_size = fs_info->nodesize;
1151 else
1152 extent_size = key.offset;
1153 btrfs_print_leaf(leaf);
1154 btrfs_err(fs_info,
1155"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1156 (unsigned long)iref, key.objectid, extent_size,
1157 refs_to_mod, refs);
1158 return -EUCLEAN;
1159 }
5d4f98a2
YZ
1160 refs += refs_to_mod;
1161
1162 if (refs > 0) {
1163 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1164 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1165 else
1166 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1167 } else {
1168 size = btrfs_extent_inline_ref_size(type);
3212fa14 1169 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2
YZ
1170 ptr = (unsigned long)iref;
1171 end = (unsigned long)ei + item_size;
1172 if (ptr + size < end)
1173 memmove_extent_buffer(leaf, ptr, ptr + size,
1174 end - ptr - size);
1175 item_size -= size;
50564b65 1176 btrfs_truncate_item(trans, path, item_size, 1);
5d4f98a2 1177 }
50564b65 1178 btrfs_mark_buffer_dirty(trans, leaf);
25761430 1179 return 0;
5d4f98a2
YZ
1180}
1181
1182static noinline_for_stack
1183int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1184 struct btrfs_path *path,
1185 u64 bytenr, u64 num_bytes, u64 parent,
1186 u64 root_objectid, u64 owner,
1187 u64 offset, int refs_to_add,
1188 struct btrfs_delayed_extent_op *extent_op)
1189{
1190 struct btrfs_extent_inline_ref *iref;
1191 int ret;
1192
867cc1fb
NB
1193 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1194 num_bytes, parent, root_objectid,
1195 owner, offset, 1);
5d4f98a2 1196 if (ret == 0) {
07cce5cf
QW
1197 /*
1198 * We're adding refs to a tree block we already own, this
1199 * should not happen at all.
1200 */
1201 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
eee3b811 1202 btrfs_print_leaf(path->nodes[0]);
07cce5cf 1203 btrfs_crit(trans->fs_info,
eee3b811
QW
1204"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1205 bytenr, num_bytes, root_objectid, path->slots[0]);
07cce5cf
QW
1206 return -EUCLEAN;
1207 }
50564b65
FM
1208 ret = update_inline_extent_backref(trans, path, iref,
1209 refs_to_add, extent_op);
5d4f98a2 1210 } else if (ret == -ENOENT) {
50564b65 1211 setup_inline_extent_backref(trans, path, iref, parent,
143bede5
JM
1212 root_objectid, owner, offset,
1213 refs_to_add, extent_op);
1214 ret = 0;
771ed689 1215 }
5d4f98a2
YZ
1216 return ret;
1217}
31840ae1 1218
5d4f98a2 1219static int remove_extent_backref(struct btrfs_trans_handle *trans,
76d76e78 1220 struct btrfs_root *root,
5d4f98a2
YZ
1221 struct btrfs_path *path,
1222 struct btrfs_extent_inline_ref *iref,
5b2a54bb 1223 int refs_to_drop, int is_data)
5d4f98a2 1224{
143bede5 1225 int ret = 0;
b9473439 1226
5d4f98a2 1227 BUG_ON(!is_data && refs_to_drop != 1);
5b2a54bb 1228 if (iref)
50564b65
FM
1229 ret = update_inline_extent_backref(trans, path, iref,
1230 -refs_to_drop, NULL);
5b2a54bb
JB
1231 else if (is_data)
1232 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1233 else
76d76e78 1234 ret = btrfs_del_item(trans, root, path);
5d4f98a2
YZ
1235 return ret;
1236}
1237
d04c6b88
JM
1238static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1239 u64 *discarded_bytes)
5d4f98a2 1240{
86557861
JM
1241 int j, ret = 0;
1242 u64 bytes_left, end;
adbe7e38 1243 u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
d04c6b88 1244
a208b3f1
DS
1245 /* Adjust the range to be aligned to 512B sectors if necessary. */
1246 if (start != aligned_start) {
4d89d377 1247 len -= aligned_start - start;
adbe7e38 1248 len = round_down(len, 1 << SECTOR_SHIFT);
4d89d377
JM
1249 start = aligned_start;
1250 }
d04c6b88 1251
4d89d377 1252 *discarded_bytes = 0;
86557861
JM
1253
1254 if (!len)
1255 return 0;
1256
1257 end = start + len;
1258 bytes_left = len;
1259
1260 /* Skip any superblocks on this device. */
1261 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262 u64 sb_start = btrfs_sb_offset(j);
1263 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264 u64 size = sb_start - start;
1265
1266 if (!in_range(sb_start, start, bytes_left) &&
1267 !in_range(sb_end, start, bytes_left) &&
1268 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269 continue;
1270
1271 /*
1272 * Superblock spans beginning of range. Adjust start and
1273 * try again.
1274 */
1275 if (sb_start <= start) {
1276 start += sb_end - start;
1277 if (start > end) {
1278 bytes_left = 0;
1279 break;
1280 }
1281 bytes_left = end - start;
1282 continue;
1283 }
1284
1285 if (size) {
29e70be2
AJ
1286 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1287 size >> SECTOR_SHIFT,
44abff2c 1288 GFP_NOFS);
86557861
JM
1289 if (!ret)
1290 *discarded_bytes += size;
1291 else if (ret != -EOPNOTSUPP)
1292 return ret;
1293 }
1294
1295 start = sb_end;
1296 if (start > end) {
1297 bytes_left = 0;
1298 break;
1299 }
1300 bytes_left = end - start;
1301 }
1302
1303 if (bytes_left) {
29e70be2
AJ
1304 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1305 bytes_left >> SECTOR_SHIFT,
44abff2c 1306 GFP_NOFS);
4d89d377 1307 if (!ret)
86557861 1308 *discarded_bytes += bytes_left;
4d89d377 1309 }
d04c6b88 1310 return ret;
5d4f98a2 1311}
5d4f98a2 1312
a4012f06 1313static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
6143c23c
NA
1314{
1315 struct btrfs_device *dev = stripe->dev;
1316 struct btrfs_fs_info *fs_info = dev->fs_info;
1317 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1318 u64 phys = stripe->physical;
1319 u64 len = stripe->length;
1320 u64 discarded = 0;
1321 int ret = 0;
1322
1323 /* Zone reset on a zoned filesystem */
1324 if (btrfs_can_zone_reset(dev, phys, len)) {
1325 u64 src_disc;
1326
1327 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1328 if (ret)
1329 goto out;
1330
1331 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1332 dev != dev_replace->srcdev)
1333 goto out;
1334
1335 src_disc = discarded;
1336
1337 /* Send to replace target as well */
1338 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1339 &discarded);
1340 discarded += src_disc;
70200574 1341 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
6143c23c
NA
1342 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1343 } else {
1344 ret = 0;
1345 *bytes = 0;
1346 }
1347
1348out:
1349 *bytes = discarded;
1350 return ret;
1351}
1352
2ff7e61e 1353int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1354 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1355{
6b7faadd 1356 int ret = 0;
5378e607 1357 u64 discarded_bytes = 0;
6b7faadd
QW
1358 u64 end = bytenr + num_bytes;
1359 u64 cur = bytenr;
e244a0ae 1360
2999241d 1361 /*
a4012f06
CH
1362 * Avoid races with device replace and make sure the devices in the
1363 * stripes don't go away while we are discarding.
2999241d 1364 */
0b246afa 1365 btrfs_bio_counter_inc_blocked(fs_info);
6b7faadd 1366 while (cur < end) {
a4012f06
CH
1367 struct btrfs_discard_stripe *stripes;
1368 unsigned int num_stripes;
5d4f98a2
YZ
1369 int i;
1370
6b7faadd 1371 num_bytes = end - cur;
a4012f06
CH
1372 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1373 if (IS_ERR(stripes)) {
1374 ret = PTR_ERR(stripes);
1375 if (ret == -EOPNOTSUPP)
1376 ret = 0;
1377 break;
1378 }
5d4f98a2 1379
a4012f06
CH
1380 for (i = 0; i < num_stripes; i++) {
1381 struct btrfs_discard_stripe *stripe = stripes + i;
d04c6b88 1382 u64 bytes;
38b5f68e 1383
a4012f06 1384 if (!stripe->dev->bdev) {
627e0873
FM
1385 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1386 continue;
1387 }
dcba6e48 1388
a4012f06
CH
1389 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1390 &stripe->dev->dev_state))
5e753a81
AJ
1391 continue;
1392
6143c23c 1393 ret = do_discard_extent(stripe, &bytes);
a4012f06 1394 if (ret) {
6b7faadd 1395 /*
a4012f06
CH
1396 * Keep going if discard is not supported by the
1397 * device.
6b7faadd 1398 */
a4012f06
CH
1399 if (ret != -EOPNOTSUPP)
1400 break;
1401 ret = 0;
1402 } else {
1403 discarded_bytes += bytes;
6b7faadd 1404 }
5d4f98a2 1405 }
a4012f06
CH
1406 kfree(stripes);
1407 if (ret)
1408 break;
6b7faadd 1409 cur += num_bytes;
5d4f98a2 1410 }
0b246afa 1411 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1412 if (actual_bytes)
1413 *actual_bytes = discarded_bytes;
5d4f98a2 1414 return ret;
5d4f98a2
YZ
1415}
1416
79787eaa 1417/* Can return -ENOMEM */
5d4f98a2 1418int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1419 struct btrfs_ref *generic_ref)
5d4f98a2 1420{
82fa113f 1421 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2 1422 int ret;
66d7e7f0 1423
82fa113f
QW
1424 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1425 generic_ref->action);
1426 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
f2e69a77 1427 generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1428
82fa113f 1429 if (generic_ref->type == BTRFS_REF_METADATA)
2187374f 1430 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
82fa113f 1431 else
2187374f 1432 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
d7eae340 1433
82fa113f 1434 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1435
5d4f98a2
YZ
1436 return ret;
1437}
1438
bd3c685e 1439/*
9580503b 1440 * Insert backreference for a given extent.
bd3c685e 1441 *
07cce5cf
QW
1442 * The counterpart is in __btrfs_free_extent(), with examples and more details
1443 * how it works.
1444 *
bd3c685e
NB
1445 * @trans: Handle of transaction
1446 *
1447 * @node: The delayed ref node used to get the bytenr/length for
1448 * extent whose references are incremented.
1449 *
bd3c685e
NB
1450 * @extent_op Pointer to a structure, holding information necessary when
1451 * updating a tree block's flags
1452 *
1453 */
5d4f98a2 1454static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 1455 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
1456 struct btrfs_delayed_extent_op *extent_op)
1457{
1458 struct btrfs_path *path;
1459 struct extent_buffer *leaf;
1460 struct btrfs_extent_item *item;
fcebe456 1461 struct btrfs_key key;
c682f9b3
QW
1462 u64 bytenr = node->bytenr;
1463 u64 num_bytes = node->num_bytes;
a502f112
JB
1464 u64 owner = btrfs_delayed_ref_owner(node);
1465 u64 offset = btrfs_delayed_ref_offset(node);
5d4f98a2 1466 u64 refs;
88b2d088 1467 int refs_to_add = node->ref_mod;
5d4f98a2 1468 int ret;
5d4f98a2
YZ
1469
1470 path = btrfs_alloc_path();
1471 if (!path)
1472 return -ENOMEM;
1473
5d4f98a2 1474 /* this will setup the path even if it fails to insert the back ref */
a639cdeb 1475 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
a502f112 1476 node->parent, node->ref_root, owner,
a639cdeb 1477 offset, refs_to_add, extent_op);
0ed4792a 1478 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 1479 goto out;
fcebe456
JB
1480
1481 /*
1482 * Ok we had -EAGAIN which means we didn't have space to insert and
1483 * inline extent ref, so just update the reference count and add a
1484 * normal backref.
1485 */
5d4f98a2 1486 leaf = path->nodes[0];
fcebe456 1487 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
1488 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1489 refs = btrfs_extent_refs(leaf, item);
1490 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1491 if (extent_op)
1492 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 1493
50564b65 1494 btrfs_mark_buffer_dirty(trans, leaf);
b3b4aa74 1495 btrfs_release_path(path);
56bec294 1496
56bec294 1497 /* now insert the actual backref */
d2f79e63 1498 if (owner < BTRFS_FIRST_FREE_OBJECTID)
b4b5934a 1499 ret = insert_tree_block_ref(trans, path, node, bytenr);
d2f79e63 1500 else
b4b5934a 1501 ret = insert_extent_data_ref(trans, path, node, bytenr);
d2f79e63 1502
79787eaa 1503 if (ret)
66642832 1504 btrfs_abort_transaction(trans, ret);
5d4f98a2 1505out:
56bec294 1506 btrfs_free_path(path);
30d133fc 1507 return ret;
56bec294
CM
1508}
1509
e85a0ada
BB
1510static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1511 struct btrfs_delayed_ref_head *href)
1512{
1513 u64 root = href->owning_root;
1514
1515 /*
1516 * Don't check must_insert_reserved, as this is called from contexts
1517 * where it has already been unset.
1518 */
1519 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1520 !href->is_data || !is_fstree(root))
1521 return;
1522
1523 btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1524 BTRFS_QGROUP_RSV_DATA);
1525}
1526
5d4f98a2 1527static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
cecbb533 1528 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1529 struct btrfs_delayed_ref_node *node,
1530 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1531 bool insert_reserved)
56bec294 1532{
5d4f98a2 1533 int ret = 0;
5d4f98a2 1534 u64 parent = 0;
5d4f98a2
YZ
1535 u64 flags = 0;
1536
1bff6d4f 1537 trace_run_delayed_data_ref(trans->fs_info, node);
599c75ec 1538
5d4f98a2 1539 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
cf4f0432 1540 parent = node->parent;
5d4f98a2
YZ
1541
1542 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
7cce0d69 1543 struct btrfs_key key;
cecbb533
BB
1544 struct btrfs_squota_delta delta = {
1545 .root = href->owning_root,
1546 .num_bytes = node->num_bytes,
cecbb533
BB
1547 .is_data = true,
1548 .is_inc = true,
bd7c1ea3 1549 .generation = trans->transid,
cecbb533 1550 };
44cc2e38
JB
1551 u64 owner = btrfs_delayed_ref_owner(node);
1552 u64 offset = btrfs_delayed_ref_offset(node);
7cce0d69 1553
3173a18f 1554 if (extent_op)
5d4f98a2 1555 flags |= extent_op->flags_to_set;
7cce0d69
FM
1556
1557 key.objectid = node->bytenr;
1558 key.type = BTRFS_EXTENT_ITEM_KEY;
1559 key.offset = node->num_bytes;
1560
cf4f0432 1561 ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
44cc2e38
JB
1562 flags, owner, offset, &key,
1563 node->ref_mod,
1564 href->owning_root);
e85a0ada 1565 free_head_ref_squota_rsv(trans->fs_info, href);
cecbb533
BB
1566 if (!ret)
1567 ret = btrfs_record_squota_delta(trans->fs_info, &delta);
5d4f98a2 1568 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
a502f112 1569 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
5d4f98a2 1570 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
85bb9f54 1571 ret = __btrfs_free_extent(trans, href, node, extent_op);
5d4f98a2
YZ
1572 } else {
1573 BUG();
1574 }
1575 return ret;
1576}
1577
1578static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1579 struct extent_buffer *leaf,
1580 struct btrfs_extent_item *ei)
1581{
1582 u64 flags = btrfs_extent_flags(leaf, ei);
1583 if (extent_op->update_flags) {
1584 flags |= extent_op->flags_to_set;
1585 btrfs_set_extent_flags(leaf, ei, flags);
1586 }
1587
1588 if (extent_op->update_key) {
1589 struct btrfs_tree_block_info *bi;
1590 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1591 bi = (struct btrfs_tree_block_info *)(ei + 1);
1592 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1593 }
1594}
1595
1596static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 1597 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
1598 struct btrfs_delayed_extent_op *extent_op)
1599{
20b9a2d6 1600 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 1601 struct btrfs_root *root;
5d4f98a2
YZ
1602 struct btrfs_key key;
1603 struct btrfs_path *path;
1604 struct btrfs_extent_item *ei;
1605 struct extent_buffer *leaf;
1606 u32 item_size;
56bec294 1607 int ret;
0e3696f8 1608 int metadata = 1;
5d4f98a2 1609
bf31f87f 1610 if (TRANS_ABORTED(trans))
79787eaa
JM
1611 return 0;
1612
0e3696f8 1613 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
1614 metadata = 0;
1615
5d4f98a2
YZ
1616 path = btrfs_alloc_path();
1617 if (!path)
1618 return -ENOMEM;
1619
d278850e 1620 key.objectid = head->bytenr;
5d4f98a2 1621
3173a18f 1622 if (metadata) {
3173a18f 1623 key.type = BTRFS_METADATA_ITEM_KEY;
eba1469f 1624 key.offset = head->level;
3173a18f
JB
1625 } else {
1626 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 1627 key.offset = head->num_bytes;
3173a18f
JB
1628 }
1629
29cbcf40 1630 root = btrfs_extent_root(fs_info, key.objectid);
3173a18f 1631again:
29cbcf40 1632 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5d4f98a2 1633 if (ret < 0) {
5d4f98a2 1634 goto out;
20fb05a6 1635 } else if (ret > 0) {
3173a18f 1636 if (metadata) {
55994887
FDBM
1637 if (path->slots[0] > 0) {
1638 path->slots[0]--;
1639 btrfs_item_key_to_cpu(path->nodes[0], &key,
1640 path->slots[0]);
d278850e 1641 if (key.objectid == head->bytenr &&
55994887 1642 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 1643 key.offset == head->num_bytes)
55994887
FDBM
1644 ret = 0;
1645 }
1646 if (ret > 0) {
1647 btrfs_release_path(path);
1648 metadata = 0;
3173a18f 1649
d278850e
JB
1650 key.objectid = head->bytenr;
1651 key.offset = head->num_bytes;
55994887
FDBM
1652 key.type = BTRFS_EXTENT_ITEM_KEY;
1653 goto again;
1654 }
1655 } else {
20fb05a6 1656 ret = -EUCLEAN;
8ec0a4a5
FM
1657 btrfs_err(fs_info,
1658 "missing extent item for extent %llu num_bytes %llu level %d",
eba1469f 1659 head->bytenr, head->num_bytes, head->level);
55994887 1660 goto out;
3173a18f 1661 }
5d4f98a2
YZ
1662 }
1663
1664 leaf = path->nodes[0];
3212fa14 1665 item_size = btrfs_item_size(leaf, path->slots[0]);
ba3c2b19 1666
6d8ff4e4 1667 if (unlikely(item_size < sizeof(*ei))) {
20fb05a6 1668 ret = -EUCLEAN;
182741d2
QW
1669 btrfs_err(fs_info,
1670 "unexpected extent item size, has %u expect >= %zu",
1671 item_size, sizeof(*ei));
20fb05a6 1672 btrfs_abort_transaction(trans, ret);
ba3c2b19
NB
1673 goto out;
1674 }
1675
5d4f98a2
YZ
1676 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1677 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 1678
50564b65 1679 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
1680out:
1681 btrfs_free_path(path);
20fb05a6 1682 return ret;
56bec294
CM
1683}
1684
5d4f98a2 1685static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
cecbb533 1686 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1687 struct btrfs_delayed_ref_node *node,
1688 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1689 bool insert_reserved)
56bec294
CM
1690{
1691 int ret = 0;
cecbb533 1692 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
1693 u64 parent = 0;
1694 u64 ref_root = 0;
56bec294 1695
1bff6d4f 1696 trace_run_delayed_tree_ref(trans->fs_info, node);
599c75ec 1697
5d4f98a2 1698 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
cf4f0432
JB
1699 parent = node->parent;
1700 ref_root = node->ref_root;
5d4f98a2 1701
1bf76df3 1702 if (unlikely(node->ref_mod != 1)) {
f97806f2 1703 btrfs_err(trans->fs_info,
1bf76df3 1704 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
02794222
LB
1705 node->bytenr, node->ref_mod, node->action, ref_root,
1706 parent);
1bf76df3 1707 return -EUCLEAN;
02794222 1708 }
5d4f98a2 1709 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
cecbb533
BB
1710 struct btrfs_squota_delta delta = {
1711 .root = href->owning_root,
1712 .num_bytes = fs_info->nodesize,
cecbb533
BB
1713 .is_data = false,
1714 .is_inc = true,
bd7c1ea3 1715 .generation = trans->transid,
cecbb533
BB
1716 };
1717
21ebfbe7 1718 ret = alloc_reserved_tree_block(trans, node, extent_op);
cecbb533
BB
1719 if (!ret)
1720 btrfs_record_squota_delta(fs_info, &delta);
5d4f98a2 1721 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
a502f112 1722 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
5d4f98a2 1723 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
85bb9f54 1724 ret = __btrfs_free_extent(trans, href, node, extent_op);
5d4f98a2
YZ
1725 } else {
1726 BUG();
1727 }
56bec294
CM
1728 return ret;
1729}
1730
1731/* helper function to actually process a single delayed ref entry */
5d4f98a2 1732static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
cecbb533 1733 struct btrfs_delayed_ref_head *href,
5d4f98a2
YZ
1734 struct btrfs_delayed_ref_node *node,
1735 struct btrfs_delayed_extent_op *extent_op,
61c681fe 1736 bool insert_reserved)
56bec294 1737{
79787eaa
JM
1738 int ret = 0;
1739
bf31f87f 1740 if (TRANS_ABORTED(trans)) {
e85a0ada 1741 if (insert_reserved) {
b25c36f8 1742 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
e85a0ada
BB
1743 free_head_ref_squota_rsv(trans->fs_info, href);
1744 }
79787eaa 1745 return 0;
857cc2fc 1746 }
79787eaa 1747
5d4f98a2
YZ
1748 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1749 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
cecbb533 1750 ret = run_delayed_tree_ref(trans, href, node, extent_op,
5d4f98a2
YZ
1751 insert_reserved);
1752 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1753 node->type == BTRFS_SHARED_DATA_REF_KEY)
cecbb533 1754 ret = run_delayed_data_ref(trans, href, node, extent_op,
5d4f98a2 1755 insert_reserved);
d9a620f7
BB
1756 else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1757 ret = 0;
5d4f98a2
YZ
1758 else
1759 BUG();
80ee54bf 1760 if (ret && insert_reserved)
b25c36f8 1761 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
39f501d6
QW
1762 if (ret < 0)
1763 btrfs_err(trans->fs_info,
1764"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1765 node->bytenr, node->num_bytes, node->type,
1766 node->action, node->ref_mod, ret);
5d4f98a2 1767 return ret;
56bec294
CM
1768}
1769
c6fc2454 1770static inline struct btrfs_delayed_ref_node *
56bec294
CM
1771select_delayed_ref(struct btrfs_delayed_ref_head *head)
1772{
cffc3374
FM
1773 struct btrfs_delayed_ref_node *ref;
1774
e3d03965 1775 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 1776 return NULL;
d7df2c79 1777
cffc3374
FM
1778 /*
1779 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1780 * This is to prevent a ref count from going down to zero, which deletes
1781 * the extent item from the extent tree, when there still are references
1782 * to add, which would fail because they would not find the extent item.
1783 */
1d57ee94
WX
1784 if (!list_empty(&head->ref_add_list))
1785 return list_first_entry(&head->ref_add_list,
1786 struct btrfs_delayed_ref_node, add_list);
1787
e3d03965 1788 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 1789 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
1790 ASSERT(list_empty(&ref->add_list));
1791 return ref;
56bec294
CM
1792}
1793
2eadaa22
JB
1794static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1795 struct btrfs_delayed_ref_head *head)
1796{
1797 spin_lock(&delayed_refs->lock);
61c681fe 1798 head->processing = false;
2eadaa22
JB
1799 delayed_refs->num_heads_ready++;
1800 spin_unlock(&delayed_refs->lock);
1801 btrfs_delayed_ref_unlock(head);
1802}
1803
bedc6617
JB
1804static struct btrfs_delayed_extent_op *cleanup_extent_op(
1805 struct btrfs_delayed_ref_head *head)
b00e6250
JB
1806{
1807 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
1808
1809 if (!extent_op)
bedc6617
JB
1810 return NULL;
1811
b00e6250 1812 if (head->must_insert_reserved) {
bedc6617 1813 head->extent_op = NULL;
b00e6250 1814 btrfs_free_delayed_extent_op(extent_op);
bedc6617 1815 return NULL;
b00e6250 1816 }
bedc6617
JB
1817 return extent_op;
1818}
1819
1820static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1821 struct btrfs_delayed_ref_head *head)
1822{
1823 struct btrfs_delayed_extent_op *extent_op;
1824 int ret;
1825
1826 extent_op = cleanup_extent_op(head);
1827 if (!extent_op)
1828 return 0;
1829 head->extent_op = NULL;
b00e6250 1830 spin_unlock(&head->lock);
20b9a2d6 1831 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
1832 btrfs_free_delayed_extent_op(extent_op);
1833 return ret ? ret : 1;
1834}
1835
8a526c44 1836u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
31890da0
JB
1837 struct btrfs_delayed_ref_root *delayed_refs,
1838 struct btrfs_delayed_ref_head *head)
07c47775 1839{
e85a0ada
BB
1840 u64 ret = 0;
1841
81e75ac7
JB
1842 /*
1843 * We had csum deletions accounted for in our delayed refs rsv, we need
1844 * to drop the csum leaves for this update from our delayed_refs_rsv.
1845 */
1846 if (head->total_ref_mod < 0 && head->is_data) {
adb86dbe 1847 int nr_csums;
3ee56a58 1848
81e75ac7
JB
1849 spin_lock(&delayed_refs->lock);
1850 delayed_refs->pending_csums -= head->num_bytes;
1851 spin_unlock(&delayed_refs->lock);
adb86dbe 1852 nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
3ee56a58 1853
adb86dbe 1854 btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
81e75ac7 1855
e85a0ada 1856 ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
3ee56a58 1857 }
e85a0ada
BB
1858 /* must_insert_reserved can be set only if we didn't run the head ref. */
1859 if (head->must_insert_reserved)
1860 free_head_ref_squota_rsv(fs_info, head);
8a526c44 1861
e85a0ada 1862 return ret;
07c47775
JB
1863}
1864
194ab0bc 1865static int cleanup_ref_head(struct btrfs_trans_handle *trans,
8a526c44
FM
1866 struct btrfs_delayed_ref_head *head,
1867 u64 *bytes_released)
194ab0bc 1868{
f9871edd
NB
1869
1870 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
1871 struct btrfs_delayed_ref_root *delayed_refs;
1872 int ret;
1873
1874 delayed_refs = &trans->transaction->delayed_refs;
1875
bedc6617 1876 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
1877 if (ret < 0) {
1878 unselect_delayed_ref_head(delayed_refs, head);
1879 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1880 return ret;
1881 } else if (ret) {
1882 return ret;
1883 }
1884
1885 /*
1886 * Need to drop our head ref lock and re-acquire the delayed ref lock
1887 * and then re-check to make sure nobody got added.
1888 */
1889 spin_unlock(&head->lock);
1890 spin_lock(&delayed_refs->lock);
1891 spin_lock(&head->lock);
e3d03965 1892 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
1893 spin_unlock(&head->lock);
1894 spin_unlock(&delayed_refs->lock);
1895 return 1;
1896 }
d7baffda 1897 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 1898 spin_unlock(&head->lock);
1e7a1421 1899 spin_unlock(&delayed_refs->lock);
c1103f7a 1900
c1103f7a 1901 if (head->must_insert_reserved) {
b25c36f8 1902 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
c1103f7a 1903 if (head->is_data) {
fc28b25e
JB
1904 struct btrfs_root *csum_root;
1905
1906 csum_root = btrfs_csum_root(fs_info, head->bytenr);
1907 ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1908 head->num_bytes);
c1103f7a
JB
1909 }
1910 }
1911
3ee56a58 1912 *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
1913
1914 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 1915 btrfs_delayed_ref_unlock(head);
d278850e 1916 btrfs_put_delayed_ref_head(head);
856bd270 1917 return ret;
194ab0bc
JB
1918}
1919
b1cdbcb5
NB
1920static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1921 struct btrfs_trans_handle *trans)
1922{
1923 struct btrfs_delayed_ref_root *delayed_refs =
1924 &trans->transaction->delayed_refs;
1925 struct btrfs_delayed_ref_head *head = NULL;
1926 int ret;
1927
1928 spin_lock(&delayed_refs->lock);
5637c74b 1929 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
1930 if (!head) {
1931 spin_unlock(&delayed_refs->lock);
1932 return head;
1933 }
1934
1935 /*
1936 * Grab the lock that says we are going to process all the refs for
1937 * this head
1938 */
9e920a6f 1939 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
1940 spin_unlock(&delayed_refs->lock);
1941
1942 /*
1943 * We may have dropped the spin lock to get the head mutex lock, and
1944 * that might have given someone else time to free the head. If that's
1945 * true, it has been removed from our list and we can move on.
1946 */
1947 if (ret == -EAGAIN)
1948 head = ERR_PTR(-EAGAIN);
1949
1950 return head;
1951}
1952
e7261386 1953static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
3ee56a58
FM
1954 struct btrfs_delayed_ref_head *locked_ref,
1955 u64 *bytes_released)
e7261386
NB
1956{
1957 struct btrfs_fs_info *fs_info = trans->fs_info;
1958 struct btrfs_delayed_ref_root *delayed_refs;
1959 struct btrfs_delayed_extent_op *extent_op;
1960 struct btrfs_delayed_ref_node *ref;
61c681fe 1961 bool must_insert_reserved;
e7261386
NB
1962 int ret;
1963
1964 delayed_refs = &trans->transaction->delayed_refs;
1965
0110a4c4
NB
1966 lockdep_assert_held(&locked_ref->mutex);
1967 lockdep_assert_held(&locked_ref->lock);
1968
e7261386
NB
1969 while ((ref = select_delayed_ref(locked_ref))) {
1970 if (ref->seq &&
1971 btrfs_check_delayed_seq(fs_info, ref->seq)) {
1972 spin_unlock(&locked_ref->lock);
1973 unselect_delayed_ref_head(delayed_refs, locked_ref);
1974 return -EAGAIN;
1975 }
1976
e7261386
NB
1977 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1978 RB_CLEAR_NODE(&ref->ref_node);
1979 if (!list_empty(&ref->add_list))
1980 list_del(&ref->add_list);
1981 /*
1982 * When we play the delayed ref, also correct the ref_mod on
1983 * head
1984 */
1985 switch (ref->action) {
1986 case BTRFS_ADD_DELAYED_REF:
1987 case BTRFS_ADD_DELAYED_EXTENT:
1988 locked_ref->ref_mod -= ref->ref_mod;
1989 break;
1990 case BTRFS_DROP_DELAYED_REF:
1991 locked_ref->ref_mod += ref->ref_mod;
1992 break;
1993 default:
1994 WARN_ON(1);
1995 }
1996 atomic_dec(&delayed_refs->num_entries);
1997
1998 /*
1999 * Record the must_insert_reserved flag before we drop the
2000 * spin lock.
2001 */
2002 must_insert_reserved = locked_ref->must_insert_reserved;
e85a0ada
BB
2003 /*
2004 * Unsetting this on the head ref relinquishes ownership of
2005 * the rsv_bytes, so it is critical that every possible code
2006 * path from here forward frees all reserves including qgroup
2007 * reserve.
2008 */
61c681fe 2009 locked_ref->must_insert_reserved = false;
e7261386
NB
2010
2011 extent_op = locked_ref->extent_op;
2012 locked_ref->extent_op = NULL;
2013 spin_unlock(&locked_ref->lock);
2014
cecbb533 2015 ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
e7261386 2016 must_insert_reserved);
adb86dbe 2017 btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
3ee56a58 2018 *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
cecbb533 2019
e7261386
NB
2020 btrfs_free_delayed_extent_op(extent_op);
2021 if (ret) {
2022 unselect_delayed_ref_head(delayed_refs, locked_ref);
2023 btrfs_put_delayed_ref(ref);
e7261386
NB
2024 return ret;
2025 }
2026
2027 btrfs_put_delayed_ref(ref);
2028 cond_resched();
2029
2030 spin_lock(&locked_ref->lock);
0c555c97 2031 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
e7261386
NB
2032 }
2033
2034 return 0;
2035}
2036
79787eaa
JM
2037/*
2038 * Returns 0 on success or if called with an already aborted transaction.
2039 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2040 */
d7df2c79 2041static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
8a526c44 2042 u64 min_bytes)
56bec294 2043{
0a1e458a 2044 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 2045 struct btrfs_delayed_ref_root *delayed_refs;
56bec294
CM
2046 struct btrfs_delayed_ref_head *locked_ref = NULL;
2047 int ret;
d7df2c79 2048 unsigned long count = 0;
8a526c44
FM
2049 unsigned long max_count = 0;
2050 u64 bytes_processed = 0;
56bec294
CM
2051
2052 delayed_refs = &trans->transaction->delayed_refs;
8a526c44
FM
2053 if (min_bytes == 0) {
2054 max_count = delayed_refs->num_heads_ready;
2055 min_bytes = U64_MAX;
2056 }
2057
0110a4c4 2058 do {
56bec294 2059 if (!locked_ref) {
b1cdbcb5 2060 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
2061 if (IS_ERR_OR_NULL(locked_ref)) {
2062 if (PTR_ERR(locked_ref) == -EAGAIN) {
2063 continue;
2064 } else {
2065 break;
2066 }
56bec294 2067 }
0110a4c4 2068 count++;
56bec294 2069 }
2c3cf7d5
FM
2070 /*
2071 * We need to try and merge add/drops of the same ref since we
2072 * can run into issues with relocate dropping the implicit ref
2073 * and then it being added back again before the drop can
2074 * finish. If we merged anything we need to re-loop so we can
2075 * get a good ref.
2076 * Or we can get node references of the same type that weren't
2077 * merged when created due to bumps in the tree mod seq, and
2078 * we need to merge them to prevent adding an inline extent
2079 * backref before dropping it (triggering a BUG_ON at
2080 * insert_inline_extent_backref()).
2081 */
d7df2c79 2082 spin_lock(&locked_ref->lock);
0c555c97 2083 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
ae1e206b 2084
3ee56a58 2085 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
0110a4c4
NB
2086 if (ret < 0 && ret != -EAGAIN) {
2087 /*
2088 * Error, btrfs_run_delayed_refs_for_head already
2089 * unlocked everything so just bail out
2090 */
2091 return ret;
2092 } else if (!ret) {
2093 /*
2094 * Success, perform the usual cleanup of a processed
2095 * head
2096 */
3ee56a58 2097 ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
194ab0bc 2098 if (ret > 0 ) {
b00e6250
JB
2099 /* We dropped our lock, we need to loop. */
2100 ret = 0;
d7df2c79 2101 continue;
194ab0bc
JB
2102 } else if (ret) {
2103 return ret;
5d4f98a2 2104 }
22cd2e7d 2105 }
1ce7a5ec 2106
b00e6250 2107 /*
0110a4c4
NB
2108 * Either success case or btrfs_run_delayed_refs_for_head
2109 * returned -EAGAIN, meaning we need to select another head
b00e6250 2110 */
b00e6250 2111
0110a4c4 2112 locked_ref = NULL;
c3e69d58 2113 cond_resched();
8a526c44
FM
2114 } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2115 (max_count > 0 && count < max_count) ||
2116 locked_ref);
0a2b2a84 2117
d7df2c79 2118 return 0;
c3e69d58
CM
2119}
2120
709c0486
AJ
2121#ifdef SCRAMBLE_DELAYED_REFS
2122/*
2123 * Normally delayed refs get processed in ascending bytenr order. This
2124 * correlates in most cases to the order added. To expose dependencies on this
2125 * order, we start to process the tree in the middle instead of the beginning
2126 */
2127static u64 find_middle(struct rb_root *root)
2128{
2129 struct rb_node *n = root->rb_node;
2130 struct btrfs_delayed_ref_node *entry;
2131 int alt = 1;
2132 u64 middle;
2133 u64 first = 0, last = 0;
2134
2135 n = rb_first(root);
2136 if (n) {
2137 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2138 first = entry->bytenr;
2139 }
2140 n = rb_last(root);
2141 if (n) {
2142 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2143 last = entry->bytenr;
2144 }
2145 n = root->rb_node;
2146
2147 while (n) {
2148 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2149 WARN_ON(!entry->in_tree);
2150
2151 middle = entry->bytenr;
2152
2153 if (alt)
2154 n = n->rb_left;
2155 else
2156 n = n->rb_right;
2157
2158 alt = 1 - alt;
2159 }
2160 return middle;
2161}
2162#endif
2163
c3e69d58 2164/*
8a526c44
FM
2165 * Start processing the delayed reference count updates and extent insertions
2166 * we have queued up so far.
2167 *
2168 * @trans: Transaction handle.
2169 * @min_bytes: How many bytes of delayed references to process. After this
2170 * many bytes we stop processing delayed references if there are
2171 * any more. If 0 it means to run all existing delayed references,
2172 * but not new ones added after running all existing ones.
2173 * Use (u64)-1 (U64_MAX) to run all existing delayed references
2174 * plus any new ones that are added.
79787eaa
JM
2175 *
2176 * Returns 0 on success or if called with an aborted transaction
2177 * Returns <0 on error and aborts the transaction
c3e69d58 2178 */
8a526c44 2179int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
c3e69d58 2180{
c79a70b1 2181 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58 2182 struct btrfs_delayed_ref_root *delayed_refs;
c3e69d58 2183 int ret;
c3e69d58 2184
79787eaa 2185 /* We'll clean this up in btrfs_cleanup_transaction */
bf31f87f 2186 if (TRANS_ABORTED(trans))
79787eaa
JM
2187 return 0;
2188
0b246afa 2189 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2190 return 0;
2191
c3e69d58 2192 delayed_refs = &trans->transaction->delayed_refs;
c3e69d58 2193again:
709c0486
AJ
2194#ifdef SCRAMBLE_DELAYED_REFS
2195 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2196#endif
8a526c44 2197 ret = __btrfs_run_delayed_refs(trans, min_bytes);
d7df2c79 2198 if (ret < 0) {
66642832 2199 btrfs_abort_transaction(trans, ret);
d7df2c79 2200 return ret;
eb099670 2201 }
c3e69d58 2202
8a526c44 2203 if (min_bytes == U64_MAX) {
119e80df 2204 btrfs_create_pending_block_groups(trans);
ea658bad 2205
d7df2c79 2206 spin_lock(&delayed_refs->lock);
3cbb9f51 2207 if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
d7df2c79 2208 spin_unlock(&delayed_refs->lock);
3cbb9f51 2209 return 0;
d7df2c79 2210 }
d278850e 2211 spin_unlock(&delayed_refs->lock);
e9d0b13b 2212
d7df2c79 2213 cond_resched();
56bec294 2214 goto again;
5f39d397 2215 }
3cbb9f51 2216
a28ec197
CM
2217 return 0;
2218}
2219
5d4f98a2 2220int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
4aec05fa 2221 struct extent_buffer *eb, u64 flags)
5d4f98a2
YZ
2222{
2223 struct btrfs_delayed_extent_op *extent_op;
2224 int ret;
2225
78a6184a 2226 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2227 if (!extent_op)
2228 return -ENOMEM;
2229
2230 extent_op->flags_to_set = flags;
35b3ad50
DS
2231 extent_op->update_flags = true;
2232 extent_op->update_key = false;
5d4f98a2 2233
eba1469f
FM
2234 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
2235 btrfs_header_level(eb), extent_op);
5d4f98a2 2236 if (ret)
78a6184a 2237 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2238 return ret;
2239}
2240
e4c3b2dc 2241static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2242 struct btrfs_path *path,
2243 u64 objectid, u64 offset, u64 bytenr)
2244{
2245 struct btrfs_delayed_ref_head *head;
2246 struct btrfs_delayed_ref_node *ref;
5d4f98a2 2247 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2248 struct btrfs_transaction *cur_trans;
0e0adbcf 2249 struct rb_node *node;
5d4f98a2
YZ
2250 int ret = 0;
2251
998ac6d2 2252 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2253 cur_trans = root->fs_info->running_transaction;
998ac6d2 2254 if (cur_trans)
2255 refcount_inc(&cur_trans->use_count);
2256 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2257 if (!cur_trans)
2258 return 0;
2259
2260 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2261 spin_lock(&delayed_refs->lock);
f72ad18e 2262 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2263 if (!head) {
2264 spin_unlock(&delayed_refs->lock);
998ac6d2 2265 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2266 return 0;
2267 }
5d4f98a2
YZ
2268
2269 if (!mutex_trylock(&head->mutex)) {
26ce9114
JB
2270 if (path->nowait) {
2271 spin_unlock(&delayed_refs->lock);
2272 btrfs_put_transaction(cur_trans);
2273 return -EAGAIN;
2274 }
2275
d278850e 2276 refcount_inc(&head->refs);
5d4f98a2
YZ
2277 spin_unlock(&delayed_refs->lock);
2278
b3b4aa74 2279 btrfs_release_path(path);
5d4f98a2 2280
8cc33e5c
DS
2281 /*
2282 * Mutex was contended, block until it's released and let
2283 * caller try again
2284 */
5d4f98a2
YZ
2285 mutex_lock(&head->mutex);
2286 mutex_unlock(&head->mutex);
d278850e 2287 btrfs_put_delayed_ref_head(head);
998ac6d2 2288 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2289 return -EAGAIN;
2290 }
d7df2c79 2291 spin_unlock(&delayed_refs->lock);
5d4f98a2 2292
d7df2c79 2293 spin_lock(&head->lock);
0e0adbcf
JB
2294 /*
2295 * XXX: We should replace this with a proper search function in the
2296 * future.
2297 */
e3d03965
LB
2298 for (node = rb_first_cached(&head->ref_tree); node;
2299 node = rb_next(node)) {
44cc2e38
JB
2300 u64 ref_owner;
2301 u64 ref_offset;
2302
0e0adbcf 2303 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2304 /* If it's a shared ref we know a cross reference exists */
2305 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2306 ret = 1;
2307 break;
2308 }
5d4f98a2 2309
44cc2e38
JB
2310 ref_owner = btrfs_delayed_ref_owner(ref);
2311 ref_offset = btrfs_delayed_ref_offset(ref);
5d4f98a2 2312
d7df2c79
JB
2313 /*
2314 * If our ref doesn't match the one we're currently looking at
2315 * then we have a cross reference.
2316 */
e094f480 2317 if (ref->ref_root != btrfs_root_id(root) ||
44cc2e38 2318 ref_owner != objectid || ref_offset != offset) {
d7df2c79
JB
2319 ret = 1;
2320 break;
2321 }
5d4f98a2 2322 }
d7df2c79 2323 spin_unlock(&head->lock);
5d4f98a2 2324 mutex_unlock(&head->mutex);
998ac6d2 2325 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2326 return ret;
2327}
2328
e4c3b2dc 2329static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2 2330 struct btrfs_path *path,
a84d5d42
BB
2331 u64 objectid, u64 offset, u64 bytenr,
2332 bool strict)
be20aa9d 2333{
0b246afa 2334 struct btrfs_fs_info *fs_info = root->fs_info;
29cbcf40 2335 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
f321e491 2336 struct extent_buffer *leaf;
5d4f98a2
YZ
2337 struct btrfs_extent_data_ref *ref;
2338 struct btrfs_extent_inline_ref *iref;
2339 struct btrfs_extent_item *ei;
f321e491 2340 struct btrfs_key key;
5d4f98a2 2341 u32 item_size;
d9a620f7 2342 u32 expected_size;
3de28d57 2343 int type;
be20aa9d 2344 int ret;
925baedd 2345
be20aa9d 2346 key.objectid = bytenr;
31840ae1 2347 key.offset = (u64)-1;
f321e491 2348 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2349
be20aa9d
CM
2350 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2351 if (ret < 0)
2352 goto out;
a4259b6c
DS
2353 if (ret == 0) {
2354 /*
2355 * Key with offset -1 found, there would have to exist an extent
2356 * item with such offset, but this is out of the valid range.
2357 */
2358 ret = -EUCLEAN;
2359 goto out;
2360 }
80ff3856
YZ
2361
2362 ret = -ENOENT;
2363 if (path->slots[0] == 0)
31840ae1 2364 goto out;
be20aa9d 2365
31840ae1 2366 path->slots[0]--;
f321e491 2367 leaf = path->nodes[0];
5d4f98a2 2368 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2369
5d4f98a2 2370 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2371 goto out;
f321e491 2372
5d4f98a2 2373 ret = 1;
3212fa14 2374 item_size = btrfs_item_size(leaf, path->slots[0]);
5d4f98a2 2375 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
d9a620f7
BB
2376 expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2377
2378 /* No inline refs; we need to bail before checking for owner ref. */
2379 if (item_size == sizeof(*ei))
2380 goto out;
2381
2382 /* Check for an owner ref; skip over it to the real inline refs. */
2383 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2384 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2385 if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2386 expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2387 iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2388 }
bd09835d 2389
a6bd9cd1 2390 /* If extent item has more than 1 inline ref then it's shared */
d9a620f7 2391 if (item_size != expected_size)
5d4f98a2 2392 goto out;
be20aa9d 2393
a84d5d42
BB
2394 /*
2395 * If extent created before last snapshot => it's shared unless the
2396 * snapshot has been deleted. Use the heuristic if strict is false.
2397 */
2398 if (!strict &&
2399 (btrfs_extent_generation(leaf, ei) <=
2400 btrfs_root_last_snapshot(&root->root_item)))
5d4f98a2
YZ
2401 goto out;
2402
a6bd9cd1 2403 /* If this extent has SHARED_DATA_REF then it's shared */
3de28d57
LB
2404 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2405 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
2406 goto out;
2407
2408 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2409 if (btrfs_extent_refs(leaf, ei) !=
2410 btrfs_extent_data_ref_count(leaf, ref) ||
e094f480 2411 btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
5d4f98a2
YZ
2412 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2413 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2414 goto out;
2415
2416 ret = 0;
2417out:
2418 return ret;
2419}
2420
e4c3b2dc 2421int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
1a89f173 2422 u64 bytenr, bool strict, struct btrfs_path *path)
5d4f98a2 2423{
5d4f98a2 2424 int ret;
5d4f98a2 2425
5d4f98a2 2426 do {
e4c3b2dc 2427 ret = check_committed_ref(root, path, objectid,
a84d5d42 2428 offset, bytenr, strict);
5d4f98a2 2429 if (ret && ret != -ENOENT)
f321e491 2430 goto out;
80ff3856 2431
380fd066
MT
2432 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2433 } while (ret == -EAGAIN);
5d4f98a2 2434
be20aa9d 2435out:
1a89f173 2436 btrfs_release_path(path);
37f00a6d 2437 if (btrfs_is_data_reloc_root(root))
f0486c68 2438 WARN_ON(ret > 0);
f321e491 2439 return ret;
be20aa9d 2440}
c5739bba 2441
5d4f98a2 2442static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 2443 struct btrfs_root *root,
5d4f98a2 2444 struct extent_buffer *buf,
e339a6b0 2445 int full_backref, int inc)
31840ae1 2446{
0b246afa 2447 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 2448 u64 parent;
31840ae1 2449 u64 ref_root;
31840ae1 2450 u32 nritems;
31840ae1
ZY
2451 struct btrfs_key key;
2452 struct btrfs_file_extent_item *fi;
82fa113f 2453 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 2454 int i;
82fa113f 2455 int action;
31840ae1
ZY
2456 int level;
2457 int ret = 0;
fccb84c9 2458
0b246afa 2459 if (btrfs_is_testing(fs_info))
faa2dbf0 2460 return 0;
fccb84c9 2461
31840ae1 2462 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
2463 nritems = btrfs_header_nritems(buf);
2464 level = btrfs_header_level(buf);
2465
92a7cc42 2466 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
5d4f98a2 2467 return 0;
31840ae1 2468
5d4f98a2
YZ
2469 if (full_backref)
2470 parent = buf->start;
2471 else
2472 parent = 0;
82fa113f
QW
2473 if (inc)
2474 action = BTRFS_ADD_DELAYED_REF;
2475 else
2476 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
2477
2478 for (i = 0; i < nritems; i++) {
4d09b4e9
JB
2479 struct btrfs_ref ref = {
2480 .action = action,
2481 .parent = parent,
f2e69a77 2482 .ref_root = ref_root,
4d09b4e9
JB
2483 };
2484
31840ae1 2485 if (level == 0) {
5d4f98a2 2486 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 2487 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 2488 continue;
5d4f98a2 2489 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
2490 struct btrfs_file_extent_item);
2491 if (btrfs_file_extent_type(buf, fi) ==
2492 BTRFS_FILE_EXTENT_INLINE)
2493 continue;
4d09b4e9
JB
2494 ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2495 if (ref.bytenr == 0)
31840ae1 2496 continue;
5d4f98a2 2497
12390e42 2498 ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
4d09b4e9
JB
2499 ref.owning_root = ref_root;
2500
5d4f98a2 2501 key.offset -= btrfs_file_extent_offset(buf, fi);
f2e69a77 2502 btrfs_init_data_ref(&ref, key.objectid, key.offset,
e094f480 2503 btrfs_root_id(root), for_reloc);
dd28b6a5 2504 if (inc)
4d09b4e9 2505 ret = btrfs_inc_extent_ref(trans, &ref);
dd28b6a5 2506 else
4d09b4e9 2507 ret = btrfs_free_extent(trans, &ref);
31840ae1
ZY
2508 if (ret)
2509 goto fail;
2510 } else {
4d09b4e9
JB
2511 /* We don't know the owning_root, leave as 0. */
2512 ref.bytenr = btrfs_node_blockptr(buf, i);
12390e42 2513 ref.num_bytes = fs_info->nodesize;
4d09b4e9 2514
f2e69a77 2515 btrfs_init_tree_ref(&ref, level - 1,
e094f480 2516 btrfs_root_id(root), for_reloc);
dd28b6a5 2517 if (inc)
4d09b4e9 2518 ret = btrfs_inc_extent_ref(trans, &ref);
dd28b6a5 2519 else
4d09b4e9 2520 ret = btrfs_free_extent(trans, &ref);
31840ae1
ZY
2521 if (ret)
2522 goto fail;
2523 }
2524 }
2525 return 0;
2526fail:
5d4f98a2
YZ
2527 return ret;
2528}
2529
2530int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2531 struct extent_buffer *buf, int full_backref)
5d4f98a2 2532{
e339a6b0 2533 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
2534}
2535
2536int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2537 struct extent_buffer *buf, int full_backref)
5d4f98a2 2538{
e339a6b0 2539 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
2540}
2541
1b86826d 2542static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 2543{
0b246afa 2544 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 2545 u64 flags;
53b381b3 2546 u64 ret;
9ed74f2d 2547
b742bb82
YZ
2548 if (data)
2549 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 2550 else if (root == fs_info->chunk_root)
b742bb82 2551 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 2552 else
b742bb82 2553 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 2554
878d7b67 2555 ret = btrfs_get_alloc_profile(fs_info, flags);
53b381b3 2556 return ret;
6a63209f 2557}
9ed74f2d 2558
0eb997bf 2559static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
a061fc8d 2560{
08dddb29
FM
2561 struct rb_node *leftmost;
2562 u64 bytenr = 0;
a1897fdd 2563
16b0c258 2564 read_lock(&fs_info->block_group_cache_lock);
0eb997bf 2565 /* Get the block group with the lowest logical start address. */
08dddb29
FM
2566 leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2567 if (leftmost) {
2568 struct btrfs_block_group *bg;
0f9dd46c 2569
08dddb29
FM
2570 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2571 bytenr = bg->start;
2572 }
16b0c258 2573 read_unlock(&fs_info->block_group_cache_lock);
d2fb3437
YZ
2574
2575 return bytenr;
a061fc8d
CM
2576}
2577
6690d071
NB
2578static int pin_down_extent(struct btrfs_trans_handle *trans,
2579 struct btrfs_block_group *cache,
f0486c68 2580 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 2581{
fdf08605
DS
2582 struct btrfs_fs_info *fs_info = cache->fs_info;
2583
11833d66
YZ
2584 spin_lock(&cache->space_info->lock);
2585 spin_lock(&cache->lock);
2586 cache->pinned += num_bytes;
bb96c4e5
JB
2587 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2588 num_bytes);
11833d66
YZ
2589 if (reserved) {
2590 cache->reserved -= num_bytes;
2591 cache->space_info->bytes_reserved -= num_bytes;
2592 }
2593 spin_unlock(&cache->lock);
2594 spin_unlock(&cache->space_info->lock);
68b38550 2595
fe1a598c 2596 set_extent_bit(&trans->transaction->pinned_extents, bytenr,
1d126800 2597 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
f0486c68
YZ
2598 return 0;
2599}
68b38550 2600
b25c36f8 2601int btrfs_pin_extent(struct btrfs_trans_handle *trans,
f0486c68
YZ
2602 u64 bytenr, u64 num_bytes, int reserved)
2603{
32da5386 2604 struct btrfs_block_group *cache;
68b38550 2605
b25c36f8 2606 cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
79787eaa 2607 BUG_ON(!cache); /* Logic error */
f0486c68 2608
6690d071 2609 pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
f0486c68
YZ
2610
2611 btrfs_put_block_group(cache);
11833d66
YZ
2612 return 0;
2613}
2614
9fce5704 2615int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
007dec8c 2616 const struct extent_buffer *eb)
e688b725 2617{
32da5386 2618 struct btrfs_block_group *cache;
b50c6e25 2619 int ret;
e688b725 2620
007dec8c 2621 cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
b50c6e25
JB
2622 if (!cache)
2623 return -EINVAL;
e688b725
CM
2624
2625 /*
ced8ecf0
OS
2626 * Fully cache the free space first so that our pin removes the free space
2627 * from the cache.
e688b725 2628 */
ced8ecf0 2629 ret = btrfs_cache_block_group(cache, true);
9ad6d91f
FM
2630 if (ret)
2631 goto out;
e688b725 2632
007dec8c 2633 pin_down_extent(trans, cache, eb->start, eb->len, 0);
e688b725
CM
2634
2635 /* remove us from the free space cache (if we're there at all) */
007dec8c 2636 ret = btrfs_remove_free_space(cache, eb->start, eb->len);
9ad6d91f 2637out:
e688b725 2638 btrfs_put_block_group(cache);
b50c6e25 2639 return ret;
e688b725
CM
2640}
2641
2ff7e61e
JM
2642static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2643 u64 start, u64 num_bytes)
8c2a1a30
JB
2644{
2645 int ret;
32da5386 2646 struct btrfs_block_group *block_group;
8c2a1a30 2647
0b246afa 2648 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
2649 if (!block_group)
2650 return -EINVAL;
2651
ced8ecf0 2652 ret = btrfs_cache_block_group(block_group, true);
9ad6d91f
FM
2653 if (ret)
2654 goto out;
8c2a1a30 2655
9ad6d91f
FM
2656 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2657out:
8c2a1a30
JB
2658 btrfs_put_block_group(block_group);
2659 return ret;
2660}
2661
bcdc428c 2662int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 2663{
bcdc428c 2664 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
2665 struct btrfs_file_extent_item *item;
2666 struct btrfs_key key;
2667 int found_type;
2668 int i;
b89311ef 2669 int ret = 0;
8c2a1a30 2670
2ff7e61e 2671 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
2672 return 0;
2673
2674 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2675 btrfs_item_key_to_cpu(eb, &key, i);
2676 if (key.type != BTRFS_EXTENT_DATA_KEY)
2677 continue;
2678 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2679 found_type = btrfs_file_extent_type(eb, item);
2680 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2681 continue;
2682 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2683 continue;
2684 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2685 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
2686 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2687 if (ret)
2688 break;
8c2a1a30
JB
2689 }
2690
b89311ef 2691 return ret;
8c2a1a30
JB
2692}
2693
9cfa3e34 2694static void
32da5386 2695btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
9cfa3e34
FM
2696{
2697 atomic_inc(&bg->reservations);
2698}
2699
c759c4e1
JB
2700/*
2701 * Returns the free cluster for the given space info and sets empty_cluster to
2702 * what it should be based on the mount options.
2703 */
2704static struct btrfs_free_cluster *
2ff7e61e
JM
2705fetch_cluster_info(struct btrfs_fs_info *fs_info,
2706 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
2707{
2708 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
2709
2710 *empty_cluster = 0;
2711 if (btrfs_mixed_space_info(space_info))
2712 return ret;
2713
c759c4e1 2714 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 2715 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
2716 if (btrfs_test_opt(fs_info, SSD))
2717 *empty_cluster = SZ_2M;
2718 else
ee22184b 2719 *empty_cluster = SZ_64K;
583b7231
HK
2720 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2721 btrfs_test_opt(fs_info, SSD_SPREAD)) {
2722 *empty_cluster = SZ_2M;
0b246afa 2723 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
2724 }
2725
2726 return ret;
2727}
2728
2ff7e61e
JM
2729static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2730 u64 start, u64 end,
678886bd 2731 const bool return_free_space)
ccd467d6 2732{
32da5386 2733 struct btrfs_block_group *cache = NULL;
7b398f8e
JB
2734 struct btrfs_space_info *space_info;
2735 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 2736 struct btrfs_free_cluster *cluster = NULL;
11833d66 2737 u64 len;
c759c4e1
JB
2738 u64 total_unpinned = 0;
2739 u64 empty_cluster = 0;
7b398f8e 2740 bool readonly;
44a6c343 2741 int ret = 0;
ccd467d6 2742
11833d66 2743 while (start <= end) {
7b398f8e 2744 readonly = false;
11833d66 2745 if (!cache ||
b3470b5d 2746 start >= cache->start + cache->length) {
11833d66
YZ
2747 if (cache)
2748 btrfs_put_block_group(cache);
c759c4e1 2749 total_unpinned = 0;
11833d66 2750 cache = btrfs_lookup_block_group(fs_info, start);
44a6c343
DS
2751 if (cache == NULL) {
2752 /* Logic error, something removed the block group. */
2753 ret = -EUCLEAN;
2754 goto out;
2755 }
c759c4e1 2756
2ff7e61e 2757 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
2758 cache->space_info,
2759 &empty_cluster);
2760 empty_cluster <<= 1;
11833d66
YZ
2761 }
2762
b3470b5d 2763 len = cache->start + cache->length - start;
11833d66
YZ
2764 len = min(len, end + 1 - start);
2765
48ff7083
OS
2766 if (return_free_space)
2767 btrfs_add_free_space(cache, start, len);
11833d66 2768
f0486c68 2769 start += len;
c759c4e1 2770 total_unpinned += len;
7b398f8e 2771 space_info = cache->space_info;
f0486c68 2772
c759c4e1
JB
2773 /*
2774 * If this space cluster has been marked as fragmented and we've
2775 * unpinned enough in this block group to potentially allow a
2776 * cluster to be created inside of it go ahead and clear the
2777 * fragmented check.
2778 */
2779 if (cluster && cluster->fragmented &&
2780 total_unpinned > empty_cluster) {
2781 spin_lock(&cluster->lock);
2782 cluster->fragmented = 0;
2783 spin_unlock(&cluster->lock);
2784 }
2785
7b398f8e 2786 spin_lock(&space_info->lock);
11833d66
YZ
2787 spin_lock(&cache->lock);
2788 cache->pinned -= len;
bb96c4e5 2789 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
4f4db217 2790 space_info->max_extent_size = 0;
7b398f8e
JB
2791 if (cache->ro) {
2792 space_info->bytes_readonly += len;
2793 readonly = true;
169e0da9
NA
2794 } else if (btrfs_is_zoned(fs_info)) {
2795 /* Need reset before reusing in a zoned block group */
8cd44dd1
NA
2796 btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2797 len);
169e0da9 2798 readonly = true;
7b398f8e 2799 }
11833d66 2800 spin_unlock(&cache->lock);
957780eb
JB
2801 if (!readonly && return_free_space &&
2802 global_rsv->space_info == space_info) {
7b398f8e
JB
2803 spin_lock(&global_rsv->lock);
2804 if (!global_rsv->full) {
c4bf1909
JC
2805 u64 to_add = min(len, global_rsv->size -
2806 global_rsv->reserved);
2807
957780eb 2808 global_rsv->reserved += to_add;
bb96c4e5
JB
2809 btrfs_space_info_update_bytes_may_use(fs_info,
2810 space_info, to_add);
7b398f8e
JB
2811 if (global_rsv->reserved >= global_rsv->size)
2812 global_rsv->full = 1;
957780eb 2813 len -= to_add;
7b398f8e
JB
2814 }
2815 spin_unlock(&global_rsv->lock);
2816 }
2732798c
JB
2817 /* Add to any tickets we may have */
2818 if (!readonly && return_free_space && len)
2819 btrfs_try_granting_tickets(fs_info, space_info);
7b398f8e 2820 spin_unlock(&space_info->lock);
ccd467d6 2821 }
11833d66
YZ
2822
2823 if (cache)
2824 btrfs_put_block_group(cache);
44a6c343
DS
2825out:
2826 return ret;
ccd467d6
CM
2827}
2828
5ead2dd0 2829int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 2830{
5ead2dd0 2831 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2832 struct btrfs_block_group *block_group, *tmp;
e33e17ee 2833 struct list_head *deleted_bgs;
11833d66 2834 struct extent_io_tree *unpin;
1a5bc167
CM
2835 u64 start;
2836 u64 end;
a28ec197 2837 int ret;
a28ec197 2838
fe119a6e 2839 unpin = &trans->transaction->pinned_extents;
11833d66 2840
bf31f87f 2841 while (!TRANS_ABORTED(trans)) {
0e6ec385
FM
2842 struct extent_state *cached_state = NULL;
2843
d4b450cd 2844 mutex_lock(&fs_info->unused_bg_unpin_mutex);
e5860f82
FM
2845 if (!find_first_extent_bit(unpin, 0, &start, &end,
2846 EXTENT_DIRTY, &cached_state)) {
d4b450cd 2847 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 2848 break;
d4b450cd 2849 }
1f3c79a2 2850
46b27f50 2851 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2ff7e61e 2852 ret = btrfs_discard_extent(fs_info, start,
5378e607 2853 end + 1 - start, NULL);
1f3c79a2 2854
0e6ec385 2855 clear_extent_dirty(unpin, start, end, &cached_state);
44a6c343
DS
2856 ret = unpin_extent_range(fs_info, start, end, true);
2857 BUG_ON(ret);
d4b450cd 2858 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 2859 free_extent_state(cached_state);
b9473439 2860 cond_resched();
a28ec197 2861 }
817d52f8 2862
a2309300
DZ
2863 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2864 btrfs_discard_calc_delay(&fs_info->discard_ctl);
b0643e59 2865 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
a2309300 2866 }
b0643e59 2867
e33e17ee
JM
2868 /*
2869 * Transaction is finished. We don't need the lock anymore. We
2870 * do need to clean up the block groups in case of a transaction
2871 * abort.
2872 */
2873 deleted_bgs = &trans->transaction->deleted_bgs;
2874 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2875 u64 trimmed = 0;
2876
2877 ret = -EROFS;
bf31f87f 2878 if (!TRANS_ABORTED(trans))
2ff7e61e 2879 ret = btrfs_discard_extent(fs_info,
b3470b5d
DS
2880 block_group->start,
2881 block_group->length,
e33e17ee
JM
2882 &trimmed);
2883
2884 list_del_init(&block_group->bg_list);
6b7304af 2885 btrfs_unfreeze_block_group(block_group);
e33e17ee
JM
2886 btrfs_put_block_group(block_group);
2887
2888 if (ret) {
2889 const char *errstr = btrfs_decode_error(ret);
2890 btrfs_warn(fs_info,
913e1535 2891 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
2892 ret, errstr);
2893 }
2894 }
2895
e20d96d6
CM
2896 return 0;
2897}
2898
8d299091
BB
2899/*
2900 * Parse an extent item's inline extents looking for a simple quotas owner ref.
2901 *
2902 * @fs_info: the btrfs_fs_info for this mount
2903 * @leaf: a leaf in the extent tree containing the extent item
2904 * @slot: the slot in the leaf where the extent item is found
2905 *
2906 * Returns the objectid of the root that originally allocated the extent item
2907 * if the inline owner ref is expected and present, otherwise 0.
2908 *
2909 * If an extent item has an owner ref item, it will be the first inline ref
2910 * item. Therefore the logic is to check whether there are any inline ref
2911 * items, then check the type of the first one.
2912 */
2913u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2914 struct extent_buffer *leaf, int slot)
2915{
2916 struct btrfs_extent_item *ei;
2917 struct btrfs_extent_inline_ref *iref;
2918 struct btrfs_extent_owner_ref *oref;
2919 unsigned long ptr;
2920 unsigned long end;
2921 int type;
2922
2923 if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2924 return 0;
2925
2926 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2927 ptr = (unsigned long)(ei + 1);
2928 end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2929
2930 /* No inline ref items of any kind, can't check type. */
2931 if (ptr == end)
2932 return 0;
2933
2934 iref = (struct btrfs_extent_inline_ref *)ptr;
2935 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2936
2937 /* We found an owner ref, get the root out of it. */
2938 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2939 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2940 return btrfs_extent_owner_ref_root_id(leaf, oref);
2941 }
2942
2943 /* We have inline refs, but not an owner ref. */
2944 return 0;
2945}
2946
8f8aa4c7 2947static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
cecbb533 2948 u64 bytenr, struct btrfs_squota_delta *delta)
8f8aa4c7
JB
2949{
2950 int ret;
cecbb533 2951 u64 num_bytes = delta->num_bytes;
8f8aa4c7 2952
cecbb533 2953 if (delta->is_data) {
8f8aa4c7
JB
2954 struct btrfs_root *csum_root;
2955
2956 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2957 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2958 if (ret) {
2959 btrfs_abort_transaction(trans, ret);
2960 return ret;
2961 }
ca41504e
JT
2962
2963 ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2964 if (ret) {
2965 btrfs_abort_transaction(trans, ret);
2966 return ret;
2967 }
8f8aa4c7
JB
2968 }
2969
cecbb533
BB
2970 ret = btrfs_record_squota_delta(trans->fs_info, delta);
2971 if (ret) {
2972 btrfs_abort_transaction(trans, ret);
2973 return ret;
2974 }
2975
8f8aa4c7
JB
2976 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2977 if (ret) {
2978 btrfs_abort_transaction(trans, ret);
2979 return ret;
2980 }
2981
2982 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2983 if (ret)
2984 btrfs_abort_transaction(trans, ret);
2985
2986 return ret;
2987}
2988
eee3b811
QW
2989#define abort_and_dump(trans, path, fmt, args...) \
2990({ \
2991 btrfs_abort_transaction(trans, -EUCLEAN); \
2992 btrfs_print_leaf(path->nodes[0]); \
2993 btrfs_crit(trans->fs_info, fmt, ##args); \
2994})
2995
1c2a07f5
QW
2996/*
2997 * Drop one or more refs of @node.
2998 *
2999 * 1. Locate the extent refs.
3000 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3001 * Locate it, then reduce the refs number or remove the ref line completely.
3002 *
3003 * 2. Update the refs count in EXTENT/METADATA_ITEM
3004 *
3005 * Inline backref case:
3006 *
3007 * in extent tree we have:
3008 *
3009 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3010 * refs 2 gen 6 flags DATA
3011 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3012 * extent data backref root FS_TREE objectid 257 offset 0 count 1
3013 *
3014 * This function gets called with:
3015 *
3016 * node->bytenr = 13631488
3017 * node->num_bytes = 1048576
3018 * root_objectid = FS_TREE
3019 * owner_objectid = 257
3020 * owner_offset = 0
3021 * refs_to_drop = 1
3022 *
3023 * Then we should get some like:
3024 *
3025 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3026 * refs 1 gen 6 flags DATA
3027 * extent data backref root FS_TREE objectid 258 offset 0 count 1
3028 *
3029 * Keyed backref case:
3030 *
3031 * in extent tree we have:
3032 *
3033 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3034 * refs 754 gen 6 flags DATA
3035 * [...]
3036 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3037 * extent data backref root FS_TREE objectid 866 offset 0 count 1
3038 *
3039 * This function get called with:
3040 *
3041 * node->bytenr = 13631488
3042 * node->num_bytes = 1048576
3043 * root_objectid = FS_TREE
3044 * owner_objectid = 866
3045 * owner_offset = 0
3046 * refs_to_drop = 1
3047 *
3048 * Then we should get some like:
3049 *
3050 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3051 * refs 753 gen 6 flags DATA
3052 *
3053 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3054 */
5d4f98a2 3055static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
cecbb533 3056 struct btrfs_delayed_ref_head *href,
85bb9f54 3057 struct btrfs_delayed_ref_node *node,
e72cb923 3058 struct btrfs_delayed_extent_op *extent_op)
a28ec197 3059{
e72cb923 3060 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 3061 struct btrfs_key key;
5d4f98a2 3062 struct btrfs_path *path;
29cbcf40 3063 struct btrfs_root *extent_root;
5f39d397 3064 struct extent_buffer *leaf;
5d4f98a2
YZ
3065 struct btrfs_extent_item *ei;
3066 struct btrfs_extent_inline_ref *iref;
a28ec197 3067 int ret;
5d4f98a2 3068 int is_data;
952fccac
CM
3069 int extent_slot = 0;
3070 int found_extent = 0;
3071 int num_to_del = 1;
1df6b3c0 3072 int refs_to_drop = node->ref_mod;
5d4f98a2
YZ
3073 u32 item_size;
3074 u64 refs;
c682f9b3
QW
3075 u64 bytenr = node->bytenr;
3076 u64 num_bytes = node->num_bytes;
85bb9f54
JB
3077 u64 owner_objectid = btrfs_delayed_ref_owner(node);
3078 u64 owner_offset = btrfs_delayed_ref_offset(node);
0b246afa 3079 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
cecbb533 3080 u64 delayed_ref_root = href->owning_root;
037e6390 3081
29cbcf40 3082 extent_root = btrfs_extent_root(info, bytenr);
abed4aaa 3083 ASSERT(extent_root);
29cbcf40 3084
5caf2a00 3085 path = btrfs_alloc_path();
54aa1f4d
CM
3086 if (!path)
3087 return -ENOMEM;
5f26f772 3088
5d4f98a2 3089 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
1c2a07f5
QW
3090
3091 if (!is_data && refs_to_drop != 1) {
3092 btrfs_crit(info,
3093"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3094 node->bytenr, refs_to_drop);
3095 ret = -EINVAL;
3096 btrfs_abort_transaction(trans, ret);
3097 goto out;
3098 }
5d4f98a2 3099
3173a18f 3100 if (is_data)
897ca819 3101 skinny_metadata = false;
3173a18f 3102
fbe4801b 3103 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
85bb9f54 3104 node->parent, node->ref_root, owner_objectid,
5d4f98a2 3105 owner_offset);
7bb86316 3106 if (ret == 0) {
1c2a07f5
QW
3107 /*
3108 * Either the inline backref or the SHARED_DATA_REF/
3109 * SHARED_BLOCK_REF is found
3110 *
3111 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3112 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3113 */
952fccac 3114 extent_slot = path->slots[0];
5d4f98a2
YZ
3115 while (extent_slot >= 0) {
3116 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 3117 extent_slot);
5d4f98a2 3118 if (key.objectid != bytenr)
952fccac 3119 break;
5d4f98a2
YZ
3120 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3121 key.offset == num_bytes) {
952fccac
CM
3122 found_extent = 1;
3123 break;
3124 }
3173a18f
JB
3125 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3126 key.offset == owner_objectid) {
3127 found_extent = 1;
3128 break;
3129 }
1c2a07f5
QW
3130
3131 /* Quick path didn't find the EXTEMT/METADATA_ITEM */
952fccac
CM
3132 if (path->slots[0] - extent_slot > 5)
3133 break;
5d4f98a2 3134 extent_slot--;
952fccac 3135 }
a79865c6 3136
31840ae1 3137 if (!found_extent) {
1c2a07f5 3138 if (iref) {
eee3b811
QW
3139 abort_and_dump(trans, path,
3140"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3141 path->slots[0]);
3142 ret = -EUCLEAN;
3143 goto out;
1c2a07f5
QW
3144 }
3145 /* Must be SHARED_* item, remove the backref first */
76d76e78 3146 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 3147 NULL, refs_to_drop, is_data);
005d6427 3148 if (ret) {
66642832 3149 btrfs_abort_transaction(trans, ret);
005d6427
DS
3150 goto out;
3151 }
b3b4aa74 3152 btrfs_release_path(path);
5d4f98a2 3153
1c2a07f5 3154 /* Slow path to locate EXTENT/METADATA_ITEM */
5d4f98a2
YZ
3155 key.objectid = bytenr;
3156 key.type = BTRFS_EXTENT_ITEM_KEY;
3157 key.offset = num_bytes;
3158
3173a18f
JB
3159 if (!is_data && skinny_metadata) {
3160 key.type = BTRFS_METADATA_ITEM_KEY;
3161 key.offset = owner_objectid;
3162 }
3163
31840ae1
ZY
3164 ret = btrfs_search_slot(trans, extent_root,
3165 &key, path, -1, 1);
3173a18f
JB
3166 if (ret > 0 && skinny_metadata && path->slots[0]) {
3167 /*
3168 * Couldn't find our skinny metadata item,
3169 * see if we have ye olde extent item.
3170 */
3171 path->slots[0]--;
3172 btrfs_item_key_to_cpu(path->nodes[0], &key,
3173 path->slots[0]);
3174 if (key.objectid == bytenr &&
3175 key.type == BTRFS_EXTENT_ITEM_KEY &&
3176 key.offset == num_bytes)
3177 ret = 0;
3178 }
3179
3180 if (ret > 0 && skinny_metadata) {
3181 skinny_metadata = false;
9ce49a0b 3182 key.objectid = bytenr;
3173a18f
JB
3183 key.type = BTRFS_EXTENT_ITEM_KEY;
3184 key.offset = num_bytes;
3185 btrfs_release_path(path);
3186 ret = btrfs_search_slot(trans, extent_root,
3187 &key, path, -1, 1);
3188 }
3189
f3465ca4 3190 if (ret) {
b783e62d 3191 if (ret > 0)
a4f78750 3192 btrfs_print_leaf(path->nodes[0]);
eee3b811
QW
3193 btrfs_err(info,
3194 "umm, got %d back from search, was looking for %llu, slot %d",
3195 ret, bytenr, path->slots[0]);
f3465ca4 3196 }
005d6427 3197 if (ret < 0) {
66642832 3198 btrfs_abort_transaction(trans, ret);
005d6427
DS
3199 goto out;
3200 }
31840ae1
ZY
3201 extent_slot = path->slots[0];
3202 }
fae7f21c 3203 } else if (WARN_ON(ret == -ENOENT)) {
eee3b811
QW
3204 abort_and_dump(trans, path,
3205"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
85bb9f54 3206 bytenr, node->parent, node->ref_root, owner_objectid,
eee3b811 3207 owner_offset, path->slots[0]);
c4a050bb 3208 goto out;
79787eaa 3209 } else {
66642832 3210 btrfs_abort_transaction(trans, ret);
005d6427 3211 goto out;
7bb86316 3212 }
5f39d397
CM
3213
3214 leaf = path->nodes[0];
3212fa14 3215 item_size = btrfs_item_size(leaf, extent_slot);
6d8ff4e4 3216 if (unlikely(item_size < sizeof(*ei))) {
182741d2
QW
3217 ret = -EUCLEAN;
3218 btrfs_err(trans->fs_info,
3219 "unexpected extent item size, has %u expect >= %zu",
3220 item_size, sizeof(*ei));
ba3c2b19
NB
3221 btrfs_abort_transaction(trans, ret);
3222 goto out;
3223 }
952fccac 3224 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 3225 struct btrfs_extent_item);
3173a18f
JB
3226 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3227 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2 3228 struct btrfs_tree_block_info *bi;
eee3b811 3229
1c2a07f5 3230 if (item_size < sizeof(*ei) + sizeof(*bi)) {
eee3b811
QW
3231 abort_and_dump(trans, path,
3232"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3233 key.objectid, key.type, key.offset,
3234 path->slots[0], owner_objectid, item_size,
3235 sizeof(*ei) + sizeof(*bi));
3236 ret = -EUCLEAN;
3237 goto out;
1c2a07f5 3238 }
5d4f98a2
YZ
3239 bi = (struct btrfs_tree_block_info *)(ei + 1);
3240 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3241 }
56bec294 3242
5d4f98a2 3243 refs = btrfs_extent_refs(leaf, ei);
32b02538 3244 if (refs < refs_to_drop) {
eee3b811
QW
3245 abort_and_dump(trans, path,
3246 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3247 refs_to_drop, refs, bytenr, path->slots[0]);
3248 ret = -EUCLEAN;
3249 goto out;
32b02538 3250 }
56bec294 3251 refs -= refs_to_drop;
5f39d397 3252
5d4f98a2
YZ
3253 if (refs > 0) {
3254 if (extent_op)
3255 __run_delayed_extent_op(extent_op, leaf, ei);
3256 /*
3257 * In the case of inline back ref, reference count will
3258 * be updated by remove_extent_backref
952fccac 3259 */
5d4f98a2 3260 if (iref) {
1c2a07f5 3261 if (!found_extent) {
eee3b811
QW
3262 abort_and_dump(trans, path,
3263"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3264 path->slots[0]);
3265 ret = -EUCLEAN;
3266 goto out;
1c2a07f5 3267 }
5d4f98a2
YZ
3268 } else {
3269 btrfs_set_extent_refs(leaf, ei, refs);
50564b65 3270 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
3271 }
3272 if (found_extent) {
76d76e78 3273 ret = remove_extent_backref(trans, extent_root, path,
5b2a54bb 3274 iref, refs_to_drop, is_data);
005d6427 3275 if (ret) {
66642832 3276 btrfs_abort_transaction(trans, ret);
005d6427
DS
3277 goto out;
3278 }
952fccac 3279 }
5d4f98a2 3280 } else {
cecbb533
BB
3281 struct btrfs_squota_delta delta = {
3282 .root = delayed_ref_root,
3283 .num_bytes = num_bytes,
cecbb533
BB
3284 .is_data = is_data,
3285 .is_inc = false,
bd7c1ea3 3286 .generation = btrfs_extent_generation(leaf, ei),
cecbb533
BB
3287 };
3288
1c2a07f5 3289 /* In this branch refs == 1 */
5d4f98a2 3290 if (found_extent) {
1c2a07f5
QW
3291 if (is_data && refs_to_drop !=
3292 extent_data_ref_count(path, iref)) {
eee3b811
QW
3293 abort_and_dump(trans, path,
3294 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3295 extent_data_ref_count(path, iref),
3296 refs_to_drop, path->slots[0]);
3297 ret = -EUCLEAN;
3298 goto out;
1c2a07f5 3299 }
5d4f98a2 3300 if (iref) {
1c2a07f5 3301 if (path->slots[0] != extent_slot) {
eee3b811
QW
3302 abort_and_dump(trans, path,
3303"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3304 key.objectid, key.type,
3305 key.offset, path->slots[0]);
3306 ret = -EUCLEAN;
3307 goto out;
1c2a07f5 3308 }
5d4f98a2 3309 } else {
1c2a07f5
QW
3310 /*
3311 * No inline ref, we must be at SHARED_* item,
3312 * And it's single ref, it must be:
3313 * | extent_slot ||extent_slot + 1|
3314 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3315 */
3316 if (path->slots[0] != extent_slot + 1) {
eee3b811
QW
3317 abort_and_dump(trans, path,
3318 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3319 path->slots[0]);
3320 ret = -EUCLEAN;
3321 goto out;
1c2a07f5 3322 }
5d4f98a2
YZ
3323 path->slots[0] = extent_slot;
3324 num_to_del = 2;
3325 }
78fae27e 3326 }
cecbb533
BB
3327 /*
3328 * We can't infer the data owner from the delayed ref, so we need
3329 * to try to get it from the owning ref item.
3330 *
3331 * If it is not present, then that extent was not written under
3332 * simple quotas mode, so we don't need to account for its deletion.
3333 */
3334 if (is_data)
3335 delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3336 leaf, extent_slot);
b9473439 3337
952fccac
CM
3338 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3339 num_to_del);
005d6427 3340 if (ret) {
66642832 3341 btrfs_abort_transaction(trans, ret);
005d6427
DS
3342 goto out;
3343 }
b3b4aa74 3344 btrfs_release_path(path);
21af804c 3345
cecbb533 3346 ret = do_free_extent_accounting(trans, bytenr, &delta);
a28ec197 3347 }
fcebe456
JB
3348 btrfs_release_path(path);
3349
79787eaa 3350out:
5caf2a00 3351 btrfs_free_path(path);
a28ec197
CM
3352 return ret;
3353}
3354
1887be66 3355/*
f0486c68 3356 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
3357 * delayed ref for that extent as well. This searches the delayed ref tree for
3358 * a given extent, and if there are no other delayed refs to be processed, it
3359 * removes it from the tree.
3360 */
3361static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 3362 u64 bytenr)
1887be66
CM
3363{
3364 struct btrfs_delayed_ref_head *head;
3365 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 3366 int ret = 0;
1887be66
CM
3367
3368 delayed_refs = &trans->transaction->delayed_refs;
3369 spin_lock(&delayed_refs->lock);
f72ad18e 3370 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 3371 if (!head)
cf93da7b 3372 goto out_delayed_unlock;
1887be66 3373
d7df2c79 3374 spin_lock(&head->lock);
e3d03965 3375 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
3376 goto out;
3377
bedc6617
JB
3378 if (cleanup_extent_op(head) != NULL)
3379 goto out;
5d4f98a2 3380
1887be66
CM
3381 /*
3382 * waiting for the lock here would deadlock. If someone else has it
3383 * locked they are already in the process of dropping it anyway
3384 */
3385 if (!mutex_trylock(&head->mutex))
3386 goto out;
3387
d7baffda 3388 btrfs_delete_ref_head(delayed_refs, head);
61c681fe 3389 head->processing = false;
d7baffda 3390
d7df2c79 3391 spin_unlock(&head->lock);
1887be66
CM
3392 spin_unlock(&delayed_refs->lock);
3393
f0486c68
YZ
3394 BUG_ON(head->extent_op);
3395 if (head->must_insert_reserved)
3396 ret = 1;
3397
31890da0 3398 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 3399 mutex_unlock(&head->mutex);
d278850e 3400 btrfs_put_delayed_ref_head(head);
f0486c68 3401 return ret;
1887be66 3402out:
d7df2c79 3403 spin_unlock(&head->lock);
cf93da7b
CM
3404
3405out_delayed_unlock:
1887be66
CM
3406 spin_unlock(&delayed_refs->lock);
3407 return 0;
3408}
3409
bb386803
FM
3410int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3411 u64 root_id,
3412 struct extent_buffer *buf,
3413 u64 parent, int last_ref)
f0486c68 3414{
7a163608 3415 struct btrfs_fs_info *fs_info = trans->fs_info;
3ba2d364 3416 struct btrfs_block_group *bg;
f0486c68
YZ
3417 int ret;
3418
7a163608 3419 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
4d09b4e9
JB
3420 struct btrfs_ref generic_ref = {
3421 .action = BTRFS_DROP_DELAYED_REF,
3422 .bytenr = buf->start,
12390e42 3423 .num_bytes = buf->len,
4d09b4e9
JB
3424 .parent = parent,
3425 .owning_root = btrfs_header_owner(buf),
f2e69a77 3426 .ref_root = root_id,
4d09b4e9 3427 };
74cd8cac 3428
073bda7a
NA
3429 /*
3430 * Assert that the extent buffer is not cleared due to
3431 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3432 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3433 * detail.
3434 */
3435 ASSERT(btrfs_header_bytenr(buf) != 0);
3436
f2e69a77 3437 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
8a5040f7 3438 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 3439 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
bb386803
FM
3440 if (ret < 0)
3441 return ret;
f0486c68
YZ
3442 }
3443
3ba2d364 3444 if (!last_ref)
bb386803 3445 return 0;
f0486c68 3446
3ba2d364
JT
3447 if (btrfs_header_generation(buf) != trans->transid)
3448 goto out;
6219872d 3449
3ba2d364
JT
3450 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3451 ret = check_ref_cleanup(trans, buf->start);
3452 if (!ret)
37be25bc 3453 goto out;
3ba2d364 3454 }
f0486c68 3455
3ba2d364 3456 bg = btrfs_lookup_block_group(fs_info, buf->start);
485df755 3457
3ba2d364
JT
3458 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3459 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3460 btrfs_put_block_group(bg);
3461 goto out;
3462 }
d3575156 3463
3ba2d364
JT
3464 /*
3465 * If there are tree mod log users we may have recorded mod log
3466 * operations for this node. If we re-allocate this node we
3467 * could replay operations on this node that happened when it
3468 * existed in a completely different root. For example if it
3469 * was part of root A, then was reallocated to root B, and we
3470 * are doing a btrfs_old_search_slot(root b), we could replay
3471 * operations that happened when the block was part of root A,
3472 * giving us an inconsistent view of the btree.
3473 *
3474 * We are safe from races here because at this point no other
3475 * node or root points to this extent buffer, so if after this
3476 * check a new tree mod log user joins we will not have an
3477 * existing log of operations on this node that we have to
3478 * contend with.
3479 */
f0486c68 3480
3ba2d364
JT
3481 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3482 || btrfs_is_zoned(fs_info)) {
3483 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3484 btrfs_put_block_group(bg);
3485 goto out;
f0486c68 3486 }
3ba2d364
JT
3487
3488 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3489
3490 btrfs_add_free_space(bg, buf->start, buf->len);
3491 btrfs_free_reserved_bytes(bg, buf->len, 0);
3492 btrfs_put_block_group(bg);
3493 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3494
f0486c68 3495out:
3ba2d364
JT
3496
3497 /*
3498 * Deleting the buffer, clear the corrupt flag since it doesn't
3499 * matter anymore.
3500 */
3501 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
bb386803 3502 return 0;
f0486c68
YZ
3503}
3504
79787eaa 3505/* Can return -ENOMEM */
ffd4bb2a 3506int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 3507{
ffd4bb2a 3508 struct btrfs_fs_info *fs_info = trans->fs_info;
925baedd
CM
3509 int ret;
3510
f5ee5c9a 3511 if (btrfs_is_testing(fs_info))
faa2dbf0 3512 return 0;
fccb84c9 3513
56bec294
CM
3514 /*
3515 * tree log blocks never actually go into the extent allocation
3516 * tree, just update pinning info and exit early.
56bec294 3517 */
f2e69a77 3518 if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
12390e42 3519 btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
56bec294 3520 ret = 0;
ffd4bb2a 3521 } else if (ref->type == BTRFS_REF_METADATA) {
2187374f 3522 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
5d4f98a2 3523 } else {
2187374f 3524 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
56bec294 3525 }
d7eae340 3526
f2e69a77 3527 if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
ffd4bb2a 3528 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 3529
925baedd
CM
3530 return ret;
3531}
3532
817d52f8 3533enum btrfs_loop_type {
b9d97cff
JB
3534 /*
3535 * Start caching block groups but do not wait for progress or for them
3536 * to be done.
3537 */
f262fa8d 3538 LOOP_CACHING_NOWAIT,
b9d97cff
JB
3539
3540 /*
3541 * Wait for the block group free_space >= the space we're waiting for if
3542 * the block group isn't cached.
3543 */
f262fa8d 3544 LOOP_CACHING_WAIT,
b9d97cff
JB
3545
3546 /*
3547 * Allow allocations to happen from block groups that do not yet have a
3548 * size classification.
3549 */
52bb7a21 3550 LOOP_UNSET_SIZE_CLASS,
b9d97cff
JB
3551
3552 /*
3553 * Allocate a chunk and then retry the allocation.
3554 */
f262fa8d 3555 LOOP_ALLOC_CHUNK,
b9d97cff
JB
3556
3557 /*
3558 * Ignore the size class restrictions for this allocation.
3559 */
52bb7a21 3560 LOOP_WRONG_SIZE_CLASS,
b9d97cff
JB
3561
3562 /*
3563 * Ignore the empty size, only try to allocate the number of bytes
3564 * needed for this allocation.
3565 */
f262fa8d 3566 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
3567};
3568
e570fd27 3569static inline void
32da5386 3570btrfs_lock_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3571 int delalloc)
3572{
3573 if (delalloc)
3574 down_read(&cache->data_rwsem);
3575}
3576
32da5386 3577static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3578 int delalloc)
3579{
3580 btrfs_get_block_group(cache);
3581 if (delalloc)
3582 down_read(&cache->data_rwsem);
3583}
3584
32da5386
DS
3585static struct btrfs_block_group *btrfs_lock_cluster(
3586 struct btrfs_block_group *block_group,
e570fd27
MX
3587 struct btrfs_free_cluster *cluster,
3588 int delalloc)
c142c6a4 3589 __acquires(&cluster->refill_lock)
e570fd27 3590{
32da5386 3591 struct btrfs_block_group *used_bg = NULL;
6719afdc 3592
e570fd27 3593 spin_lock(&cluster->refill_lock);
6719afdc
GU
3594 while (1) {
3595 used_bg = cluster->block_group;
3596 if (!used_bg)
3597 return NULL;
3598
3599 if (used_bg == block_group)
e570fd27
MX
3600 return used_bg;
3601
6719afdc 3602 btrfs_get_block_group(used_bg);
e570fd27 3603
6719afdc
GU
3604 if (!delalloc)
3605 return used_bg;
e570fd27 3606
6719afdc
GU
3607 if (down_read_trylock(&used_bg->data_rwsem))
3608 return used_bg;
e570fd27 3609
6719afdc 3610 spin_unlock(&cluster->refill_lock);
e570fd27 3611
e321f8a8
LB
3612 /* We should only have one-level nested. */
3613 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 3614
6719afdc
GU
3615 spin_lock(&cluster->refill_lock);
3616 if (used_bg == cluster->block_group)
3617 return used_bg;
e570fd27 3618
6719afdc
GU
3619 up_read(&used_bg->data_rwsem);
3620 btrfs_put_block_group(used_bg);
3621 }
e570fd27
MX
3622}
3623
3624static inline void
32da5386 3625btrfs_release_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3626 int delalloc)
3627{
3628 if (delalloc)
3629 up_read(&cache->data_rwsem);
3630 btrfs_put_block_group(cache);
3631}
3632
d06e3bb6
QW
3633/*
3634 * Helper function for find_free_extent().
3635 *
3636 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
d06e3bb6
QW
3637 * Return >0 to inform caller that we find nothing
3638 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3639 */
32da5386 3640static int find_free_extent_clustered(struct btrfs_block_group *bg,
897cae79
NA
3641 struct find_free_extent_ctl *ffe_ctl,
3642 struct btrfs_block_group **cluster_bg_ret)
d06e3bb6 3643{
32da5386 3644 struct btrfs_block_group *cluster_bg;
897cae79 3645 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
d06e3bb6
QW
3646 u64 aligned_cluster;
3647 u64 offset;
3648 int ret;
3649
3650 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3651 if (!cluster_bg)
3652 goto refill_cluster;
3653 if (cluster_bg != bg && (cluster_bg->ro ||
3654 !block_group_bits(cluster_bg, ffe_ctl->flags)))
3655 goto release_cluster;
3656
3657 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
b3470b5d 3658 ffe_ctl->num_bytes, cluster_bg->start,
d06e3bb6
QW
3659 &ffe_ctl->max_extent_size);
3660 if (offset) {
3661 /* We have a block, we're done */
3662 spin_unlock(&last_ptr->refill_lock);
cfc2de0f 3663 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
d06e3bb6
QW
3664 *cluster_bg_ret = cluster_bg;
3665 ffe_ctl->found_offset = offset;
3666 return 0;
3667 }
3668 WARN_ON(last_ptr->block_group != cluster_bg);
3669
3670release_cluster:
3671 /*
3672 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3673 * lets just skip it and let the allocator find whatever block it can
3674 * find. If we reach this point, we will have tried the cluster
3675 * allocator plenty of times and not have found anything, so we are
3676 * likely way too fragmented for the clustering stuff to find anything.
3677 *
3678 * However, if the cluster is taken from the current block group,
3679 * release the cluster first, so that we stand a better chance of
3680 * succeeding in the unclustered allocation.
3681 */
3682 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3683 spin_unlock(&last_ptr->refill_lock);
3684 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3685 return -ENOENT;
3686 }
3687
3688 /* This cluster didn't work out, free it and start over */
3689 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3690
3691 if (cluster_bg != bg)
3692 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3693
3694refill_cluster:
3695 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3696 spin_unlock(&last_ptr->refill_lock);
3697 return -ENOENT;
3698 }
3699
3700 aligned_cluster = max_t(u64,
3701 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3702 bg->full_stripe_len);
2ceeae2e
DS
3703 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3704 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
3705 if (ret == 0) {
3706 /* Now pull our allocation out of this cluster */
3707 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3708 ffe_ctl->num_bytes, ffe_ctl->search_start,
3709 &ffe_ctl->max_extent_size);
3710 if (offset) {
3711 /* We found one, proceed */
3712 spin_unlock(&last_ptr->refill_lock);
d06e3bb6 3713 ffe_ctl->found_offset = offset;
cfc2de0f 3714 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
d06e3bb6
QW
3715 return 0;
3716 }
d06e3bb6
QW
3717 }
3718 /*
3719 * At this point we either didn't find a cluster or we weren't able to
3720 * allocate a block from our cluster. Free the cluster we've been
3721 * trying to use, and go to the next block group.
3722 */
3723 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3724 spin_unlock(&last_ptr->refill_lock);
3725 return 1;
3726}
3727
e1a41848
QW
3728/*
3729 * Return >0 to inform caller that we find nothing
3730 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
e1a41848 3731 */
32da5386 3732static int find_free_extent_unclustered(struct btrfs_block_group *bg,
897cae79 3733 struct find_free_extent_ctl *ffe_ctl)
e1a41848 3734{
897cae79 3735 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
e1a41848
QW
3736 u64 offset;
3737
3738 /*
3739 * We are doing an unclustered allocation, set the fragmented flag so
3740 * we don't bother trying to setup a cluster again until we get more
3741 * space.
3742 */
3743 if (unlikely(last_ptr)) {
3744 spin_lock(&last_ptr->lock);
3745 last_ptr->fragmented = 1;
3746 spin_unlock(&last_ptr->lock);
3747 }
3748 if (ffe_ctl->cached) {
3749 struct btrfs_free_space_ctl *free_space_ctl;
3750
3751 free_space_ctl = bg->free_space_ctl;
3752 spin_lock(&free_space_ctl->tree_lock);
3753 if (free_space_ctl->free_space <
3754 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3755 ffe_ctl->empty_size) {
3756 ffe_ctl->total_free_space = max_t(u64,
3757 ffe_ctl->total_free_space,
3758 free_space_ctl->free_space);
3759 spin_unlock(&free_space_ctl->tree_lock);
3760 return 1;
3761 }
3762 spin_unlock(&free_space_ctl->tree_lock);
3763 }
3764
3765 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3766 ffe_ctl->num_bytes, ffe_ctl->empty_size,
3767 &ffe_ctl->max_extent_size);
cd361199 3768 if (!offset)
e1a41848 3769 return 1;
e1a41848
QW
3770 ffe_ctl->found_offset = offset;
3771 return 0;
3772}
3773
c668690d
NA
3774static int do_allocation_clustered(struct btrfs_block_group *block_group,
3775 struct find_free_extent_ctl *ffe_ctl,
3776 struct btrfs_block_group **bg_ret)
3777{
3778 int ret;
3779
3780 /* We want to try and use the cluster allocator, so lets look there */
3781 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
897cae79 3782 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
cd361199 3783 if (ret >= 0)
c668690d
NA
3784 return ret;
3785 /* ret == -ENOENT case falls through */
3786 }
3787
897cae79 3788 return find_free_extent_unclustered(block_group, ffe_ctl);
c668690d
NA
3789}
3790
40ab3be1
NA
3791/*
3792 * Tree-log block group locking
3793 * ============================
3794 *
3795 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3796 * indicates the starting address of a block group, which is reserved only
3797 * for tree-log metadata.
3798 *
3799 * Lock nesting
3800 * ============
3801 *
3802 * space_info::lock
3803 * block_group::lock
3804 * fs_info::treelog_bg_lock
3805 */
3806
2eda5708
NA
3807/*
3808 * Simple allocator for sequential-only block group. It only allows sequential
3809 * allocation. No need to play with trees. This function also reserves the
3810 * bytes as in btrfs_add_reserved_bytes.
3811 */
3812static int do_allocation_zoned(struct btrfs_block_group *block_group,
3813 struct find_free_extent_ctl *ffe_ctl,
3814 struct btrfs_block_group **bg_ret)
3815{
40ab3be1 3816 struct btrfs_fs_info *fs_info = block_group->fs_info;
2eda5708
NA
3817 struct btrfs_space_info *space_info = block_group->space_info;
3818 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3819 u64 start = block_group->start;
3820 u64 num_bytes = ffe_ctl->num_bytes;
3821 u64 avail;
40ab3be1
NA
3822 u64 bytenr = block_group->start;
3823 u64 log_bytenr;
c2707a25 3824 u64 data_reloc_bytenr;
2eda5708 3825 int ret = 0;
2d81eb1c 3826 bool skip = false;
2eda5708
NA
3827
3828 ASSERT(btrfs_is_zoned(block_group->fs_info));
3829
40ab3be1
NA
3830 /*
3831 * Do not allow non-tree-log blocks in the dedicated tree-log block
3832 * group, and vice versa.
3833 */
3834 spin_lock(&fs_info->treelog_bg_lock);
3835 log_bytenr = fs_info->treelog_bg;
2d81eb1c
JT
3836 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3837 (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3838 skip = true;
40ab3be1
NA
3839 spin_unlock(&fs_info->treelog_bg_lock);
3840 if (skip)
3841 return 1;
3842
c2707a25
JT
3843 /*
3844 * Do not allow non-relocation blocks in the dedicated relocation block
3845 * group, and vice versa.
3846 */
3847 spin_lock(&fs_info->relocation_bg_lock);
3848 data_reloc_bytenr = fs_info->data_reloc_bg;
3849 if (data_reloc_bytenr &&
3850 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3851 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3852 skip = true;
3853 spin_unlock(&fs_info->relocation_bg_lock);
3854 if (skip)
3855 return 1;
1ada69f6 3856
2e654e4b
NA
3857 /* Check RO and no space case before trying to activate it */
3858 spin_lock(&block_group->lock);
1bfd4767 3859 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
1ada69f6
NA
3860 ret = 1;
3861 /*
3862 * May need to clear fs_info->{treelog,data_reloc}_bg.
3863 * Return the error after taking the locks.
3864 */
2e654e4b
NA
3865 }
3866 spin_unlock(&block_group->lock);
3867
5a7d107e
NA
3868 /* Metadata block group is activated at write time. */
3869 if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3870 !btrfs_zone_activate(block_group)) {
1ada69f6
NA
3871 ret = 1;
3872 /*
3873 * May need to clear fs_info->{treelog,data_reloc}_bg.
3874 * Return the error after taking the locks.
3875 */
3876 }
2e654e4b 3877
2eda5708
NA
3878 spin_lock(&space_info->lock);
3879 spin_lock(&block_group->lock);
40ab3be1 3880 spin_lock(&fs_info->treelog_bg_lock);
c2707a25 3881 spin_lock(&fs_info->relocation_bg_lock);
40ab3be1 3882
1ada69f6
NA
3883 if (ret)
3884 goto out;
3885
40ab3be1
NA
3886 ASSERT(!ffe_ctl->for_treelog ||
3887 block_group->start == fs_info->treelog_bg ||
3888 fs_info->treelog_bg == 0);
c2707a25
JT
3889 ASSERT(!ffe_ctl->for_data_reloc ||
3890 block_group->start == fs_info->data_reloc_bg ||
3891 fs_info->data_reloc_bg == 0);
2eda5708 3892
3349b57f 3893 if (block_group->ro ||
332581bd
NA
3894 (!ffe_ctl->for_data_reloc &&
3895 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
2eda5708
NA
3896 ret = 1;
3897 goto out;
3898 }
3899
40ab3be1
NA
3900 /*
3901 * Do not allow currently using block group to be tree-log dedicated
3902 * block group.
3903 */
3904 if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3905 (block_group->used || block_group->reserved)) {
3906 ret = 1;
3907 goto out;
3908 }
3909
c2707a25
JT
3910 /*
3911 * Do not allow currently used block group to be the data relocation
3912 * dedicated block group.
3913 */
3914 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3915 (block_group->used || block_group->reserved)) {
3916 ret = 1;
3917 goto out;
3918 }
3919
98173255
NA
3920 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3921 avail = block_group->zone_capacity - block_group->alloc_offset;
2eda5708
NA
3922 if (avail < num_bytes) {
3923 if (ffe_ctl->max_extent_size < avail) {
3924 /*
3925 * With sequential allocator, free space is always
3926 * contiguous
3927 */
3928 ffe_ctl->max_extent_size = avail;
3929 ffe_ctl->total_free_space = avail;
3930 }
3931 ret = 1;
3932 goto out;
3933 }
3934
40ab3be1
NA
3935 if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3936 fs_info->treelog_bg = block_group->start;
3937
332581bd
NA
3938 if (ffe_ctl->for_data_reloc) {
3939 if (!fs_info->data_reloc_bg)
3940 fs_info->data_reloc_bg = block_group->start;
3941 /*
3942 * Do not allow allocations from this block group, unless it is
3943 * for data relocation. Compared to increasing the ->ro, setting
3944 * the ->zoned_data_reloc_ongoing flag still allows nocow
3945 * writers to come in. See btrfs_inc_nocow_writers().
3946 *
3947 * We need to disable an allocation to avoid an allocation of
3948 * regular (non-relocation data) extent. With mix of relocation
3949 * extents and regular extents, we can dispatch WRITE commands
3950 * (for relocation extents) and ZONE APPEND commands (for
3951 * regular extents) at the same time to the same zone, which
3952 * easily break the write pointer.
3953 *
3954 * Also, this flag avoids this block group to be zone finished.
3955 */
3956 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3957 }
c2707a25 3958
2eda5708
NA
3959 ffe_ctl->found_offset = start + block_group->alloc_offset;
3960 block_group->alloc_offset += num_bytes;
3961 spin_lock(&ctl->tree_lock);
3962 ctl->free_space -= num_bytes;
3963 spin_unlock(&ctl->tree_lock);
3964
3965 /*
3966 * We do not check if found_offset is aligned to stripesize. The
3967 * address is anyway rewritten when using zone append writing.
3968 */
3969
3970 ffe_ctl->search_start = ffe_ctl->found_offset;
3971
3972out:
40ab3be1
NA
3973 if (ret && ffe_ctl->for_treelog)
3974 fs_info->treelog_bg = 0;
332581bd 3975 if (ret && ffe_ctl->for_data_reloc)
c2707a25
JT
3976 fs_info->data_reloc_bg = 0;
3977 spin_unlock(&fs_info->relocation_bg_lock);
40ab3be1 3978 spin_unlock(&fs_info->treelog_bg_lock);
2eda5708
NA
3979 spin_unlock(&block_group->lock);
3980 spin_unlock(&space_info->lock);
3981 return ret;
3982}
3983
c668690d
NA
3984static int do_allocation(struct btrfs_block_group *block_group,
3985 struct find_free_extent_ctl *ffe_ctl,
3986 struct btrfs_block_group **bg_ret)
3987{
3988 switch (ffe_ctl->policy) {
3989 case BTRFS_EXTENT_ALLOC_CLUSTERED:
3990 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
2eda5708
NA
3991 case BTRFS_EXTENT_ALLOC_ZONED:
3992 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
c668690d
NA
3993 default:
3994 BUG();
3995 }
3996}
3997
baba5062
NA
3998static void release_block_group(struct btrfs_block_group *block_group,
3999 struct find_free_extent_ctl *ffe_ctl,
4000 int delalloc)
4001{
4002 switch (ffe_ctl->policy) {
4003 case BTRFS_EXTENT_ALLOC_CLUSTERED:
cd361199 4004 ffe_ctl->retry_uncached = false;
baba5062 4005 break;
2eda5708
NA
4006 case BTRFS_EXTENT_ALLOC_ZONED:
4007 /* Nothing to do */
4008 break;
baba5062
NA
4009 default:
4010 BUG();
4011 }
4012
4013 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4014 ffe_ctl->index);
4015 btrfs_release_block_group(block_group, delalloc);
4016}
4017
0ab9724b
NA
4018static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4019 struct btrfs_key *ins)
4020{
4021 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4022
4023 if (!ffe_ctl->use_cluster && last_ptr) {
4024 spin_lock(&last_ptr->lock);
4025 last_ptr->window_start = ins->objectid;
4026 spin_unlock(&last_ptr->lock);
4027 }
4028}
4029
4030static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4031 struct btrfs_key *ins)
4032{
4033 switch (ffe_ctl->policy) {
4034 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4035 found_extent_clustered(ffe_ctl, ins);
4036 break;
2eda5708
NA
4037 case BTRFS_EXTENT_ALLOC_ZONED:
4038 /* Nothing to do */
4039 break;
0ab9724b
NA
4040 default:
4041 BUG();
4042 }
4043}
4044
393f646e
NA
4045static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4046 struct find_free_extent_ctl *ffe_ctl)
4047{
5a7d107e
NA
4048 /* Block group's activeness is not a requirement for METADATA block groups. */
4049 if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4050 return 0;
4051
393f646e
NA
4052 /* If we can activate new zone, just allocate a chunk and use it */
4053 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4054 return 0;
4055
4056 /*
4057 * We already reached the max active zones. Try to finish one block
4058 * group to make a room for a new block group. This is only possible
4059 * for a data block group because btrfs_zone_finish() may need to wait
4060 * for a running transaction which can cause a deadlock for metadata
4061 * allocation.
4062 */
4063 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4064 int ret = btrfs_zone_finish_one_bg(fs_info);
4065
4066 if (ret == 1)
4067 return 0;
4068 else if (ret < 0)
4069 return ret;
4070 }
4071
4072 /*
4073 * If we have enough free space left in an already active block group
4074 * and we can't activate any other zone now, do not allow allocating a
4075 * new chunk and let find_free_extent() retry with a smaller size.
4076 */
4077 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4078 return -ENOSPC;
4079
898793d9
NA
4080 /*
4081 * Even min_alloc_size is not left in any block groups. Since we cannot
4082 * activate a new block group, allocating it may not help. Let's tell a
4083 * caller to try again and hope it progress something by writing some
4084 * parts of the region. That is only possible for data block groups,
4085 * where a part of the region can be written.
4086 */
4087 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4088 return -EAGAIN;
4089
393f646e
NA
4090 /*
4091 * We cannot activate a new block group and no enough space left in any
4092 * block groups. So, allocating a new block group may not help. But,
4093 * there is nothing to do anyway, so let's go with it.
4094 */
4095 return 0;
4096}
4097
bb9950d3
NA
4098static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4099 struct find_free_extent_ctl *ffe_ctl)
50475cd5
NA
4100{
4101 switch (ffe_ctl->policy) {
4102 case BTRFS_EXTENT_ALLOC_CLUSTERED:
bb9950d3 4103 return 0;
50475cd5 4104 case BTRFS_EXTENT_ALLOC_ZONED:
393f646e 4105 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
50475cd5
NA
4106 default:
4107 BUG();
4108 }
4109}
4110
e72d79d6
QW
4111/*
4112 * Return >0 means caller needs to re-search for free extent
4113 * Return 0 means we have the needed free extent.
4114 * Return <0 means we failed to locate any free extent.
4115 */
4116static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
e72d79d6
QW
4117 struct btrfs_key *ins,
4118 struct find_free_extent_ctl *ffe_ctl,
15b7ee65 4119 bool full_search)
e72d79d6 4120{
8e1d0290 4121 struct btrfs_root *root = fs_info->chunk_root;
e72d79d6
QW
4122 int ret;
4123
4124 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4125 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4126 ffe_ctl->orig_have_caching_bg = true;
4127
e72d79d6 4128 if (ins->objectid) {
0ab9724b 4129 found_extent(ffe_ctl, ins);
e72d79d6
QW
4130 return 0;
4131 }
4132
a85f05e5
NA
4133 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4134 return 1;
4135
4136 ffe_ctl->index++;
4137 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4138 return 1;
4139
b9d97cff 4140 /* See the comments for btrfs_loop_type for an explanation of the phases. */
e72d79d6
QW
4141 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4142 ffe_ctl->index = 0;
52bb7a21
BB
4143 /*
4144 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4145 * any uncached bgs and we've already done a full search
4146 * through.
4147 */
4148 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4149 (!ffe_ctl->orig_have_caching_bg && full_search))
e72d79d6 4150 ffe_ctl->loop++;
52bb7a21 4151 ffe_ctl->loop++;
e72d79d6
QW
4152
4153 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4154 struct btrfs_trans_handle *trans;
4155 int exist = 0;
4156
52bb7a21 4157 /* Check if allocation policy allows to create a new chunk */
bb9950d3
NA
4158 ret = can_allocate_chunk(fs_info, ffe_ctl);
4159 if (ret)
4160 return ret;
50475cd5 4161
e72d79d6
QW
4162 trans = current->journal_info;
4163 if (trans)
4164 exist = 1;
4165 else
4166 trans = btrfs_join_transaction(root);
4167
4168 if (IS_ERR(trans)) {
4169 ret = PTR_ERR(trans);
4170 return ret;
4171 }
4172
fc471cb0 4173 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
760e69c4 4174 CHUNK_ALLOC_FORCE_FOR_EXTENT);
e72d79d6 4175
e72d79d6 4176 /* Do not bail out on ENOSPC since we can do more. */
52bb7a21
BB
4177 if (ret == -ENOSPC) {
4178 ret = 0;
4179 ffe_ctl->loop++;
4180 }
c70e2139 4181 else if (ret < 0)
e72d79d6
QW
4182 btrfs_abort_transaction(trans, ret);
4183 else
4184 ret = 0;
4185 if (!exist)
4186 btrfs_end_transaction(trans);
4187 if (ret)
4188 return ret;
4189 }
4190
4191 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
45d8e033
NA
4192 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4193 return -ENOSPC;
4194
e72d79d6
QW
4195 /*
4196 * Don't loop again if we already have no empty_size and
4197 * no empty_cluster.
4198 */
4199 if (ffe_ctl->empty_size == 0 &&
4200 ffe_ctl->empty_cluster == 0)
4201 return -ENOSPC;
4202 ffe_ctl->empty_size = 0;
4203 ffe_ctl->empty_cluster = 0;
4204 }
4205 return 1;
4206 }
4207 return -ENOSPC;
4208}
4209
52bb7a21
BB
4210static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4211 struct btrfs_block_group *bg)
4212{
4213 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4214 return true;
cb0922f2 4215 if (!btrfs_block_group_should_use_size_class(bg))
52bb7a21
BB
4216 return true;
4217 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4218 return true;
4219 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4220 bg->size_class == BTRFS_BG_SZ_NONE)
4221 return true;
4222 return ffe_ctl->size_class == bg->size_class;
4223}
4224
7e895409
NA
4225static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4226 struct find_free_extent_ctl *ffe_ctl,
4227 struct btrfs_space_info *space_info,
4228 struct btrfs_key *ins)
4229{
4230 /*
4231 * If our free space is heavily fragmented we may not be able to make
4232 * big contiguous allocations, so instead of doing the expensive search
4233 * for free space, simply return ENOSPC with our max_extent_size so we
4234 * can go ahead and search for a more manageable chunk.
4235 *
4236 * If our max_extent_size is large enough for our allocation simply
4237 * disable clustering since we will likely not be able to find enough
4238 * space to create a cluster and induce latency trying.
4239 */
4240 if (space_info->max_extent_size) {
4241 spin_lock(&space_info->lock);
4242 if (space_info->max_extent_size &&
4243 ffe_ctl->num_bytes > space_info->max_extent_size) {
4244 ins->offset = space_info->max_extent_size;
4245 spin_unlock(&space_info->lock);
4246 return -ENOSPC;
4247 } else if (space_info->max_extent_size) {
4248 ffe_ctl->use_cluster = false;
4249 }
4250 spin_unlock(&space_info->lock);
4251 }
4252
4253 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4254 &ffe_ctl->empty_cluster);
4255 if (ffe_ctl->last_ptr) {
4256 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4257
4258 spin_lock(&last_ptr->lock);
4259 if (last_ptr->block_group)
4260 ffe_ctl->hint_byte = last_ptr->window_start;
4261 if (last_ptr->fragmented) {
4262 /*
4263 * We still set window_start so we can keep track of the
4264 * last place we found an allocation to try and save
4265 * some time.
4266 */
4267 ffe_ctl->hint_byte = last_ptr->window_start;
4268 ffe_ctl->use_cluster = false;
4269 }
4270 spin_unlock(&last_ptr->lock);
4271 }
4272
4273 return 0;
4274}
4275
b271fee9
NA
4276static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4277 struct find_free_extent_ctl *ffe_ctl)
4278{
4279 if (ffe_ctl->for_treelog) {
4280 spin_lock(&fs_info->treelog_bg_lock);
4281 if (fs_info->treelog_bg)
4282 ffe_ctl->hint_byte = fs_info->treelog_bg;
4283 spin_unlock(&fs_info->treelog_bg_lock);
4284 } else if (ffe_ctl->for_data_reloc) {
4285 spin_lock(&fs_info->relocation_bg_lock);
4286 if (fs_info->data_reloc_bg)
4287 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4288 spin_unlock(&fs_info->relocation_bg_lock);
02444f2a
NA
4289 } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4290 struct btrfs_block_group *block_group;
4291
4292 spin_lock(&fs_info->zone_active_bgs_lock);
4293 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4294 /*
4295 * No lock is OK here because avail is monotinically
4296 * decreasing, and this is just a hint.
4297 */
4298 u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4299
4300 if (block_group_bits(block_group, ffe_ctl->flags) &&
4301 avail >= ffe_ctl->num_bytes) {
4302 ffe_ctl->hint_byte = block_group->start;
4303 break;
4304 }
4305 }
4306 spin_unlock(&fs_info->zone_active_bgs_lock);
b271fee9
NA
4307 }
4308
4309 return 0;
4310}
4311
7e895409
NA
4312static int prepare_allocation(struct btrfs_fs_info *fs_info,
4313 struct find_free_extent_ctl *ffe_ctl,
4314 struct btrfs_space_info *space_info,
4315 struct btrfs_key *ins)
4316{
4317 switch (ffe_ctl->policy) {
4318 case BTRFS_EXTENT_ALLOC_CLUSTERED:
4319 return prepare_allocation_clustered(fs_info, ffe_ctl,
4320 space_info, ins);
2eda5708 4321 case BTRFS_EXTENT_ALLOC_ZONED:
b271fee9 4322 return prepare_allocation_zoned(fs_info, ffe_ctl);
7e895409
NA
4323 default:
4324 BUG();
4325 }
4326}
4327
fec577fb
CM
4328/*
4329 * walks the btree of allocated extents and find a hole of a given size.
4330 * The key ins is changed to record the hole:
a4820398 4331 * ins->objectid == start position
62e2749e 4332 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 4333 * ins->offset == the size of the hole.
fec577fb 4334 * Any available blocks before search_start are skipped.
a4820398
MX
4335 *
4336 * If there is no suitable free space, we will record the max size of
4337 * the free space extent currently.
e72d79d6
QW
4338 *
4339 * The overall logic and call chain:
4340 *
4341 * find_free_extent()
4342 * |- Iterate through all block groups
4343 * | |- Get a valid block group
4344 * | |- Try to do clustered allocation in that block group
4345 * | |- Try to do unclustered allocation in that block group
4346 * | |- Check if the result is valid
4347 * | | |- If valid, then exit
4348 * | |- Jump to next block group
4349 * |
4350 * |- Push harder to find free extents
4351 * |- If not found, re-iterate all block groups
fec577fb 4352 */
437490fe 4353static noinline int find_free_extent(struct btrfs_root *root,
a12b0dc0
NA
4354 struct btrfs_key *ins,
4355 struct find_free_extent_ctl *ffe_ctl)
fec577fb 4356{
437490fe 4357 struct btrfs_fs_info *fs_info = root->fs_info;
80eb234a 4358 int ret = 0;
db8fe64f 4359 int cache_block_group_error = 0;
32da5386 4360 struct btrfs_block_group *block_group = NULL;
80eb234a 4361 struct btrfs_space_info *space_info;
a5e681d9 4362 bool full_search = false;
fec577fb 4363
a12b0dc0 4364 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
b4bd745d 4365
a12b0dc0
NA
4366 ffe_ctl->search_start = 0;
4367 /* For clustered allocation */
4368 ffe_ctl->empty_cluster = 0;
4369 ffe_ctl->last_ptr = NULL;
4370 ffe_ctl->use_cluster = true;
4371 ffe_ctl->have_caching_bg = false;
4372 ffe_ctl->orig_have_caching_bg = false;
4373 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4374 ffe_ctl->loop = 0;
cd361199 4375 ffe_ctl->retry_uncached = false;
a12b0dc0
NA
4376 ffe_ctl->cached = 0;
4377 ffe_ctl->max_extent_size = 0;
4378 ffe_ctl->total_free_space = 0;
4379 ffe_ctl->found_offset = 0;
4380 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
52bb7a21 4381 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
c10859be 4382
2eda5708 4383 if (btrfs_is_zoned(fs_info))
a12b0dc0 4384 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
2eda5708 4385
962a298f 4386 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
4387 ins->objectid = 0;
4388 ins->offset = 0;
b1a4d965 4389
cfc2de0f 4390 trace_find_free_extent(root, ffe_ctl);
3f7de037 4391
a12b0dc0 4392 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
1b1d1f66 4393 if (!space_info) {
a12b0dc0 4394 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
1b1d1f66
JB
4395 return -ENOSPC;
4396 }
2552d17e 4397
a12b0dc0 4398 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
7e895409
NA
4399 if (ret < 0)
4400 return ret;
fa9c0d79 4401
a12b0dc0 4402 ffe_ctl->search_start = max(ffe_ctl->search_start,
0eb997bf 4403 first_logical_byte(fs_info));
a12b0dc0
NA
4404 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4405 if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
b4bd745d 4406 block_group = btrfs_lookup_block_group(fs_info,
a12b0dc0 4407 ffe_ctl->search_start);
817d52f8
JB
4408 /*
4409 * we don't want to use the block group if it doesn't match our
4410 * allocation bits, or if its not cached.
ccf0e725
JB
4411 *
4412 * However if we are re-searching with an ideal block group
4413 * picked out then we don't care that the block group is cached.
817d52f8 4414 */
a12b0dc0 4415 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
285ff5af 4416 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 4417 down_read(&space_info->groups_sem);
44fb5511
CM
4418 if (list_empty(&block_group->list) ||
4419 block_group->ro) {
4420 /*
4421 * someone is removing this block group,
4422 * we can't jump into the have_block_group
4423 * target because our list pointers are not
4424 * valid
4425 */
4426 btrfs_put_block_group(block_group);
4427 up_read(&space_info->groups_sem);
ccf0e725 4428 } else {
a12b0dc0
NA
4429 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4430 block_group->flags);
4431 btrfs_lock_block_group(block_group,
4432 ffe_ctl->delalloc);
854c2f36 4433 ffe_ctl->hinted = true;
44fb5511 4434 goto have_block_group;
ccf0e725 4435 }
2552d17e 4436 } else if (block_group) {
fa9c0d79 4437 btrfs_put_block_group(block_group);
2552d17e 4438 }
42e70e7a 4439 }
2552d17e 4440search:
854c2f36 4441 trace_find_free_extent_search_loop(root, ffe_ctl);
a12b0dc0
NA
4442 ffe_ctl->have_caching_bg = false;
4443 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4444 ffe_ctl->index == 0)
a5e681d9 4445 full_search = true;
80eb234a 4446 down_read(&space_info->groups_sem);
b4bd745d 4447 list_for_each_entry(block_group,
a12b0dc0 4448 &space_info->block_groups[ffe_ctl->index], list) {
c668690d
NA
4449 struct btrfs_block_group *bg_ret;
4450
854c2f36 4451 ffe_ctl->hinted = false;
14443937 4452 /* If the block group is read-only, we can skip it entirely. */
40ab3be1 4453 if (unlikely(block_group->ro)) {
a12b0dc0 4454 if (ffe_ctl->for_treelog)
40ab3be1 4455 btrfs_clear_treelog_bg(block_group);
c2707a25
JT
4456 if (ffe_ctl->for_data_reloc)
4457 btrfs_clear_data_reloc_bg(block_group);
14443937 4458 continue;
40ab3be1 4459 }
14443937 4460
a12b0dc0
NA
4461 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4462 ffe_ctl->search_start = block_group->start;
42e70e7a 4463
83a50de9
CM
4464 /*
4465 * this can happen if we end up cycling through all the
4466 * raid types, but we want to make sure we only allocate
4467 * for the proper type.
4468 */
a12b0dc0 4469 if (!block_group_bits(block_group, ffe_ctl->flags)) {
bece2e82 4470 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 4471 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 4472 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
4473 BTRFS_BLOCK_GROUP_RAID10;
4474
4475 /*
4476 * if they asked for extra copies and this block group
4477 * doesn't provide them, bail. This does allow us to
4478 * fill raid0 from raid1.
4479 */
a12b0dc0 4480 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
83a50de9 4481 goto loop;
2a28468e
QW
4482
4483 /*
4484 * This block group has different flags than we want.
4485 * It's possible that we have MIXED_GROUP flag but no
4486 * block group is mixed. Just skip such block group.
4487 */
a12b0dc0 4488 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2a28468e 4489 continue;
83a50de9
CM
4490 }
4491
2552d17e 4492have_block_group:
854c2f36 4493 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
a12b0dc0
NA
4494 ffe_ctl->cached = btrfs_block_group_done(block_group);
4495 if (unlikely(!ffe_ctl->cached)) {
4496 ffe_ctl->have_caching_bg = true;
ced8ecf0 4497 ret = btrfs_cache_block_group(block_group, false);
db8fe64f
JB
4498
4499 /*
4500 * If we get ENOMEM here or something else we want to
4501 * try other block groups, because it may not be fatal.
4502 * However if we can't find anything else we need to
4503 * save our return here so that we return the actual
4504 * error that caused problems, not ENOSPC.
4505 */
4506 if (ret < 0) {
4507 if (!cache_block_group_error)
4508 cache_block_group_error = ret;
4509 ret = 0;
4510 goto loop;
4511 }
1d4284bd 4512 ret = 0;
817d52f8
JB
4513 }
4514
92fb94b6
JB
4515 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4516 if (!cache_block_group_error)
4517 cache_block_group_error = -EIO;
36cce922 4518 goto loop;
92fb94b6 4519 }
0f9dd46c 4520
52bb7a21
BB
4521 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4522 goto loop;
4523
c668690d 4524 bg_ret = NULL;
a12b0dc0 4525 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
cd361199 4526 if (ret > 0)
1cdda9b8 4527 goto loop;
cd361199
JB
4528
4529 if (bg_ret && bg_ret != block_group) {
4530 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4531 block_group = bg_ret;
c668690d
NA
4532 }
4533
4534 /* Checks */
a12b0dc0
NA
4535 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4536 fs_info->stripesize);
25179201 4537
2552d17e 4538 /* move on to the next group */
a12b0dc0 4539 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
b3470b5d 4540 block_group->start + block_group->length) {
2eda5708 4541 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4542 ffe_ctl->found_offset,
4543 ffe_ctl->num_bytes);
2552d17e 4544 goto loop;
6226cb0a 4545 }
f5a31e16 4546
a12b0dc0 4547 if (ffe_ctl->found_offset < ffe_ctl->search_start)
2eda5708 4548 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4549 ffe_ctl->found_offset,
4550 ffe_ctl->search_start - ffe_ctl->found_offset);
2552d17e 4551
a12b0dc0
NA
4552 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4553 ffe_ctl->num_bytes,
52bb7a21
BB
4554 ffe_ctl->delalloc,
4555 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
f0486c68 4556 if (ret == -EAGAIN) {
2eda5708 4557 btrfs_add_free_space_unused(block_group,
a12b0dc0
NA
4558 ffe_ctl->found_offset,
4559 ffe_ctl->num_bytes);
2552d17e 4560 goto loop;
0f9dd46c 4561 }
9cfa3e34 4562 btrfs_inc_block_group_reservations(block_group);
0b86a832 4563
f0486c68 4564 /* we are all good, lets return */
a12b0dc0
NA
4565 ins->objectid = ffe_ctl->search_start;
4566 ins->offset = ffe_ctl->num_bytes;
d2fb3437 4567
cfc2de0f 4568 trace_btrfs_reserve_extent(block_group, ffe_ctl);
a12b0dc0 4569 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
2552d17e
JB
4570 break;
4571loop:
cd361199
JB
4572 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4573 !ffe_ctl->retry_uncached) {
4574 ffe_ctl->retry_uncached = true;
4575 btrfs_wait_block_group_cache_progress(block_group,
4576 ffe_ctl->num_bytes +
4577 ffe_ctl->empty_cluster +
4578 ffe_ctl->empty_size);
4579 goto have_block_group;
4580 }
a12b0dc0 4581 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
14443937 4582 cond_resched();
2552d17e
JB
4583 }
4584 up_read(&space_info->groups_sem);
4585
a12b0dc0 4586 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
e72d79d6 4587 if (ret > 0)
b742bb82
YZ
4588 goto search;
4589
db8fe64f 4590 if (ret == -ENOSPC && !cache_block_group_error) {
b4bd745d
QW
4591 /*
4592 * Use ffe_ctl->total_free_space as fallback if we can't find
4593 * any contiguous hole.
4594 */
a12b0dc0
NA
4595 if (!ffe_ctl->max_extent_size)
4596 ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4f4db217 4597 spin_lock(&space_info->lock);
a12b0dc0 4598 space_info->max_extent_size = ffe_ctl->max_extent_size;
4f4db217 4599 spin_unlock(&space_info->lock);
a12b0dc0 4600 ins->offset = ffe_ctl->max_extent_size;
db8fe64f
JB
4601 } else if (ret == -ENOSPC) {
4602 ret = cache_block_group_error;
4f4db217 4603 }
0f70abe2 4604 return ret;
fec577fb 4605}
ec44a35c 4606
6f47c706 4607/*
9580503b
DS
4608 * Entry point to the extent allocator. Tries to find a hole that is at least
4609 * as big as @num_bytes.
6f47c706
NB
4610 *
4611 * @root - The root that will contain this extent
4612 *
4613 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4614 * is used for accounting purposes. This value differs
4615 * from @num_bytes only in the case of compressed extents.
4616 *
4617 * @num_bytes - Number of bytes to allocate on-disk.
4618 *
4619 * @min_alloc_size - Indicates the minimum amount of space that the
4620 * allocator should try to satisfy. In some cases
4621 * @num_bytes may be larger than what is required and if
4622 * the filesystem is fragmented then allocation fails.
4623 * However, the presence of @min_alloc_size gives a
4624 * chance to try and satisfy the smaller allocation.
4625 *
4626 * @empty_size - A hint that you plan on doing more COW. This is the
4627 * size in bytes the allocator should try to find free
4628 * next to the block it returns. This is just a hint and
4629 * may be ignored by the allocator.
4630 *
4631 * @hint_byte - Hint to the allocator to start searching above the byte
4632 * address passed. It might be ignored.
4633 *
4634 * @ins - This key is modified to record the found hole. It will
4635 * have the following values:
4636 * ins->objectid == start position
4637 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4638 * ins->offset == the size of the hole.
4639 *
4640 * @is_data - Boolean flag indicating whether an extent is
4641 * allocated for data (true) or metadata (false)
4642 *
4643 * @delalloc - Boolean flag indicating whether this allocation is for
4644 * delalloc or not. If 'true' data_rwsem of block groups
4645 * is going to be acquired.
4646 *
4647 *
4648 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4649 * case -ENOSPC is returned then @ins->offset will contain the size of the
4650 * largest available hole the allocator managed to find.
4651 */
18513091 4652int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
4653 u64 num_bytes, u64 min_alloc_size,
4654 u64 empty_size, u64 hint_byte,
e570fd27 4655 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 4656{
ab8d0fc4 4657 struct btrfs_fs_info *fs_info = root->fs_info;
a12b0dc0 4658 struct find_free_extent_ctl ffe_ctl = {};
36af4e07 4659 bool final_tried = num_bytes == min_alloc_size;
b6919a58 4660 u64 flags;
fec577fb 4661 int ret;
e094f480 4662 bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
c2707a25 4663 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
925baedd 4664
1b86826d 4665 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 4666again:
0b246afa 4667 WARN_ON(num_bytes < fs_info->sectorsize);
a12b0dc0
NA
4668
4669 ffe_ctl.ram_bytes = ram_bytes;
4670 ffe_ctl.num_bytes = num_bytes;
a85f05e5 4671 ffe_ctl.min_alloc_size = min_alloc_size;
a12b0dc0
NA
4672 ffe_ctl.empty_size = empty_size;
4673 ffe_ctl.flags = flags;
4674 ffe_ctl.delalloc = delalloc;
4675 ffe_ctl.hint_byte = hint_byte;
4676 ffe_ctl.for_treelog = for_treelog;
c2707a25 4677 ffe_ctl.for_data_reloc = for_data_reloc;
a12b0dc0
NA
4678
4679 ret = find_free_extent(root, ins, &ffe_ctl);
9cfa3e34 4680 if (!ret && !is_data) {
ab8d0fc4 4681 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 4682 } else if (ret == -ENOSPC) {
a4820398
MX
4683 if (!final_tried && ins->offset) {
4684 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 4685 num_bytes = round_down(num_bytes,
0b246afa 4686 fs_info->sectorsize);
9e622d6b 4687 num_bytes = max(num_bytes, min_alloc_size);
18513091 4688 ram_bytes = num_bytes;
9e622d6b
MX
4689 if (num_bytes == min_alloc_size)
4690 final_tried = true;
4691 goto again;
ab8d0fc4 4692 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
4693 struct btrfs_space_info *sinfo;
4694
280c2908 4695 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 4696 btrfs_err(fs_info,
c2707a25
JT
4697 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4698 flags, num_bytes, for_treelog, for_data_reloc);
53804280 4699 if (sinfo)
5da6afeb
JB
4700 btrfs_dump_space_info(fs_info, sinfo,
4701 num_bytes, 1);
9e622d6b 4702 }
925baedd 4703 }
0f9dd46c
JB
4704
4705 return ret;
e6dcd2dc
CM
4706}
4707
a0fbf736
NB
4708int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4709 u64 start, u64 len, int delalloc)
65b51a00 4710{
32da5386 4711 struct btrfs_block_group *cache;
0f9dd46c 4712
0b246afa 4713 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 4714 if (!cache) {
0b246afa
JM
4715 btrfs_err(fs_info, "Unable to find block group for %llu",
4716 start);
0f9dd46c
JB
4717 return -ENOSPC;
4718 }
1f3c79a2 4719
a0fbf736
NB
4720 btrfs_add_free_space(cache, start, len);
4721 btrfs_free_reserved_bytes(cache, len, delalloc);
4722 trace_btrfs_reserved_extent_free(fs_info, start, len);
4723
fa9c0d79 4724 btrfs_put_block_group(cache);
a0fbf736 4725 return 0;
e6dcd2dc
CM
4726}
4727
f863c502
DS
4728int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4729 const struct extent_buffer *eb)
e688b725 4730{
7ef54d54 4731 struct btrfs_block_group *cache;
a0fbf736 4732 int ret = 0;
7ef54d54 4733
f863c502 4734 cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
7ef54d54 4735 if (!cache) {
7bfc1007 4736 btrfs_err(trans->fs_info, "unable to find block group for %llu",
f863c502 4737 eb->start);
7ef54d54
NB
4738 return -ENOSPC;
4739 }
4740
f863c502 4741 ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
7ef54d54 4742 btrfs_put_block_group(cache);
a0fbf736 4743 return ret;
e688b725
CM
4744}
4745
34666705
JB
4746static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4747 u64 num_bytes)
4748{
4749 struct btrfs_fs_info *fs_info = trans->fs_info;
4750 int ret;
4751
4752 ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4753 if (ret)
4754 return ret;
4755
4756 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4757 if (ret) {
4758 ASSERT(!ret);
4759 btrfs_err(fs_info, "update block group failed for %llu %llu",
4760 bytenr, num_bytes);
4761 return ret;
4762 }
4763
4764 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4765 return 0;
4766}
4767
5d4f98a2 4768static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4769 u64 parent, u64 root_objectid,
4770 u64 flags, u64 owner, u64 offset,
2672a051 4771 struct btrfs_key *ins, int ref_mod, u64 oref_root)
e6dcd2dc 4772{
ef89b824 4773 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4774 struct btrfs_root *extent_root;
e6dcd2dc 4775 int ret;
e6dcd2dc 4776 struct btrfs_extent_item *extent_item;
d9a620f7 4777 struct btrfs_extent_owner_ref *oref;
5d4f98a2 4778 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 4779 struct btrfs_path *path;
5d4f98a2
YZ
4780 struct extent_buffer *leaf;
4781 int type;
4782 u32 size;
d9a620f7 4783 const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
26b8003f 4784
5d4f98a2
YZ
4785 if (parent > 0)
4786 type = BTRFS_SHARED_DATA_REF_KEY;
4787 else
4788 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 4789
d9a620f7
BB
4790 size = sizeof(*extent_item);
4791 if (simple_quota)
4792 size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4793 size += btrfs_extent_inline_ref_size(type);
7bb86316
CM
4794
4795 path = btrfs_alloc_path();
db5b493a
TI
4796 if (!path)
4797 return -ENOMEM;
47e4bb98 4798
29cbcf40
JB
4799 extent_root = btrfs_extent_root(fs_info, ins->objectid);
4800 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
79787eaa
JM
4801 if (ret) {
4802 btrfs_free_path(path);
4803 return ret;
4804 }
0f9dd46c 4805
5d4f98a2
YZ
4806 leaf = path->nodes[0];
4807 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 4808 struct btrfs_extent_item);
5d4f98a2
YZ
4809 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4810 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4811 btrfs_set_extent_flags(leaf, extent_item,
4812 flags | BTRFS_EXTENT_FLAG_DATA);
4813
4814 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
d9a620f7
BB
4815 if (simple_quota) {
4816 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4817 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2672a051 4818 btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
d9a620f7
BB
4819 iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4820 }
5d4f98a2 4821 btrfs_set_extent_inline_ref_type(leaf, iref, type);
d9a620f7 4822
5d4f98a2
YZ
4823 if (parent > 0) {
4824 struct btrfs_shared_data_ref *ref;
4825 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4826 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4827 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4828 } else {
4829 struct btrfs_extent_data_ref *ref;
4830 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4831 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4832 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4833 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4834 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4835 }
47e4bb98 4836
50564b65 4837 btrfs_mark_buffer_dirty(trans, path->nodes[0]);
7bb86316 4838 btrfs_free_path(path);
f510cfec 4839
34666705 4840 return alloc_reserved_extent(trans, ins->objectid, ins->offset);
e6dcd2dc
CM
4841}
4842
5d4f98a2 4843static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 4844 struct btrfs_delayed_ref_node *node,
21ebfbe7 4845 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 4846{
9dcdbe01 4847 struct btrfs_fs_info *fs_info = trans->fs_info;
29cbcf40 4848 struct btrfs_root *extent_root;
e6dcd2dc 4849 int ret;
5d4f98a2 4850 struct btrfs_extent_item *extent_item;
4e6bd4e0 4851 struct btrfs_key extent_key;
5d4f98a2
YZ
4852 struct btrfs_tree_block_info *block_info;
4853 struct btrfs_extent_inline_ref *iref;
4854 struct btrfs_path *path;
4855 struct extent_buffer *leaf;
3173a18f 4856 u32 size = sizeof(*extent_item) + sizeof(*iref);
eba1469f 4857 const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
efc7d5db
JB
4858 /* The owner of a tree block is the level. */
4859 int level = btrfs_delayed_ref_owner(node);
0b246afa 4860 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 4861
4e6bd4e0
NB
4862 extent_key.objectid = node->bytenr;
4863 if (skinny_metadata) {
efc7d5db
JB
4864 /* The owner of a tree block is the level. */
4865 extent_key.offset = level;
4e6bd4e0 4866 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4e6bd4e0
NB
4867 } else {
4868 extent_key.offset = node->num_bytes;
4869 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 4870 size += sizeof(*block_info);
4e6bd4e0 4871 }
1c2308f8 4872
5d4f98a2 4873 path = btrfs_alloc_path();
80ee54bf 4874 if (!path)
d8926bb3 4875 return -ENOMEM;
56bec294 4876
29cbcf40
JB
4877 extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4878 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4879 size);
79787eaa 4880 if (ret) {
dd825259 4881 btrfs_free_path(path);
79787eaa
JM
4882 return ret;
4883 }
5d4f98a2
YZ
4884
4885 leaf = path->nodes[0];
4886 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4887 struct btrfs_extent_item);
4888 btrfs_set_extent_refs(leaf, extent_item, 1);
4889 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4890 btrfs_set_extent_flags(leaf, extent_item,
4891 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 4892
3173a18f
JB
4893 if (skinny_metadata) {
4894 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4895 } else {
4896 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 4897 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
efc7d5db 4898 btrfs_set_tree_block_level(leaf, block_info, level);
3173a18f
JB
4899 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4900 }
5d4f98a2 4901
d4b20733 4902 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
4903 btrfs_set_extent_inline_ref_type(leaf, iref,
4904 BTRFS_SHARED_BLOCK_REF_KEY);
cf4f0432 4905 btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
5d4f98a2
YZ
4906 } else {
4907 btrfs_set_extent_inline_ref_type(leaf, iref,
4908 BTRFS_TREE_BLOCK_REF_KEY);
cf4f0432 4909 btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
5d4f98a2
YZ
4910 }
4911
50564b65 4912 btrfs_mark_buffer_dirty(trans, leaf);
5d4f98a2
YZ
4913 btrfs_free_path(path);
4914
34666705 4915 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
5d4f98a2
YZ
4916}
4917
4918int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 4919 struct btrfs_root *root, u64 owner,
5846a3c2
QW
4920 u64 offset, u64 ram_bytes,
4921 struct btrfs_key *ins)
5d4f98a2 4922{
4d09b4e9
JB
4923 struct btrfs_ref generic_ref = {
4924 .action = BTRFS_ADD_DELAYED_EXTENT,
4925 .bytenr = ins->objectid,
12390e42 4926 .num_bytes = ins->offset,
e094f480
JB
4927 .owning_root = btrfs_root_id(root),
4928 .ref_root = btrfs_root_id(root),
4d09b4e9 4929 };
5d4f98a2 4930
f2e69a77 4931 ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 4932
2672a051 4933 if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4d09b4e9 4934 generic_ref.owning_root = root->relocation_src_root;
2672a051 4935
f2e69a77 4936 btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
8a5040f7 4937 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
2187374f
JB
4938
4939 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
e6dcd2dc 4940}
e02119d5
CM
4941
4942/*
4943 * this is used by the tree logging recovery code. It records that
4944 * an extent has been allocated and makes sure to clear the free
4945 * space cache bits as well
4946 */
5d4f98a2 4947int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4948 u64 root_objectid, u64 owner, u64 offset,
4949 struct btrfs_key *ins)
e02119d5 4950{
61da2abf 4951 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5 4952 int ret;
32da5386 4953 struct btrfs_block_group *block_group;
ed7a6948 4954 struct btrfs_space_info *space_info;
cecbb533
BB
4955 struct btrfs_squota_delta delta = {
4956 .root = root_objectid,
4957 .num_bytes = ins->offset,
bd7c1ea3 4958 .generation = trans->transid,
cecbb533
BB
4959 .is_data = true,
4960 .is_inc = true,
4961 };
11833d66 4962
8c2a1a30
JB
4963 /*
4964 * Mixed block groups will exclude before processing the log so we only
01327610 4965 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 4966 */
0b246afa 4967 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
4968 ret = __exclude_logged_extent(fs_info, ins->objectid,
4969 ins->offset);
b50c6e25 4970 if (ret)
8c2a1a30 4971 return ret;
11833d66
YZ
4972 }
4973
0b246afa 4974 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
4975 if (!block_group)
4976 return -EINVAL;
4977
ed7a6948
WX
4978 space_info = block_group->space_info;
4979 spin_lock(&space_info->lock);
4980 spin_lock(&block_group->lock);
4981 space_info->bytes_reserved += ins->offset;
4982 block_group->reserved += ins->offset;
4983 spin_unlock(&block_group->lock);
4984 spin_unlock(&space_info->lock);
4985
ef89b824 4986 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
2672a051 4987 offset, ins, 1, root_objectid);
bd727173 4988 if (ret)
ab9b2c7b 4989 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
cecbb533 4990 ret = btrfs_record_squota_delta(fs_info, &delta);
b50c6e25 4991 btrfs_put_block_group(block_group);
e02119d5
CM
4992 return ret;
4993}
4994
150cce2d
DS
4995#ifdef CONFIG_BTRFS_DEBUG
4996/*
4997 * Extra safety check in case the extent tree is corrupted and extent allocator
4998 * chooses to use a tree block which is already used and locked.
4999 */
5000static bool check_eb_lock_owner(const struct extent_buffer *eb)
5001{
5002 if (eb->lock_owner == current->pid) {
5003 btrfs_err_rl(eb->fs_info,
5004"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5005 eb->start, btrfs_header_owner(eb), current->pid);
5006 return true;
5007 }
5008 return false;
5009}
5010#else
5011static bool check_eb_lock_owner(struct extent_buffer *eb)
5012{
5013 return false;
5014}
5015#endif
5016
48a3b636
ES
5017static struct extent_buffer *
5018btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
9631e4cc
JB
5019 u64 bytenr, int level, u64 owner,
5020 enum btrfs_lock_nesting nest)
65b51a00 5021{
0b246afa 5022 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00 5023 struct extent_buffer *buf;
b40130b2 5024 u64 lockdep_owner = owner;
65b51a00 5025
3fbaf258 5026 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
c871b0f2
LB
5027 if (IS_ERR(buf))
5028 return buf;
5029
150cce2d 5030 if (check_eb_lock_owner(buf)) {
b72c3aba
QW
5031 free_extent_buffer(buf);
5032 return ERR_PTR(-EUCLEAN);
5033 }
5034
b40130b2
JB
5035 /*
5036 * The reloc trees are just snapshots, so we need them to appear to be
5037 * just like any other fs tree WRT lockdep.
5038 *
5039 * The exception however is in replace_path() in relocation, where we
5040 * hold the lock on the original fs root and then search for the reloc
5041 * root. At that point we need to make sure any reloc root buffers are
5042 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5043 * lockdep happy.
5044 */
5045 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5046 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5047 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5048
618d1d7d 5049 /* btrfs_clear_buffer_dirty() accesses generation field. */
cbddcc4f
TH
5050 btrfs_set_header_generation(buf, trans->transid);
5051
e114c545
JB
5052 /*
5053 * This needs to stay, because we could allocate a freed block from an
5054 * old tree into a new tree, so we need to make sure this new block is
5055 * set to the appropriate level and owner.
5056 */
b40130b2
JB
5057 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5058
2066bbfc 5059 btrfs_tree_lock_nested(buf, nest);
190a8339 5060 btrfs_clear_buffer_dirty(trans, buf);
3083ee2e 5061 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
cbf44cd9 5062 clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
b4ce94de 5063
4db8c528 5064 set_extent_buffer_uptodate(buf);
b4ce94de 5065
bc877d28
NB
5066 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5067 btrfs_set_header_level(buf, level);
5068 btrfs_set_header_bytenr(buf, buf->start);
5069 btrfs_set_header_generation(buf, trans->transid);
5070 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5071 btrfs_set_header_owner(buf, owner);
de37aa51 5072 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 5073 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
e094f480 5074 if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
656f30db 5075 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
5076 /*
5077 * we allow two log transactions at a time, use different
52042d8e 5078 * EXTENT bit to differentiate dirty pages.
8cef4e16 5079 */
656f30db 5080 if (buf->log_index == 0)
fe1a598c
DS
5081 set_extent_bit(&root->dirty_log_pages, buf->start,
5082 buf->start + buf->len - 1,
1d126800 5083 EXTENT_DIRTY, NULL);
8cef4e16 5084 else
eea8686e
DS
5085 set_extent_bit(&root->dirty_log_pages, buf->start,
5086 buf->start + buf->len - 1,
1d126800 5087 EXTENT_NEW, NULL);
d0c803c4 5088 } else {
656f30db 5089 buf->log_index = -1;
fe1a598c 5090 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
1d126800 5091 buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
d0c803c4 5092 }
b4ce94de 5093 /* this returns a buffer locked for blocking */
65b51a00
CM
5094 return buf;
5095}
5096
fec577fb 5097/*
f0486c68 5098 * finds a free extent and does all the dirty work required for allocation
67b7859e 5099 * returns the tree buffer or an ERR_PTR on error.
fec577fb 5100 */
4d75f8a9 5101struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
5102 struct btrfs_root *root,
5103 u64 parent, u64 root_objectid,
5104 const struct btrfs_disk_key *key,
5105 int level, u64 hint,
9631e4cc 5106 u64 empty_size,
60ea105a 5107 u64 reloc_src_root,
9631e4cc 5108 enum btrfs_lock_nesting nest)
fec577fb 5109{
0b246afa 5110 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 5111 struct btrfs_key ins;
f0486c68 5112 struct btrfs_block_rsv *block_rsv;
5f39d397 5113 struct extent_buffer *buf;
f0486c68
YZ
5114 u64 flags = 0;
5115 int ret;
0b246afa
JM
5116 u32 blocksize = fs_info->nodesize;
5117 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
60ea105a 5118 u64 owning_root;
fec577fb 5119
05653ef3 5120#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 5121 if (btrfs_is_testing(fs_info)) {
faa2dbf0 5122 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
9631e4cc 5123 level, root_objectid, nest);
faa2dbf0
JB
5124 if (!IS_ERR(buf))
5125 root->alloc_bytenr += blocksize;
5126 return buf;
5127 }
05653ef3 5128#endif
fccb84c9 5129
67f9c220 5130 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
f0486c68
YZ
5131 if (IS_ERR(block_rsv))
5132 return ERR_CAST(block_rsv);
5133
18513091 5134 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 5135 empty_size, hint, &ins, 0, 0);
67b7859e
OS
5136 if (ret)
5137 goto out_unuse;
55c69072 5138
bc877d28 5139 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
9631e4cc 5140 root_objectid, nest);
67b7859e
OS
5141 if (IS_ERR(buf)) {
5142 ret = PTR_ERR(buf);
5143 goto out_free_reserved;
5144 }
60ea105a 5145 owning_root = btrfs_header_owner(buf);
f0486c68
YZ
5146
5147 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5148 if (parent == 0)
5149 parent = ins.objectid;
5150 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
60ea105a 5151 owning_root = reloc_src_root;
f0486c68
YZ
5152 } else
5153 BUG_ON(parent > 0);
5154
5155 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
eba1469f 5156 struct btrfs_delayed_extent_op *extent_op;
4d09b4e9
JB
5157 struct btrfs_ref generic_ref = {
5158 .action = BTRFS_ADD_DELAYED_EXTENT,
5159 .bytenr = ins.objectid,
12390e42 5160 .num_bytes = ins.offset,
4d09b4e9
JB
5161 .parent = parent,
5162 .owning_root = owning_root,
f2e69a77 5163 .ref_root = root_objectid,
4d09b4e9 5164 };
eba1469f
FM
5165
5166 if (!skinny_metadata || flags != 0) {
5167 extent_op = btrfs_alloc_delayed_extent_op();
5168 if (!extent_op) {
5169 ret = -ENOMEM;
5170 goto out_free_buf;
5171 }
5172 if (key)
5173 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5174 else
5175 memset(&extent_op->key, 0, sizeof(extent_op->key));
5176 extent_op->flags_to_set = flags;
5177 extent_op->update_key = (skinny_metadata ? false : true);
5178 extent_op->update_flags = (flags != 0);
5179 } else {
5180 extent_op = NULL;
67b7859e 5181 }
f0486c68 5182
e094f480 5183 btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
8a5040f7 5184 btrfs_ref_tree_mod(fs_info, &generic_ref);
2187374f 5185 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
eba1469f
FM
5186 if (ret) {
5187 btrfs_free_delayed_extent_op(extent_op);
5188 goto out_free_buf;
5189 }
f0486c68 5190 }
fec577fb 5191 return buf;
67b7859e 5192
67b7859e 5193out_free_buf:
19ea40dd 5194 btrfs_tree_unlock(buf);
67b7859e
OS
5195 free_extent_buffer(buf);
5196out_free_reserved:
2ff7e61e 5197 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 5198out_unuse:
67f9c220 5199 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 5200 return ERR_PTR(ret);
fec577fb 5201}
a28ec197 5202
2c47e605
YZ
5203struct walk_control {
5204 u64 refs[BTRFS_MAX_LEVEL];
5205 u64 flags[BTRFS_MAX_LEVEL];
5206 struct btrfs_key update_progress;
aea6f028
JB
5207 struct btrfs_key drop_progress;
5208 int drop_level;
2c47e605
YZ
5209 int stage;
5210 int level;
5211 int shared_level;
5212 int update_ref;
5213 int keep_locks;
1c4850e2
YZ
5214 int reada_slot;
5215 int reada_count;
78c52d9e 5216 int restarted;
7fcee18d
JB
5217 /* Indicate that extent info needs to be looked up when walking the tree. */
5218 int lookup_info;
2c47e605
YZ
5219};
5220
33b804fa
JB
5221/*
5222 * This is our normal stage. We are traversing blocks the current snapshot owns
5223 * and we are dropping any of our references to any children we are able to, and
5224 * then freeing the block once we've processed all of the children.
5225 */
2c47e605 5226#define DROP_REFERENCE 1
33b804fa
JB
5227
5228/*
5229 * We enter this stage when we have to walk into a child block (meaning we can't
5230 * simply drop our reference to it from our current parent node) and there are
5231 * more than one reference on it. If we are the owner of any of the children
5232 * blocks from the current parent node then we have to do the FULL_BACKREF dance
5233 * on them in order to drop our normal ref and add the shared ref.
5234 */
2c47e605
YZ
5235#define UPDATE_BACKREF 2
5236
2b73c7e7
JB
5237/*
5238 * Decide if we need to walk down into this node to adjust the references.
5239 *
5240 * @root: the root we are currently deleting
5241 * @wc: the walk control for this deletion
5242 * @eb: the parent eb that we're currently visiting
5243 * @refs: the number of refs for wc->level - 1
5244 * @flags: the flags for wc->level - 1
5245 * @slot: the slot in the eb that we're currently checking
5246 *
5247 * This is meant to be called when we're evaluating if a node we point to at
5248 * wc->level should be read and walked into, or if we can simply delete our
5249 * reference to it. We return true if we should walk into the node, false if we
5250 * can skip it.
5251 *
5252 * We have assertions in here to make sure this is called correctly. We assume
5253 * that sanity checking on the blocks read to this point has been done, so any
5254 * corrupted file systems must have been caught before calling this function.
5255 */
5256static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
5257 struct extent_buffer *eb, u64 refs, u64 flags, int slot)
5258{
5259 struct btrfs_key key;
5260 u64 generation;
5261 int level = wc->level;
5262
5263 ASSERT(level > 0);
5264 ASSERT(wc->refs[level - 1] > 0);
5265
5266 /*
5267 * The update backref stage we only want to skip if we already have
5268 * FULL_BACKREF set, otherwise we need to read.
5269 */
5270 if (wc->stage == UPDATE_BACKREF) {
5271 if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5272 return false;
5273 return true;
5274 }
5275
5276 /*
5277 * We're the last ref on this block, we must walk into it and process
5278 * any refs it's pointing at.
5279 */
5280 if (wc->refs[level - 1] == 1)
5281 return true;
5282
5283 /*
5284 * If we're already FULL_BACKREF then we know we can just drop our
5285 * current reference.
5286 */
5287 if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5288 return false;
5289
5290 /*
5291 * This block is older than our creation generation, we can drop our
5292 * reference to it.
5293 */
5294 generation = btrfs_node_ptr_generation(eb, slot);
5295 if (!wc->update_ref || generation <= root->root_key.offset)
5296 return false;
5297
5298 /*
5299 * This block was processed from a previous snapshot deletion run, we
5300 * can skip it.
5301 */
5302 btrfs_node_key_to_cpu(eb, &key, slot);
5303 if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
5304 return false;
5305
5306 /* All other cases we need to wander into the node. */
5307 return true;
5308}
5309
1c4850e2
YZ
5310static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5311 struct btrfs_root *root,
5312 struct walk_control *wc,
5313 struct btrfs_path *path)
6407bf6d 5314{
0b246afa 5315 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5316 u64 bytenr;
5317 u64 generation;
5318 u64 refs;
94fcca9f 5319 u64 flags;
5d4f98a2 5320 u32 nritems;
1c4850e2 5321 struct extent_buffer *eb;
6407bf6d 5322 int ret;
1c4850e2
YZ
5323 int slot;
5324 int nread = 0;
6407bf6d 5325
1c4850e2
YZ
5326 if (path->slots[wc->level] < wc->reada_slot) {
5327 wc->reada_count = wc->reada_count * 2 / 3;
5328 wc->reada_count = max(wc->reada_count, 2);
5329 } else {
5330 wc->reada_count = wc->reada_count * 3 / 2;
5331 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 5332 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 5333 }
7bb86316 5334
1c4850e2
YZ
5335 eb = path->nodes[wc->level];
5336 nritems = btrfs_header_nritems(eb);
bd56b302 5337
1c4850e2
YZ
5338 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5339 if (nread >= wc->reada_count)
5340 break;
bd56b302 5341
2dd3e67b 5342 cond_resched();
1c4850e2
YZ
5343 bytenr = btrfs_node_blockptr(eb, slot);
5344 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 5345
1c4850e2
YZ
5346 if (slot == path->slots[wc->level])
5347 goto reada;
5d4f98a2 5348
1c4850e2
YZ
5349 if (wc->stage == UPDATE_BACKREF &&
5350 generation <= root->root_key.offset)
bd56b302
CM
5351 continue;
5352
94fcca9f 5353 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 5354 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f 5355 wc->level - 1, 1, &refs,
d8ba2a91 5356 &flags, NULL);
79787eaa
JM
5357 /* We don't care about errors in readahead. */
5358 if (ret < 0)
5359 continue;
b8ccef04
JB
5360
5361 /*
5362 * This could be racey, it's conceivable that we raced and end
5363 * up with a bogus refs count, if that's the case just skip, if
5364 * we are actually corrupt we will notice when we look up
5365 * everything again with our locks.
5366 */
5367 if (refs == 0)
5368 continue;
94fcca9f 5369
2b73c7e7
JB
5370 /* If we don't need to visit this node don't reada. */
5371 if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
5372 continue;
1c4850e2 5373reada:
bfb484d9 5374 btrfs_readahead_node_child(eb, slot);
1c4850e2 5375 nread++;
20524f02 5376 }
1c4850e2 5377 wc->reada_slot = slot;
20524f02 5378}
2c47e605 5379
f82d02d9 5380/*
2c016dc2 5381 * helper to process tree block while walking down the tree.
2c47e605 5382 *
2c47e605
YZ
5383 * when wc->stage == UPDATE_BACKREF, this function updates
5384 * back refs for pointers in the block.
5385 *
5386 * NOTE: return value 1 means we should stop walking down.
f82d02d9 5387 */
2c47e605 5388static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 5389 struct btrfs_root *root,
2c47e605 5390 struct btrfs_path *path,
7fcee18d 5391 struct walk_control *wc)
f82d02d9 5392{
2ff7e61e 5393 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
5394 int level = wc->level;
5395 struct extent_buffer *eb = path->nodes[level];
2c47e605 5396 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
5397 int ret;
5398
e094f480 5399 if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
2c47e605 5400 return 1;
f82d02d9 5401
2c47e605
YZ
5402 /*
5403 * when reference count of tree block is 1, it won't increase
5404 * again. once full backref flag is set, we never clear it.
5405 */
7fcee18d 5406 if (wc->lookup_info &&
94fcca9f
YZ
5407 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5408 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
1f9d44c0 5409 ASSERT(path->locks[level]);
2ff7e61e 5410 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5411 eb->start, level, 1,
2c47e605 5412 &wc->refs[level],
d8ba2a91
JB
5413 &wc->flags[level],
5414 NULL);
79787eaa
JM
5415 if (ret)
5416 return ret;
b8ccef04
JB
5417 if (unlikely(wc->refs[level] == 0)) {
5418 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5419 eb->start);
5420 return -EUCLEAN;
5421 }
2c47e605 5422 }
5d4f98a2 5423
2c47e605
YZ
5424 if (wc->stage == DROP_REFERENCE) {
5425 if (wc->refs[level] > 1)
5426 return 1;
f82d02d9 5427
2c47e605 5428 if (path->locks[level] && !wc->keep_locks) {
bd681513 5429 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5430 path->locks[level] = 0;
5431 }
5432 return 0;
5433 }
f82d02d9 5434
2c47e605
YZ
5435 /* wc->stage == UPDATE_BACKREF */
5436 if (!(wc->flags[level] & flag)) {
1f9d44c0 5437 ASSERT(path->locks[level]);
e339a6b0 5438 ret = btrfs_inc_ref(trans, root, eb, 1);
b4236703
JB
5439 if (ret) {
5440 btrfs_abort_transaction(trans, ret);
5441 return ret;
5442 }
e339a6b0 5443 ret = btrfs_dec_ref(trans, root, eb, 0);
b4236703
JB
5444 if (ret) {
5445 btrfs_abort_transaction(trans, ret);
5446 return ret;
5447 }
4aec05fa 5448 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
b4236703
JB
5449 if (ret) {
5450 btrfs_abort_transaction(trans, ret);
5451 return ret;
5452 }
2c47e605
YZ
5453 wc->flags[level] |= flag;
5454 }
5455
5456 /*
5457 * the block is shared by multiple trees, so it's not good to
5458 * keep the tree lock
5459 */
5460 if (path->locks[level] && level > 0) {
bd681513 5461 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
5462 path->locks[level] = 0;
5463 }
5464 return 0;
5465}
5466
78c52d9e
JB
5467/*
5468 * This is used to verify a ref exists for this root to deal with a bug where we
5469 * would have a drop_progress key that hadn't been updated properly.
5470 */
5471static int check_ref_exists(struct btrfs_trans_handle *trans,
5472 struct btrfs_root *root, u64 bytenr, u64 parent,
5473 int level)
5474{
42fac187
JB
5475 struct btrfs_delayed_ref_root *delayed_refs;
5476 struct btrfs_delayed_ref_head *head;
78c52d9e
JB
5477 struct btrfs_path *path;
5478 struct btrfs_extent_inline_ref *iref;
5479 int ret;
42fac187 5480 bool exists = false;
78c52d9e
JB
5481
5482 path = btrfs_alloc_path();
5483 if (!path)
5484 return -ENOMEM;
42fac187 5485again:
78c52d9e
JB
5486 ret = lookup_extent_backref(trans, path, &iref, bytenr,
5487 root->fs_info->nodesize, parent,
e094f480 5488 btrfs_root_id(root), level, 0);
42fac187
JB
5489 if (ret != -ENOENT) {
5490 /*
5491 * If we get 0 then we found our reference, return 1, else
5492 * return the error if it's not -ENOENT;
5493 */
5494 btrfs_free_path(path);
5495 return (ret < 0 ) ? ret : 1;
5496 }
5497
5498 /*
5499 * We could have a delayed ref with this reference, so look it up while
5500 * we're holding the path open to make sure we don't race with the
5501 * delayed ref running.
5502 */
5503 delayed_refs = &trans->transaction->delayed_refs;
5504 spin_lock(&delayed_refs->lock);
5505 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
5506 if (!head)
5507 goto out;
5508 if (!mutex_trylock(&head->mutex)) {
5509 /*
5510 * We're contended, means that the delayed ref is running, get a
5511 * reference and wait for the ref head to be complete and then
5512 * try again.
5513 */
5514 refcount_inc(&head->refs);
5515 spin_unlock(&delayed_refs->lock);
5516
5517 btrfs_release_path(path);
5518
5519 mutex_lock(&head->mutex);
5520 mutex_unlock(&head->mutex);
5521 btrfs_put_delayed_ref_head(head);
5522 goto again;
5523 }
5524
5525 exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
5526 mutex_unlock(&head->mutex);
5527out:
5528 spin_unlock(&delayed_refs->lock);
78c52d9e 5529 btrfs_free_path(path);
42fac187 5530 return exists ? 1 : 0;
78c52d9e
JB
5531}
5532
562d4254
JB
5533/*
5534 * We may not have an uptodate block, so if we are going to walk down into this
5535 * block we need to drop the lock, read it off of the disk, re-lock it and
5536 * return to continue dropping the snapshot.
5537 */
5538static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
5539 struct btrfs_root *root,
5540 struct btrfs_path *path,
5541 struct walk_control *wc,
5542 struct extent_buffer *next)
5543{
5544 struct btrfs_tree_parent_check check = { 0 };
5545 u64 generation;
5546 int level = wc->level;
5547 int ret;
5548
5549 btrfs_assert_tree_write_locked(next);
5550
5551 generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
5552
5553 if (btrfs_buffer_uptodate(next, generation, 0))
5554 return 0;
5555
5556 check.level = level - 1;
5557 check.transid = generation;
5558 check.owner_root = btrfs_root_id(root);
5559 check.has_first_key = true;
5560 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
5561
5562 btrfs_tree_unlock(next);
5563 if (level == 1)
5564 reada_walk_down(trans, root, wc, path);
5565 ret = btrfs_read_extent_buffer(next, &check);
5566 if (ret) {
5567 free_extent_buffer(next);
5568 return ret;
5569 }
5570 btrfs_tree_lock(next);
5571 wc->lookup_info = 1;
5572 return 0;
5573}
5574
acb9b476
JB
5575/*
5576 * If we determine that we don't have to visit wc->level - 1 then we need to
5577 * determine if we can drop our reference.
5578 *
5579 * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5580 *
5581 * If we are DROP_REFERENCE this will figure out if we need to drop our current
5582 * reference, skipping it if we dropped it from a previous incompleted drop, or
5583 * dropping it if we still have a reference to it.
5584 */
5585static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5586 struct btrfs_path *path, struct walk_control *wc,
5587 struct extent_buffer *next, u64 owner_root)
5588{
5589 struct btrfs_ref ref = {
5590 .action = BTRFS_DROP_DELAYED_REF,
5591 .bytenr = next->start,
5592 .num_bytes = root->fs_info->nodesize,
5593 .owning_root = owner_root,
5594 .ref_root = btrfs_root_id(root),
5595 };
5596 int level = wc->level;
5597 int ret;
5598
5599 /* We are UPDATE_BACKREF, we're not dropping anything. */
5600 if (wc->stage == UPDATE_BACKREF)
5601 return 0;
5602
5603 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5604 ref.parent = path->nodes[level]->start;
5605 } else {
5606 ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
5607 if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
5608 btrfs_err(root->fs_info, "mismatched block owner");
5609 return -EIO;
5610 }
5611 }
5612
5613 /*
5614 * If we had a drop_progress we need to verify the refs are set as
5615 * expected. If we find our ref then we know that from here on out
5616 * everything should be correct, and we can clear the
5617 * ->restarted flag.
5618 */
5619 if (wc->restarted) {
5620 ret = check_ref_exists(trans, root, next->start, ref.parent,
5621 level - 1);
5622 if (ret <= 0)
5623 return ret;
5624 ret = 0;
5625 wc->restarted = 0;
5626 }
5627
5628 /*
5629 * Reloc tree doesn't contribute to qgroup numbers, and we have already
5630 * accounted them at merge time (replace_path), thus we could skip
5631 * expensive subtree trace here.
5632 */
5633 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
5634 wc->refs[level - 1] > 1) {
5635 u64 generation = btrfs_node_ptr_generation(path->nodes[level],
5636 path->slots[level]);
5637
5638 ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
5639 if (ret) {
5640 btrfs_err_rl(root->fs_info,
5641"error %d accounting shared subtree, quota is out of sync, rescan required",
5642 ret);
5643 }
5644 }
5645
5646 /*
5647 * We need to update the next key in our walk control so we can update
5648 * the drop_progress key accordingly. We don't care if find_next_key
5649 * doesn't find a key because that means we're at the end and are going
5650 * to clean up now.
5651 */
5652 wc->drop_level = level;
5653 find_next_key(path, level, &wc->drop_progress);
5654
5655 btrfs_init_tree_ref(&ref, level - 1, 0, false);
5656 return btrfs_free_extent(trans, &ref);
5657}
5658
1c4850e2 5659/*
2c016dc2 5660 * helper to process tree block pointer.
1c4850e2
YZ
5661 *
5662 * when wc->stage == DROP_REFERENCE, this function checks
5663 * reference count of the block pointed to. if the block
5664 * is shared and we need update back refs for the subtree
5665 * rooted at the block, this function changes wc->stage to
5666 * UPDATE_BACKREF. if the block is shared and there is no
5667 * need to update back, this function drops the reference
5668 * to the block.
5669 *
5670 * NOTE: return value 1 means we should stop walking down.
5671 */
5672static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5673 struct btrfs_root *root,
5674 struct btrfs_path *path,
7fcee18d 5675 struct walk_control *wc)
1c4850e2 5676{
0b246afa 5677 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
5678 u64 bytenr;
5679 u64 generation;
d8ba2a91 5680 u64 owner_root = 0;
1c4850e2
YZ
5681 struct extent_buffer *next;
5682 int level = wc->level;
1c4850e2
YZ
5683 int ret = 0;
5684
5685 generation = btrfs_node_ptr_generation(path->nodes[level],
5686 path->slots[level]);
5687 /*
5688 * if the lower level block was created before the snapshot
5689 * was created, we know there is no need to update back refs
5690 * for the subtree
5691 */
5692 if (wc->stage == UPDATE_BACKREF &&
94fcca9f 5693 generation <= root->root_key.offset) {
7fcee18d 5694 wc->lookup_info = 1;
1c4850e2 5695 return 1;
94fcca9f 5696 }
1c4850e2
YZ
5697
5698 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
789d6a3a 5699
58147d5a
JB
5700 next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
5701 level - 1);
5702 if (IS_ERR(next))
5703 return PTR_ERR(next);
5704
1c4850e2 5705 btrfs_tree_lock(next);
1c4850e2 5706
2ff7e61e 5707 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f 5708 &wc->refs[level - 1],
d8ba2a91
JB
5709 &wc->flags[level - 1],
5710 &owner_root);
4867268c
JB
5711 if (ret < 0)
5712 goto out_unlock;
79787eaa 5713
c2cf52eb 5714 if (unlikely(wc->refs[level - 1] == 0)) {
b8ccef04
JB
5715 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5716 bytenr);
5717 ret = -EUCLEAN;
4867268c 5718 goto out_unlock;
c2cf52eb 5719 }
7fcee18d 5720 wc->lookup_info = 0;
1c4850e2 5721
2b73c7e7
JB
5722 /* If we don't have to walk into this node skip it. */
5723 if (!visit_node_for_delete(root, wc, path->nodes[level],
5724 wc->refs[level - 1], wc->flags[level - 1],
5725 path->slots[level]))
5726 goto skip;
5727
5728 /*
5729 * We have to walk down into this node, and if we're currently at the
5730 * DROP_REFERNCE stage and this block is shared then we need to switch
5731 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5732 */
5733 if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
5734 wc->stage = UPDATE_BACKREF;
5735 wc->shared_level = level - 1;
1c4850e2
YZ
5736 }
5737
562d4254
JB
5738 ret = check_next_block_uptodate(trans, root, path, wc, next);
5739 if (ret)
5740 return ret;
1c4850e2
YZ
5741
5742 level--;
4867268c
JB
5743 ASSERT(level == btrfs_header_level(next));
5744 if (level != btrfs_header_level(next)) {
5745 btrfs_err(root->fs_info, "mismatched level");
5746 ret = -EIO;
5747 goto out_unlock;
5748 }
1c4850e2
YZ
5749 path->nodes[level] = next;
5750 path->slots[level] = 0;
ac5887c8 5751 path->locks[level] = BTRFS_WRITE_LOCK;
1c4850e2
YZ
5752 wc->level = level;
5753 if (wc->level == 1)
5754 wc->reada_slot = 0;
5755 return 0;
5756skip:
acb9b476
JB
5757 ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
5758 if (ret)
5759 goto out_unlock;
4c4686d1
JB
5760 wc->refs[level - 1] = 0;
5761 wc->flags[level - 1] = 0;
7fcee18d 5762 wc->lookup_info = 1;
4867268c
JB
5763 ret = 1;
5764
5765out_unlock:
1c4850e2
YZ
5766 btrfs_tree_unlock(next);
5767 free_extent_buffer(next);
4867268c
JB
5768
5769 return ret;
1c4850e2
YZ
5770}
5771
2c47e605 5772/*
2c016dc2 5773 * helper to process tree block while walking up the tree.
2c47e605
YZ
5774 *
5775 * when wc->stage == DROP_REFERENCE, this function drops
5776 * reference count on the block.
5777 *
5778 * when wc->stage == UPDATE_BACKREF, this function changes
5779 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5780 * to UPDATE_BACKREF previously while processing the block.
5781 *
5782 * NOTE: return value 1 means we should stop walking up.
5783 */
5784static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5785 struct btrfs_root *root,
5786 struct btrfs_path *path,
5787 struct walk_control *wc)
5788{
0b246afa 5789 struct btrfs_fs_info *fs_info = root->fs_info;
bb386803 5790 int ret = 0;
2c47e605
YZ
5791 int level = wc->level;
5792 struct extent_buffer *eb = path->nodes[level];
5793 u64 parent = 0;
5794
5795 if (wc->stage == UPDATE_BACKREF) {
f9c5b70c 5796 ASSERT(wc->shared_level >= level);
2c47e605
YZ
5797 if (level < wc->shared_level)
5798 goto out;
5799
2c47e605
YZ
5800 ret = find_next_key(path, level + 1, &wc->update_progress);
5801 if (ret > 0)
5802 wc->update_ref = 0;
5803
5804 wc->stage = DROP_REFERENCE;
5805 wc->shared_level = -1;
5806 path->slots[level] = 0;
5807
5808 /*
5809 * check reference count again if the block isn't locked.
5810 * we should start walking down the tree again if reference
5811 * count is one.
5812 */
5813 if (!path->locks[level]) {
f9c5b70c 5814 ASSERT(level > 0);
2c47e605 5815 btrfs_tree_lock(eb);
ac5887c8 5816 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5817
2ff7e61e 5818 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5819 eb->start, level, 1,
2c47e605 5820 &wc->refs[level],
d8ba2a91
JB
5821 &wc->flags[level],
5822 NULL);
79787eaa
JM
5823 if (ret < 0) {
5824 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5825 path->locks[level] = 0;
79787eaa
JM
5826 return ret;
5827 }
b8ccef04
JB
5828 if (unlikely(wc->refs[level] == 0)) {
5829 btrfs_tree_unlock_rw(eb, path->locks[level]);
5830 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5831 eb->start);
5832 return -EUCLEAN;
5833 }
2c47e605 5834 if (wc->refs[level] == 1) {
bd681513 5835 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5836 path->locks[level] = 0;
2c47e605
YZ
5837 return 1;
5838 }
f82d02d9 5839 }
2c47e605 5840 }
f82d02d9 5841
2c47e605 5842 /* wc->stage == DROP_REFERENCE */
f9c5b70c 5843 ASSERT(path->locks[level] || wc->refs[level] == 1);
5d4f98a2 5844
2c47e605
YZ
5845 if (wc->refs[level] == 1) {
5846 if (level == 0) {
5847 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 5848 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 5849 else
e339a6b0 5850 ret = btrfs_dec_ref(trans, root, eb, 0);
5eb178f3
JB
5851 if (ret) {
5852 btrfs_abort_transaction(trans, ret);
5853 return ret;
5854 }
e094f480 5855 if (is_fstree(btrfs_root_id(root))) {
c4140cbf
QW
5856 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5857 if (ret) {
5858 btrfs_err_rl(fs_info,
5859 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 5860 ret);
c4140cbf 5861 }
1152651a 5862 }
2c47e605 5863 }
190a8339 5864 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
d3fb6615 5865 if (!path->locks[level]) {
2c47e605 5866 btrfs_tree_lock(eb);
ac5887c8 5867 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 5868 }
190a8339 5869 btrfs_clear_buffer_dirty(trans, eb);
2c47e605
YZ
5870 }
5871
5872 if (eb == root->node) {
5873 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5874 parent = eb->start;
e094f480 5875 else if (btrfs_root_id(root) != btrfs_header_owner(eb))
65c6e82b 5876 goto owner_mismatch;
2c47e605
YZ
5877 } else {
5878 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5879 parent = path->nodes[level + 1]->start;
e094f480 5880 else if (btrfs_root_id(root) !=
65c6e82b
QW
5881 btrfs_header_owner(path->nodes[level + 1]))
5882 goto owner_mismatch;
f82d02d9 5883 }
f82d02d9 5884
bb386803
FM
5885 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5886 wc->refs[level] == 1);
5887 if (ret < 0)
5888 btrfs_abort_transaction(trans, ret);
2c47e605
YZ
5889out:
5890 wc->refs[level] = 0;
5891 wc->flags[level] = 0;
bb386803 5892 return ret;
65c6e82b
QW
5893
5894owner_mismatch:
5895 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
e094f480 5896 btrfs_header_owner(eb), btrfs_root_id(root));
65c6e82b 5897 return -EUCLEAN;
2c47e605
YZ
5898}
5899
33b804fa
JB
5900/*
5901 * walk_down_tree consists of two steps.
5902 *
5903 * walk_down_proc(). Look up the reference count and reference of our current
5904 * wc->level. At this point path->nodes[wc->level] should be populated and
5905 * uptodate, and in most cases should already be locked. If we are in
5906 * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5907 * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
5908 * FULL_BACKREF on this node if it's not already set, and then do the
5909 * FULL_BACKREF conversion dance, which is to drop the root reference and add
5910 * the shared reference to all of this nodes children.
5911 *
5912 * do_walk_down(). This is where we actually start iterating on the children of
5913 * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
5914 * our reference to the children that return false from visit_node_for_delete(),
5915 * which has various conditions where we know we can just drop our reference
5916 * without visiting the node. For UPDATE_BACKREF we will skip any children that
5917 * visit_node_for_delete() returns false for, only walking down when necessary.
5918 * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5919 * snapshot deletion.
5920 */
2c47e605
YZ
5921static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5922 struct btrfs_root *root,
5923 struct btrfs_path *path,
5924 struct walk_control *wc)
5925{
2c47e605 5926 int level = wc->level;
4e194384 5927 int ret = 0;
2c47e605 5928
7fcee18d 5929 wc->lookup_info = 1;
2c47e605 5930 while (level >= 0) {
7fcee18d 5931 ret = walk_down_proc(trans, root, path, wc);
4e194384 5932 if (ret)
2c47e605
YZ
5933 break;
5934
5935 if (level == 0)
5936 break;
5937
7a7965f8
YZ
5938 if (path->slots[level] >=
5939 btrfs_header_nritems(path->nodes[level]))
5940 break;
5941
7fcee18d 5942 ret = do_walk_down(trans, root, path, wc);
1c4850e2
YZ
5943 if (ret > 0) {
5944 path->slots[level]++;
5945 continue;
90d2c51d 5946 } else if (ret < 0)
4e194384 5947 break;
1c4850e2 5948 level = wc->level;
f82d02d9 5949 }
4e194384 5950 return (ret == 1) ? 0 : ret;
f82d02d9
YZ
5951}
5952
33b804fa
JB
5953/*
5954 * walk_up_tree() is responsible for making sure we visit every slot on our
5955 * current node, and if we're at the end of that node then we call
5956 * walk_up_proc() on our current node which will do one of a few things based on
5957 * our stage.
5958 *
5959 * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
5960 * then we need to walk back up the tree, and then going back down into the
5961 * other slots via walk_down_tree to update any other children from our original
5962 * wc->shared_level. Once we're at or above our wc->shared_level we can switch
5963 * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5964 *
5965 * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5966 * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5967 * in our current leaf. After that we call btrfs_free_tree_block() on the
5968 * current node and walk up to the next node to walk down the next slot.
5969 */
d397712b 5970static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 5971 struct btrfs_root *root,
f82d02d9 5972 struct btrfs_path *path,
2c47e605 5973 struct walk_control *wc, int max_level)
20524f02 5974{
2c47e605 5975 int level = wc->level;
20524f02 5976 int ret;
9f3a7427 5977
2c47e605
YZ
5978 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5979 while (level < max_level && path->nodes[level]) {
5980 wc->level = level;
5981 if (path->slots[level] + 1 <
5982 btrfs_header_nritems(path->nodes[level])) {
5983 path->slots[level]++;
20524f02
CM
5984 return 0;
5985 } else {
2c47e605
YZ
5986 ret = walk_up_proc(trans, root, path, wc);
5987 if (ret > 0)
5988 return 0;
65c6e82b
QW
5989 if (ret < 0)
5990 return ret;
bd56b302 5991
2c47e605 5992 if (path->locks[level]) {
bd681513
CM
5993 btrfs_tree_unlock_rw(path->nodes[level],
5994 path->locks[level]);
2c47e605 5995 path->locks[level] = 0;
f82d02d9 5996 }
2c47e605
YZ
5997 free_extent_buffer(path->nodes[level]);
5998 path->nodes[level] = NULL;
5999 level++;
20524f02
CM
6000 }
6001 }
6002 return 1;
6003}
6004
9aca1d51 6005/*
2c47e605
YZ
6006 * drop a subvolume tree.
6007 *
6008 * this function traverses the tree freeing any blocks that only
6009 * referenced by the tree.
6010 *
6011 * when a shared tree block is found. this function decreases its
6012 * reference count by one. if update_ref is true, this function
6013 * also make sure backrefs for the shared block and all lower level
6014 * blocks are properly updated.
9d1a2a3a
DS
6015 *
6016 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 6017 */
0078a9f9 6018int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
20524f02 6019{
e094f480 6020 const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
ab8d0fc4 6021 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 6022 struct btrfs_path *path;
2c47e605 6023 struct btrfs_trans_handle *trans;
ab8d0fc4 6024 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 6025 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
6026 struct walk_control *wc;
6027 struct btrfs_key key;
839d6ea4 6028 const u64 rootid = btrfs_root_id(root);
ca8ba2cc 6029 int ret = 0;
2c47e605 6030 int level;
d29a9f62 6031 bool root_dropped = false;
b4be6aef 6032 bool unfinished_drop = false;
20524f02 6033
e094f480 6034 btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
1152651a 6035
5caf2a00 6036 path = btrfs_alloc_path();
cb1b69f4 6037 if (!path) {
ca8ba2cc 6038 ret = -ENOMEM;
cb1b69f4
TI
6039 goto out;
6040 }
20524f02 6041
2c47e605 6042 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
6043 if (!wc) {
6044 btrfs_free_path(path);
ca8ba2cc 6045 ret = -ENOMEM;
cb1b69f4 6046 goto out;
38a1a919 6047 }
2c47e605 6048
f3e3d9cc
QW
6049 /*
6050 * Use join to avoid potential EINTR from transaction start. See
6051 * wait_reserve_ticket and the whole reservation callchain.
6052 */
6053 if (for_reloc)
6054 trans = btrfs_join_transaction(tree_root);
6055 else
6056 trans = btrfs_start_transaction(tree_root, 0);
79787eaa 6057 if (IS_ERR(trans)) {
ca8ba2cc 6058 ret = PTR_ERR(trans);
79787eaa
JM
6059 goto out_free;
6060 }
98d5dc13 6061
ca8ba2cc
AJ
6062 ret = btrfs_run_delayed_items(trans);
6063 if (ret)
0568e82d
JB
6064 goto out_end_trans;
6065
83354f07
JB
6066 /*
6067 * This will help us catch people modifying the fs tree while we're
6068 * dropping it. It is unsafe to mess with the fs tree while it's being
6069 * dropped as we unlock the root node and parent nodes as we walk down
6070 * the tree, assuming nothing will change. If something does change
6071 * then we'll have stale information and drop references to blocks we've
6072 * already dropped.
6073 */
6074 set_bit(BTRFS_ROOT_DELETING, &root->state);
b4be6aef
JB
6075 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
6076
9f3a7427 6077 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 6078 level = btrfs_header_level(root->node);
5d4f98a2 6079 path->nodes[level] = btrfs_lock_root_node(root);
9f3a7427 6080 path->slots[level] = 0;
ac5887c8 6081 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
6082 memset(&wc->update_progress, 0,
6083 sizeof(wc->update_progress));
9f3a7427 6084 } else {
9f3a7427 6085 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
6086 memcpy(&wc->update_progress, &key,
6087 sizeof(wc->update_progress));
6088
c8422684 6089 level = btrfs_root_drop_level(root_item);
2c47e605 6090 BUG_ON(level == 0);
6702ed49 6091 path->lowest_level = level;
2c47e605
YZ
6092 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6093 path->lowest_level = 0;
ca8ba2cc 6094 if (ret < 0)
79787eaa 6095 goto out_end_trans;
ca8ba2cc 6096
1c4850e2 6097 WARN_ON(ret > 0);
ca8ba2cc 6098 ret = 0;
2c47e605 6099
7d9eb12c
CM
6100 /*
6101 * unlock our path, this is safe because only this
6102 * function is allowed to delete this snapshot
6103 */
5d4f98a2 6104 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
6105
6106 level = btrfs_header_level(root->node);
6107 while (1) {
6108 btrfs_tree_lock(path->nodes[level]);
ac5887c8 6109 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605 6110
ca8ba2cc
AJ
6111 /*
6112 * btrfs_lookup_extent_info() returns 0 for success,
6113 * or < 0 for error.
6114 */
2ff7e61e 6115 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 6116 path->nodes[level]->start,
3173a18f 6117 level, 1, &wc->refs[level],
d8ba2a91 6118 &wc->flags[level], NULL);
ca8ba2cc 6119 if (ret < 0)
79787eaa 6120 goto out_end_trans;
ca8ba2cc 6121
2c47e605
YZ
6122 BUG_ON(wc->refs[level] == 0);
6123
c8422684 6124 if (level == btrfs_root_drop_level(root_item))
2c47e605
YZ
6125 break;
6126
6127 btrfs_tree_unlock(path->nodes[level]);
fec386ac 6128 path->locks[level] = 0;
2c47e605
YZ
6129 WARN_ON(wc->refs[level] != 1);
6130 level--;
6131 }
9f3a7427 6132 }
2c47e605 6133
78c52d9e 6134 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
6135 wc->level = level;
6136 wc->shared_level = -1;
6137 wc->stage = DROP_REFERENCE;
6138 wc->update_ref = update_ref;
6139 wc->keep_locks = 0;
0b246afa 6140 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 6141
d397712b 6142 while (1) {
9d1a2a3a 6143
2c47e605
YZ
6144 ret = walk_down_tree(trans, root, path, wc);
6145 if (ret < 0) {
9a93b5a3 6146 btrfs_abort_transaction(trans, ret);
20524f02 6147 break;
2c47e605 6148 }
9aca1d51 6149
2c47e605
YZ
6150 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6151 if (ret < 0) {
9a93b5a3 6152 btrfs_abort_transaction(trans, ret);
20524f02 6153 break;
2c47e605
YZ
6154 }
6155
6156 if (ret > 0) {
6157 BUG_ON(wc->stage != DROP_REFERENCE);
ca8ba2cc 6158 ret = 0;
e7a84565
CM
6159 break;
6160 }
2c47e605
YZ
6161
6162 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
6163 wc->drop_level = wc->level;
6164 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
6165 &wc->drop_progress,
6166 path->slots[wc->drop_level]);
6167 }
6168 btrfs_cpu_key_to_disk(&root_item->drop_progress,
6169 &wc->drop_progress);
c8422684 6170 btrfs_set_root_drop_level(root_item, wc->drop_level);
2c47e605
YZ
6171
6172 BUG_ON(wc->level == 0);
3a45bb20 6173 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 6174 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
6175 ret = btrfs_update_root(trans, tree_root,
6176 &root->root_key,
6177 root_item);
79787eaa 6178 if (ret) {
66642832 6179 btrfs_abort_transaction(trans, ret);
79787eaa
JM
6180 goto out_end_trans;
6181 }
2c47e605 6182
12a824dc
FM
6183 if (!is_reloc_root)
6184 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6185
3a45bb20 6186 btrfs_end_transaction_throttle(trans);
2ff7e61e 6187 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
6188 btrfs_debug(fs_info,
6189 "drop snapshot early exit");
ca8ba2cc 6190 ret = -EAGAIN;
3c8f2422
JB
6191 goto out_free;
6192 }
6193
18d3bff4
JB
6194 /*
6195 * Use join to avoid potential EINTR from transaction
6196 * start. See wait_reserve_ticket and the whole
6197 * reservation callchain.
6198 */
6199 if (for_reloc)
6200 trans = btrfs_join_transaction(tree_root);
6201 else
6202 trans = btrfs_start_transaction(tree_root, 0);
79787eaa 6203 if (IS_ERR(trans)) {
ca8ba2cc 6204 ret = PTR_ERR(trans);
79787eaa
JM
6205 goto out_free;
6206 }
c3e69d58 6207 }
20524f02 6208 }
b3b4aa74 6209 btrfs_release_path(path);
ca8ba2cc 6210 if (ret)
79787eaa 6211 goto out_end_trans;
2c47e605 6212
ab9ce7d4 6213 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 6214 if (ret) {
66642832 6215 btrfs_abort_transaction(trans, ret);
79787eaa
JM
6216 goto out_end_trans;
6217 }
2c47e605 6218
12a824dc 6219 if (!is_reloc_root) {
cb517eab
MX
6220 ret = btrfs_find_root(tree_root, &root->root_key, path,
6221 NULL, NULL);
79787eaa 6222 if (ret < 0) {
66642832 6223 btrfs_abort_transaction(trans, ret);
79787eaa
JM
6224 goto out_end_trans;
6225 } else if (ret > 0) {
ca8ba2cc
AJ
6226 ret = 0;
6227 /*
6228 * If we fail to delete the orphan item this time
84cd948c
JB
6229 * around, it'll get picked up the next time.
6230 *
6231 * The most common failure here is just -ENOENT.
6232 */
e094f480 6233 btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
76dda93c
YZ
6234 }
6235 }
6236
a3cf0e43
QW
6237 /*
6238 * This subvolume is going to be completely dropped, and won't be
6239 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6240 * commit transaction time. So free it here manually.
6241 */
6242 btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6243 btrfs_qgroup_free_meta_all_pertrans(root);
6244
fc7cbcd4 6245 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
2b9dbef2 6246 btrfs_add_dropped_root(trans, root);
8c38938c 6247 else
00246528 6248 btrfs_put_root(root);
d29a9f62 6249 root_dropped = true;
79787eaa 6250out_end_trans:
12a824dc
FM
6251 if (!is_reloc_root)
6252 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6253
3a45bb20 6254 btrfs_end_transaction_throttle(trans);
79787eaa 6255out_free:
2c47e605 6256 kfree(wc);
5caf2a00 6257 btrfs_free_path(path);
cb1b69f4 6258out:
ca8ba2cc 6259 if (!ret && root_dropped) {
839d6ea4
QW
6260 ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
6261 if (ret < 0)
6262 btrfs_warn_rl(fs_info,
6263 "failed to cleanup qgroup 0/%llu: %d",
6264 rootid, ret);
ca8ba2cc 6265 ret = 0;
839d6ea4 6266 }
b4be6aef
JB
6267 /*
6268 * We were an unfinished drop root, check to see if there are any
6269 * pending, and if not clear and wake up any waiters.
6270 */
ca8ba2cc 6271 if (!ret && unfinished_drop)
b4be6aef
JB
6272 btrfs_maybe_wake_unfinished_drop(fs_info);
6273
d29a9f62
JB
6274 /*
6275 * So if we need to stop dropping the snapshot for whatever reason we
6276 * need to make sure to add it back to the dead root list so that we
6277 * keep trying to do the work later. This also cleans up roots if we
6278 * don't have it in the radix (like when we recover after a power fail
6279 * or unmount) so we don't leak memory.
6280 */
897ca819 6281 if (!for_reloc && !root_dropped)
d29a9f62 6282 btrfs_add_dead_root(root);
ca8ba2cc 6283 return ret;
20524f02 6284}
9078a3e1 6285
2c47e605
YZ
6286/*
6287 * drop subtree rooted at tree block 'node'.
6288 *
6289 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 6290 * only used by relocation code
2c47e605 6291 */
f82d02d9
YZ
6292int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6293 struct btrfs_root *root,
6294 struct extent_buffer *node,
6295 struct extent_buffer *parent)
6296{
0b246afa 6297 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 6298 struct btrfs_path *path;
2c47e605 6299 struct walk_control *wc;
f82d02d9
YZ
6300 int level;
6301 int parent_level;
6302 int ret = 0;
f82d02d9 6303
e094f480 6304 BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
2c47e605 6305
f82d02d9 6306 path = btrfs_alloc_path();
db5b493a
TI
6307 if (!path)
6308 return -ENOMEM;
f82d02d9 6309
2c47e605 6310 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
6311 if (!wc) {
6312 btrfs_free_path(path);
6313 return -ENOMEM;
6314 }
2c47e605 6315
49d0c642 6316 btrfs_assert_tree_write_locked(parent);
f82d02d9 6317 parent_level = btrfs_header_level(parent);
67439dad 6318 atomic_inc(&parent->refs);
f82d02d9
YZ
6319 path->nodes[parent_level] = parent;
6320 path->slots[parent_level] = btrfs_header_nritems(parent);
6321
49d0c642 6322 btrfs_assert_tree_write_locked(node);
f82d02d9 6323 level = btrfs_header_level(node);
f82d02d9
YZ
6324 path->nodes[level] = node;
6325 path->slots[level] = 0;
ac5887c8 6326 path->locks[level] = BTRFS_WRITE_LOCK;
2c47e605
YZ
6327
6328 wc->refs[parent_level] = 1;
6329 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6330 wc->level = level;
6331 wc->shared_level = -1;
6332 wc->stage = DROP_REFERENCE;
6333 wc->update_ref = 0;
6334 wc->keep_locks = 1;
0b246afa 6335 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
6336
6337 while (1) {
33a44f37
AJ
6338 ret = walk_down_tree(trans, root, path, wc);
6339 if (ret < 0)
f82d02d9
YZ
6340 break;
6341
33a44f37
AJ
6342 ret = walk_up_tree(trans, root, path, wc, parent_level);
6343 if (ret) {
6344 if (ret > 0)
6345 ret = 0;
f82d02d9 6346 break;
33a44f37 6347 }
f82d02d9
YZ
6348 }
6349
2c47e605 6350 kfree(wc);
f82d02d9
YZ
6351 btrfs_free_path(path);
6352 return ret;
6353}
6354
91701bdf
DS
6355/*
6356 * Unpin the extent range in an error context and don't add the space back.
6357 * Errors are not propagated further.
6358 */
6359void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
acce952b 6360{
91701bdf 6361 unpin_extent_range(fs_info, start, end, false);
acce952b 6362}
6363
499f377f
JM
6364/*
6365 * It used to be that old block groups would be left around forever.
6366 * Iterating over them would be enough to trim unused space. Since we
6367 * now automatically remove them, we also need to iterate over unallocated
6368 * space.
6369 *
6370 * We don't want a transaction for this since the discard may take a
6371 * substantial amount of time. We don't require that a transaction be
6372 * running, but we do need to take a running transaction into account
fee7acc3
JM
6373 * to ensure that we're not discarding chunks that were released or
6374 * allocated in the current transaction.
499f377f
JM
6375 *
6376 * Holding the chunks lock will prevent other threads from allocating
6377 * or releasing chunks, but it won't prevent a running transaction
6378 * from committing and releasing the memory that the pending chunks
6379 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
6380 * transaction and hold the commit root sem. We only need to hold
6381 * it while performing the free space search since we have already
6382 * held back allocations.
499f377f 6383 */
8103d10b 6384static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 6385{
37f85ec3 6386 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
499f377f
JM
6387 int ret;
6388
6389 *trimmed = 0;
6390
0be88e36 6391 /* Discard not supported = nothing to do. */
70200574 6392 if (!bdev_max_discard_sectors(device->bdev))
0be88e36
JM
6393 return 0;
6394
52042d8e 6395 /* Not writable = nothing to do. */
ebbede42 6396 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
6397 return 0;
6398
6399 /* No free space = nothing to do. */
6400 if (device->total_bytes <= device->bytes_used)
6401 return 0;
6402
6403 ret = 0;
6404
6405 while (1) {
fb456252 6406 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
6407 u64 bytes;
6408
6409 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6410 if (ret)
fee7acc3 6411 break;
499f377f 6412
929be17a
NB
6413 find_first_clear_extent_bit(&device->alloc_state, start,
6414 &start, &end,
6415 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45 6416
c57dd1f2
QW
6417 /* Check if there are any CHUNK_* bits left */
6418 if (start > device->total_bytes) {
6419 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6420 btrfs_warn_in_rcu(fs_info,
6421"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6422 start, end - start + 1,
cb3e217b 6423 btrfs_dev_name(device),
c57dd1f2
QW
6424 device->total_bytes);
6425 mutex_unlock(&fs_info->chunk_mutex);
6426 ret = 0;
6427 break;
6428 }
6429
37f85ec3
QW
6430 /* Ensure we skip the reserved space on each device. */
6431 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
53460a45 6432
929be17a
NB
6433 /*
6434 * If find_first_clear_extent_bit find a range that spans the
6435 * end of the device it will set end to -1, in this case it's up
6436 * to the caller to trim the value to the size of the device.
6437 */
6438 end = min(end, device->total_bytes - 1);
53460a45 6439
929be17a 6440 len = end - start + 1;
499f377f 6441
929be17a
NB
6442 /* We didn't find any extents */
6443 if (!len) {
499f377f 6444 mutex_unlock(&fs_info->chunk_mutex);
929be17a 6445 ret = 0;
499f377f
JM
6446 break;
6447 }
6448
929be17a
NB
6449 ret = btrfs_issue_discard(device->bdev, start, len,
6450 &bytes);
6451 if (!ret)
0acd32c2 6452 set_extent_bit(&device->alloc_state, start,
1d126800 6453 start + bytes - 1, CHUNK_TRIMMED, NULL);
499f377f
JM
6454 mutex_unlock(&fs_info->chunk_mutex);
6455
6456 if (ret)
6457 break;
6458
6459 start += len;
6460 *trimmed += bytes;
6461
6462 if (fatal_signal_pending(current)) {
6463 ret = -ERESTARTSYS;
6464 break;
6465 }
6466
6467 cond_resched();
6468 }
6469
6470 return ret;
6471}
6472
93bba24d
QW
6473/*
6474 * Trim the whole filesystem by:
6475 * 1) trimming the free space in each block group
6476 * 2) trimming the unallocated space on each device
6477 *
6478 * This will also continue trimming even if a block group or device encounters
6479 * an error. The return value will be the last error, or 0 if nothing bad
6480 * happens.
6481 */
2ff7e61e 6482int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 6483{
23608d51 6484 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
32da5386 6485 struct btrfs_block_group *cache = NULL;
499f377f 6486 struct btrfs_device *device;
f7039b1d 6487 u64 group_trimmed;
07301df7 6488 u64 range_end = U64_MAX;
f7039b1d
LD
6489 u64 start;
6490 u64 end;
6491 u64 trimmed = 0;
93bba24d
QW
6492 u64 bg_failed = 0;
6493 u64 dev_failed = 0;
6494 int bg_ret = 0;
6495 int dev_ret = 0;
f7039b1d
LD
6496 int ret = 0;
6497
f981fec1
JB
6498 if (range->start == U64_MAX)
6499 return -EINVAL;
6500
07301df7
QW
6501 /*
6502 * Check range overflow if range->len is set.
6503 * The default range->len is U64_MAX.
6504 */
6505 if (range->len != U64_MAX &&
6506 check_add_overflow(range->start, range->len, &range_end))
6507 return -EINVAL;
6508
6ba9fc8e 6509 cache = btrfs_lookup_first_block_group(fs_info, range->start);
2e405ad8 6510 for (; cache; cache = btrfs_next_block_group(cache)) {
b3470b5d 6511 if (cache->start >= range_end) {
f7039b1d
LD
6512 btrfs_put_block_group(cache);
6513 break;
6514 }
6515
b3470b5d
DS
6516 start = max(range->start, cache->start);
6517 end = min(range_end, cache->start + cache->length);
f7039b1d
LD
6518
6519 if (end - start >= range->minlen) {
32da5386 6520 if (!btrfs_block_group_done(cache)) {
ced8ecf0 6521 ret = btrfs_cache_block_group(cache, true);
1be41b78 6522 if (ret) {
93bba24d
QW
6523 bg_failed++;
6524 bg_ret = ret;
6525 continue;
1be41b78 6526 }
f7039b1d
LD
6527 }
6528 ret = btrfs_trim_block_group(cache,
6529 &group_trimmed,
6530 start,
6531 end,
6532 range->minlen);
6533
6534 trimmed += group_trimmed;
6535 if (ret) {
93bba24d
QW
6536 bg_failed++;
6537 bg_ret = ret;
6538 continue;
f7039b1d
LD
6539 }
6540 }
f7039b1d
LD
6541 }
6542
93bba24d
QW
6543 if (bg_failed)
6544 btrfs_warn(fs_info,
6545 "failed to trim %llu block group(s), last error %d",
6546 bg_failed, bg_ret);
23608d51
AJ
6547
6548 mutex_lock(&fs_devices->device_list_mutex);
6549 list_for_each_entry(device, &fs_devices->devices, dev_list) {
16a200f6
AJ
6550 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6551 continue;
6552
8103d10b 6553 ret = btrfs_trim_free_extents(device, &group_trimmed);
33685972
LS
6554
6555 trimmed += group_trimmed;
93bba24d
QW
6556 if (ret) {
6557 dev_failed++;
6558 dev_ret = ret;
499f377f 6559 break;
93bba24d 6560 }
499f377f 6561 }
23608d51 6562 mutex_unlock(&fs_devices->device_list_mutex);
499f377f 6563
93bba24d
QW
6564 if (dev_failed)
6565 btrfs_warn(fs_info,
6566 "failed to trim %llu device(s), last error %d",
6567 dev_failed, dev_ret);
f7039b1d 6568 range->len = trimmed;
93bba24d
QW
6569 if (bg_ret)
6570 return bg_ret;
6571 return dev_ret;
f7039b1d 6572}