Merge tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux
[linux-block.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "tree-log.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "sysfs.h"
30 #include "qgroup.h"
31 #include "ref-verify.h"
32 #include "space-info.h"
33 #include "block-rsv.h"
34 #include "delalloc-space.h"
35 #include "discard.h"
36 #include "rcu-string.h"
37 #include "zoned.h"
38 #include "dev-replace.h"
39 #include "fs.h"
40 #include "accessors.h"
41 #include "root-tree.h"
42 #include "file-item.h"
43 #include "orphan.h"
44 #include "tree-checker.h"
45
46 #undef SCRAMBLE_DELAYED_REFS
47
48
49 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
50                                struct btrfs_delayed_ref_node *node, u64 parent,
51                                u64 root_objectid, u64 owner_objectid,
52                                u64 owner_offset, int refs_to_drop,
53                                struct btrfs_delayed_extent_op *extra_op);
54 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
55                                     struct extent_buffer *leaf,
56                                     struct btrfs_extent_item *ei);
57 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
58                                       u64 parent, u64 root_objectid,
59                                       u64 flags, u64 owner, u64 offset,
60                                       struct btrfs_key *ins, int ref_mod);
61 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
62                                      struct btrfs_delayed_ref_node *node,
63                                      struct btrfs_delayed_extent_op *extent_op);
64 static int find_next_key(struct btrfs_path *path, int level,
65                          struct btrfs_key *key);
66
67 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
68 {
69         return (cache->flags & bits) == bits;
70 }
71
72 int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
73                               u64 start, u64 num_bytes)
74 {
75         u64 end = start + num_bytes - 1;
76         set_extent_bit(&fs_info->excluded_extents, start, end,
77                        EXTENT_UPTODATE, NULL);
78         return 0;
79 }
80
81 void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
82 {
83         struct btrfs_fs_info *fs_info = cache->fs_info;
84         u64 start, end;
85
86         start = cache->start;
87         end = start + cache->length - 1;
88
89         clear_extent_bits(&fs_info->excluded_extents, start, end,
90                           EXTENT_UPTODATE);
91 }
92
93 /* simple helper to search for an existing data extent at a given offset */
94 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
95 {
96         struct btrfs_root *root = btrfs_extent_root(fs_info, start);
97         int ret;
98         struct btrfs_key key;
99         struct btrfs_path *path;
100
101         path = btrfs_alloc_path();
102         if (!path)
103                 return -ENOMEM;
104
105         key.objectid = start;
106         key.offset = len;
107         key.type = BTRFS_EXTENT_ITEM_KEY;
108         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
109         btrfs_free_path(path);
110         return ret;
111 }
112
113 /*
114  * helper function to lookup reference count and flags of a tree block.
115  *
116  * the head node for delayed ref is used to store the sum of all the
117  * reference count modifications queued up in the rbtree. the head
118  * node may also store the extent flags to set. This way you can check
119  * to see what the reference count and extent flags would be if all of
120  * the delayed refs are not processed.
121  */
122 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
123                              struct btrfs_fs_info *fs_info, u64 bytenr,
124                              u64 offset, int metadata, u64 *refs, u64 *flags)
125 {
126         struct btrfs_root *extent_root;
127         struct btrfs_delayed_ref_head *head;
128         struct btrfs_delayed_ref_root *delayed_refs;
129         struct btrfs_path *path;
130         struct btrfs_extent_item *ei;
131         struct extent_buffer *leaf;
132         struct btrfs_key key;
133         u32 item_size;
134         u64 num_refs;
135         u64 extent_flags;
136         int ret;
137
138         /*
139          * If we don't have skinny metadata, don't bother doing anything
140          * different
141          */
142         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
143                 offset = fs_info->nodesize;
144                 metadata = 0;
145         }
146
147         path = btrfs_alloc_path();
148         if (!path)
149                 return -ENOMEM;
150
151         if (!trans) {
152                 path->skip_locking = 1;
153                 path->search_commit_root = 1;
154         }
155
156 search_again:
157         key.objectid = bytenr;
158         key.offset = offset;
159         if (metadata)
160                 key.type = BTRFS_METADATA_ITEM_KEY;
161         else
162                 key.type = BTRFS_EXTENT_ITEM_KEY;
163
164         extent_root = btrfs_extent_root(fs_info, bytenr);
165         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
166         if (ret < 0)
167                 goto out_free;
168
169         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
170                 if (path->slots[0]) {
171                         path->slots[0]--;
172                         btrfs_item_key_to_cpu(path->nodes[0], &key,
173                                               path->slots[0]);
174                         if (key.objectid == bytenr &&
175                             key.type == BTRFS_EXTENT_ITEM_KEY &&
176                             key.offset == fs_info->nodesize)
177                                 ret = 0;
178                 }
179         }
180
181         if (ret == 0) {
182                 leaf = path->nodes[0];
183                 item_size = btrfs_item_size(leaf, path->slots[0]);
184                 if (item_size >= sizeof(*ei)) {
185                         ei = btrfs_item_ptr(leaf, path->slots[0],
186                                             struct btrfs_extent_item);
187                         num_refs = btrfs_extent_refs(leaf, ei);
188                         extent_flags = btrfs_extent_flags(leaf, ei);
189                 } else {
190                         ret = -EINVAL;
191                         btrfs_print_v0_err(fs_info);
192                         if (trans)
193                                 btrfs_abort_transaction(trans, ret);
194                         else
195                                 btrfs_handle_fs_error(fs_info, ret, NULL);
196
197                         goto out_free;
198                 }
199
200                 BUG_ON(num_refs == 0);
201         } else {
202                 num_refs = 0;
203                 extent_flags = 0;
204                 ret = 0;
205         }
206
207         if (!trans)
208                 goto out;
209
210         delayed_refs = &trans->transaction->delayed_refs;
211         spin_lock(&delayed_refs->lock);
212         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
213         if (head) {
214                 if (!mutex_trylock(&head->mutex)) {
215                         refcount_inc(&head->refs);
216                         spin_unlock(&delayed_refs->lock);
217
218                         btrfs_release_path(path);
219
220                         /*
221                          * Mutex was contended, block until it's released and try
222                          * again
223                          */
224                         mutex_lock(&head->mutex);
225                         mutex_unlock(&head->mutex);
226                         btrfs_put_delayed_ref_head(head);
227                         goto search_again;
228                 }
229                 spin_lock(&head->lock);
230                 if (head->extent_op && head->extent_op->update_flags)
231                         extent_flags |= head->extent_op->flags_to_set;
232                 else
233                         BUG_ON(num_refs == 0);
234
235                 num_refs += head->ref_mod;
236                 spin_unlock(&head->lock);
237                 mutex_unlock(&head->mutex);
238         }
239         spin_unlock(&delayed_refs->lock);
240 out:
241         WARN_ON(num_refs == 0);
242         if (refs)
243                 *refs = num_refs;
244         if (flags)
245                 *flags = extent_flags;
246 out_free:
247         btrfs_free_path(path);
248         return ret;
249 }
250
251 /*
252  * Back reference rules.  Back refs have three main goals:
253  *
254  * 1) differentiate between all holders of references to an extent so that
255  *    when a reference is dropped we can make sure it was a valid reference
256  *    before freeing the extent.
257  *
258  * 2) Provide enough information to quickly find the holders of an extent
259  *    if we notice a given block is corrupted or bad.
260  *
261  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
262  *    maintenance.  This is actually the same as #2, but with a slightly
263  *    different use case.
264  *
265  * There are two kinds of back refs. The implicit back refs is optimized
266  * for pointers in non-shared tree blocks. For a given pointer in a block,
267  * back refs of this kind provide information about the block's owner tree
268  * and the pointer's key. These information allow us to find the block by
269  * b-tree searching. The full back refs is for pointers in tree blocks not
270  * referenced by their owner trees. The location of tree block is recorded
271  * in the back refs. Actually the full back refs is generic, and can be
272  * used in all cases the implicit back refs is used. The major shortcoming
273  * of the full back refs is its overhead. Every time a tree block gets
274  * COWed, we have to update back refs entry for all pointers in it.
275  *
276  * For a newly allocated tree block, we use implicit back refs for
277  * pointers in it. This means most tree related operations only involve
278  * implicit back refs. For a tree block created in old transaction, the
279  * only way to drop a reference to it is COW it. So we can detect the
280  * event that tree block loses its owner tree's reference and do the
281  * back refs conversion.
282  *
283  * When a tree block is COWed through a tree, there are four cases:
284  *
285  * The reference count of the block is one and the tree is the block's
286  * owner tree. Nothing to do in this case.
287  *
288  * The reference count of the block is one and the tree is not the
289  * block's owner tree. In this case, full back refs is used for pointers
290  * in the block. Remove these full back refs, add implicit back refs for
291  * every pointers in the new block.
292  *
293  * The reference count of the block is greater than one and the tree is
294  * the block's owner tree. In this case, implicit back refs is used for
295  * pointers in the block. Add full back refs for every pointers in the
296  * block, increase lower level extents' reference counts. The original
297  * implicit back refs are entailed to the new block.
298  *
299  * The reference count of the block is greater than one and the tree is
300  * not the block's owner tree. Add implicit back refs for every pointer in
301  * the new block, increase lower level extents' reference count.
302  *
303  * Back Reference Key composing:
304  *
305  * The key objectid corresponds to the first byte in the extent,
306  * The key type is used to differentiate between types of back refs.
307  * There are different meanings of the key offset for different types
308  * of back refs.
309  *
310  * File extents can be referenced by:
311  *
312  * - multiple snapshots, subvolumes, or different generations in one subvol
313  * - different files inside a single subvolume
314  * - different offsets inside a file (bookend extents in file.c)
315  *
316  * The extent ref structure for the implicit back refs has fields for:
317  *
318  * - Objectid of the subvolume root
319  * - objectid of the file holding the reference
320  * - original offset in the file
321  * - how many bookend extents
322  *
323  * The key offset for the implicit back refs is hash of the first
324  * three fields.
325  *
326  * The extent ref structure for the full back refs has field for:
327  *
328  * - number of pointers in the tree leaf
329  *
330  * The key offset for the implicit back refs is the first byte of
331  * the tree leaf
332  *
333  * When a file extent is allocated, The implicit back refs is used.
334  * the fields are filled in:
335  *
336  *     (root_key.objectid, inode objectid, offset in file, 1)
337  *
338  * When a file extent is removed file truncation, we find the
339  * corresponding implicit back refs and check the following fields:
340  *
341  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
342  *
343  * Btree extents can be referenced by:
344  *
345  * - Different subvolumes
346  *
347  * Both the implicit back refs and the full back refs for tree blocks
348  * only consist of key. The key offset for the implicit back refs is
349  * objectid of block's owner tree. The key offset for the full back refs
350  * is the first byte of parent block.
351  *
352  * When implicit back refs is used, information about the lowest key and
353  * level of the tree block are required. These information are stored in
354  * tree block info structure.
355  */
356
357 /*
358  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
359  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
360  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
361  */
362 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
363                                      struct btrfs_extent_inline_ref *iref,
364                                      enum btrfs_inline_ref_type is_data)
365 {
366         int type = btrfs_extent_inline_ref_type(eb, iref);
367         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
368
369         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
370             type == BTRFS_SHARED_BLOCK_REF_KEY ||
371             type == BTRFS_SHARED_DATA_REF_KEY ||
372             type == BTRFS_EXTENT_DATA_REF_KEY) {
373                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
374                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
375                                 return type;
376                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
377                                 ASSERT(eb->fs_info);
378                                 /*
379                                  * Every shared one has parent tree block,
380                                  * which must be aligned to sector size.
381                                  */
382                                 if (offset &&
383                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
384                                         return type;
385                         }
386                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
387                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
388                                 return type;
389                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
390                                 ASSERT(eb->fs_info);
391                                 /*
392                                  * Every shared one has parent tree block,
393                                  * which must be aligned to sector size.
394                                  */
395                                 if (offset &&
396                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
397                                         return type;
398                         }
399                 } else {
400                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
401                         return type;
402                 }
403         }
404
405         btrfs_print_leaf(eb);
406         btrfs_err(eb->fs_info,
407                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
408                   eb->start, (unsigned long)iref, type);
409         WARN_ON(1);
410
411         return BTRFS_REF_TYPE_INVALID;
412 }
413
414 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
415 {
416         u32 high_crc = ~(u32)0;
417         u32 low_crc = ~(u32)0;
418         __le64 lenum;
419
420         lenum = cpu_to_le64(root_objectid);
421         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
422         lenum = cpu_to_le64(owner);
423         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
424         lenum = cpu_to_le64(offset);
425         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
426
427         return ((u64)high_crc << 31) ^ (u64)low_crc;
428 }
429
430 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
431                                      struct btrfs_extent_data_ref *ref)
432 {
433         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
434                                     btrfs_extent_data_ref_objectid(leaf, ref),
435                                     btrfs_extent_data_ref_offset(leaf, ref));
436 }
437
438 static int match_extent_data_ref(struct extent_buffer *leaf,
439                                  struct btrfs_extent_data_ref *ref,
440                                  u64 root_objectid, u64 owner, u64 offset)
441 {
442         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
443             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
444             btrfs_extent_data_ref_offset(leaf, ref) != offset)
445                 return 0;
446         return 1;
447 }
448
449 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
450                                            struct btrfs_path *path,
451                                            u64 bytenr, u64 parent,
452                                            u64 root_objectid,
453                                            u64 owner, u64 offset)
454 {
455         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
456         struct btrfs_key key;
457         struct btrfs_extent_data_ref *ref;
458         struct extent_buffer *leaf;
459         u32 nritems;
460         int ret;
461         int recow;
462         int err = -ENOENT;
463
464         key.objectid = bytenr;
465         if (parent) {
466                 key.type = BTRFS_SHARED_DATA_REF_KEY;
467                 key.offset = parent;
468         } else {
469                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
470                 key.offset = hash_extent_data_ref(root_objectid,
471                                                   owner, offset);
472         }
473 again:
474         recow = 0;
475         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
476         if (ret < 0) {
477                 err = ret;
478                 goto fail;
479         }
480
481         if (parent) {
482                 if (!ret)
483                         return 0;
484                 goto fail;
485         }
486
487         leaf = path->nodes[0];
488         nritems = btrfs_header_nritems(leaf);
489         while (1) {
490                 if (path->slots[0] >= nritems) {
491                         ret = btrfs_next_leaf(root, path);
492                         if (ret < 0)
493                                 err = ret;
494                         if (ret)
495                                 goto fail;
496
497                         leaf = path->nodes[0];
498                         nritems = btrfs_header_nritems(leaf);
499                         recow = 1;
500                 }
501
502                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
503                 if (key.objectid != bytenr ||
504                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
505                         goto fail;
506
507                 ref = btrfs_item_ptr(leaf, path->slots[0],
508                                      struct btrfs_extent_data_ref);
509
510                 if (match_extent_data_ref(leaf, ref, root_objectid,
511                                           owner, offset)) {
512                         if (recow) {
513                                 btrfs_release_path(path);
514                                 goto again;
515                         }
516                         err = 0;
517                         break;
518                 }
519                 path->slots[0]++;
520         }
521 fail:
522         return err;
523 }
524
525 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
526                                            struct btrfs_path *path,
527                                            u64 bytenr, u64 parent,
528                                            u64 root_objectid, u64 owner,
529                                            u64 offset, int refs_to_add)
530 {
531         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
532         struct btrfs_key key;
533         struct extent_buffer *leaf;
534         u32 size;
535         u32 num_refs;
536         int ret;
537
538         key.objectid = bytenr;
539         if (parent) {
540                 key.type = BTRFS_SHARED_DATA_REF_KEY;
541                 key.offset = parent;
542                 size = sizeof(struct btrfs_shared_data_ref);
543         } else {
544                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
545                 key.offset = hash_extent_data_ref(root_objectid,
546                                                   owner, offset);
547                 size = sizeof(struct btrfs_extent_data_ref);
548         }
549
550         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
551         if (ret && ret != -EEXIST)
552                 goto fail;
553
554         leaf = path->nodes[0];
555         if (parent) {
556                 struct btrfs_shared_data_ref *ref;
557                 ref = btrfs_item_ptr(leaf, path->slots[0],
558                                      struct btrfs_shared_data_ref);
559                 if (ret == 0) {
560                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
561                 } else {
562                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
563                         num_refs += refs_to_add;
564                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
565                 }
566         } else {
567                 struct btrfs_extent_data_ref *ref;
568                 while (ret == -EEXIST) {
569                         ref = btrfs_item_ptr(leaf, path->slots[0],
570                                              struct btrfs_extent_data_ref);
571                         if (match_extent_data_ref(leaf, ref, root_objectid,
572                                                   owner, offset))
573                                 break;
574                         btrfs_release_path(path);
575                         key.offset++;
576                         ret = btrfs_insert_empty_item(trans, root, path, &key,
577                                                       size);
578                         if (ret && ret != -EEXIST)
579                                 goto fail;
580
581                         leaf = path->nodes[0];
582                 }
583                 ref = btrfs_item_ptr(leaf, path->slots[0],
584                                      struct btrfs_extent_data_ref);
585                 if (ret == 0) {
586                         btrfs_set_extent_data_ref_root(leaf, ref,
587                                                        root_objectid);
588                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
589                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
590                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
591                 } else {
592                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
593                         num_refs += refs_to_add;
594                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
595                 }
596         }
597         btrfs_mark_buffer_dirty(leaf);
598         ret = 0;
599 fail:
600         btrfs_release_path(path);
601         return ret;
602 }
603
604 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
605                                            struct btrfs_root *root,
606                                            struct btrfs_path *path,
607                                            int refs_to_drop)
608 {
609         struct btrfs_key key;
610         struct btrfs_extent_data_ref *ref1 = NULL;
611         struct btrfs_shared_data_ref *ref2 = NULL;
612         struct extent_buffer *leaf;
613         u32 num_refs = 0;
614         int ret = 0;
615
616         leaf = path->nodes[0];
617         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
618
619         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
620                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
621                                       struct btrfs_extent_data_ref);
622                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
623         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
624                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
625                                       struct btrfs_shared_data_ref);
626                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
627         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
628                 btrfs_print_v0_err(trans->fs_info);
629                 btrfs_abort_transaction(trans, -EINVAL);
630                 return -EINVAL;
631         } else {
632                 BUG();
633         }
634
635         BUG_ON(num_refs < refs_to_drop);
636         num_refs -= refs_to_drop;
637
638         if (num_refs == 0) {
639                 ret = btrfs_del_item(trans, root, path);
640         } else {
641                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
642                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
643                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
644                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
645                 btrfs_mark_buffer_dirty(leaf);
646         }
647         return ret;
648 }
649
650 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
651                                           struct btrfs_extent_inline_ref *iref)
652 {
653         struct btrfs_key key;
654         struct extent_buffer *leaf;
655         struct btrfs_extent_data_ref *ref1;
656         struct btrfs_shared_data_ref *ref2;
657         u32 num_refs = 0;
658         int type;
659
660         leaf = path->nodes[0];
661         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
662
663         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
664         if (iref) {
665                 /*
666                  * If type is invalid, we should have bailed out earlier than
667                  * this call.
668                  */
669                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
670                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
671                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
672                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
673                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
674                 } else {
675                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
676                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
677                 }
678         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
679                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
680                                       struct btrfs_extent_data_ref);
681                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
682         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
683                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
684                                       struct btrfs_shared_data_ref);
685                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
686         } else {
687                 WARN_ON(1);
688         }
689         return num_refs;
690 }
691
692 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
693                                           struct btrfs_path *path,
694                                           u64 bytenr, u64 parent,
695                                           u64 root_objectid)
696 {
697         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
698         struct btrfs_key key;
699         int ret;
700
701         key.objectid = bytenr;
702         if (parent) {
703                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
704                 key.offset = parent;
705         } else {
706                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
707                 key.offset = root_objectid;
708         }
709
710         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
711         if (ret > 0)
712                 ret = -ENOENT;
713         return ret;
714 }
715
716 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
717                                           struct btrfs_path *path,
718                                           u64 bytenr, u64 parent,
719                                           u64 root_objectid)
720 {
721         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
722         struct btrfs_key key;
723         int ret;
724
725         key.objectid = bytenr;
726         if (parent) {
727                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
728                 key.offset = parent;
729         } else {
730                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
731                 key.offset = root_objectid;
732         }
733
734         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
735         btrfs_release_path(path);
736         return ret;
737 }
738
739 static inline int extent_ref_type(u64 parent, u64 owner)
740 {
741         int type;
742         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
743                 if (parent > 0)
744                         type = BTRFS_SHARED_BLOCK_REF_KEY;
745                 else
746                         type = BTRFS_TREE_BLOCK_REF_KEY;
747         } else {
748                 if (parent > 0)
749                         type = BTRFS_SHARED_DATA_REF_KEY;
750                 else
751                         type = BTRFS_EXTENT_DATA_REF_KEY;
752         }
753         return type;
754 }
755
756 static int find_next_key(struct btrfs_path *path, int level,
757                          struct btrfs_key *key)
758
759 {
760         for (; level < BTRFS_MAX_LEVEL; level++) {
761                 if (!path->nodes[level])
762                         break;
763                 if (path->slots[level] + 1 >=
764                     btrfs_header_nritems(path->nodes[level]))
765                         continue;
766                 if (level == 0)
767                         btrfs_item_key_to_cpu(path->nodes[level], key,
768                                               path->slots[level] + 1);
769                 else
770                         btrfs_node_key_to_cpu(path->nodes[level], key,
771                                               path->slots[level] + 1);
772                 return 0;
773         }
774         return 1;
775 }
776
777 /*
778  * look for inline back ref. if back ref is found, *ref_ret is set
779  * to the address of inline back ref, and 0 is returned.
780  *
781  * if back ref isn't found, *ref_ret is set to the address where it
782  * should be inserted, and -ENOENT is returned.
783  *
784  * if insert is true and there are too many inline back refs, the path
785  * points to the extent item, and -EAGAIN is returned.
786  *
787  * NOTE: inline back refs are ordered in the same way that back ref
788  *       items in the tree are ordered.
789  */
790 static noinline_for_stack
791 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
792                                  struct btrfs_path *path,
793                                  struct btrfs_extent_inline_ref **ref_ret,
794                                  u64 bytenr, u64 num_bytes,
795                                  u64 parent, u64 root_objectid,
796                                  u64 owner, u64 offset, int insert)
797 {
798         struct btrfs_fs_info *fs_info = trans->fs_info;
799         struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
800         struct btrfs_key key;
801         struct extent_buffer *leaf;
802         struct btrfs_extent_item *ei;
803         struct btrfs_extent_inline_ref *iref;
804         u64 flags;
805         u64 item_size;
806         unsigned long ptr;
807         unsigned long end;
808         int extra_size;
809         int type;
810         int want;
811         int ret;
812         int err = 0;
813         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
814         int needed;
815
816         key.objectid = bytenr;
817         key.type = BTRFS_EXTENT_ITEM_KEY;
818         key.offset = num_bytes;
819
820         want = extent_ref_type(parent, owner);
821         if (insert) {
822                 extra_size = btrfs_extent_inline_ref_size(want);
823                 path->search_for_extension = 1;
824                 path->keep_locks = 1;
825         } else
826                 extra_size = -1;
827
828         /*
829          * Owner is our level, so we can just add one to get the level for the
830          * block we are interested in.
831          */
832         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
833                 key.type = BTRFS_METADATA_ITEM_KEY;
834                 key.offset = owner;
835         }
836
837 again:
838         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
839         if (ret < 0) {
840                 err = ret;
841                 goto out;
842         }
843
844         /*
845          * We may be a newly converted file system which still has the old fat
846          * extent entries for metadata, so try and see if we have one of those.
847          */
848         if (ret > 0 && skinny_metadata) {
849                 skinny_metadata = false;
850                 if (path->slots[0]) {
851                         path->slots[0]--;
852                         btrfs_item_key_to_cpu(path->nodes[0], &key,
853                                               path->slots[0]);
854                         if (key.objectid == bytenr &&
855                             key.type == BTRFS_EXTENT_ITEM_KEY &&
856                             key.offset == num_bytes)
857                                 ret = 0;
858                 }
859                 if (ret) {
860                         key.objectid = bytenr;
861                         key.type = BTRFS_EXTENT_ITEM_KEY;
862                         key.offset = num_bytes;
863                         btrfs_release_path(path);
864                         goto again;
865                 }
866         }
867
868         if (ret && !insert) {
869                 err = -ENOENT;
870                 goto out;
871         } else if (WARN_ON(ret)) {
872                 err = -EIO;
873                 goto out;
874         }
875
876         leaf = path->nodes[0];
877         item_size = btrfs_item_size(leaf, path->slots[0]);
878         if (unlikely(item_size < sizeof(*ei))) {
879                 err = -EINVAL;
880                 btrfs_print_v0_err(fs_info);
881                 btrfs_abort_transaction(trans, err);
882                 goto out;
883         }
884
885         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
886         flags = btrfs_extent_flags(leaf, ei);
887
888         ptr = (unsigned long)(ei + 1);
889         end = (unsigned long)ei + item_size;
890
891         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
892                 ptr += sizeof(struct btrfs_tree_block_info);
893                 BUG_ON(ptr > end);
894         }
895
896         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
897                 needed = BTRFS_REF_TYPE_DATA;
898         else
899                 needed = BTRFS_REF_TYPE_BLOCK;
900
901         err = -ENOENT;
902         while (1) {
903                 if (ptr >= end) {
904                         if (ptr > end) {
905                                 err = -EUCLEAN;
906                                 btrfs_print_leaf(path->nodes[0]);
907                                 btrfs_crit(fs_info,
908 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
909                                         path->slots[0], root_objectid, owner, offset, parent);
910                         }
911                         break;
912                 }
913                 iref = (struct btrfs_extent_inline_ref *)ptr;
914                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
915                 if (type == BTRFS_REF_TYPE_INVALID) {
916                         err = -EUCLEAN;
917                         goto out;
918                 }
919
920                 if (want < type)
921                         break;
922                 if (want > type) {
923                         ptr += btrfs_extent_inline_ref_size(type);
924                         continue;
925                 }
926
927                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
928                         struct btrfs_extent_data_ref *dref;
929                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
930                         if (match_extent_data_ref(leaf, dref, root_objectid,
931                                                   owner, offset)) {
932                                 err = 0;
933                                 break;
934                         }
935                         if (hash_extent_data_ref_item(leaf, dref) <
936                             hash_extent_data_ref(root_objectid, owner, offset))
937                                 break;
938                 } else {
939                         u64 ref_offset;
940                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
941                         if (parent > 0) {
942                                 if (parent == ref_offset) {
943                                         err = 0;
944                                         break;
945                                 }
946                                 if (ref_offset < parent)
947                                         break;
948                         } else {
949                                 if (root_objectid == ref_offset) {
950                                         err = 0;
951                                         break;
952                                 }
953                                 if (ref_offset < root_objectid)
954                                         break;
955                         }
956                 }
957                 ptr += btrfs_extent_inline_ref_size(type);
958         }
959         if (err == -ENOENT && insert) {
960                 if (item_size + extra_size >=
961                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
962                         err = -EAGAIN;
963                         goto out;
964                 }
965                 /*
966                  * To add new inline back ref, we have to make sure
967                  * there is no corresponding back ref item.
968                  * For simplicity, we just do not add new inline back
969                  * ref if there is any kind of item for this block
970                  */
971                 if (find_next_key(path, 0, &key) == 0 &&
972                     key.objectid == bytenr &&
973                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
974                         err = -EAGAIN;
975                         goto out;
976                 }
977         }
978         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
979 out:
980         if (insert) {
981                 path->keep_locks = 0;
982                 path->search_for_extension = 0;
983                 btrfs_unlock_up_safe(path, 1);
984         }
985         return err;
986 }
987
988 /*
989  * helper to add new inline back ref
990  */
991 static noinline_for_stack
992 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
993                                  struct btrfs_path *path,
994                                  struct btrfs_extent_inline_ref *iref,
995                                  u64 parent, u64 root_objectid,
996                                  u64 owner, u64 offset, int refs_to_add,
997                                  struct btrfs_delayed_extent_op *extent_op)
998 {
999         struct extent_buffer *leaf;
1000         struct btrfs_extent_item *ei;
1001         unsigned long ptr;
1002         unsigned long end;
1003         unsigned long item_offset;
1004         u64 refs;
1005         int size;
1006         int type;
1007
1008         leaf = path->nodes[0];
1009         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1010         item_offset = (unsigned long)iref - (unsigned long)ei;
1011
1012         type = extent_ref_type(parent, owner);
1013         size = btrfs_extent_inline_ref_size(type);
1014
1015         btrfs_extend_item(path, size);
1016
1017         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1018         refs = btrfs_extent_refs(leaf, ei);
1019         refs += refs_to_add;
1020         btrfs_set_extent_refs(leaf, ei, refs);
1021         if (extent_op)
1022                 __run_delayed_extent_op(extent_op, leaf, ei);
1023
1024         ptr = (unsigned long)ei + item_offset;
1025         end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1026         if (ptr < end - size)
1027                 memmove_extent_buffer(leaf, ptr + size, ptr,
1028                                       end - size - ptr);
1029
1030         iref = (struct btrfs_extent_inline_ref *)ptr;
1031         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1032         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1033                 struct btrfs_extent_data_ref *dref;
1034                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1035                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1036                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1037                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1038                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1039         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1040                 struct btrfs_shared_data_ref *sref;
1041                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1042                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1043                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1044         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1045                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1046         } else {
1047                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1048         }
1049         btrfs_mark_buffer_dirty(leaf);
1050 }
1051
1052 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1053                                  struct btrfs_path *path,
1054                                  struct btrfs_extent_inline_ref **ref_ret,
1055                                  u64 bytenr, u64 num_bytes, u64 parent,
1056                                  u64 root_objectid, u64 owner, u64 offset)
1057 {
1058         int ret;
1059
1060         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1061                                            num_bytes, parent, root_objectid,
1062                                            owner, offset, 0);
1063         if (ret != -ENOENT)
1064                 return ret;
1065
1066         btrfs_release_path(path);
1067         *ref_ret = NULL;
1068
1069         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1070                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1071                                             root_objectid);
1072         } else {
1073                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1074                                              root_objectid, owner, offset);
1075         }
1076         return ret;
1077 }
1078
1079 /*
1080  * helper to update/remove inline back ref
1081  */
1082 static noinline_for_stack
1083 void update_inline_extent_backref(struct btrfs_path *path,
1084                                   struct btrfs_extent_inline_ref *iref,
1085                                   int refs_to_mod,
1086                                   struct btrfs_delayed_extent_op *extent_op)
1087 {
1088         struct extent_buffer *leaf = path->nodes[0];
1089         struct btrfs_extent_item *ei;
1090         struct btrfs_extent_data_ref *dref = NULL;
1091         struct btrfs_shared_data_ref *sref = NULL;
1092         unsigned long ptr;
1093         unsigned long end;
1094         u32 item_size;
1095         int size;
1096         int type;
1097         u64 refs;
1098
1099         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1100         refs = btrfs_extent_refs(leaf, ei);
1101         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1102         refs += refs_to_mod;
1103         btrfs_set_extent_refs(leaf, ei, refs);
1104         if (extent_op)
1105                 __run_delayed_extent_op(extent_op, leaf, ei);
1106
1107         /*
1108          * If type is invalid, we should have bailed out after
1109          * lookup_inline_extent_backref().
1110          */
1111         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1112         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1113
1114         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1115                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1116                 refs = btrfs_extent_data_ref_count(leaf, dref);
1117         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1118                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1119                 refs = btrfs_shared_data_ref_count(leaf, sref);
1120         } else {
1121                 refs = 1;
1122                 BUG_ON(refs_to_mod != -1);
1123         }
1124
1125         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1126         refs += refs_to_mod;
1127
1128         if (refs > 0) {
1129                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1130                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1131                 else
1132                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1133         } else {
1134                 size =  btrfs_extent_inline_ref_size(type);
1135                 item_size = btrfs_item_size(leaf, path->slots[0]);
1136                 ptr = (unsigned long)iref;
1137                 end = (unsigned long)ei + item_size;
1138                 if (ptr + size < end)
1139                         memmove_extent_buffer(leaf, ptr, ptr + size,
1140                                               end - ptr - size);
1141                 item_size -= size;
1142                 btrfs_truncate_item(path, item_size, 1);
1143         }
1144         btrfs_mark_buffer_dirty(leaf);
1145 }
1146
1147 static noinline_for_stack
1148 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1149                                  struct btrfs_path *path,
1150                                  u64 bytenr, u64 num_bytes, u64 parent,
1151                                  u64 root_objectid, u64 owner,
1152                                  u64 offset, int refs_to_add,
1153                                  struct btrfs_delayed_extent_op *extent_op)
1154 {
1155         struct btrfs_extent_inline_ref *iref;
1156         int ret;
1157
1158         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1159                                            num_bytes, parent, root_objectid,
1160                                            owner, offset, 1);
1161         if (ret == 0) {
1162                 /*
1163                  * We're adding refs to a tree block we already own, this
1164                  * should not happen at all.
1165                  */
1166                 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1167                         btrfs_print_leaf(path->nodes[0]);
1168                         btrfs_crit(trans->fs_info,
1169 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1170                                    bytenr, num_bytes, root_objectid, path->slots[0]);
1171                         return -EUCLEAN;
1172                 }
1173                 update_inline_extent_backref(path, iref, refs_to_add, extent_op);
1174         } else if (ret == -ENOENT) {
1175                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1176                                             root_objectid, owner, offset,
1177                                             refs_to_add, extent_op);
1178                 ret = 0;
1179         }
1180         return ret;
1181 }
1182
1183 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1184                                  struct btrfs_root *root,
1185                                  struct btrfs_path *path,
1186                                  struct btrfs_extent_inline_ref *iref,
1187                                  int refs_to_drop, int is_data)
1188 {
1189         int ret = 0;
1190
1191         BUG_ON(!is_data && refs_to_drop != 1);
1192         if (iref)
1193                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
1194         else if (is_data)
1195                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1196         else
1197                 ret = btrfs_del_item(trans, root, path);
1198         return ret;
1199 }
1200
1201 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1202                                u64 *discarded_bytes)
1203 {
1204         int j, ret = 0;
1205         u64 bytes_left, end;
1206         u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1207
1208         if (WARN_ON(start != aligned_start)) {
1209                 len -= aligned_start - start;
1210                 len = round_down(len, 1 << SECTOR_SHIFT);
1211                 start = aligned_start;
1212         }
1213
1214         *discarded_bytes = 0;
1215
1216         if (!len)
1217                 return 0;
1218
1219         end = start + len;
1220         bytes_left = len;
1221
1222         /* Skip any superblocks on this device. */
1223         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1224                 u64 sb_start = btrfs_sb_offset(j);
1225                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1226                 u64 size = sb_start - start;
1227
1228                 if (!in_range(sb_start, start, bytes_left) &&
1229                     !in_range(sb_end, start, bytes_left) &&
1230                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1231                         continue;
1232
1233                 /*
1234                  * Superblock spans beginning of range.  Adjust start and
1235                  * try again.
1236                  */
1237                 if (sb_start <= start) {
1238                         start += sb_end - start;
1239                         if (start > end) {
1240                                 bytes_left = 0;
1241                                 break;
1242                         }
1243                         bytes_left = end - start;
1244                         continue;
1245                 }
1246
1247                 if (size) {
1248                         ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1249                                                    size >> SECTOR_SHIFT,
1250                                                    GFP_NOFS);
1251                         if (!ret)
1252                                 *discarded_bytes += size;
1253                         else if (ret != -EOPNOTSUPP)
1254                                 return ret;
1255                 }
1256
1257                 start = sb_end;
1258                 if (start > end) {
1259                         bytes_left = 0;
1260                         break;
1261                 }
1262                 bytes_left = end - start;
1263         }
1264
1265         if (bytes_left) {
1266                 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1267                                            bytes_left >> SECTOR_SHIFT,
1268                                            GFP_NOFS);
1269                 if (!ret)
1270                         *discarded_bytes += bytes_left;
1271         }
1272         return ret;
1273 }
1274
1275 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1276 {
1277         struct btrfs_device *dev = stripe->dev;
1278         struct btrfs_fs_info *fs_info = dev->fs_info;
1279         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1280         u64 phys = stripe->physical;
1281         u64 len = stripe->length;
1282         u64 discarded = 0;
1283         int ret = 0;
1284
1285         /* Zone reset on a zoned filesystem */
1286         if (btrfs_can_zone_reset(dev, phys, len)) {
1287                 u64 src_disc;
1288
1289                 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1290                 if (ret)
1291                         goto out;
1292
1293                 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1294                     dev != dev_replace->srcdev)
1295                         goto out;
1296
1297                 src_disc = discarded;
1298
1299                 /* Send to replace target as well */
1300                 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1301                                               &discarded);
1302                 discarded += src_disc;
1303         } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1304                 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1305         } else {
1306                 ret = 0;
1307                 *bytes = 0;
1308         }
1309
1310 out:
1311         *bytes = discarded;
1312         return ret;
1313 }
1314
1315 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1316                          u64 num_bytes, u64 *actual_bytes)
1317 {
1318         int ret = 0;
1319         u64 discarded_bytes = 0;
1320         u64 end = bytenr + num_bytes;
1321         u64 cur = bytenr;
1322
1323         /*
1324          * Avoid races with device replace and make sure the devices in the
1325          * stripes don't go away while we are discarding.
1326          */
1327         btrfs_bio_counter_inc_blocked(fs_info);
1328         while (cur < end) {
1329                 struct btrfs_discard_stripe *stripes;
1330                 unsigned int num_stripes;
1331                 int i;
1332
1333                 num_bytes = end - cur;
1334                 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1335                 if (IS_ERR(stripes)) {
1336                         ret = PTR_ERR(stripes);
1337                         if (ret == -EOPNOTSUPP)
1338                                 ret = 0;
1339                         break;
1340                 }
1341
1342                 for (i = 0; i < num_stripes; i++) {
1343                         struct btrfs_discard_stripe *stripe = stripes + i;
1344                         u64 bytes;
1345
1346                         if (!stripe->dev->bdev) {
1347                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1348                                 continue;
1349                         }
1350
1351                         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1352                                         &stripe->dev->dev_state))
1353                                 continue;
1354
1355                         ret = do_discard_extent(stripe, &bytes);
1356                         if (ret) {
1357                                 /*
1358                                  * Keep going if discard is not supported by the
1359                                  * device.
1360                                  */
1361                                 if (ret != -EOPNOTSUPP)
1362                                         break;
1363                                 ret = 0;
1364                         } else {
1365                                 discarded_bytes += bytes;
1366                         }
1367                 }
1368                 kfree(stripes);
1369                 if (ret)
1370                         break;
1371                 cur += num_bytes;
1372         }
1373         btrfs_bio_counter_dec(fs_info);
1374         if (actual_bytes)
1375                 *actual_bytes = discarded_bytes;
1376         return ret;
1377 }
1378
1379 /* Can return -ENOMEM */
1380 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1381                          struct btrfs_ref *generic_ref)
1382 {
1383         struct btrfs_fs_info *fs_info = trans->fs_info;
1384         int ret;
1385
1386         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1387                generic_ref->action);
1388         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1389                generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1390
1391         if (generic_ref->type == BTRFS_REF_METADATA)
1392                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1393         else
1394                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1395
1396         btrfs_ref_tree_mod(fs_info, generic_ref);
1397
1398         return ret;
1399 }
1400
1401 /*
1402  * __btrfs_inc_extent_ref - insert backreference for a given extent
1403  *
1404  * The counterpart is in __btrfs_free_extent(), with examples and more details
1405  * how it works.
1406  *
1407  * @trans:          Handle of transaction
1408  *
1409  * @node:           The delayed ref node used to get the bytenr/length for
1410  *                  extent whose references are incremented.
1411  *
1412  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1413  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1414  *                  bytenr of the parent block. Since new extents are always
1415  *                  created with indirect references, this will only be the case
1416  *                  when relocating a shared extent. In that case, root_objectid
1417  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1418  *                  be 0
1419  *
1420  * @root_objectid:  The id of the root where this modification has originated,
1421  *                  this can be either one of the well-known metadata trees or
1422  *                  the subvolume id which references this extent.
1423  *
1424  * @owner:          For data extents it is the inode number of the owning file.
1425  *                  For metadata extents this parameter holds the level in the
1426  *                  tree of the extent.
1427  *
1428  * @offset:         For metadata extents the offset is ignored and is currently
1429  *                  always passed as 0. For data extents it is the fileoffset
1430  *                  this extent belongs to.
1431  *
1432  * @refs_to_add     Number of references to add
1433  *
1434  * @extent_op       Pointer to a structure, holding information necessary when
1435  *                  updating a tree block's flags
1436  *
1437  */
1438 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1439                                   struct btrfs_delayed_ref_node *node,
1440                                   u64 parent, u64 root_objectid,
1441                                   u64 owner, u64 offset, int refs_to_add,
1442                                   struct btrfs_delayed_extent_op *extent_op)
1443 {
1444         struct btrfs_path *path;
1445         struct extent_buffer *leaf;
1446         struct btrfs_extent_item *item;
1447         struct btrfs_key key;
1448         u64 bytenr = node->bytenr;
1449         u64 num_bytes = node->num_bytes;
1450         u64 refs;
1451         int ret;
1452
1453         path = btrfs_alloc_path();
1454         if (!path)
1455                 return -ENOMEM;
1456
1457         /* this will setup the path even if it fails to insert the back ref */
1458         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1459                                            parent, root_objectid, owner,
1460                                            offset, refs_to_add, extent_op);
1461         if ((ret < 0 && ret != -EAGAIN) || !ret)
1462                 goto out;
1463
1464         /*
1465          * Ok we had -EAGAIN which means we didn't have space to insert and
1466          * inline extent ref, so just update the reference count and add a
1467          * normal backref.
1468          */
1469         leaf = path->nodes[0];
1470         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1471         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1472         refs = btrfs_extent_refs(leaf, item);
1473         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1474         if (extent_op)
1475                 __run_delayed_extent_op(extent_op, leaf, item);
1476
1477         btrfs_mark_buffer_dirty(leaf);
1478         btrfs_release_path(path);
1479
1480         /* now insert the actual backref */
1481         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1482                 BUG_ON(refs_to_add != 1);
1483                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1484                                             root_objectid);
1485         } else {
1486                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1487                                              root_objectid, owner, offset,
1488                                              refs_to_add);
1489         }
1490         if (ret)
1491                 btrfs_abort_transaction(trans, ret);
1492 out:
1493         btrfs_free_path(path);
1494         return ret;
1495 }
1496
1497 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1498                                 struct btrfs_delayed_ref_node *node,
1499                                 struct btrfs_delayed_extent_op *extent_op,
1500                                 bool insert_reserved)
1501 {
1502         int ret = 0;
1503         struct btrfs_delayed_data_ref *ref;
1504         struct btrfs_key ins;
1505         u64 parent = 0;
1506         u64 ref_root = 0;
1507         u64 flags = 0;
1508
1509         ins.objectid = node->bytenr;
1510         ins.offset = node->num_bytes;
1511         ins.type = BTRFS_EXTENT_ITEM_KEY;
1512
1513         ref = btrfs_delayed_node_to_data_ref(node);
1514         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1515
1516         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1517                 parent = ref->parent;
1518         ref_root = ref->root;
1519
1520         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1521                 if (extent_op)
1522                         flags |= extent_op->flags_to_set;
1523                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1524                                                  flags, ref->objectid,
1525                                                  ref->offset, &ins,
1526                                                  node->ref_mod);
1527         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1528                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1529                                              ref->objectid, ref->offset,
1530                                              node->ref_mod, extent_op);
1531         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1532                 ret = __btrfs_free_extent(trans, node, parent,
1533                                           ref_root, ref->objectid,
1534                                           ref->offset, node->ref_mod,
1535                                           extent_op);
1536         } else {
1537                 BUG();
1538         }
1539         return ret;
1540 }
1541
1542 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1543                                     struct extent_buffer *leaf,
1544                                     struct btrfs_extent_item *ei)
1545 {
1546         u64 flags = btrfs_extent_flags(leaf, ei);
1547         if (extent_op->update_flags) {
1548                 flags |= extent_op->flags_to_set;
1549                 btrfs_set_extent_flags(leaf, ei, flags);
1550         }
1551
1552         if (extent_op->update_key) {
1553                 struct btrfs_tree_block_info *bi;
1554                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1555                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1556                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1557         }
1558 }
1559
1560 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1561                                  struct btrfs_delayed_ref_head *head,
1562                                  struct btrfs_delayed_extent_op *extent_op)
1563 {
1564         struct btrfs_fs_info *fs_info = trans->fs_info;
1565         struct btrfs_root *root;
1566         struct btrfs_key key;
1567         struct btrfs_path *path;
1568         struct btrfs_extent_item *ei;
1569         struct extent_buffer *leaf;
1570         u32 item_size;
1571         int ret;
1572         int err = 0;
1573         int metadata = 1;
1574
1575         if (TRANS_ABORTED(trans))
1576                 return 0;
1577
1578         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1579                 metadata = 0;
1580
1581         path = btrfs_alloc_path();
1582         if (!path)
1583                 return -ENOMEM;
1584
1585         key.objectid = head->bytenr;
1586
1587         if (metadata) {
1588                 key.type = BTRFS_METADATA_ITEM_KEY;
1589                 key.offset = extent_op->level;
1590         } else {
1591                 key.type = BTRFS_EXTENT_ITEM_KEY;
1592                 key.offset = head->num_bytes;
1593         }
1594
1595         root = btrfs_extent_root(fs_info, key.objectid);
1596 again:
1597         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1598         if (ret < 0) {
1599                 err = ret;
1600                 goto out;
1601         }
1602         if (ret > 0) {
1603                 if (metadata) {
1604                         if (path->slots[0] > 0) {
1605                                 path->slots[0]--;
1606                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1607                                                       path->slots[0]);
1608                                 if (key.objectid == head->bytenr &&
1609                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1610                                     key.offset == head->num_bytes)
1611                                         ret = 0;
1612                         }
1613                         if (ret > 0) {
1614                                 btrfs_release_path(path);
1615                                 metadata = 0;
1616
1617                                 key.objectid = head->bytenr;
1618                                 key.offset = head->num_bytes;
1619                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1620                                 goto again;
1621                         }
1622                 } else {
1623                         err = -EIO;
1624                         goto out;
1625                 }
1626         }
1627
1628         leaf = path->nodes[0];
1629         item_size = btrfs_item_size(leaf, path->slots[0]);
1630
1631         if (unlikely(item_size < sizeof(*ei))) {
1632                 err = -EINVAL;
1633                 btrfs_print_v0_err(fs_info);
1634                 btrfs_abort_transaction(trans, err);
1635                 goto out;
1636         }
1637
1638         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1639         __run_delayed_extent_op(extent_op, leaf, ei);
1640
1641         btrfs_mark_buffer_dirty(leaf);
1642 out:
1643         btrfs_free_path(path);
1644         return err;
1645 }
1646
1647 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1648                                 struct btrfs_delayed_ref_node *node,
1649                                 struct btrfs_delayed_extent_op *extent_op,
1650                                 bool insert_reserved)
1651 {
1652         int ret = 0;
1653         struct btrfs_delayed_tree_ref *ref;
1654         u64 parent = 0;
1655         u64 ref_root = 0;
1656
1657         ref = btrfs_delayed_node_to_tree_ref(node);
1658         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1659
1660         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1661                 parent = ref->parent;
1662         ref_root = ref->root;
1663
1664         if (node->ref_mod != 1) {
1665                 btrfs_err(trans->fs_info,
1666         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1667                           node->bytenr, node->ref_mod, node->action, ref_root,
1668                           parent);
1669                 return -EIO;
1670         }
1671         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1672                 BUG_ON(!extent_op || !extent_op->update_flags);
1673                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1674         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1675                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1676                                              ref->level, 0, 1, extent_op);
1677         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1678                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
1679                                           ref->level, 0, 1, extent_op);
1680         } else {
1681                 BUG();
1682         }
1683         return ret;
1684 }
1685
1686 /* helper function to actually process a single delayed ref entry */
1687 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1688                                struct btrfs_delayed_ref_node *node,
1689                                struct btrfs_delayed_extent_op *extent_op,
1690                                bool insert_reserved)
1691 {
1692         int ret = 0;
1693
1694         if (TRANS_ABORTED(trans)) {
1695                 if (insert_reserved)
1696                         btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1697                 return 0;
1698         }
1699
1700         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1701             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1702                 ret = run_delayed_tree_ref(trans, node, extent_op,
1703                                            insert_reserved);
1704         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1705                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1706                 ret = run_delayed_data_ref(trans, node, extent_op,
1707                                            insert_reserved);
1708         else
1709                 BUG();
1710         if (ret && insert_reserved)
1711                 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1712         if (ret < 0)
1713                 btrfs_err(trans->fs_info,
1714 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1715                           node->bytenr, node->num_bytes, node->type,
1716                           node->action, node->ref_mod, ret);
1717         return ret;
1718 }
1719
1720 static inline struct btrfs_delayed_ref_node *
1721 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1722 {
1723         struct btrfs_delayed_ref_node *ref;
1724
1725         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1726                 return NULL;
1727
1728         /*
1729          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1730          * This is to prevent a ref count from going down to zero, which deletes
1731          * the extent item from the extent tree, when there still are references
1732          * to add, which would fail because they would not find the extent item.
1733          */
1734         if (!list_empty(&head->ref_add_list))
1735                 return list_first_entry(&head->ref_add_list,
1736                                 struct btrfs_delayed_ref_node, add_list);
1737
1738         ref = rb_entry(rb_first_cached(&head->ref_tree),
1739                        struct btrfs_delayed_ref_node, ref_node);
1740         ASSERT(list_empty(&ref->add_list));
1741         return ref;
1742 }
1743
1744 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1745                                       struct btrfs_delayed_ref_head *head)
1746 {
1747         spin_lock(&delayed_refs->lock);
1748         head->processing = false;
1749         delayed_refs->num_heads_ready++;
1750         spin_unlock(&delayed_refs->lock);
1751         btrfs_delayed_ref_unlock(head);
1752 }
1753
1754 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1755                                 struct btrfs_delayed_ref_head *head)
1756 {
1757         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1758
1759         if (!extent_op)
1760                 return NULL;
1761
1762         if (head->must_insert_reserved) {
1763                 head->extent_op = NULL;
1764                 btrfs_free_delayed_extent_op(extent_op);
1765                 return NULL;
1766         }
1767         return extent_op;
1768 }
1769
1770 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1771                                      struct btrfs_delayed_ref_head *head)
1772 {
1773         struct btrfs_delayed_extent_op *extent_op;
1774         int ret;
1775
1776         extent_op = cleanup_extent_op(head);
1777         if (!extent_op)
1778                 return 0;
1779         head->extent_op = NULL;
1780         spin_unlock(&head->lock);
1781         ret = run_delayed_extent_op(trans, head, extent_op);
1782         btrfs_free_delayed_extent_op(extent_op);
1783         return ret ? ret : 1;
1784 }
1785
1786 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1787                                   struct btrfs_delayed_ref_root *delayed_refs,
1788                                   struct btrfs_delayed_ref_head *head)
1789 {
1790         int nr_items = 1;       /* Dropping this ref head update. */
1791
1792         /*
1793          * We had csum deletions accounted for in our delayed refs rsv, we need
1794          * to drop the csum leaves for this update from our delayed_refs_rsv.
1795          */
1796         if (head->total_ref_mod < 0 && head->is_data) {
1797                 spin_lock(&delayed_refs->lock);
1798                 delayed_refs->pending_csums -= head->num_bytes;
1799                 spin_unlock(&delayed_refs->lock);
1800                 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1801         }
1802
1803         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1804 }
1805
1806 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1807                             struct btrfs_delayed_ref_head *head)
1808 {
1809
1810         struct btrfs_fs_info *fs_info = trans->fs_info;
1811         struct btrfs_delayed_ref_root *delayed_refs;
1812         int ret;
1813
1814         delayed_refs = &trans->transaction->delayed_refs;
1815
1816         ret = run_and_cleanup_extent_op(trans, head);
1817         if (ret < 0) {
1818                 unselect_delayed_ref_head(delayed_refs, head);
1819                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1820                 return ret;
1821         } else if (ret) {
1822                 return ret;
1823         }
1824
1825         /*
1826          * Need to drop our head ref lock and re-acquire the delayed ref lock
1827          * and then re-check to make sure nobody got added.
1828          */
1829         spin_unlock(&head->lock);
1830         spin_lock(&delayed_refs->lock);
1831         spin_lock(&head->lock);
1832         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1833                 spin_unlock(&head->lock);
1834                 spin_unlock(&delayed_refs->lock);
1835                 return 1;
1836         }
1837         btrfs_delete_ref_head(delayed_refs, head);
1838         spin_unlock(&head->lock);
1839         spin_unlock(&delayed_refs->lock);
1840
1841         if (head->must_insert_reserved) {
1842                 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1843                 if (head->is_data) {
1844                         struct btrfs_root *csum_root;
1845
1846                         csum_root = btrfs_csum_root(fs_info, head->bytenr);
1847                         ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1848                                               head->num_bytes);
1849                 }
1850         }
1851
1852         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1853
1854         trace_run_delayed_ref_head(fs_info, head, 0);
1855         btrfs_delayed_ref_unlock(head);
1856         btrfs_put_delayed_ref_head(head);
1857         return ret;
1858 }
1859
1860 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1861                                         struct btrfs_trans_handle *trans)
1862 {
1863         struct btrfs_delayed_ref_root *delayed_refs =
1864                 &trans->transaction->delayed_refs;
1865         struct btrfs_delayed_ref_head *head = NULL;
1866         int ret;
1867
1868         spin_lock(&delayed_refs->lock);
1869         head = btrfs_select_ref_head(delayed_refs);
1870         if (!head) {
1871                 spin_unlock(&delayed_refs->lock);
1872                 return head;
1873         }
1874
1875         /*
1876          * Grab the lock that says we are going to process all the refs for
1877          * this head
1878          */
1879         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1880         spin_unlock(&delayed_refs->lock);
1881
1882         /*
1883          * We may have dropped the spin lock to get the head mutex lock, and
1884          * that might have given someone else time to free the head.  If that's
1885          * true, it has been removed from our list and we can move on.
1886          */
1887         if (ret == -EAGAIN)
1888                 head = ERR_PTR(-EAGAIN);
1889
1890         return head;
1891 }
1892
1893 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1894                                            struct btrfs_delayed_ref_head *locked_ref)
1895 {
1896         struct btrfs_fs_info *fs_info = trans->fs_info;
1897         struct btrfs_delayed_ref_root *delayed_refs;
1898         struct btrfs_delayed_extent_op *extent_op;
1899         struct btrfs_delayed_ref_node *ref;
1900         bool must_insert_reserved;
1901         int ret;
1902
1903         delayed_refs = &trans->transaction->delayed_refs;
1904
1905         lockdep_assert_held(&locked_ref->mutex);
1906         lockdep_assert_held(&locked_ref->lock);
1907
1908         while ((ref = select_delayed_ref(locked_ref))) {
1909                 if (ref->seq &&
1910                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1911                         spin_unlock(&locked_ref->lock);
1912                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1913                         return -EAGAIN;
1914                 }
1915
1916                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1917                 RB_CLEAR_NODE(&ref->ref_node);
1918                 if (!list_empty(&ref->add_list))
1919                         list_del(&ref->add_list);
1920                 /*
1921                  * When we play the delayed ref, also correct the ref_mod on
1922                  * head
1923                  */
1924                 switch (ref->action) {
1925                 case BTRFS_ADD_DELAYED_REF:
1926                 case BTRFS_ADD_DELAYED_EXTENT:
1927                         locked_ref->ref_mod -= ref->ref_mod;
1928                         break;
1929                 case BTRFS_DROP_DELAYED_REF:
1930                         locked_ref->ref_mod += ref->ref_mod;
1931                         break;
1932                 default:
1933                         WARN_ON(1);
1934                 }
1935                 atomic_dec(&delayed_refs->num_entries);
1936
1937                 /*
1938                  * Record the must_insert_reserved flag before we drop the
1939                  * spin lock.
1940                  */
1941                 must_insert_reserved = locked_ref->must_insert_reserved;
1942                 locked_ref->must_insert_reserved = false;
1943
1944                 extent_op = locked_ref->extent_op;
1945                 locked_ref->extent_op = NULL;
1946                 spin_unlock(&locked_ref->lock);
1947
1948                 ret = run_one_delayed_ref(trans, ref, extent_op,
1949                                           must_insert_reserved);
1950
1951                 btrfs_free_delayed_extent_op(extent_op);
1952                 if (ret) {
1953                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1954                         btrfs_put_delayed_ref(ref);
1955                         return ret;
1956                 }
1957
1958                 btrfs_put_delayed_ref(ref);
1959                 cond_resched();
1960
1961                 spin_lock(&locked_ref->lock);
1962                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
1963         }
1964
1965         return 0;
1966 }
1967
1968 /*
1969  * Returns 0 on success or if called with an already aborted transaction.
1970  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1971  */
1972 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1973                                              unsigned long nr)
1974 {
1975         struct btrfs_fs_info *fs_info = trans->fs_info;
1976         struct btrfs_delayed_ref_root *delayed_refs;
1977         struct btrfs_delayed_ref_head *locked_ref = NULL;
1978         int ret;
1979         unsigned long count = 0;
1980
1981         delayed_refs = &trans->transaction->delayed_refs;
1982         do {
1983                 if (!locked_ref) {
1984                         locked_ref = btrfs_obtain_ref_head(trans);
1985                         if (IS_ERR_OR_NULL(locked_ref)) {
1986                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
1987                                         continue;
1988                                 } else {
1989                                         break;
1990                                 }
1991                         }
1992                         count++;
1993                 }
1994                 /*
1995                  * We need to try and merge add/drops of the same ref since we
1996                  * can run into issues with relocate dropping the implicit ref
1997                  * and then it being added back again before the drop can
1998                  * finish.  If we merged anything we need to re-loop so we can
1999                  * get a good ref.
2000                  * Or we can get node references of the same type that weren't
2001                  * merged when created due to bumps in the tree mod seq, and
2002                  * we need to merge them to prevent adding an inline extent
2003                  * backref before dropping it (triggering a BUG_ON at
2004                  * insert_inline_extent_backref()).
2005                  */
2006                 spin_lock(&locked_ref->lock);
2007                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2008
2009                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
2010                 if (ret < 0 && ret != -EAGAIN) {
2011                         /*
2012                          * Error, btrfs_run_delayed_refs_for_head already
2013                          * unlocked everything so just bail out
2014                          */
2015                         return ret;
2016                 } else if (!ret) {
2017                         /*
2018                          * Success, perform the usual cleanup of a processed
2019                          * head
2020                          */
2021                         ret = cleanup_ref_head(trans, locked_ref);
2022                         if (ret > 0 ) {
2023                                 /* We dropped our lock, we need to loop. */
2024                                 ret = 0;
2025                                 continue;
2026                         } else if (ret) {
2027                                 return ret;
2028                         }
2029                 }
2030
2031                 /*
2032                  * Either success case or btrfs_run_delayed_refs_for_head
2033                  * returned -EAGAIN, meaning we need to select another head
2034                  */
2035
2036                 locked_ref = NULL;
2037                 cond_resched();
2038         } while ((nr != -1 && count < nr) || locked_ref);
2039
2040         return 0;
2041 }
2042
2043 #ifdef SCRAMBLE_DELAYED_REFS
2044 /*
2045  * Normally delayed refs get processed in ascending bytenr order. This
2046  * correlates in most cases to the order added. To expose dependencies on this
2047  * order, we start to process the tree in the middle instead of the beginning
2048  */
2049 static u64 find_middle(struct rb_root *root)
2050 {
2051         struct rb_node *n = root->rb_node;
2052         struct btrfs_delayed_ref_node *entry;
2053         int alt = 1;
2054         u64 middle;
2055         u64 first = 0, last = 0;
2056
2057         n = rb_first(root);
2058         if (n) {
2059                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2060                 first = entry->bytenr;
2061         }
2062         n = rb_last(root);
2063         if (n) {
2064                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2065                 last = entry->bytenr;
2066         }
2067         n = root->rb_node;
2068
2069         while (n) {
2070                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2071                 WARN_ON(!entry->in_tree);
2072
2073                 middle = entry->bytenr;
2074
2075                 if (alt)
2076                         n = n->rb_left;
2077                 else
2078                         n = n->rb_right;
2079
2080                 alt = 1 - alt;
2081         }
2082         return middle;
2083 }
2084 #endif
2085
2086 /*
2087  * this starts processing the delayed reference count updates and
2088  * extent insertions we have queued up so far.  count can be
2089  * 0, which means to process everything in the tree at the start
2090  * of the run (but not newly added entries), or it can be some target
2091  * number you'd like to process.
2092  *
2093  * Returns 0 on success or if called with an aborted transaction
2094  * Returns <0 on error and aborts the transaction
2095  */
2096 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2097                            unsigned long count)
2098 {
2099         struct btrfs_fs_info *fs_info = trans->fs_info;
2100         struct rb_node *node;
2101         struct btrfs_delayed_ref_root *delayed_refs;
2102         struct btrfs_delayed_ref_head *head;
2103         int ret;
2104         int run_all = count == (unsigned long)-1;
2105
2106         /* We'll clean this up in btrfs_cleanup_transaction */
2107         if (TRANS_ABORTED(trans))
2108                 return 0;
2109
2110         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2111                 return 0;
2112
2113         delayed_refs = &trans->transaction->delayed_refs;
2114         if (count == 0)
2115                 count = delayed_refs->num_heads_ready;
2116
2117 again:
2118 #ifdef SCRAMBLE_DELAYED_REFS
2119         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2120 #endif
2121         ret = __btrfs_run_delayed_refs(trans, count);
2122         if (ret < 0) {
2123                 btrfs_abort_transaction(trans, ret);
2124                 return ret;
2125         }
2126
2127         if (run_all) {
2128                 btrfs_create_pending_block_groups(trans);
2129
2130                 spin_lock(&delayed_refs->lock);
2131                 node = rb_first_cached(&delayed_refs->href_root);
2132                 if (!node) {
2133                         spin_unlock(&delayed_refs->lock);
2134                         goto out;
2135                 }
2136                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2137                                 href_node);
2138                 refcount_inc(&head->refs);
2139                 spin_unlock(&delayed_refs->lock);
2140
2141                 /* Mutex was contended, block until it's released and retry. */
2142                 mutex_lock(&head->mutex);
2143                 mutex_unlock(&head->mutex);
2144
2145                 btrfs_put_delayed_ref_head(head);
2146                 cond_resched();
2147                 goto again;
2148         }
2149 out:
2150         return 0;
2151 }
2152
2153 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2154                                 struct extent_buffer *eb, u64 flags)
2155 {
2156         struct btrfs_delayed_extent_op *extent_op;
2157         int level = btrfs_header_level(eb);
2158         int ret;
2159
2160         extent_op = btrfs_alloc_delayed_extent_op();
2161         if (!extent_op)
2162                 return -ENOMEM;
2163
2164         extent_op->flags_to_set = flags;
2165         extent_op->update_flags = true;
2166         extent_op->update_key = false;
2167         extent_op->level = level;
2168
2169         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2170         if (ret)
2171                 btrfs_free_delayed_extent_op(extent_op);
2172         return ret;
2173 }
2174
2175 static noinline int check_delayed_ref(struct btrfs_root *root,
2176                                       struct btrfs_path *path,
2177                                       u64 objectid, u64 offset, u64 bytenr)
2178 {
2179         struct btrfs_delayed_ref_head *head;
2180         struct btrfs_delayed_ref_node *ref;
2181         struct btrfs_delayed_data_ref *data_ref;
2182         struct btrfs_delayed_ref_root *delayed_refs;
2183         struct btrfs_transaction *cur_trans;
2184         struct rb_node *node;
2185         int ret = 0;
2186
2187         spin_lock(&root->fs_info->trans_lock);
2188         cur_trans = root->fs_info->running_transaction;
2189         if (cur_trans)
2190                 refcount_inc(&cur_trans->use_count);
2191         spin_unlock(&root->fs_info->trans_lock);
2192         if (!cur_trans)
2193                 return 0;
2194
2195         delayed_refs = &cur_trans->delayed_refs;
2196         spin_lock(&delayed_refs->lock);
2197         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2198         if (!head) {
2199                 spin_unlock(&delayed_refs->lock);
2200                 btrfs_put_transaction(cur_trans);
2201                 return 0;
2202         }
2203
2204         if (!mutex_trylock(&head->mutex)) {
2205                 if (path->nowait) {
2206                         spin_unlock(&delayed_refs->lock);
2207                         btrfs_put_transaction(cur_trans);
2208                         return -EAGAIN;
2209                 }
2210
2211                 refcount_inc(&head->refs);
2212                 spin_unlock(&delayed_refs->lock);
2213
2214                 btrfs_release_path(path);
2215
2216                 /*
2217                  * Mutex was contended, block until it's released and let
2218                  * caller try again
2219                  */
2220                 mutex_lock(&head->mutex);
2221                 mutex_unlock(&head->mutex);
2222                 btrfs_put_delayed_ref_head(head);
2223                 btrfs_put_transaction(cur_trans);
2224                 return -EAGAIN;
2225         }
2226         spin_unlock(&delayed_refs->lock);
2227
2228         spin_lock(&head->lock);
2229         /*
2230          * XXX: We should replace this with a proper search function in the
2231          * future.
2232          */
2233         for (node = rb_first_cached(&head->ref_tree); node;
2234              node = rb_next(node)) {
2235                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2236                 /* If it's a shared ref we know a cross reference exists */
2237                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2238                         ret = 1;
2239                         break;
2240                 }
2241
2242                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2243
2244                 /*
2245                  * If our ref doesn't match the one we're currently looking at
2246                  * then we have a cross reference.
2247                  */
2248                 if (data_ref->root != root->root_key.objectid ||
2249                     data_ref->objectid != objectid ||
2250                     data_ref->offset != offset) {
2251                         ret = 1;
2252                         break;
2253                 }
2254         }
2255         spin_unlock(&head->lock);
2256         mutex_unlock(&head->mutex);
2257         btrfs_put_transaction(cur_trans);
2258         return ret;
2259 }
2260
2261 static noinline int check_committed_ref(struct btrfs_root *root,
2262                                         struct btrfs_path *path,
2263                                         u64 objectid, u64 offset, u64 bytenr,
2264                                         bool strict)
2265 {
2266         struct btrfs_fs_info *fs_info = root->fs_info;
2267         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2268         struct extent_buffer *leaf;
2269         struct btrfs_extent_data_ref *ref;
2270         struct btrfs_extent_inline_ref *iref;
2271         struct btrfs_extent_item *ei;
2272         struct btrfs_key key;
2273         u32 item_size;
2274         int type;
2275         int ret;
2276
2277         key.objectid = bytenr;
2278         key.offset = (u64)-1;
2279         key.type = BTRFS_EXTENT_ITEM_KEY;
2280
2281         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2282         if (ret < 0)
2283                 goto out;
2284         BUG_ON(ret == 0); /* Corruption */
2285
2286         ret = -ENOENT;
2287         if (path->slots[0] == 0)
2288                 goto out;
2289
2290         path->slots[0]--;
2291         leaf = path->nodes[0];
2292         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2293
2294         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2295                 goto out;
2296
2297         ret = 1;
2298         item_size = btrfs_item_size(leaf, path->slots[0]);
2299         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2300
2301         /* If extent item has more than 1 inline ref then it's shared */
2302         if (item_size != sizeof(*ei) +
2303             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2304                 goto out;
2305
2306         /*
2307          * If extent created before last snapshot => it's shared unless the
2308          * snapshot has been deleted. Use the heuristic if strict is false.
2309          */
2310         if (!strict &&
2311             (btrfs_extent_generation(leaf, ei) <=
2312              btrfs_root_last_snapshot(&root->root_item)))
2313                 goto out;
2314
2315         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2316
2317         /* If this extent has SHARED_DATA_REF then it's shared */
2318         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2319         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2320                 goto out;
2321
2322         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2323         if (btrfs_extent_refs(leaf, ei) !=
2324             btrfs_extent_data_ref_count(leaf, ref) ||
2325             btrfs_extent_data_ref_root(leaf, ref) !=
2326             root->root_key.objectid ||
2327             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2328             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2329                 goto out;
2330
2331         ret = 0;
2332 out:
2333         return ret;
2334 }
2335
2336 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2337                           u64 bytenr, bool strict, struct btrfs_path *path)
2338 {
2339         int ret;
2340
2341         do {
2342                 ret = check_committed_ref(root, path, objectid,
2343                                           offset, bytenr, strict);
2344                 if (ret && ret != -ENOENT)
2345                         goto out;
2346
2347                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2348         } while (ret == -EAGAIN);
2349
2350 out:
2351         btrfs_release_path(path);
2352         if (btrfs_is_data_reloc_root(root))
2353                 WARN_ON(ret > 0);
2354         return ret;
2355 }
2356
2357 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2358                            struct btrfs_root *root,
2359                            struct extent_buffer *buf,
2360                            int full_backref, int inc)
2361 {
2362         struct btrfs_fs_info *fs_info = root->fs_info;
2363         u64 bytenr;
2364         u64 num_bytes;
2365         u64 parent;
2366         u64 ref_root;
2367         u32 nritems;
2368         struct btrfs_key key;
2369         struct btrfs_file_extent_item *fi;
2370         struct btrfs_ref generic_ref = { 0 };
2371         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2372         int i;
2373         int action;
2374         int level;
2375         int ret = 0;
2376
2377         if (btrfs_is_testing(fs_info))
2378                 return 0;
2379
2380         ref_root = btrfs_header_owner(buf);
2381         nritems = btrfs_header_nritems(buf);
2382         level = btrfs_header_level(buf);
2383
2384         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2385                 return 0;
2386
2387         if (full_backref)
2388                 parent = buf->start;
2389         else
2390                 parent = 0;
2391         if (inc)
2392                 action = BTRFS_ADD_DELAYED_REF;
2393         else
2394                 action = BTRFS_DROP_DELAYED_REF;
2395
2396         for (i = 0; i < nritems; i++) {
2397                 if (level == 0) {
2398                         btrfs_item_key_to_cpu(buf, &key, i);
2399                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2400                                 continue;
2401                         fi = btrfs_item_ptr(buf, i,
2402                                             struct btrfs_file_extent_item);
2403                         if (btrfs_file_extent_type(buf, fi) ==
2404                             BTRFS_FILE_EXTENT_INLINE)
2405                                 continue;
2406                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2407                         if (bytenr == 0)
2408                                 continue;
2409
2410                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2411                         key.offset -= btrfs_file_extent_offset(buf, fi);
2412                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2413                                                num_bytes, parent);
2414                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2415                                             key.offset, root->root_key.objectid,
2416                                             for_reloc);
2417                         if (inc)
2418                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2419                         else
2420                                 ret = btrfs_free_extent(trans, &generic_ref);
2421                         if (ret)
2422                                 goto fail;
2423                 } else {
2424                         bytenr = btrfs_node_blockptr(buf, i);
2425                         num_bytes = fs_info->nodesize;
2426                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2427                                                num_bytes, parent);
2428                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2429                                             root->root_key.objectid, for_reloc);
2430                         if (inc)
2431                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2432                         else
2433                                 ret = btrfs_free_extent(trans, &generic_ref);
2434                         if (ret)
2435                                 goto fail;
2436                 }
2437         }
2438         return 0;
2439 fail:
2440         return ret;
2441 }
2442
2443 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2444                   struct extent_buffer *buf, int full_backref)
2445 {
2446         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2447 }
2448
2449 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2450                   struct extent_buffer *buf, int full_backref)
2451 {
2452         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2453 }
2454
2455 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2456 {
2457         struct btrfs_fs_info *fs_info = root->fs_info;
2458         u64 flags;
2459         u64 ret;
2460
2461         if (data)
2462                 flags = BTRFS_BLOCK_GROUP_DATA;
2463         else if (root == fs_info->chunk_root)
2464                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2465         else
2466                 flags = BTRFS_BLOCK_GROUP_METADATA;
2467
2468         ret = btrfs_get_alloc_profile(fs_info, flags);
2469         return ret;
2470 }
2471
2472 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2473 {
2474         struct rb_node *leftmost;
2475         u64 bytenr = 0;
2476
2477         read_lock(&fs_info->block_group_cache_lock);
2478         /* Get the block group with the lowest logical start address. */
2479         leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2480         if (leftmost) {
2481                 struct btrfs_block_group *bg;
2482
2483                 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2484                 bytenr = bg->start;
2485         }
2486         read_unlock(&fs_info->block_group_cache_lock);
2487
2488         return bytenr;
2489 }
2490
2491 static int pin_down_extent(struct btrfs_trans_handle *trans,
2492                            struct btrfs_block_group *cache,
2493                            u64 bytenr, u64 num_bytes, int reserved)
2494 {
2495         struct btrfs_fs_info *fs_info = cache->fs_info;
2496
2497         spin_lock(&cache->space_info->lock);
2498         spin_lock(&cache->lock);
2499         cache->pinned += num_bytes;
2500         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2501                                              num_bytes);
2502         if (reserved) {
2503                 cache->reserved -= num_bytes;
2504                 cache->space_info->bytes_reserved -= num_bytes;
2505         }
2506         spin_unlock(&cache->lock);
2507         spin_unlock(&cache->space_info->lock);
2508
2509         set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2510                        bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2511         return 0;
2512 }
2513
2514 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2515                      u64 bytenr, u64 num_bytes, int reserved)
2516 {
2517         struct btrfs_block_group *cache;
2518
2519         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2520         BUG_ON(!cache); /* Logic error */
2521
2522         pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2523
2524         btrfs_put_block_group(cache);
2525         return 0;
2526 }
2527
2528 /*
2529  * this function must be called within transaction
2530  */
2531 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2532                                     u64 bytenr, u64 num_bytes)
2533 {
2534         struct btrfs_block_group *cache;
2535         int ret;
2536
2537         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2538         if (!cache)
2539                 return -EINVAL;
2540
2541         /*
2542          * Fully cache the free space first so that our pin removes the free space
2543          * from the cache.
2544          */
2545         ret = btrfs_cache_block_group(cache, true);
2546         if (ret)
2547                 goto out;
2548
2549         pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2550
2551         /* remove us from the free space cache (if we're there at all) */
2552         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2553 out:
2554         btrfs_put_block_group(cache);
2555         return ret;
2556 }
2557
2558 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2559                                    u64 start, u64 num_bytes)
2560 {
2561         int ret;
2562         struct btrfs_block_group *block_group;
2563
2564         block_group = btrfs_lookup_block_group(fs_info, start);
2565         if (!block_group)
2566                 return -EINVAL;
2567
2568         ret = btrfs_cache_block_group(block_group, true);
2569         if (ret)
2570                 goto out;
2571
2572         ret = btrfs_remove_free_space(block_group, start, num_bytes);
2573 out:
2574         btrfs_put_block_group(block_group);
2575         return ret;
2576 }
2577
2578 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2579 {
2580         struct btrfs_fs_info *fs_info = eb->fs_info;
2581         struct btrfs_file_extent_item *item;
2582         struct btrfs_key key;
2583         int found_type;
2584         int i;
2585         int ret = 0;
2586
2587         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2588                 return 0;
2589
2590         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2591                 btrfs_item_key_to_cpu(eb, &key, i);
2592                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2593                         continue;
2594                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2595                 found_type = btrfs_file_extent_type(eb, item);
2596                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2597                         continue;
2598                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2599                         continue;
2600                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2601                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2602                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2603                 if (ret)
2604                         break;
2605         }
2606
2607         return ret;
2608 }
2609
2610 static void
2611 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2612 {
2613         atomic_inc(&bg->reservations);
2614 }
2615
2616 /*
2617  * Returns the free cluster for the given space info and sets empty_cluster to
2618  * what it should be based on the mount options.
2619  */
2620 static struct btrfs_free_cluster *
2621 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2622                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2623 {
2624         struct btrfs_free_cluster *ret = NULL;
2625
2626         *empty_cluster = 0;
2627         if (btrfs_mixed_space_info(space_info))
2628                 return ret;
2629
2630         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2631                 ret = &fs_info->meta_alloc_cluster;
2632                 if (btrfs_test_opt(fs_info, SSD))
2633                         *empty_cluster = SZ_2M;
2634                 else
2635                         *empty_cluster = SZ_64K;
2636         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2637                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2638                 *empty_cluster = SZ_2M;
2639                 ret = &fs_info->data_alloc_cluster;
2640         }
2641
2642         return ret;
2643 }
2644
2645 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2646                               u64 start, u64 end,
2647                               const bool return_free_space)
2648 {
2649         struct btrfs_block_group *cache = NULL;
2650         struct btrfs_space_info *space_info;
2651         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2652         struct btrfs_free_cluster *cluster = NULL;
2653         u64 len;
2654         u64 total_unpinned = 0;
2655         u64 empty_cluster = 0;
2656         bool readonly;
2657
2658         while (start <= end) {
2659                 readonly = false;
2660                 if (!cache ||
2661                     start >= cache->start + cache->length) {
2662                         if (cache)
2663                                 btrfs_put_block_group(cache);
2664                         total_unpinned = 0;
2665                         cache = btrfs_lookup_block_group(fs_info, start);
2666                         BUG_ON(!cache); /* Logic error */
2667
2668                         cluster = fetch_cluster_info(fs_info,
2669                                                      cache->space_info,
2670                                                      &empty_cluster);
2671                         empty_cluster <<= 1;
2672                 }
2673
2674                 len = cache->start + cache->length - start;
2675                 len = min(len, end + 1 - start);
2676
2677                 if (return_free_space)
2678                         btrfs_add_free_space(cache, start, len);
2679
2680                 start += len;
2681                 total_unpinned += len;
2682                 space_info = cache->space_info;
2683
2684                 /*
2685                  * If this space cluster has been marked as fragmented and we've
2686                  * unpinned enough in this block group to potentially allow a
2687                  * cluster to be created inside of it go ahead and clear the
2688                  * fragmented check.
2689                  */
2690                 if (cluster && cluster->fragmented &&
2691                     total_unpinned > empty_cluster) {
2692                         spin_lock(&cluster->lock);
2693                         cluster->fragmented = 0;
2694                         spin_unlock(&cluster->lock);
2695                 }
2696
2697                 spin_lock(&space_info->lock);
2698                 spin_lock(&cache->lock);
2699                 cache->pinned -= len;
2700                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2701                 space_info->max_extent_size = 0;
2702                 if (cache->ro) {
2703                         space_info->bytes_readonly += len;
2704                         readonly = true;
2705                 } else if (btrfs_is_zoned(fs_info)) {
2706                         /* Need reset before reusing in a zoned block group */
2707                         space_info->bytes_zone_unusable += len;
2708                         readonly = true;
2709                 }
2710                 spin_unlock(&cache->lock);
2711                 if (!readonly && return_free_space &&
2712                     global_rsv->space_info == space_info) {
2713                         spin_lock(&global_rsv->lock);
2714                         if (!global_rsv->full) {
2715                                 u64 to_add = min(len, global_rsv->size -
2716                                                       global_rsv->reserved);
2717
2718                                 global_rsv->reserved += to_add;
2719                                 btrfs_space_info_update_bytes_may_use(fs_info,
2720                                                 space_info, to_add);
2721                                 if (global_rsv->reserved >= global_rsv->size)
2722                                         global_rsv->full = 1;
2723                                 len -= to_add;
2724                         }
2725                         spin_unlock(&global_rsv->lock);
2726                 }
2727                 /* Add to any tickets we may have */
2728                 if (!readonly && return_free_space && len)
2729                         btrfs_try_granting_tickets(fs_info, space_info);
2730                 spin_unlock(&space_info->lock);
2731         }
2732
2733         if (cache)
2734                 btrfs_put_block_group(cache);
2735         return 0;
2736 }
2737
2738 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2739 {
2740         struct btrfs_fs_info *fs_info = trans->fs_info;
2741         struct btrfs_block_group *block_group, *tmp;
2742         struct list_head *deleted_bgs;
2743         struct extent_io_tree *unpin;
2744         u64 start;
2745         u64 end;
2746         int ret;
2747
2748         unpin = &trans->transaction->pinned_extents;
2749
2750         while (!TRANS_ABORTED(trans)) {
2751                 struct extent_state *cached_state = NULL;
2752
2753                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2754                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2755                                             EXTENT_DIRTY, &cached_state);
2756                 if (ret) {
2757                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2758                         break;
2759                 }
2760
2761                 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2762                         ret = btrfs_discard_extent(fs_info, start,
2763                                                    end + 1 - start, NULL);
2764
2765                 clear_extent_dirty(unpin, start, end, &cached_state);
2766                 unpin_extent_range(fs_info, start, end, true);
2767                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2768                 free_extent_state(cached_state);
2769                 cond_resched();
2770         }
2771
2772         if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2773                 btrfs_discard_calc_delay(&fs_info->discard_ctl);
2774                 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2775         }
2776
2777         /*
2778          * Transaction is finished.  We don't need the lock anymore.  We
2779          * do need to clean up the block groups in case of a transaction
2780          * abort.
2781          */
2782         deleted_bgs = &trans->transaction->deleted_bgs;
2783         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2784                 u64 trimmed = 0;
2785
2786                 ret = -EROFS;
2787                 if (!TRANS_ABORTED(trans))
2788                         ret = btrfs_discard_extent(fs_info,
2789                                                    block_group->start,
2790                                                    block_group->length,
2791                                                    &trimmed);
2792
2793                 list_del_init(&block_group->bg_list);
2794                 btrfs_unfreeze_block_group(block_group);
2795                 btrfs_put_block_group(block_group);
2796
2797                 if (ret) {
2798                         const char *errstr = btrfs_decode_error(ret);
2799                         btrfs_warn(fs_info,
2800                            "discard failed while removing blockgroup: errno=%d %s",
2801                                    ret, errstr);
2802                 }
2803         }
2804
2805         return 0;
2806 }
2807
2808 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2809                                      u64 bytenr, u64 num_bytes, bool is_data)
2810 {
2811         int ret;
2812
2813         if (is_data) {
2814                 struct btrfs_root *csum_root;
2815
2816                 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2817                 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2818                 if (ret) {
2819                         btrfs_abort_transaction(trans, ret);
2820                         return ret;
2821                 }
2822         }
2823
2824         ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2825         if (ret) {
2826                 btrfs_abort_transaction(trans, ret);
2827                 return ret;
2828         }
2829
2830         ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2831         if (ret)
2832                 btrfs_abort_transaction(trans, ret);
2833
2834         return ret;
2835 }
2836
2837 #define abort_and_dump(trans, path, fmt, args...)       \
2838 ({                                                      \
2839         btrfs_abort_transaction(trans, -EUCLEAN);       \
2840         btrfs_print_leaf(path->nodes[0]);               \
2841         btrfs_crit(trans->fs_info, fmt, ##args);        \
2842 })
2843
2844 /*
2845  * Drop one or more refs of @node.
2846  *
2847  * 1. Locate the extent refs.
2848  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2849  *    Locate it, then reduce the refs number or remove the ref line completely.
2850  *
2851  * 2. Update the refs count in EXTENT/METADATA_ITEM
2852  *
2853  * Inline backref case:
2854  *
2855  * in extent tree we have:
2856  *
2857  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2858  *              refs 2 gen 6 flags DATA
2859  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2860  *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2861  *
2862  * This function gets called with:
2863  *
2864  *    node->bytenr = 13631488
2865  *    node->num_bytes = 1048576
2866  *    root_objectid = FS_TREE
2867  *    owner_objectid = 257
2868  *    owner_offset = 0
2869  *    refs_to_drop = 1
2870  *
2871  * Then we should get some like:
2872  *
2873  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2874  *              refs 1 gen 6 flags DATA
2875  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2876  *
2877  * Keyed backref case:
2878  *
2879  * in extent tree we have:
2880  *
2881  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2882  *              refs 754 gen 6 flags DATA
2883  *      [...]
2884  *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2885  *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2886  *
2887  * This function get called with:
2888  *
2889  *    node->bytenr = 13631488
2890  *    node->num_bytes = 1048576
2891  *    root_objectid = FS_TREE
2892  *    owner_objectid = 866
2893  *    owner_offset = 0
2894  *    refs_to_drop = 1
2895  *
2896  * Then we should get some like:
2897  *
2898  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2899  *              refs 753 gen 6 flags DATA
2900  *
2901  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2902  */
2903 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2904                                struct btrfs_delayed_ref_node *node, u64 parent,
2905                                u64 root_objectid, u64 owner_objectid,
2906                                u64 owner_offset, int refs_to_drop,
2907                                struct btrfs_delayed_extent_op *extent_op)
2908 {
2909         struct btrfs_fs_info *info = trans->fs_info;
2910         struct btrfs_key key;
2911         struct btrfs_path *path;
2912         struct btrfs_root *extent_root;
2913         struct extent_buffer *leaf;
2914         struct btrfs_extent_item *ei;
2915         struct btrfs_extent_inline_ref *iref;
2916         int ret;
2917         int is_data;
2918         int extent_slot = 0;
2919         int found_extent = 0;
2920         int num_to_del = 1;
2921         u32 item_size;
2922         u64 refs;
2923         u64 bytenr = node->bytenr;
2924         u64 num_bytes = node->num_bytes;
2925         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2926
2927         extent_root = btrfs_extent_root(info, bytenr);
2928         ASSERT(extent_root);
2929
2930         path = btrfs_alloc_path();
2931         if (!path)
2932                 return -ENOMEM;
2933
2934         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2935
2936         if (!is_data && refs_to_drop != 1) {
2937                 btrfs_crit(info,
2938 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2939                            node->bytenr, refs_to_drop);
2940                 ret = -EINVAL;
2941                 btrfs_abort_transaction(trans, ret);
2942                 goto out;
2943         }
2944
2945         if (is_data)
2946                 skinny_metadata = false;
2947
2948         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2949                                     parent, root_objectid, owner_objectid,
2950                                     owner_offset);
2951         if (ret == 0) {
2952                 /*
2953                  * Either the inline backref or the SHARED_DATA_REF/
2954                  * SHARED_BLOCK_REF is found
2955                  *
2956                  * Here is a quick path to locate EXTENT/METADATA_ITEM.
2957                  * It's possible the EXTENT/METADATA_ITEM is near current slot.
2958                  */
2959                 extent_slot = path->slots[0];
2960                 while (extent_slot >= 0) {
2961                         btrfs_item_key_to_cpu(path->nodes[0], &key,
2962                                               extent_slot);
2963                         if (key.objectid != bytenr)
2964                                 break;
2965                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2966                             key.offset == num_bytes) {
2967                                 found_extent = 1;
2968                                 break;
2969                         }
2970                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
2971                             key.offset == owner_objectid) {
2972                                 found_extent = 1;
2973                                 break;
2974                         }
2975
2976                         /* Quick path didn't find the EXTEMT/METADATA_ITEM */
2977                         if (path->slots[0] - extent_slot > 5)
2978                                 break;
2979                         extent_slot--;
2980                 }
2981
2982                 if (!found_extent) {
2983                         if (iref) {
2984                                 abort_and_dump(trans, path,
2985 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
2986                                            path->slots[0]);
2987                                 ret = -EUCLEAN;
2988                                 goto out;
2989                         }
2990                         /* Must be SHARED_* item, remove the backref first */
2991                         ret = remove_extent_backref(trans, extent_root, path,
2992                                                     NULL, refs_to_drop, is_data);
2993                         if (ret) {
2994                                 btrfs_abort_transaction(trans, ret);
2995                                 goto out;
2996                         }
2997                         btrfs_release_path(path);
2998
2999                         /* Slow path to locate EXTENT/METADATA_ITEM */
3000                         key.objectid = bytenr;
3001                         key.type = BTRFS_EXTENT_ITEM_KEY;
3002                         key.offset = num_bytes;
3003
3004                         if (!is_data && skinny_metadata) {
3005                                 key.type = BTRFS_METADATA_ITEM_KEY;
3006                                 key.offset = owner_objectid;
3007                         }
3008
3009                         ret = btrfs_search_slot(trans, extent_root,
3010                                                 &key, path, -1, 1);
3011                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3012                                 /*
3013                                  * Couldn't find our skinny metadata item,
3014                                  * see if we have ye olde extent item.
3015                                  */
3016                                 path->slots[0]--;
3017                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3018                                                       path->slots[0]);
3019                                 if (key.objectid == bytenr &&
3020                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3021                                     key.offset == num_bytes)
3022                                         ret = 0;
3023                         }
3024
3025                         if (ret > 0 && skinny_metadata) {
3026                                 skinny_metadata = false;
3027                                 key.objectid = bytenr;
3028                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3029                                 key.offset = num_bytes;
3030                                 btrfs_release_path(path);
3031                                 ret = btrfs_search_slot(trans, extent_root,
3032                                                         &key, path, -1, 1);
3033                         }
3034
3035                         if (ret) {
3036                                 if (ret > 0)
3037                                         btrfs_print_leaf(path->nodes[0]);
3038                                 btrfs_err(info,
3039                         "umm, got %d back from search, was looking for %llu, slot %d",
3040                                           ret, bytenr, path->slots[0]);
3041                         }
3042                         if (ret < 0) {
3043                                 btrfs_abort_transaction(trans, ret);
3044                                 goto out;
3045                         }
3046                         extent_slot = path->slots[0];
3047                 }
3048         } else if (WARN_ON(ret == -ENOENT)) {
3049                 abort_and_dump(trans, path,
3050 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3051                                bytenr, parent, root_objectid, owner_objectid,
3052                                owner_offset, path->slots[0]);
3053                 goto out;
3054         } else {
3055                 btrfs_abort_transaction(trans, ret);
3056                 goto out;
3057         }
3058
3059         leaf = path->nodes[0];
3060         item_size = btrfs_item_size(leaf, extent_slot);
3061         if (unlikely(item_size < sizeof(*ei))) {
3062                 ret = -EINVAL;
3063                 btrfs_print_v0_err(info);
3064                 btrfs_abort_transaction(trans, ret);
3065                 goto out;
3066         }
3067         ei = btrfs_item_ptr(leaf, extent_slot,
3068                             struct btrfs_extent_item);
3069         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3070             key.type == BTRFS_EXTENT_ITEM_KEY) {
3071                 struct btrfs_tree_block_info *bi;
3072
3073                 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3074                         abort_and_dump(trans, path,
3075 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3076                                        key.objectid, key.type, key.offset,
3077                                        path->slots[0], owner_objectid, item_size,
3078                                        sizeof(*ei) + sizeof(*bi));
3079                         ret = -EUCLEAN;
3080                         goto out;
3081                 }
3082                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3083                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3084         }
3085
3086         refs = btrfs_extent_refs(leaf, ei);
3087         if (refs < refs_to_drop) {
3088                 abort_and_dump(trans, path,
3089                 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3090                                refs_to_drop, refs, bytenr, path->slots[0]);
3091                 ret = -EUCLEAN;
3092                 goto out;
3093         }
3094         refs -= refs_to_drop;
3095
3096         if (refs > 0) {
3097                 if (extent_op)
3098                         __run_delayed_extent_op(extent_op, leaf, ei);
3099                 /*
3100                  * In the case of inline back ref, reference count will
3101                  * be updated by remove_extent_backref
3102                  */
3103                 if (iref) {
3104                         if (!found_extent) {
3105                                 abort_and_dump(trans, path,
3106 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3107                                                path->slots[0]);
3108                                 ret = -EUCLEAN;
3109                                 goto out;
3110                         }
3111                 } else {
3112                         btrfs_set_extent_refs(leaf, ei, refs);
3113                         btrfs_mark_buffer_dirty(leaf);
3114                 }
3115                 if (found_extent) {
3116                         ret = remove_extent_backref(trans, extent_root, path,
3117                                                     iref, refs_to_drop, is_data);
3118                         if (ret) {
3119                                 btrfs_abort_transaction(trans, ret);
3120                                 goto out;
3121                         }
3122                 }
3123         } else {
3124                 /* In this branch refs == 1 */
3125                 if (found_extent) {
3126                         if (is_data && refs_to_drop !=
3127                             extent_data_ref_count(path, iref)) {
3128                                 abort_and_dump(trans, path,
3129                 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3130                                                extent_data_ref_count(path, iref),
3131                                                refs_to_drop, path->slots[0]);
3132                                 ret = -EUCLEAN;
3133                                 goto out;
3134                         }
3135                         if (iref) {
3136                                 if (path->slots[0] != extent_slot) {
3137                                         abort_and_dump(trans, path,
3138 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3139                                                        key.objectid, key.type,
3140                                                        key.offset, path->slots[0]);
3141                                         ret = -EUCLEAN;
3142                                         goto out;
3143                                 }
3144                         } else {
3145                                 /*
3146                                  * No inline ref, we must be at SHARED_* item,
3147                                  * And it's single ref, it must be:
3148                                  * |    extent_slot       ||extent_slot + 1|
3149                                  * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3150                                  */
3151                                 if (path->slots[0] != extent_slot + 1) {
3152                                         abort_and_dump(trans, path,
3153         "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3154                                                        path->slots[0]);
3155                                         ret = -EUCLEAN;
3156                                         goto out;
3157                                 }
3158                                 path->slots[0] = extent_slot;
3159                                 num_to_del = 2;
3160                         }
3161                 }
3162
3163                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3164                                       num_to_del);
3165                 if (ret) {
3166                         btrfs_abort_transaction(trans, ret);
3167                         goto out;
3168                 }
3169                 btrfs_release_path(path);
3170
3171                 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
3172         }
3173         btrfs_release_path(path);
3174
3175 out:
3176         btrfs_free_path(path);
3177         return ret;
3178 }
3179
3180 /*
3181  * when we free an block, it is possible (and likely) that we free the last
3182  * delayed ref for that extent as well.  This searches the delayed ref tree for
3183  * a given extent, and if there are no other delayed refs to be processed, it
3184  * removes it from the tree.
3185  */
3186 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3187                                       u64 bytenr)
3188 {
3189         struct btrfs_delayed_ref_head *head;
3190         struct btrfs_delayed_ref_root *delayed_refs;
3191         int ret = 0;
3192
3193         delayed_refs = &trans->transaction->delayed_refs;
3194         spin_lock(&delayed_refs->lock);
3195         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3196         if (!head)
3197                 goto out_delayed_unlock;
3198
3199         spin_lock(&head->lock);
3200         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3201                 goto out;
3202
3203         if (cleanup_extent_op(head) != NULL)
3204                 goto out;
3205
3206         /*
3207          * waiting for the lock here would deadlock.  If someone else has it
3208          * locked they are already in the process of dropping it anyway
3209          */
3210         if (!mutex_trylock(&head->mutex))
3211                 goto out;
3212
3213         btrfs_delete_ref_head(delayed_refs, head);
3214         head->processing = false;
3215
3216         spin_unlock(&head->lock);
3217         spin_unlock(&delayed_refs->lock);
3218
3219         BUG_ON(head->extent_op);
3220         if (head->must_insert_reserved)
3221                 ret = 1;
3222
3223         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3224         mutex_unlock(&head->mutex);
3225         btrfs_put_delayed_ref_head(head);
3226         return ret;
3227 out:
3228         spin_unlock(&head->lock);
3229
3230 out_delayed_unlock:
3231         spin_unlock(&delayed_refs->lock);
3232         return 0;
3233 }
3234
3235 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3236                            u64 root_id,
3237                            struct extent_buffer *buf,
3238                            u64 parent, int last_ref)
3239 {
3240         struct btrfs_fs_info *fs_info = trans->fs_info;
3241         struct btrfs_ref generic_ref = { 0 };
3242         int ret;
3243
3244         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3245                                buf->start, buf->len, parent);
3246         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3247                             root_id, 0, false);
3248
3249         if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3250                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3251                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3252                 BUG_ON(ret); /* -ENOMEM */
3253         }
3254
3255         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3256                 struct btrfs_block_group *cache;
3257                 bool must_pin = false;
3258
3259                 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3260                         ret = check_ref_cleanup(trans, buf->start);
3261                         if (!ret) {
3262                                 btrfs_redirty_list_add(trans->transaction, buf);
3263                                 goto out;
3264                         }
3265                 }
3266
3267                 cache = btrfs_lookup_block_group(fs_info, buf->start);
3268
3269                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3270                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3271                         btrfs_put_block_group(cache);
3272                         goto out;
3273                 }
3274
3275                 /*
3276                  * If there are tree mod log users we may have recorded mod log
3277                  * operations for this node.  If we re-allocate this node we
3278                  * could replay operations on this node that happened when it
3279                  * existed in a completely different root.  For example if it
3280                  * was part of root A, then was reallocated to root B, and we
3281                  * are doing a btrfs_old_search_slot(root b), we could replay
3282                  * operations that happened when the block was part of root A,
3283                  * giving us an inconsistent view of the btree.
3284                  *
3285                  * We are safe from races here because at this point no other
3286                  * node or root points to this extent buffer, so if after this
3287                  * check a new tree mod log user joins we will not have an
3288                  * existing log of operations on this node that we have to
3289                  * contend with.
3290                  */
3291                 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3292                         must_pin = true;
3293
3294                 if (must_pin || btrfs_is_zoned(fs_info)) {
3295                         btrfs_redirty_list_add(trans->transaction, buf);
3296                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3297                         btrfs_put_block_group(cache);
3298                         goto out;
3299                 }
3300
3301                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3302
3303                 btrfs_add_free_space(cache, buf->start, buf->len);
3304                 btrfs_free_reserved_bytes(cache, buf->len, 0);
3305                 btrfs_put_block_group(cache);
3306                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3307         }
3308 out:
3309         if (last_ref) {
3310                 /*
3311                  * Deleting the buffer, clear the corrupt flag since it doesn't
3312                  * matter anymore.
3313                  */
3314                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3315         }
3316 }
3317
3318 /* Can return -ENOMEM */
3319 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3320 {
3321         struct btrfs_fs_info *fs_info = trans->fs_info;
3322         int ret;
3323
3324         if (btrfs_is_testing(fs_info))
3325                 return 0;
3326
3327         /*
3328          * tree log blocks never actually go into the extent allocation
3329          * tree, just update pinning info and exit early.
3330          */
3331         if ((ref->type == BTRFS_REF_METADATA &&
3332              ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3333             (ref->type == BTRFS_REF_DATA &&
3334              ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3335                 /* unlocks the pinned mutex */
3336                 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3337                 ret = 0;
3338         } else if (ref->type == BTRFS_REF_METADATA) {
3339                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3340         } else {
3341                 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3342         }
3343
3344         if (!((ref->type == BTRFS_REF_METADATA &&
3345                ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3346               (ref->type == BTRFS_REF_DATA &&
3347                ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3348                 btrfs_ref_tree_mod(fs_info, ref);
3349
3350         return ret;
3351 }
3352
3353 enum btrfs_loop_type {
3354         LOOP_CACHING_NOWAIT,
3355         LOOP_CACHING_WAIT,
3356         LOOP_UNSET_SIZE_CLASS,
3357         LOOP_ALLOC_CHUNK,
3358         LOOP_WRONG_SIZE_CLASS,
3359         LOOP_NO_EMPTY_SIZE,
3360 };
3361
3362 static inline void
3363 btrfs_lock_block_group(struct btrfs_block_group *cache,
3364                        int delalloc)
3365 {
3366         if (delalloc)
3367                 down_read(&cache->data_rwsem);
3368 }
3369
3370 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3371                        int delalloc)
3372 {
3373         btrfs_get_block_group(cache);
3374         if (delalloc)
3375                 down_read(&cache->data_rwsem);
3376 }
3377
3378 static struct btrfs_block_group *btrfs_lock_cluster(
3379                    struct btrfs_block_group *block_group,
3380                    struct btrfs_free_cluster *cluster,
3381                    int delalloc)
3382         __acquires(&cluster->refill_lock)
3383 {
3384         struct btrfs_block_group *used_bg = NULL;
3385
3386         spin_lock(&cluster->refill_lock);
3387         while (1) {
3388                 used_bg = cluster->block_group;
3389                 if (!used_bg)
3390                         return NULL;
3391
3392                 if (used_bg == block_group)
3393                         return used_bg;
3394
3395                 btrfs_get_block_group(used_bg);
3396
3397                 if (!delalloc)
3398                         return used_bg;
3399
3400                 if (down_read_trylock(&used_bg->data_rwsem))
3401                         return used_bg;
3402
3403                 spin_unlock(&cluster->refill_lock);
3404
3405                 /* We should only have one-level nested. */
3406                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3407
3408                 spin_lock(&cluster->refill_lock);
3409                 if (used_bg == cluster->block_group)
3410                         return used_bg;
3411
3412                 up_read(&used_bg->data_rwsem);
3413                 btrfs_put_block_group(used_bg);
3414         }
3415 }
3416
3417 static inline void
3418 btrfs_release_block_group(struct btrfs_block_group *cache,
3419                          int delalloc)
3420 {
3421         if (delalloc)
3422                 up_read(&cache->data_rwsem);
3423         btrfs_put_block_group(cache);
3424 }
3425
3426 /*
3427  * Helper function for find_free_extent().
3428  *
3429  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3430  * Return -EAGAIN to inform caller that we need to re-search this block group
3431  * Return >0 to inform caller that we find nothing
3432  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3433  */
3434 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3435                                       struct find_free_extent_ctl *ffe_ctl,
3436                                       struct btrfs_block_group **cluster_bg_ret)
3437 {
3438         struct btrfs_block_group *cluster_bg;
3439         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3440         u64 aligned_cluster;
3441         u64 offset;
3442         int ret;
3443
3444         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3445         if (!cluster_bg)
3446                 goto refill_cluster;
3447         if (cluster_bg != bg && (cluster_bg->ro ||
3448             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3449                 goto release_cluster;
3450
3451         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3452                         ffe_ctl->num_bytes, cluster_bg->start,
3453                         &ffe_ctl->max_extent_size);
3454         if (offset) {
3455                 /* We have a block, we're done */
3456                 spin_unlock(&last_ptr->refill_lock);
3457                 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3458                 *cluster_bg_ret = cluster_bg;
3459                 ffe_ctl->found_offset = offset;
3460                 return 0;
3461         }
3462         WARN_ON(last_ptr->block_group != cluster_bg);
3463
3464 release_cluster:
3465         /*
3466          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3467          * lets just skip it and let the allocator find whatever block it can
3468          * find. If we reach this point, we will have tried the cluster
3469          * allocator plenty of times and not have found anything, so we are
3470          * likely way too fragmented for the clustering stuff to find anything.
3471          *
3472          * However, if the cluster is taken from the current block group,
3473          * release the cluster first, so that we stand a better chance of
3474          * succeeding in the unclustered allocation.
3475          */
3476         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3477                 spin_unlock(&last_ptr->refill_lock);
3478                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3479                 return -ENOENT;
3480         }
3481
3482         /* This cluster didn't work out, free it and start over */
3483         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3484
3485         if (cluster_bg != bg)
3486                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3487
3488 refill_cluster:
3489         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3490                 spin_unlock(&last_ptr->refill_lock);
3491                 return -ENOENT;
3492         }
3493
3494         aligned_cluster = max_t(u64,
3495                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3496                         bg->full_stripe_len);
3497         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3498                         ffe_ctl->num_bytes, aligned_cluster);
3499         if (ret == 0) {
3500                 /* Now pull our allocation out of this cluster */
3501                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3502                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3503                                 &ffe_ctl->max_extent_size);
3504                 if (offset) {
3505                         /* We found one, proceed */
3506                         spin_unlock(&last_ptr->refill_lock);
3507                         ffe_ctl->found_offset = offset;
3508                         trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3509                         return 0;
3510                 }
3511         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3512                    !ffe_ctl->retry_clustered) {
3513                 spin_unlock(&last_ptr->refill_lock);
3514
3515                 ffe_ctl->retry_clustered = true;
3516                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3517                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3518                 return -EAGAIN;
3519         }
3520         /*
3521          * At this point we either didn't find a cluster or we weren't able to
3522          * allocate a block from our cluster.  Free the cluster we've been
3523          * trying to use, and go to the next block group.
3524          */
3525         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3526         spin_unlock(&last_ptr->refill_lock);
3527         return 1;
3528 }
3529
3530 /*
3531  * Return >0 to inform caller that we find nothing
3532  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3533  * Return -EAGAIN to inform caller that we need to re-search this block group
3534  */
3535 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3536                                         struct find_free_extent_ctl *ffe_ctl)
3537 {
3538         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3539         u64 offset;
3540
3541         /*
3542          * We are doing an unclustered allocation, set the fragmented flag so
3543          * we don't bother trying to setup a cluster again until we get more
3544          * space.
3545          */
3546         if (unlikely(last_ptr)) {
3547                 spin_lock(&last_ptr->lock);
3548                 last_ptr->fragmented = 1;
3549                 spin_unlock(&last_ptr->lock);
3550         }
3551         if (ffe_ctl->cached) {
3552                 struct btrfs_free_space_ctl *free_space_ctl;
3553
3554                 free_space_ctl = bg->free_space_ctl;
3555                 spin_lock(&free_space_ctl->tree_lock);
3556                 if (free_space_ctl->free_space <
3557                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3558                     ffe_ctl->empty_size) {
3559                         ffe_ctl->total_free_space = max_t(u64,
3560                                         ffe_ctl->total_free_space,
3561                                         free_space_ctl->free_space);
3562                         spin_unlock(&free_space_ctl->tree_lock);
3563                         return 1;
3564                 }
3565                 spin_unlock(&free_space_ctl->tree_lock);
3566         }
3567
3568         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3569                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3570                         &ffe_ctl->max_extent_size);
3571
3572         /*
3573          * If we didn't find a chunk, and we haven't failed on this block group
3574          * before, and this block group is in the middle of caching and we are
3575          * ok with waiting, then go ahead and wait for progress to be made, and
3576          * set @retry_unclustered to true.
3577          *
3578          * If @retry_unclustered is true then we've already waited on this
3579          * block group once and should move on to the next block group.
3580          */
3581         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3582             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3583                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3584                                                       ffe_ctl->empty_size);
3585                 ffe_ctl->retry_unclustered = true;
3586                 return -EAGAIN;
3587         } else if (!offset) {
3588                 return 1;
3589         }
3590         ffe_ctl->found_offset = offset;
3591         return 0;
3592 }
3593
3594 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3595                                    struct find_free_extent_ctl *ffe_ctl,
3596                                    struct btrfs_block_group **bg_ret)
3597 {
3598         int ret;
3599
3600         /* We want to try and use the cluster allocator, so lets look there */
3601         if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3602                 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3603                 if (ret >= 0 || ret == -EAGAIN)
3604                         return ret;
3605                 /* ret == -ENOENT case falls through */
3606         }
3607
3608         return find_free_extent_unclustered(block_group, ffe_ctl);
3609 }
3610
3611 /*
3612  * Tree-log block group locking
3613  * ============================
3614  *
3615  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3616  * indicates the starting address of a block group, which is reserved only
3617  * for tree-log metadata.
3618  *
3619  * Lock nesting
3620  * ============
3621  *
3622  * space_info::lock
3623  *   block_group::lock
3624  *     fs_info::treelog_bg_lock
3625  */
3626
3627 /*
3628  * Simple allocator for sequential-only block group. It only allows sequential
3629  * allocation. No need to play with trees. This function also reserves the
3630  * bytes as in btrfs_add_reserved_bytes.
3631  */
3632 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3633                                struct find_free_extent_ctl *ffe_ctl,
3634                                struct btrfs_block_group **bg_ret)
3635 {
3636         struct btrfs_fs_info *fs_info = block_group->fs_info;
3637         struct btrfs_space_info *space_info = block_group->space_info;
3638         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3639         u64 start = block_group->start;
3640         u64 num_bytes = ffe_ctl->num_bytes;
3641         u64 avail;
3642         u64 bytenr = block_group->start;
3643         u64 log_bytenr;
3644         u64 data_reloc_bytenr;
3645         int ret = 0;
3646         bool skip = false;
3647
3648         ASSERT(btrfs_is_zoned(block_group->fs_info));
3649
3650         /*
3651          * Do not allow non-tree-log blocks in the dedicated tree-log block
3652          * group, and vice versa.
3653          */
3654         spin_lock(&fs_info->treelog_bg_lock);
3655         log_bytenr = fs_info->treelog_bg;
3656         if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3657                            (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3658                 skip = true;
3659         spin_unlock(&fs_info->treelog_bg_lock);
3660         if (skip)
3661                 return 1;
3662
3663         /*
3664          * Do not allow non-relocation blocks in the dedicated relocation block
3665          * group, and vice versa.
3666          */
3667         spin_lock(&fs_info->relocation_bg_lock);
3668         data_reloc_bytenr = fs_info->data_reloc_bg;
3669         if (data_reloc_bytenr &&
3670             ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3671              (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3672                 skip = true;
3673         spin_unlock(&fs_info->relocation_bg_lock);
3674         if (skip)
3675                 return 1;
3676
3677         /* Check RO and no space case before trying to activate it */
3678         spin_lock(&block_group->lock);
3679         if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3680                 ret = 1;
3681                 /*
3682                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3683                  * Return the error after taking the locks.
3684                  */
3685         }
3686         spin_unlock(&block_group->lock);
3687
3688         if (!ret && !btrfs_zone_activate(block_group)) {
3689                 ret = 1;
3690                 /*
3691                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3692                  * Return the error after taking the locks.
3693                  */
3694         }
3695
3696         spin_lock(&space_info->lock);
3697         spin_lock(&block_group->lock);
3698         spin_lock(&fs_info->treelog_bg_lock);
3699         spin_lock(&fs_info->relocation_bg_lock);
3700
3701         if (ret)
3702                 goto out;
3703
3704         ASSERT(!ffe_ctl->for_treelog ||
3705                block_group->start == fs_info->treelog_bg ||
3706                fs_info->treelog_bg == 0);
3707         ASSERT(!ffe_ctl->for_data_reloc ||
3708                block_group->start == fs_info->data_reloc_bg ||
3709                fs_info->data_reloc_bg == 0);
3710
3711         if (block_group->ro ||
3712             test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
3713                 ret = 1;
3714                 goto out;
3715         }
3716
3717         /*
3718          * Do not allow currently using block group to be tree-log dedicated
3719          * block group.
3720          */
3721         if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3722             (block_group->used || block_group->reserved)) {
3723                 ret = 1;
3724                 goto out;
3725         }
3726
3727         /*
3728          * Do not allow currently used block group to be the data relocation
3729          * dedicated block group.
3730          */
3731         if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3732             (block_group->used || block_group->reserved)) {
3733                 ret = 1;
3734                 goto out;
3735         }
3736
3737         WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3738         avail = block_group->zone_capacity - block_group->alloc_offset;
3739         if (avail < num_bytes) {
3740                 if (ffe_ctl->max_extent_size < avail) {
3741                         /*
3742                          * With sequential allocator, free space is always
3743                          * contiguous
3744                          */
3745                         ffe_ctl->max_extent_size = avail;
3746                         ffe_ctl->total_free_space = avail;
3747                 }
3748                 ret = 1;
3749                 goto out;
3750         }
3751
3752         if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3753                 fs_info->treelog_bg = block_group->start;
3754
3755         if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
3756                 fs_info->data_reloc_bg = block_group->start;
3757
3758         ffe_ctl->found_offset = start + block_group->alloc_offset;
3759         block_group->alloc_offset += num_bytes;
3760         spin_lock(&ctl->tree_lock);
3761         ctl->free_space -= num_bytes;
3762         spin_unlock(&ctl->tree_lock);
3763
3764         /*
3765          * We do not check if found_offset is aligned to stripesize. The
3766          * address is anyway rewritten when using zone append writing.
3767          */
3768
3769         ffe_ctl->search_start = ffe_ctl->found_offset;
3770
3771 out:
3772         if (ret && ffe_ctl->for_treelog)
3773                 fs_info->treelog_bg = 0;
3774         if (ret && ffe_ctl->for_data_reloc &&
3775             fs_info->data_reloc_bg == block_group->start) {
3776                 /*
3777                  * Do not allow further allocations from this block group.
3778                  * Compared to increasing the ->ro, setting the
3779                  * ->zoned_data_reloc_ongoing flag still allows nocow
3780                  *  writers to come in. See btrfs_inc_nocow_writers().
3781                  *
3782                  * We need to disable an allocation to avoid an allocation of
3783                  * regular (non-relocation data) extent. With mix of relocation
3784                  * extents and regular extents, we can dispatch WRITE commands
3785                  * (for relocation extents) and ZONE APPEND commands (for
3786                  * regular extents) at the same time to the same zone, which
3787                  * easily break the write pointer.
3788                  */
3789                 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3790                 fs_info->data_reloc_bg = 0;
3791         }
3792         spin_unlock(&fs_info->relocation_bg_lock);
3793         spin_unlock(&fs_info->treelog_bg_lock);
3794         spin_unlock(&block_group->lock);
3795         spin_unlock(&space_info->lock);
3796         return ret;
3797 }
3798
3799 static int do_allocation(struct btrfs_block_group *block_group,
3800                          struct find_free_extent_ctl *ffe_ctl,
3801                          struct btrfs_block_group **bg_ret)
3802 {
3803         switch (ffe_ctl->policy) {
3804         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3805                 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3806         case BTRFS_EXTENT_ALLOC_ZONED:
3807                 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3808         default:
3809                 BUG();
3810         }
3811 }
3812
3813 static void release_block_group(struct btrfs_block_group *block_group,
3814                                 struct find_free_extent_ctl *ffe_ctl,
3815                                 int delalloc)
3816 {
3817         switch (ffe_ctl->policy) {
3818         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3819                 ffe_ctl->retry_clustered = false;
3820                 ffe_ctl->retry_unclustered = false;
3821                 break;
3822         case BTRFS_EXTENT_ALLOC_ZONED:
3823                 /* Nothing to do */
3824                 break;
3825         default:
3826                 BUG();
3827         }
3828
3829         BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3830                ffe_ctl->index);
3831         btrfs_release_block_group(block_group, delalloc);
3832 }
3833
3834 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3835                                    struct btrfs_key *ins)
3836 {
3837         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3838
3839         if (!ffe_ctl->use_cluster && last_ptr) {
3840                 spin_lock(&last_ptr->lock);
3841                 last_ptr->window_start = ins->objectid;
3842                 spin_unlock(&last_ptr->lock);
3843         }
3844 }
3845
3846 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3847                          struct btrfs_key *ins)
3848 {
3849         switch (ffe_ctl->policy) {
3850         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3851                 found_extent_clustered(ffe_ctl, ins);
3852                 break;
3853         case BTRFS_EXTENT_ALLOC_ZONED:
3854                 /* Nothing to do */
3855                 break;
3856         default:
3857                 BUG();
3858         }
3859 }
3860
3861 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
3862                                     struct find_free_extent_ctl *ffe_ctl)
3863 {
3864         /* If we can activate new zone, just allocate a chunk and use it */
3865         if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3866                 return 0;
3867
3868         /*
3869          * We already reached the max active zones. Try to finish one block
3870          * group to make a room for a new block group. This is only possible
3871          * for a data block group because btrfs_zone_finish() may need to wait
3872          * for a running transaction which can cause a deadlock for metadata
3873          * allocation.
3874          */
3875         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
3876                 int ret = btrfs_zone_finish_one_bg(fs_info);
3877
3878                 if (ret == 1)
3879                         return 0;
3880                 else if (ret < 0)
3881                         return ret;
3882         }
3883
3884         /*
3885          * If we have enough free space left in an already active block group
3886          * and we can't activate any other zone now, do not allow allocating a
3887          * new chunk and let find_free_extent() retry with a smaller size.
3888          */
3889         if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
3890                 return -ENOSPC;
3891
3892         /*
3893          * Even min_alloc_size is not left in any block groups. Since we cannot
3894          * activate a new block group, allocating it may not help. Let's tell a
3895          * caller to try again and hope it progress something by writing some
3896          * parts of the region. That is only possible for data block groups,
3897          * where a part of the region can be written.
3898          */
3899         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
3900                 return -EAGAIN;
3901
3902         /*
3903          * We cannot activate a new block group and no enough space left in any
3904          * block groups. So, allocating a new block group may not help. But,
3905          * there is nothing to do anyway, so let's go with it.
3906          */
3907         return 0;
3908 }
3909
3910 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
3911                               struct find_free_extent_ctl *ffe_ctl)
3912 {
3913         switch (ffe_ctl->policy) {
3914         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3915                 return 0;
3916         case BTRFS_EXTENT_ALLOC_ZONED:
3917                 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
3918         default:
3919                 BUG();
3920         }
3921 }
3922
3923 /*
3924  * Return >0 means caller needs to re-search for free extent
3925  * Return 0 means we have the needed free extent.
3926  * Return <0 means we failed to locate any free extent.
3927  */
3928 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3929                                         struct btrfs_key *ins,
3930                                         struct find_free_extent_ctl *ffe_ctl,
3931                                         bool full_search)
3932 {
3933         struct btrfs_root *root = fs_info->chunk_root;
3934         int ret;
3935
3936         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3937             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3938                 ffe_ctl->orig_have_caching_bg = true;
3939
3940         if (ins->objectid) {
3941                 found_extent(ffe_ctl, ins);
3942                 return 0;
3943         }
3944
3945         if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
3946                 return 1;
3947
3948         ffe_ctl->index++;
3949         if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
3950                 return 1;
3951
3952         /*
3953          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3954          *                      caching kthreads as we move along
3955          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3956          * LOOP_UNSET_SIZE_CLASS, allow unset size class
3957          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3958          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3959          *                     again
3960          */
3961         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3962                 ffe_ctl->index = 0;
3963                 /*
3964                  * We want to skip the LOOP_CACHING_WAIT step if we don't have
3965                  * any uncached bgs and we've already done a full search
3966                  * through.
3967                  */
3968                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
3969                     (!ffe_ctl->orig_have_caching_bg && full_search))
3970                         ffe_ctl->loop++;
3971                 ffe_ctl->loop++;
3972
3973                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3974                         struct btrfs_trans_handle *trans;
3975                         int exist = 0;
3976
3977                         /* Check if allocation policy allows to create a new chunk */
3978                         ret = can_allocate_chunk(fs_info, ffe_ctl);
3979                         if (ret)
3980                                 return ret;
3981
3982                         trans = current->journal_info;
3983                         if (trans)
3984                                 exist = 1;
3985                         else
3986                                 trans = btrfs_join_transaction(root);
3987
3988                         if (IS_ERR(trans)) {
3989                                 ret = PTR_ERR(trans);
3990                                 return ret;
3991                         }
3992
3993                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3994                                                 CHUNK_ALLOC_FORCE_FOR_EXTENT);
3995
3996                         /* Do not bail out on ENOSPC since we can do more. */
3997                         if (ret == -ENOSPC) {
3998                                 ret = 0;
3999                                 ffe_ctl->loop++;
4000                         }
4001                         else if (ret < 0)
4002                                 btrfs_abort_transaction(trans, ret);
4003                         else
4004                                 ret = 0;
4005                         if (!exist)
4006                                 btrfs_end_transaction(trans);
4007                         if (ret)
4008                                 return ret;
4009                 }
4010
4011                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4012                         if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4013                                 return -ENOSPC;
4014
4015                         /*
4016                          * Don't loop again if we already have no empty_size and
4017                          * no empty_cluster.
4018                          */
4019                         if (ffe_ctl->empty_size == 0 &&
4020                             ffe_ctl->empty_cluster == 0)
4021                                 return -ENOSPC;
4022                         ffe_ctl->empty_size = 0;
4023                         ffe_ctl->empty_cluster = 0;
4024                 }
4025                 return 1;
4026         }
4027         return -ENOSPC;
4028 }
4029
4030 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4031                                               struct btrfs_block_group *bg)
4032 {
4033         if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4034                 return true;
4035         if (!btrfs_block_group_should_use_size_class(bg))
4036                 return true;
4037         if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4038                 return true;
4039         if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4040             bg->size_class == BTRFS_BG_SZ_NONE)
4041                 return true;
4042         return ffe_ctl->size_class == bg->size_class;
4043 }
4044
4045 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4046                                         struct find_free_extent_ctl *ffe_ctl,
4047                                         struct btrfs_space_info *space_info,
4048                                         struct btrfs_key *ins)
4049 {
4050         /*
4051          * If our free space is heavily fragmented we may not be able to make
4052          * big contiguous allocations, so instead of doing the expensive search
4053          * for free space, simply return ENOSPC with our max_extent_size so we
4054          * can go ahead and search for a more manageable chunk.
4055          *
4056          * If our max_extent_size is large enough for our allocation simply
4057          * disable clustering since we will likely not be able to find enough
4058          * space to create a cluster and induce latency trying.
4059          */
4060         if (space_info->max_extent_size) {
4061                 spin_lock(&space_info->lock);
4062                 if (space_info->max_extent_size &&
4063                     ffe_ctl->num_bytes > space_info->max_extent_size) {
4064                         ins->offset = space_info->max_extent_size;
4065                         spin_unlock(&space_info->lock);
4066                         return -ENOSPC;
4067                 } else if (space_info->max_extent_size) {
4068                         ffe_ctl->use_cluster = false;
4069                 }
4070                 spin_unlock(&space_info->lock);
4071         }
4072
4073         ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4074                                                &ffe_ctl->empty_cluster);
4075         if (ffe_ctl->last_ptr) {
4076                 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4077
4078                 spin_lock(&last_ptr->lock);
4079                 if (last_ptr->block_group)
4080                         ffe_ctl->hint_byte = last_ptr->window_start;
4081                 if (last_ptr->fragmented) {
4082                         /*
4083                          * We still set window_start so we can keep track of the
4084                          * last place we found an allocation to try and save
4085                          * some time.
4086                          */
4087                         ffe_ctl->hint_byte = last_ptr->window_start;
4088                         ffe_ctl->use_cluster = false;
4089                 }
4090                 spin_unlock(&last_ptr->lock);
4091         }
4092
4093         return 0;
4094 }
4095
4096 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4097                               struct find_free_extent_ctl *ffe_ctl,
4098                               struct btrfs_space_info *space_info,
4099                               struct btrfs_key *ins)
4100 {
4101         switch (ffe_ctl->policy) {
4102         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4103                 return prepare_allocation_clustered(fs_info, ffe_ctl,
4104                                                     space_info, ins);
4105         case BTRFS_EXTENT_ALLOC_ZONED:
4106                 if (ffe_ctl->for_treelog) {
4107                         spin_lock(&fs_info->treelog_bg_lock);
4108                         if (fs_info->treelog_bg)
4109                                 ffe_ctl->hint_byte = fs_info->treelog_bg;
4110                         spin_unlock(&fs_info->treelog_bg_lock);
4111                 }
4112                 if (ffe_ctl->for_data_reloc) {
4113                         spin_lock(&fs_info->relocation_bg_lock);
4114                         if (fs_info->data_reloc_bg)
4115                                 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4116                         spin_unlock(&fs_info->relocation_bg_lock);
4117                 }
4118                 return 0;
4119         default:
4120                 BUG();
4121         }
4122 }
4123
4124 /*
4125  * walks the btree of allocated extents and find a hole of a given size.
4126  * The key ins is changed to record the hole:
4127  * ins->objectid == start position
4128  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4129  * ins->offset == the size of the hole.
4130  * Any available blocks before search_start are skipped.
4131  *
4132  * If there is no suitable free space, we will record the max size of
4133  * the free space extent currently.
4134  *
4135  * The overall logic and call chain:
4136  *
4137  * find_free_extent()
4138  * |- Iterate through all block groups
4139  * |  |- Get a valid block group
4140  * |  |- Try to do clustered allocation in that block group
4141  * |  |- Try to do unclustered allocation in that block group
4142  * |  |- Check if the result is valid
4143  * |  |  |- If valid, then exit
4144  * |  |- Jump to next block group
4145  * |
4146  * |- Push harder to find free extents
4147  *    |- If not found, re-iterate all block groups
4148  */
4149 static noinline int find_free_extent(struct btrfs_root *root,
4150                                      struct btrfs_key *ins,
4151                                      struct find_free_extent_ctl *ffe_ctl)
4152 {
4153         struct btrfs_fs_info *fs_info = root->fs_info;
4154         int ret = 0;
4155         int cache_block_group_error = 0;
4156         struct btrfs_block_group *block_group = NULL;
4157         struct btrfs_space_info *space_info;
4158         bool full_search = false;
4159
4160         WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4161
4162         ffe_ctl->search_start = 0;
4163         /* For clustered allocation */
4164         ffe_ctl->empty_cluster = 0;
4165         ffe_ctl->last_ptr = NULL;
4166         ffe_ctl->use_cluster = true;
4167         ffe_ctl->have_caching_bg = false;
4168         ffe_ctl->orig_have_caching_bg = false;
4169         ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4170         ffe_ctl->loop = 0;
4171         /* For clustered allocation */
4172         ffe_ctl->retry_clustered = false;
4173         ffe_ctl->retry_unclustered = false;
4174         ffe_ctl->cached = 0;
4175         ffe_ctl->max_extent_size = 0;
4176         ffe_ctl->total_free_space = 0;
4177         ffe_ctl->found_offset = 0;
4178         ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4179         ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4180
4181         if (btrfs_is_zoned(fs_info))
4182                 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4183
4184         ins->type = BTRFS_EXTENT_ITEM_KEY;
4185         ins->objectid = 0;
4186         ins->offset = 0;
4187
4188         trace_find_free_extent(root, ffe_ctl);
4189
4190         space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4191         if (!space_info) {
4192                 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4193                 return -ENOSPC;
4194         }
4195
4196         ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4197         if (ret < 0)
4198                 return ret;
4199
4200         ffe_ctl->search_start = max(ffe_ctl->search_start,
4201                                     first_logical_byte(fs_info));
4202         ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4203         if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4204                 block_group = btrfs_lookup_block_group(fs_info,
4205                                                        ffe_ctl->search_start);
4206                 /*
4207                  * we don't want to use the block group if it doesn't match our
4208                  * allocation bits, or if its not cached.
4209                  *
4210                  * However if we are re-searching with an ideal block group
4211                  * picked out then we don't care that the block group is cached.
4212                  */
4213                 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4214                     block_group->cached != BTRFS_CACHE_NO) {
4215                         down_read(&space_info->groups_sem);
4216                         if (list_empty(&block_group->list) ||
4217                             block_group->ro) {
4218                                 /*
4219                                  * someone is removing this block group,
4220                                  * we can't jump into the have_block_group
4221                                  * target because our list pointers are not
4222                                  * valid
4223                                  */
4224                                 btrfs_put_block_group(block_group);
4225                                 up_read(&space_info->groups_sem);
4226                         } else {
4227                                 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4228                                                         block_group->flags);
4229                                 btrfs_lock_block_group(block_group,
4230                                                        ffe_ctl->delalloc);
4231                                 ffe_ctl->hinted = true;
4232                                 goto have_block_group;
4233                         }
4234                 } else if (block_group) {
4235                         btrfs_put_block_group(block_group);
4236                 }
4237         }
4238 search:
4239         trace_find_free_extent_search_loop(root, ffe_ctl);
4240         ffe_ctl->have_caching_bg = false;
4241         if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4242             ffe_ctl->index == 0)
4243                 full_search = true;
4244         down_read(&space_info->groups_sem);
4245         list_for_each_entry(block_group,
4246                             &space_info->block_groups[ffe_ctl->index], list) {
4247                 struct btrfs_block_group *bg_ret;
4248
4249                 ffe_ctl->hinted = false;
4250                 /* If the block group is read-only, we can skip it entirely. */
4251                 if (unlikely(block_group->ro)) {
4252                         if (ffe_ctl->for_treelog)
4253                                 btrfs_clear_treelog_bg(block_group);
4254                         if (ffe_ctl->for_data_reloc)
4255                                 btrfs_clear_data_reloc_bg(block_group);
4256                         continue;
4257                 }
4258
4259                 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4260                 ffe_ctl->search_start = block_group->start;
4261
4262                 /*
4263                  * this can happen if we end up cycling through all the
4264                  * raid types, but we want to make sure we only allocate
4265                  * for the proper type.
4266                  */
4267                 if (!block_group_bits(block_group, ffe_ctl->flags)) {
4268                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
4269                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
4270                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
4271                                 BTRFS_BLOCK_GROUP_RAID10;
4272
4273                         /*
4274                          * if they asked for extra copies and this block group
4275                          * doesn't provide them, bail.  This does allow us to
4276                          * fill raid0 from raid1.
4277                          */
4278                         if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4279                                 goto loop;
4280
4281                         /*
4282                          * This block group has different flags than we want.
4283                          * It's possible that we have MIXED_GROUP flag but no
4284                          * block group is mixed.  Just skip such block group.
4285                          */
4286                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4287                         continue;
4288                 }
4289
4290 have_block_group:
4291                 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4292                 ffe_ctl->cached = btrfs_block_group_done(block_group);
4293                 if (unlikely(!ffe_ctl->cached)) {
4294                         ffe_ctl->have_caching_bg = true;
4295                         ret = btrfs_cache_block_group(block_group, false);
4296
4297                         /*
4298                          * If we get ENOMEM here or something else we want to
4299                          * try other block groups, because it may not be fatal.
4300                          * However if we can't find anything else we need to
4301                          * save our return here so that we return the actual
4302                          * error that caused problems, not ENOSPC.
4303                          */
4304                         if (ret < 0) {
4305                                 if (!cache_block_group_error)
4306                                         cache_block_group_error = ret;
4307                                 ret = 0;
4308                                 goto loop;
4309                         }
4310                         ret = 0;
4311                 }
4312
4313                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
4314                         goto loop;
4315
4316                 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4317                         goto loop;
4318
4319                 bg_ret = NULL;
4320                 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4321                 if (ret == 0) {
4322                         if (bg_ret && bg_ret != block_group) {
4323                                 btrfs_release_block_group(block_group,
4324                                                           ffe_ctl->delalloc);
4325                                 block_group = bg_ret;
4326                         }
4327                 } else if (ret == -EAGAIN) {
4328                         goto have_block_group;
4329                 } else if (ret > 0) {
4330                         goto loop;
4331                 }
4332
4333                 /* Checks */
4334                 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4335                                                  fs_info->stripesize);
4336
4337                 /* move on to the next group */
4338                 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4339                     block_group->start + block_group->length) {
4340                         btrfs_add_free_space_unused(block_group,
4341                                             ffe_ctl->found_offset,
4342                                             ffe_ctl->num_bytes);
4343                         goto loop;
4344                 }
4345
4346                 if (ffe_ctl->found_offset < ffe_ctl->search_start)
4347                         btrfs_add_free_space_unused(block_group,
4348                                         ffe_ctl->found_offset,
4349                                         ffe_ctl->search_start - ffe_ctl->found_offset);
4350
4351                 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4352                                                ffe_ctl->num_bytes,
4353                                                ffe_ctl->delalloc,
4354                                                ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4355                 if (ret == -EAGAIN) {
4356                         btrfs_add_free_space_unused(block_group,
4357                                         ffe_ctl->found_offset,
4358                                         ffe_ctl->num_bytes);
4359                         goto loop;
4360                 }
4361                 btrfs_inc_block_group_reservations(block_group);
4362
4363                 /* we are all good, lets return */
4364                 ins->objectid = ffe_ctl->search_start;
4365                 ins->offset = ffe_ctl->num_bytes;
4366
4367                 trace_btrfs_reserve_extent(block_group, ffe_ctl);
4368                 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4369                 break;
4370 loop:
4371                 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4372                 cond_resched();
4373         }
4374         up_read(&space_info->groups_sem);
4375
4376         ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4377         if (ret > 0)
4378                 goto search;
4379
4380         if (ret == -ENOSPC && !cache_block_group_error) {
4381                 /*
4382                  * Use ffe_ctl->total_free_space as fallback if we can't find
4383                  * any contiguous hole.
4384                  */
4385                 if (!ffe_ctl->max_extent_size)
4386                         ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4387                 spin_lock(&space_info->lock);
4388                 space_info->max_extent_size = ffe_ctl->max_extent_size;
4389                 spin_unlock(&space_info->lock);
4390                 ins->offset = ffe_ctl->max_extent_size;
4391         } else if (ret == -ENOSPC) {
4392                 ret = cache_block_group_error;
4393         }
4394         return ret;
4395 }
4396
4397 /*
4398  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4399  *                        hole that is at least as big as @num_bytes.
4400  *
4401  * @root           -    The root that will contain this extent
4402  *
4403  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4404  *                      is used for accounting purposes. This value differs
4405  *                      from @num_bytes only in the case of compressed extents.
4406  *
4407  * @num_bytes      -    Number of bytes to allocate on-disk.
4408  *
4409  * @min_alloc_size -    Indicates the minimum amount of space that the
4410  *                      allocator should try to satisfy. In some cases
4411  *                      @num_bytes may be larger than what is required and if
4412  *                      the filesystem is fragmented then allocation fails.
4413  *                      However, the presence of @min_alloc_size gives a
4414  *                      chance to try and satisfy the smaller allocation.
4415  *
4416  * @empty_size     -    A hint that you plan on doing more COW. This is the
4417  *                      size in bytes the allocator should try to find free
4418  *                      next to the block it returns.  This is just a hint and
4419  *                      may be ignored by the allocator.
4420  *
4421  * @hint_byte      -    Hint to the allocator to start searching above the byte
4422  *                      address passed. It might be ignored.
4423  *
4424  * @ins            -    This key is modified to record the found hole. It will
4425  *                      have the following values:
4426  *                      ins->objectid == start position
4427  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4428  *                      ins->offset == the size of the hole.
4429  *
4430  * @is_data        -    Boolean flag indicating whether an extent is
4431  *                      allocated for data (true) or metadata (false)
4432  *
4433  * @delalloc       -    Boolean flag indicating whether this allocation is for
4434  *                      delalloc or not. If 'true' data_rwsem of block groups
4435  *                      is going to be acquired.
4436  *
4437  *
4438  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4439  * case -ENOSPC is returned then @ins->offset will contain the size of the
4440  * largest available hole the allocator managed to find.
4441  */
4442 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4443                          u64 num_bytes, u64 min_alloc_size,
4444                          u64 empty_size, u64 hint_byte,
4445                          struct btrfs_key *ins, int is_data, int delalloc)
4446 {
4447         struct btrfs_fs_info *fs_info = root->fs_info;
4448         struct find_free_extent_ctl ffe_ctl = {};
4449         bool final_tried = num_bytes == min_alloc_size;
4450         u64 flags;
4451         int ret;
4452         bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4453         bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4454
4455         flags = get_alloc_profile_by_root(root, is_data);
4456 again:
4457         WARN_ON(num_bytes < fs_info->sectorsize);
4458
4459         ffe_ctl.ram_bytes = ram_bytes;
4460         ffe_ctl.num_bytes = num_bytes;
4461         ffe_ctl.min_alloc_size = min_alloc_size;
4462         ffe_ctl.empty_size = empty_size;
4463         ffe_ctl.flags = flags;
4464         ffe_ctl.delalloc = delalloc;
4465         ffe_ctl.hint_byte = hint_byte;
4466         ffe_ctl.for_treelog = for_treelog;
4467         ffe_ctl.for_data_reloc = for_data_reloc;
4468
4469         ret = find_free_extent(root, ins, &ffe_ctl);
4470         if (!ret && !is_data) {
4471                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4472         } else if (ret == -ENOSPC) {
4473                 if (!final_tried && ins->offset) {
4474                         num_bytes = min(num_bytes >> 1, ins->offset);
4475                         num_bytes = round_down(num_bytes,
4476                                                fs_info->sectorsize);
4477                         num_bytes = max(num_bytes, min_alloc_size);
4478                         ram_bytes = num_bytes;
4479                         if (num_bytes == min_alloc_size)
4480                                 final_tried = true;
4481                         goto again;
4482                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4483                         struct btrfs_space_info *sinfo;
4484
4485                         sinfo = btrfs_find_space_info(fs_info, flags);
4486                         btrfs_err(fs_info,
4487         "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4488                                   flags, num_bytes, for_treelog, for_data_reloc);
4489                         if (sinfo)
4490                                 btrfs_dump_space_info(fs_info, sinfo,
4491                                                       num_bytes, 1);
4492                 }
4493         }
4494
4495         return ret;
4496 }
4497
4498 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4499                                u64 start, u64 len, int delalloc)
4500 {
4501         struct btrfs_block_group *cache;
4502
4503         cache = btrfs_lookup_block_group(fs_info, start);
4504         if (!cache) {
4505                 btrfs_err(fs_info, "Unable to find block group for %llu",
4506                           start);
4507                 return -ENOSPC;
4508         }
4509
4510         btrfs_add_free_space(cache, start, len);
4511         btrfs_free_reserved_bytes(cache, len, delalloc);
4512         trace_btrfs_reserved_extent_free(fs_info, start, len);
4513
4514         btrfs_put_block_group(cache);
4515         return 0;
4516 }
4517
4518 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4519                               u64 len)
4520 {
4521         struct btrfs_block_group *cache;
4522         int ret = 0;
4523
4524         cache = btrfs_lookup_block_group(trans->fs_info, start);
4525         if (!cache) {
4526                 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4527                           start);
4528                 return -ENOSPC;
4529         }
4530
4531         ret = pin_down_extent(trans, cache, start, len, 1);
4532         btrfs_put_block_group(cache);
4533         return ret;
4534 }
4535
4536 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4537                                  u64 num_bytes)
4538 {
4539         struct btrfs_fs_info *fs_info = trans->fs_info;
4540         int ret;
4541
4542         ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4543         if (ret)
4544                 return ret;
4545
4546         ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4547         if (ret) {
4548                 ASSERT(!ret);
4549                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4550                           bytenr, num_bytes);
4551                 return ret;
4552         }
4553
4554         trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4555         return 0;
4556 }
4557
4558 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4559                                       u64 parent, u64 root_objectid,
4560                                       u64 flags, u64 owner, u64 offset,
4561                                       struct btrfs_key *ins, int ref_mod)
4562 {
4563         struct btrfs_fs_info *fs_info = trans->fs_info;
4564         struct btrfs_root *extent_root;
4565         int ret;
4566         struct btrfs_extent_item *extent_item;
4567         struct btrfs_extent_inline_ref *iref;
4568         struct btrfs_path *path;
4569         struct extent_buffer *leaf;
4570         int type;
4571         u32 size;
4572
4573         if (parent > 0)
4574                 type = BTRFS_SHARED_DATA_REF_KEY;
4575         else
4576                 type = BTRFS_EXTENT_DATA_REF_KEY;
4577
4578         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4579
4580         path = btrfs_alloc_path();
4581         if (!path)
4582                 return -ENOMEM;
4583
4584         extent_root = btrfs_extent_root(fs_info, ins->objectid);
4585         ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4586         if (ret) {
4587                 btrfs_free_path(path);
4588                 return ret;
4589         }
4590
4591         leaf = path->nodes[0];
4592         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4593                                      struct btrfs_extent_item);
4594         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4595         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4596         btrfs_set_extent_flags(leaf, extent_item,
4597                                flags | BTRFS_EXTENT_FLAG_DATA);
4598
4599         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4600         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4601         if (parent > 0) {
4602                 struct btrfs_shared_data_ref *ref;
4603                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4604                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4605                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4606         } else {
4607                 struct btrfs_extent_data_ref *ref;
4608                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4609                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4610                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4611                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4612                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4613         }
4614
4615         btrfs_mark_buffer_dirty(path->nodes[0]);
4616         btrfs_free_path(path);
4617
4618         return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4619 }
4620
4621 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4622                                      struct btrfs_delayed_ref_node *node,
4623                                      struct btrfs_delayed_extent_op *extent_op)
4624 {
4625         struct btrfs_fs_info *fs_info = trans->fs_info;
4626         struct btrfs_root *extent_root;
4627         int ret;
4628         struct btrfs_extent_item *extent_item;
4629         struct btrfs_key extent_key;
4630         struct btrfs_tree_block_info *block_info;
4631         struct btrfs_extent_inline_ref *iref;
4632         struct btrfs_path *path;
4633         struct extent_buffer *leaf;
4634         struct btrfs_delayed_tree_ref *ref;
4635         u32 size = sizeof(*extent_item) + sizeof(*iref);
4636         u64 flags = extent_op->flags_to_set;
4637         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4638
4639         ref = btrfs_delayed_node_to_tree_ref(node);
4640
4641         extent_key.objectid = node->bytenr;
4642         if (skinny_metadata) {
4643                 extent_key.offset = ref->level;
4644                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4645         } else {
4646                 extent_key.offset = node->num_bytes;
4647                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4648                 size += sizeof(*block_info);
4649         }
4650
4651         path = btrfs_alloc_path();
4652         if (!path)
4653                 return -ENOMEM;
4654
4655         extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4656         ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4657                                       size);
4658         if (ret) {
4659                 btrfs_free_path(path);
4660                 return ret;
4661         }
4662
4663         leaf = path->nodes[0];
4664         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4665                                      struct btrfs_extent_item);
4666         btrfs_set_extent_refs(leaf, extent_item, 1);
4667         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4668         btrfs_set_extent_flags(leaf, extent_item,
4669                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4670
4671         if (skinny_metadata) {
4672                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4673         } else {
4674                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4675                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4676                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
4677                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4678         }
4679
4680         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4681                 btrfs_set_extent_inline_ref_type(leaf, iref,
4682                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4683                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4684         } else {
4685                 btrfs_set_extent_inline_ref_type(leaf, iref,
4686                                                  BTRFS_TREE_BLOCK_REF_KEY);
4687                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4688         }
4689
4690         btrfs_mark_buffer_dirty(leaf);
4691         btrfs_free_path(path);
4692
4693         return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4694 }
4695
4696 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4697                                      struct btrfs_root *root, u64 owner,
4698                                      u64 offset, u64 ram_bytes,
4699                                      struct btrfs_key *ins)
4700 {
4701         struct btrfs_ref generic_ref = { 0 };
4702
4703         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4704
4705         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4706                                ins->objectid, ins->offset, 0);
4707         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4708                             offset, 0, false);
4709         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4710
4711         return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4712 }
4713
4714 /*
4715  * this is used by the tree logging recovery code.  It records that
4716  * an extent has been allocated and makes sure to clear the free
4717  * space cache bits as well
4718  */
4719 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4720                                    u64 root_objectid, u64 owner, u64 offset,
4721                                    struct btrfs_key *ins)
4722 {
4723         struct btrfs_fs_info *fs_info = trans->fs_info;
4724         int ret;
4725         struct btrfs_block_group *block_group;
4726         struct btrfs_space_info *space_info;
4727
4728         /*
4729          * Mixed block groups will exclude before processing the log so we only
4730          * need to do the exclude dance if this fs isn't mixed.
4731          */
4732         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4733                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4734                                               ins->offset);
4735                 if (ret)
4736                         return ret;
4737         }
4738
4739         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4740         if (!block_group)
4741                 return -EINVAL;
4742
4743         space_info = block_group->space_info;
4744         spin_lock(&space_info->lock);
4745         spin_lock(&block_group->lock);
4746         space_info->bytes_reserved += ins->offset;
4747         block_group->reserved += ins->offset;
4748         spin_unlock(&block_group->lock);
4749         spin_unlock(&space_info->lock);
4750
4751         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4752                                          offset, ins, 1);
4753         if (ret)
4754                 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4755         btrfs_put_block_group(block_group);
4756         return ret;
4757 }
4758
4759 static struct extent_buffer *
4760 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4761                       u64 bytenr, int level, u64 owner,
4762                       enum btrfs_lock_nesting nest)
4763 {
4764         struct btrfs_fs_info *fs_info = root->fs_info;
4765         struct extent_buffer *buf;
4766         u64 lockdep_owner = owner;
4767
4768         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4769         if (IS_ERR(buf))
4770                 return buf;
4771
4772         /*
4773          * Extra safety check in case the extent tree is corrupted and extent
4774          * allocator chooses to use a tree block which is already used and
4775          * locked.
4776          */
4777         if (buf->lock_owner == current->pid) {
4778                 btrfs_err_rl(fs_info,
4779 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4780                         buf->start, btrfs_header_owner(buf), current->pid);
4781                 free_extent_buffer(buf);
4782                 return ERR_PTR(-EUCLEAN);
4783         }
4784
4785         /*
4786          * The reloc trees are just snapshots, so we need them to appear to be
4787          * just like any other fs tree WRT lockdep.
4788          *
4789          * The exception however is in replace_path() in relocation, where we
4790          * hold the lock on the original fs root and then search for the reloc
4791          * root.  At that point we need to make sure any reloc root buffers are
4792          * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
4793          * lockdep happy.
4794          */
4795         if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
4796             !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
4797                 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4798
4799         /* btrfs_clear_buffer_dirty() accesses generation field. */
4800         btrfs_set_header_generation(buf, trans->transid);
4801
4802         /*
4803          * This needs to stay, because we could allocate a freed block from an
4804          * old tree into a new tree, so we need to make sure this new block is
4805          * set to the appropriate level and owner.
4806          */
4807         btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
4808
4809         __btrfs_tree_lock(buf, nest);
4810         btrfs_clear_buffer_dirty(trans, buf);
4811         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4812         clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4813
4814         set_extent_buffer_uptodate(buf);
4815
4816         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4817         btrfs_set_header_level(buf, level);
4818         btrfs_set_header_bytenr(buf, buf->start);
4819         btrfs_set_header_generation(buf, trans->transid);
4820         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4821         btrfs_set_header_owner(buf, owner);
4822         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4823         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4824         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4825                 buf->log_index = root->log_transid % 2;
4826                 /*
4827                  * we allow two log transactions at a time, use different
4828                  * EXTENT bit to differentiate dirty pages.
4829                  */
4830                 if (buf->log_index == 0)
4831                         set_extent_bit(&root->dirty_log_pages, buf->start,
4832                                        buf->start + buf->len - 1,
4833                                        EXTENT_DIRTY, NULL);
4834                 else
4835                         set_extent_bit(&root->dirty_log_pages, buf->start,
4836                                        buf->start + buf->len - 1,
4837                                        EXTENT_NEW, NULL);
4838         } else {
4839                 buf->log_index = -1;
4840                 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
4841                                buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
4842         }
4843         /* this returns a buffer locked for blocking */
4844         return buf;
4845 }
4846
4847 /*
4848  * finds a free extent and does all the dirty work required for allocation
4849  * returns the tree buffer or an ERR_PTR on error.
4850  */
4851 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4852                                              struct btrfs_root *root,
4853                                              u64 parent, u64 root_objectid,
4854                                              const struct btrfs_disk_key *key,
4855                                              int level, u64 hint,
4856                                              u64 empty_size,
4857                                              enum btrfs_lock_nesting nest)
4858 {
4859         struct btrfs_fs_info *fs_info = root->fs_info;
4860         struct btrfs_key ins;
4861         struct btrfs_block_rsv *block_rsv;
4862         struct extent_buffer *buf;
4863         struct btrfs_delayed_extent_op *extent_op;
4864         struct btrfs_ref generic_ref = { 0 };
4865         u64 flags = 0;
4866         int ret;
4867         u32 blocksize = fs_info->nodesize;
4868         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4869
4870 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4871         if (btrfs_is_testing(fs_info)) {
4872                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4873                                             level, root_objectid, nest);
4874                 if (!IS_ERR(buf))
4875                         root->alloc_bytenr += blocksize;
4876                 return buf;
4877         }
4878 #endif
4879
4880         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4881         if (IS_ERR(block_rsv))
4882                 return ERR_CAST(block_rsv);
4883
4884         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4885                                    empty_size, hint, &ins, 0, 0);
4886         if (ret)
4887                 goto out_unuse;
4888
4889         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4890                                     root_objectid, nest);
4891         if (IS_ERR(buf)) {
4892                 ret = PTR_ERR(buf);
4893                 goto out_free_reserved;
4894         }
4895
4896         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4897                 if (parent == 0)
4898                         parent = ins.objectid;
4899                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4900         } else
4901                 BUG_ON(parent > 0);
4902
4903         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4904                 extent_op = btrfs_alloc_delayed_extent_op();
4905                 if (!extent_op) {
4906                         ret = -ENOMEM;
4907                         goto out_free_buf;
4908                 }
4909                 if (key)
4910                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4911                 else
4912                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4913                 extent_op->flags_to_set = flags;
4914                 extent_op->update_key = skinny_metadata ? false : true;
4915                 extent_op->update_flags = true;
4916                 extent_op->level = level;
4917
4918                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4919                                        ins.objectid, ins.offset, parent);
4920                 btrfs_init_tree_ref(&generic_ref, level, root_objectid,
4921                                     root->root_key.objectid, false);
4922                 btrfs_ref_tree_mod(fs_info, &generic_ref);
4923                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4924                 if (ret)
4925                         goto out_free_delayed;
4926         }
4927         return buf;
4928
4929 out_free_delayed:
4930         btrfs_free_delayed_extent_op(extent_op);
4931 out_free_buf:
4932         btrfs_tree_unlock(buf);
4933         free_extent_buffer(buf);
4934 out_free_reserved:
4935         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4936 out_unuse:
4937         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4938         return ERR_PTR(ret);
4939 }
4940
4941 struct walk_control {
4942         u64 refs[BTRFS_MAX_LEVEL];
4943         u64 flags[BTRFS_MAX_LEVEL];
4944         struct btrfs_key update_progress;
4945         struct btrfs_key drop_progress;
4946         int drop_level;
4947         int stage;
4948         int level;
4949         int shared_level;
4950         int update_ref;
4951         int keep_locks;
4952         int reada_slot;
4953         int reada_count;
4954         int restarted;
4955 };
4956
4957 #define DROP_REFERENCE  1
4958 #define UPDATE_BACKREF  2
4959
4960 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4961                                      struct btrfs_root *root,
4962                                      struct walk_control *wc,
4963                                      struct btrfs_path *path)
4964 {
4965         struct btrfs_fs_info *fs_info = root->fs_info;
4966         u64 bytenr;
4967         u64 generation;
4968         u64 refs;
4969         u64 flags;
4970         u32 nritems;
4971         struct btrfs_key key;
4972         struct extent_buffer *eb;
4973         int ret;
4974         int slot;
4975         int nread = 0;
4976
4977         if (path->slots[wc->level] < wc->reada_slot) {
4978                 wc->reada_count = wc->reada_count * 2 / 3;
4979                 wc->reada_count = max(wc->reada_count, 2);
4980         } else {
4981                 wc->reada_count = wc->reada_count * 3 / 2;
4982                 wc->reada_count = min_t(int, wc->reada_count,
4983                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
4984         }
4985
4986         eb = path->nodes[wc->level];
4987         nritems = btrfs_header_nritems(eb);
4988
4989         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4990                 if (nread >= wc->reada_count)
4991                         break;
4992
4993                 cond_resched();
4994                 bytenr = btrfs_node_blockptr(eb, slot);
4995                 generation = btrfs_node_ptr_generation(eb, slot);
4996
4997                 if (slot == path->slots[wc->level])
4998                         goto reada;
4999
5000                 if (wc->stage == UPDATE_BACKREF &&
5001                     generation <= root->root_key.offset)
5002                         continue;
5003
5004                 /* We don't lock the tree block, it's OK to be racy here */
5005                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5006                                                wc->level - 1, 1, &refs,
5007                                                &flags);
5008                 /* We don't care about errors in readahead. */
5009                 if (ret < 0)
5010                         continue;
5011                 BUG_ON(refs == 0);
5012
5013                 if (wc->stage == DROP_REFERENCE) {
5014                         if (refs == 1)
5015                                 goto reada;
5016
5017                         if (wc->level == 1 &&
5018                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5019                                 continue;
5020                         if (!wc->update_ref ||
5021                             generation <= root->root_key.offset)
5022                                 continue;
5023                         btrfs_node_key_to_cpu(eb, &key, slot);
5024                         ret = btrfs_comp_cpu_keys(&key,
5025                                                   &wc->update_progress);
5026                         if (ret < 0)
5027                                 continue;
5028                 } else {
5029                         if (wc->level == 1 &&
5030                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5031                                 continue;
5032                 }
5033 reada:
5034                 btrfs_readahead_node_child(eb, slot);
5035                 nread++;
5036         }
5037         wc->reada_slot = slot;
5038 }
5039
5040 /*
5041  * helper to process tree block while walking down the tree.
5042  *
5043  * when wc->stage == UPDATE_BACKREF, this function updates
5044  * back refs for pointers in the block.
5045  *
5046  * NOTE: return value 1 means we should stop walking down.
5047  */
5048 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5049                                    struct btrfs_root *root,
5050                                    struct btrfs_path *path,
5051                                    struct walk_control *wc, int lookup_info)
5052 {
5053         struct btrfs_fs_info *fs_info = root->fs_info;
5054         int level = wc->level;
5055         struct extent_buffer *eb = path->nodes[level];
5056         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5057         int ret;
5058
5059         if (wc->stage == UPDATE_BACKREF &&
5060             btrfs_header_owner(eb) != root->root_key.objectid)
5061                 return 1;
5062
5063         /*
5064          * when reference count of tree block is 1, it won't increase
5065          * again. once full backref flag is set, we never clear it.
5066          */
5067         if (lookup_info &&
5068             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5069              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5070                 BUG_ON(!path->locks[level]);
5071                 ret = btrfs_lookup_extent_info(trans, fs_info,
5072                                                eb->start, level, 1,
5073                                                &wc->refs[level],
5074                                                &wc->flags[level]);
5075                 BUG_ON(ret == -ENOMEM);
5076                 if (ret)
5077                         return ret;
5078                 BUG_ON(wc->refs[level] == 0);
5079         }
5080
5081         if (wc->stage == DROP_REFERENCE) {
5082                 if (wc->refs[level] > 1)
5083                         return 1;
5084
5085                 if (path->locks[level] && !wc->keep_locks) {
5086                         btrfs_tree_unlock_rw(eb, path->locks[level]);
5087                         path->locks[level] = 0;
5088                 }
5089                 return 0;
5090         }
5091
5092         /* wc->stage == UPDATE_BACKREF */
5093         if (!(wc->flags[level] & flag)) {
5094                 BUG_ON(!path->locks[level]);
5095                 ret = btrfs_inc_ref(trans, root, eb, 1);
5096                 BUG_ON(ret); /* -ENOMEM */
5097                 ret = btrfs_dec_ref(trans, root, eb, 0);
5098                 BUG_ON(ret); /* -ENOMEM */
5099                 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5100                 BUG_ON(ret); /* -ENOMEM */
5101                 wc->flags[level] |= flag;
5102         }
5103
5104         /*
5105          * the block is shared by multiple trees, so it's not good to
5106          * keep the tree lock
5107          */
5108         if (path->locks[level] && level > 0) {
5109                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5110                 path->locks[level] = 0;
5111         }
5112         return 0;
5113 }
5114
5115 /*
5116  * This is used to verify a ref exists for this root to deal with a bug where we
5117  * would have a drop_progress key that hadn't been updated properly.
5118  */
5119 static int check_ref_exists(struct btrfs_trans_handle *trans,
5120                             struct btrfs_root *root, u64 bytenr, u64 parent,
5121                             int level)
5122 {
5123         struct btrfs_path *path;
5124         struct btrfs_extent_inline_ref *iref;
5125         int ret;
5126
5127         path = btrfs_alloc_path();
5128         if (!path)
5129                 return -ENOMEM;
5130
5131         ret = lookup_extent_backref(trans, path, &iref, bytenr,
5132                                     root->fs_info->nodesize, parent,
5133                                     root->root_key.objectid, level, 0);
5134         btrfs_free_path(path);
5135         if (ret == -ENOENT)
5136                 return 0;
5137         if (ret < 0)
5138                 return ret;
5139         return 1;
5140 }
5141
5142 /*
5143  * helper to process tree block pointer.
5144  *
5145  * when wc->stage == DROP_REFERENCE, this function checks
5146  * reference count of the block pointed to. if the block
5147  * is shared and we need update back refs for the subtree
5148  * rooted at the block, this function changes wc->stage to
5149  * UPDATE_BACKREF. if the block is shared and there is no
5150  * need to update back, this function drops the reference
5151  * to the block.
5152  *
5153  * NOTE: return value 1 means we should stop walking down.
5154  */
5155 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5156                                  struct btrfs_root *root,
5157                                  struct btrfs_path *path,
5158                                  struct walk_control *wc, int *lookup_info)
5159 {
5160         struct btrfs_fs_info *fs_info = root->fs_info;
5161         u64 bytenr;
5162         u64 generation;
5163         u64 parent;
5164         struct btrfs_tree_parent_check check = { 0 };
5165         struct btrfs_key key;
5166         struct btrfs_ref ref = { 0 };
5167         struct extent_buffer *next;
5168         int level = wc->level;
5169         int reada = 0;
5170         int ret = 0;
5171         bool need_account = false;
5172
5173         generation = btrfs_node_ptr_generation(path->nodes[level],
5174                                                path->slots[level]);
5175         /*
5176          * if the lower level block was created before the snapshot
5177          * was created, we know there is no need to update back refs
5178          * for the subtree
5179          */
5180         if (wc->stage == UPDATE_BACKREF &&
5181             generation <= root->root_key.offset) {
5182                 *lookup_info = 1;
5183                 return 1;
5184         }
5185
5186         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5187
5188         check.level = level - 1;
5189         check.transid = generation;
5190         check.owner_root = root->root_key.objectid;
5191         check.has_first_key = true;
5192         btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
5193                               path->slots[level]);
5194
5195         next = find_extent_buffer(fs_info, bytenr);
5196         if (!next) {
5197                 next = btrfs_find_create_tree_block(fs_info, bytenr,
5198                                 root->root_key.objectid, level - 1);
5199                 if (IS_ERR(next))
5200                         return PTR_ERR(next);
5201                 reada = 1;
5202         }
5203         btrfs_tree_lock(next);
5204
5205         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5206                                        &wc->refs[level - 1],
5207                                        &wc->flags[level - 1]);
5208         if (ret < 0)
5209                 goto out_unlock;
5210
5211         if (unlikely(wc->refs[level - 1] == 0)) {
5212                 btrfs_err(fs_info, "Missing references.");
5213                 ret = -EIO;
5214                 goto out_unlock;
5215         }
5216         *lookup_info = 0;
5217
5218         if (wc->stage == DROP_REFERENCE) {
5219                 if (wc->refs[level - 1] > 1) {
5220                         need_account = true;
5221                         if (level == 1 &&
5222                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5223                                 goto skip;
5224
5225                         if (!wc->update_ref ||
5226                             generation <= root->root_key.offset)
5227                                 goto skip;
5228
5229                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5230                                               path->slots[level]);
5231                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5232                         if (ret < 0)
5233                                 goto skip;
5234
5235                         wc->stage = UPDATE_BACKREF;
5236                         wc->shared_level = level - 1;
5237                 }
5238         } else {
5239                 if (level == 1 &&
5240                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5241                         goto skip;
5242         }
5243
5244         if (!btrfs_buffer_uptodate(next, generation, 0)) {
5245                 btrfs_tree_unlock(next);
5246                 free_extent_buffer(next);
5247                 next = NULL;
5248                 *lookup_info = 1;
5249         }
5250
5251         if (!next) {
5252                 if (reada && level == 1)
5253                         reada_walk_down(trans, root, wc, path);
5254                 next = read_tree_block(fs_info, bytenr, &check);
5255                 if (IS_ERR(next)) {
5256                         return PTR_ERR(next);
5257                 } else if (!extent_buffer_uptodate(next)) {
5258                         free_extent_buffer(next);
5259                         return -EIO;
5260                 }
5261                 btrfs_tree_lock(next);
5262         }
5263
5264         level--;
5265         ASSERT(level == btrfs_header_level(next));
5266         if (level != btrfs_header_level(next)) {
5267                 btrfs_err(root->fs_info, "mismatched level");
5268                 ret = -EIO;
5269                 goto out_unlock;
5270         }
5271         path->nodes[level] = next;
5272         path->slots[level] = 0;
5273         path->locks[level] = BTRFS_WRITE_LOCK;
5274         wc->level = level;
5275         if (wc->level == 1)
5276                 wc->reada_slot = 0;
5277         return 0;
5278 skip:
5279         wc->refs[level - 1] = 0;
5280         wc->flags[level - 1] = 0;
5281         if (wc->stage == DROP_REFERENCE) {
5282                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5283                         parent = path->nodes[level]->start;
5284                 } else {
5285                         ASSERT(root->root_key.objectid ==
5286                                btrfs_header_owner(path->nodes[level]));
5287                         if (root->root_key.objectid !=
5288                             btrfs_header_owner(path->nodes[level])) {
5289                                 btrfs_err(root->fs_info,
5290                                                 "mismatched block owner");
5291                                 ret = -EIO;
5292                                 goto out_unlock;
5293                         }
5294                         parent = 0;
5295                 }
5296
5297                 /*
5298                  * If we had a drop_progress we need to verify the refs are set
5299                  * as expected.  If we find our ref then we know that from here
5300                  * on out everything should be correct, and we can clear the
5301                  * ->restarted flag.
5302                  */
5303                 if (wc->restarted) {
5304                         ret = check_ref_exists(trans, root, bytenr, parent,
5305                                                level - 1);
5306                         if (ret < 0)
5307                                 goto out_unlock;
5308                         if (ret == 0)
5309                                 goto no_delete;
5310                         ret = 0;
5311                         wc->restarted = 0;
5312                 }
5313
5314                 /*
5315                  * Reloc tree doesn't contribute to qgroup numbers, and we have
5316                  * already accounted them at merge time (replace_path),
5317                  * thus we could skip expensive subtree trace here.
5318                  */
5319                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5320                     need_account) {
5321                         ret = btrfs_qgroup_trace_subtree(trans, next,
5322                                                          generation, level - 1);
5323                         if (ret) {
5324                                 btrfs_err_rl(fs_info,
5325                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5326                                              ret);
5327                         }
5328                 }
5329
5330                 /*
5331                  * We need to update the next key in our walk control so we can
5332                  * update the drop_progress key accordingly.  We don't care if
5333                  * find_next_key doesn't find a key because that means we're at
5334                  * the end and are going to clean up now.
5335                  */
5336                 wc->drop_level = level;
5337                 find_next_key(path, level, &wc->drop_progress);
5338
5339                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5340                                        fs_info->nodesize, parent);
5341                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5342                                     0, false);
5343                 ret = btrfs_free_extent(trans, &ref);
5344                 if (ret)
5345                         goto out_unlock;
5346         }
5347 no_delete:
5348         *lookup_info = 1;
5349         ret = 1;
5350
5351 out_unlock:
5352         btrfs_tree_unlock(next);
5353         free_extent_buffer(next);
5354
5355         return ret;
5356 }
5357
5358 /*
5359  * helper to process tree block while walking up the tree.
5360  *
5361  * when wc->stage == DROP_REFERENCE, this function drops
5362  * reference count on the block.
5363  *
5364  * when wc->stage == UPDATE_BACKREF, this function changes
5365  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5366  * to UPDATE_BACKREF previously while processing the block.
5367  *
5368  * NOTE: return value 1 means we should stop walking up.
5369  */
5370 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5371                                  struct btrfs_root *root,
5372                                  struct btrfs_path *path,
5373                                  struct walk_control *wc)
5374 {
5375         struct btrfs_fs_info *fs_info = root->fs_info;
5376         int ret;
5377         int level = wc->level;
5378         struct extent_buffer *eb = path->nodes[level];
5379         u64 parent = 0;
5380
5381         if (wc->stage == UPDATE_BACKREF) {
5382                 BUG_ON(wc->shared_level < level);
5383                 if (level < wc->shared_level)
5384                         goto out;
5385
5386                 ret = find_next_key(path, level + 1, &wc->update_progress);
5387                 if (ret > 0)
5388                         wc->update_ref = 0;
5389
5390                 wc->stage = DROP_REFERENCE;
5391                 wc->shared_level = -1;
5392                 path->slots[level] = 0;
5393
5394                 /*
5395                  * check reference count again if the block isn't locked.
5396                  * we should start walking down the tree again if reference
5397                  * count is one.
5398                  */
5399                 if (!path->locks[level]) {
5400                         BUG_ON(level == 0);
5401                         btrfs_tree_lock(eb);
5402                         path->locks[level] = BTRFS_WRITE_LOCK;
5403
5404                         ret = btrfs_lookup_extent_info(trans, fs_info,
5405                                                        eb->start, level, 1,
5406                                                        &wc->refs[level],
5407                                                        &wc->flags[level]);
5408                         if (ret < 0) {
5409                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5410                                 path->locks[level] = 0;
5411                                 return ret;
5412                         }
5413                         BUG_ON(wc->refs[level] == 0);
5414                         if (wc->refs[level] == 1) {
5415                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5416                                 path->locks[level] = 0;
5417                                 return 1;
5418                         }
5419                 }
5420         }
5421
5422         /* wc->stage == DROP_REFERENCE */
5423         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5424
5425         if (wc->refs[level] == 1) {
5426                 if (level == 0) {
5427                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5428                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5429                         else
5430                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5431                         BUG_ON(ret); /* -ENOMEM */
5432                         if (is_fstree(root->root_key.objectid)) {
5433                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5434                                 if (ret) {
5435                                         btrfs_err_rl(fs_info,
5436         "error %d accounting leaf items, quota is out of sync, rescan required",
5437                                              ret);
5438                                 }
5439                         }
5440                 }
5441                 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5442                 if (!path->locks[level]) {
5443                         btrfs_tree_lock(eb);
5444                         path->locks[level] = BTRFS_WRITE_LOCK;
5445                 }
5446                 btrfs_clear_buffer_dirty(trans, eb);
5447         }
5448
5449         if (eb == root->node) {
5450                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5451                         parent = eb->start;
5452                 else if (root->root_key.objectid != btrfs_header_owner(eb))
5453                         goto owner_mismatch;
5454         } else {
5455                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5456                         parent = path->nodes[level + 1]->start;
5457                 else if (root->root_key.objectid !=
5458                          btrfs_header_owner(path->nodes[level + 1]))
5459                         goto owner_mismatch;
5460         }
5461
5462         btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5463                               wc->refs[level] == 1);
5464 out:
5465         wc->refs[level] = 0;
5466         wc->flags[level] = 0;
5467         return 0;
5468
5469 owner_mismatch:
5470         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5471                      btrfs_header_owner(eb), root->root_key.objectid);
5472         return -EUCLEAN;
5473 }
5474
5475 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5476                                    struct btrfs_root *root,
5477                                    struct btrfs_path *path,
5478                                    struct walk_control *wc)
5479 {
5480         int level = wc->level;
5481         int lookup_info = 1;
5482         int ret = 0;
5483
5484         while (level >= 0) {
5485                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5486                 if (ret)
5487                         break;
5488
5489                 if (level == 0)
5490                         break;
5491
5492                 if (path->slots[level] >=
5493                     btrfs_header_nritems(path->nodes[level]))
5494                         break;
5495
5496                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5497                 if (ret > 0) {
5498                         path->slots[level]++;
5499                         continue;
5500                 } else if (ret < 0)
5501                         break;
5502                 level = wc->level;
5503         }
5504         return (ret == 1) ? 0 : ret;
5505 }
5506
5507 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5508                                  struct btrfs_root *root,
5509                                  struct btrfs_path *path,
5510                                  struct walk_control *wc, int max_level)
5511 {
5512         int level = wc->level;
5513         int ret;
5514
5515         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5516         while (level < max_level && path->nodes[level]) {
5517                 wc->level = level;
5518                 if (path->slots[level] + 1 <
5519                     btrfs_header_nritems(path->nodes[level])) {
5520                         path->slots[level]++;
5521                         return 0;
5522                 } else {
5523                         ret = walk_up_proc(trans, root, path, wc);
5524                         if (ret > 0)
5525                                 return 0;
5526                         if (ret < 0)
5527                                 return ret;
5528
5529                         if (path->locks[level]) {
5530                                 btrfs_tree_unlock_rw(path->nodes[level],
5531                                                      path->locks[level]);
5532                                 path->locks[level] = 0;
5533                         }
5534                         free_extent_buffer(path->nodes[level]);
5535                         path->nodes[level] = NULL;
5536                         level++;
5537                 }
5538         }
5539         return 1;
5540 }
5541
5542 /*
5543  * drop a subvolume tree.
5544  *
5545  * this function traverses the tree freeing any blocks that only
5546  * referenced by the tree.
5547  *
5548  * when a shared tree block is found. this function decreases its
5549  * reference count by one. if update_ref is true, this function
5550  * also make sure backrefs for the shared block and all lower level
5551  * blocks are properly updated.
5552  *
5553  * If called with for_reloc == 0, may exit early with -EAGAIN
5554  */
5555 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5556 {
5557         const bool is_reloc_root = (root->root_key.objectid ==
5558                                     BTRFS_TREE_RELOC_OBJECTID);
5559         struct btrfs_fs_info *fs_info = root->fs_info;
5560         struct btrfs_path *path;
5561         struct btrfs_trans_handle *trans;
5562         struct btrfs_root *tree_root = fs_info->tree_root;
5563         struct btrfs_root_item *root_item = &root->root_item;
5564         struct walk_control *wc;
5565         struct btrfs_key key;
5566         int err = 0;
5567         int ret;
5568         int level;
5569         bool root_dropped = false;
5570         bool unfinished_drop = false;
5571
5572         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5573
5574         path = btrfs_alloc_path();
5575         if (!path) {
5576                 err = -ENOMEM;
5577                 goto out;
5578         }
5579
5580         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5581         if (!wc) {
5582                 btrfs_free_path(path);
5583                 err = -ENOMEM;
5584                 goto out;
5585         }
5586
5587         /*
5588          * Use join to avoid potential EINTR from transaction start. See
5589          * wait_reserve_ticket and the whole reservation callchain.
5590          */
5591         if (for_reloc)
5592                 trans = btrfs_join_transaction(tree_root);
5593         else
5594                 trans = btrfs_start_transaction(tree_root, 0);
5595         if (IS_ERR(trans)) {
5596                 err = PTR_ERR(trans);
5597                 goto out_free;
5598         }
5599
5600         err = btrfs_run_delayed_items(trans);
5601         if (err)
5602                 goto out_end_trans;
5603
5604         /*
5605          * This will help us catch people modifying the fs tree while we're
5606          * dropping it.  It is unsafe to mess with the fs tree while it's being
5607          * dropped as we unlock the root node and parent nodes as we walk down
5608          * the tree, assuming nothing will change.  If something does change
5609          * then we'll have stale information and drop references to blocks we've
5610          * already dropped.
5611          */
5612         set_bit(BTRFS_ROOT_DELETING, &root->state);
5613         unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5614
5615         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5616                 level = btrfs_header_level(root->node);
5617                 path->nodes[level] = btrfs_lock_root_node(root);
5618                 path->slots[level] = 0;
5619                 path->locks[level] = BTRFS_WRITE_LOCK;
5620                 memset(&wc->update_progress, 0,
5621                        sizeof(wc->update_progress));
5622         } else {
5623                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5624                 memcpy(&wc->update_progress, &key,
5625                        sizeof(wc->update_progress));
5626
5627                 level = btrfs_root_drop_level(root_item);
5628                 BUG_ON(level == 0);
5629                 path->lowest_level = level;
5630                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5631                 path->lowest_level = 0;
5632                 if (ret < 0) {
5633                         err = ret;
5634                         goto out_end_trans;
5635                 }
5636                 WARN_ON(ret > 0);
5637
5638                 /*
5639                  * unlock our path, this is safe because only this
5640                  * function is allowed to delete this snapshot
5641                  */
5642                 btrfs_unlock_up_safe(path, 0);
5643
5644                 level = btrfs_header_level(root->node);
5645                 while (1) {
5646                         btrfs_tree_lock(path->nodes[level]);
5647                         path->locks[level] = BTRFS_WRITE_LOCK;
5648
5649                         ret = btrfs_lookup_extent_info(trans, fs_info,
5650                                                 path->nodes[level]->start,
5651                                                 level, 1, &wc->refs[level],
5652                                                 &wc->flags[level]);
5653                         if (ret < 0) {
5654                                 err = ret;
5655                                 goto out_end_trans;
5656                         }
5657                         BUG_ON(wc->refs[level] == 0);
5658
5659                         if (level == btrfs_root_drop_level(root_item))
5660                                 break;
5661
5662                         btrfs_tree_unlock(path->nodes[level]);
5663                         path->locks[level] = 0;
5664                         WARN_ON(wc->refs[level] != 1);
5665                         level--;
5666                 }
5667         }
5668
5669         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5670         wc->level = level;
5671         wc->shared_level = -1;
5672         wc->stage = DROP_REFERENCE;
5673         wc->update_ref = update_ref;
5674         wc->keep_locks = 0;
5675         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5676
5677         while (1) {
5678
5679                 ret = walk_down_tree(trans, root, path, wc);
5680                 if (ret < 0) {
5681                         btrfs_abort_transaction(trans, ret);
5682                         err = ret;
5683                         break;
5684                 }
5685
5686                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5687                 if (ret < 0) {
5688                         btrfs_abort_transaction(trans, ret);
5689                         err = ret;
5690                         break;
5691                 }
5692
5693                 if (ret > 0) {
5694                         BUG_ON(wc->stage != DROP_REFERENCE);
5695                         break;
5696                 }
5697
5698                 if (wc->stage == DROP_REFERENCE) {
5699                         wc->drop_level = wc->level;
5700                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5701                                               &wc->drop_progress,
5702                                               path->slots[wc->drop_level]);
5703                 }
5704                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5705                                       &wc->drop_progress);
5706                 btrfs_set_root_drop_level(root_item, wc->drop_level);
5707
5708                 BUG_ON(wc->level == 0);
5709                 if (btrfs_should_end_transaction(trans) ||
5710                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5711                         ret = btrfs_update_root(trans, tree_root,
5712                                                 &root->root_key,
5713                                                 root_item);
5714                         if (ret) {
5715                                 btrfs_abort_transaction(trans, ret);
5716                                 err = ret;
5717                                 goto out_end_trans;
5718                         }
5719
5720                         if (!is_reloc_root)
5721                                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5722
5723                         btrfs_end_transaction_throttle(trans);
5724                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5725                                 btrfs_debug(fs_info,
5726                                             "drop snapshot early exit");
5727                                 err = -EAGAIN;
5728                                 goto out_free;
5729                         }
5730
5731                        /*
5732                         * Use join to avoid potential EINTR from transaction
5733                         * start. See wait_reserve_ticket and the whole
5734                         * reservation callchain.
5735                         */
5736                         if (for_reloc)
5737                                 trans = btrfs_join_transaction(tree_root);
5738                         else
5739                                 trans = btrfs_start_transaction(tree_root, 0);
5740                         if (IS_ERR(trans)) {
5741                                 err = PTR_ERR(trans);
5742                                 goto out_free;
5743                         }
5744                 }
5745         }
5746         btrfs_release_path(path);
5747         if (err)
5748                 goto out_end_trans;
5749
5750         ret = btrfs_del_root(trans, &root->root_key);
5751         if (ret) {
5752                 btrfs_abort_transaction(trans, ret);
5753                 err = ret;
5754                 goto out_end_trans;
5755         }
5756
5757         if (!is_reloc_root) {
5758                 ret = btrfs_find_root(tree_root, &root->root_key, path,
5759                                       NULL, NULL);
5760                 if (ret < 0) {
5761                         btrfs_abort_transaction(trans, ret);
5762                         err = ret;
5763                         goto out_end_trans;
5764                 } else if (ret > 0) {
5765                         /* if we fail to delete the orphan item this time
5766                          * around, it'll get picked up the next time.
5767                          *
5768                          * The most common failure here is just -ENOENT.
5769                          */
5770                         btrfs_del_orphan_item(trans, tree_root,
5771                                               root->root_key.objectid);
5772                 }
5773         }
5774
5775         /*
5776          * This subvolume is going to be completely dropped, and won't be
5777          * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5778          * commit transaction time.  So free it here manually.
5779          */
5780         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5781         btrfs_qgroup_free_meta_all_pertrans(root);
5782
5783         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5784                 btrfs_add_dropped_root(trans, root);
5785         else
5786                 btrfs_put_root(root);
5787         root_dropped = true;
5788 out_end_trans:
5789         if (!is_reloc_root)
5790                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5791
5792         btrfs_end_transaction_throttle(trans);
5793 out_free:
5794         kfree(wc);
5795         btrfs_free_path(path);
5796 out:
5797         /*
5798          * We were an unfinished drop root, check to see if there are any
5799          * pending, and if not clear and wake up any waiters.
5800          */
5801         if (!err && unfinished_drop)
5802                 btrfs_maybe_wake_unfinished_drop(fs_info);
5803
5804         /*
5805          * So if we need to stop dropping the snapshot for whatever reason we
5806          * need to make sure to add it back to the dead root list so that we
5807          * keep trying to do the work later.  This also cleans up roots if we
5808          * don't have it in the radix (like when we recover after a power fail
5809          * or unmount) so we don't leak memory.
5810          */
5811         if (!for_reloc && !root_dropped)
5812                 btrfs_add_dead_root(root);
5813         return err;
5814 }
5815
5816 /*
5817  * drop subtree rooted at tree block 'node'.
5818  *
5819  * NOTE: this function will unlock and release tree block 'node'
5820  * only used by relocation code
5821  */
5822 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5823                         struct btrfs_root *root,
5824                         struct extent_buffer *node,
5825                         struct extent_buffer *parent)
5826 {
5827         struct btrfs_fs_info *fs_info = root->fs_info;
5828         struct btrfs_path *path;
5829         struct walk_control *wc;
5830         int level;
5831         int parent_level;
5832         int ret = 0;
5833         int wret;
5834
5835         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5836
5837         path = btrfs_alloc_path();
5838         if (!path)
5839                 return -ENOMEM;
5840
5841         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5842         if (!wc) {
5843                 btrfs_free_path(path);
5844                 return -ENOMEM;
5845         }
5846
5847         btrfs_assert_tree_write_locked(parent);
5848         parent_level = btrfs_header_level(parent);
5849         atomic_inc(&parent->refs);
5850         path->nodes[parent_level] = parent;
5851         path->slots[parent_level] = btrfs_header_nritems(parent);
5852
5853         btrfs_assert_tree_write_locked(node);
5854         level = btrfs_header_level(node);
5855         path->nodes[level] = node;
5856         path->slots[level] = 0;
5857         path->locks[level] = BTRFS_WRITE_LOCK;
5858
5859         wc->refs[parent_level] = 1;
5860         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5861         wc->level = level;
5862         wc->shared_level = -1;
5863         wc->stage = DROP_REFERENCE;
5864         wc->update_ref = 0;
5865         wc->keep_locks = 1;
5866         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5867
5868         while (1) {
5869                 wret = walk_down_tree(trans, root, path, wc);
5870                 if (wret < 0) {
5871                         ret = wret;
5872                         break;
5873                 }
5874
5875                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5876                 if (wret < 0)
5877                         ret = wret;
5878                 if (wret != 0)
5879                         break;
5880         }
5881
5882         kfree(wc);
5883         btrfs_free_path(path);
5884         return ret;
5885 }
5886
5887 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5888                                    u64 start, u64 end)
5889 {
5890         return unpin_extent_range(fs_info, start, end, false);
5891 }
5892
5893 /*
5894  * It used to be that old block groups would be left around forever.
5895  * Iterating over them would be enough to trim unused space.  Since we
5896  * now automatically remove them, we also need to iterate over unallocated
5897  * space.
5898  *
5899  * We don't want a transaction for this since the discard may take a
5900  * substantial amount of time.  We don't require that a transaction be
5901  * running, but we do need to take a running transaction into account
5902  * to ensure that we're not discarding chunks that were released or
5903  * allocated in the current transaction.
5904  *
5905  * Holding the chunks lock will prevent other threads from allocating
5906  * or releasing chunks, but it won't prevent a running transaction
5907  * from committing and releasing the memory that the pending chunks
5908  * list head uses.  For that, we need to take a reference to the
5909  * transaction and hold the commit root sem.  We only need to hold
5910  * it while performing the free space search since we have already
5911  * held back allocations.
5912  */
5913 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5914 {
5915         u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
5916         int ret;
5917
5918         *trimmed = 0;
5919
5920         /* Discard not supported = nothing to do. */
5921         if (!bdev_max_discard_sectors(device->bdev))
5922                 return 0;
5923
5924         /* Not writable = nothing to do. */
5925         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5926                 return 0;
5927
5928         /* No free space = nothing to do. */
5929         if (device->total_bytes <= device->bytes_used)
5930                 return 0;
5931
5932         ret = 0;
5933
5934         while (1) {
5935                 struct btrfs_fs_info *fs_info = device->fs_info;
5936                 u64 bytes;
5937
5938                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5939                 if (ret)
5940                         break;
5941
5942                 find_first_clear_extent_bit(&device->alloc_state, start,
5943                                             &start, &end,
5944                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
5945
5946                 /* Check if there are any CHUNK_* bits left */
5947                 if (start > device->total_bytes) {
5948                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5949                         btrfs_warn_in_rcu(fs_info,
5950 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5951                                           start, end - start + 1,
5952                                           btrfs_dev_name(device),
5953                                           device->total_bytes);
5954                         mutex_unlock(&fs_info->chunk_mutex);
5955                         ret = 0;
5956                         break;
5957                 }
5958
5959                 /* Ensure we skip the reserved space on each device. */
5960                 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
5961
5962                 /*
5963                  * If find_first_clear_extent_bit find a range that spans the
5964                  * end of the device it will set end to -1, in this case it's up
5965                  * to the caller to trim the value to the size of the device.
5966                  */
5967                 end = min(end, device->total_bytes - 1);
5968
5969                 len = end - start + 1;
5970
5971                 /* We didn't find any extents */
5972                 if (!len) {
5973                         mutex_unlock(&fs_info->chunk_mutex);
5974                         ret = 0;
5975                         break;
5976                 }
5977
5978                 ret = btrfs_issue_discard(device->bdev, start, len,
5979                                           &bytes);
5980                 if (!ret)
5981                         set_extent_bit(&device->alloc_state, start,
5982                                        start + bytes - 1, CHUNK_TRIMMED, NULL);
5983                 mutex_unlock(&fs_info->chunk_mutex);
5984
5985                 if (ret)
5986                         break;
5987
5988                 start += len;
5989                 *trimmed += bytes;
5990
5991                 if (fatal_signal_pending(current)) {
5992                         ret = -ERESTARTSYS;
5993                         break;
5994                 }
5995
5996                 cond_resched();
5997         }
5998
5999         return ret;
6000 }
6001
6002 /*
6003  * Trim the whole filesystem by:
6004  * 1) trimming the free space in each block group
6005  * 2) trimming the unallocated space on each device
6006  *
6007  * This will also continue trimming even if a block group or device encounters
6008  * an error.  The return value will be the last error, or 0 if nothing bad
6009  * happens.
6010  */
6011 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6012 {
6013         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6014         struct btrfs_block_group *cache = NULL;
6015         struct btrfs_device *device;
6016         u64 group_trimmed;
6017         u64 range_end = U64_MAX;
6018         u64 start;
6019         u64 end;
6020         u64 trimmed = 0;
6021         u64 bg_failed = 0;
6022         u64 dev_failed = 0;
6023         int bg_ret = 0;
6024         int dev_ret = 0;
6025         int ret = 0;
6026
6027         if (range->start == U64_MAX)
6028                 return -EINVAL;
6029
6030         /*
6031          * Check range overflow if range->len is set.
6032          * The default range->len is U64_MAX.
6033          */
6034         if (range->len != U64_MAX &&
6035             check_add_overflow(range->start, range->len, &range_end))
6036                 return -EINVAL;
6037
6038         cache = btrfs_lookup_first_block_group(fs_info, range->start);
6039         for (; cache; cache = btrfs_next_block_group(cache)) {
6040                 if (cache->start >= range_end) {
6041                         btrfs_put_block_group(cache);
6042                         break;
6043                 }
6044
6045                 start = max(range->start, cache->start);
6046                 end = min(range_end, cache->start + cache->length);
6047
6048                 if (end - start >= range->minlen) {
6049                         if (!btrfs_block_group_done(cache)) {
6050                                 ret = btrfs_cache_block_group(cache, true);
6051                                 if (ret) {
6052                                         bg_failed++;
6053                                         bg_ret = ret;
6054                                         continue;
6055                                 }
6056                         }
6057                         ret = btrfs_trim_block_group(cache,
6058                                                      &group_trimmed,
6059                                                      start,
6060                                                      end,
6061                                                      range->minlen);
6062
6063                         trimmed += group_trimmed;
6064                         if (ret) {
6065                                 bg_failed++;
6066                                 bg_ret = ret;
6067                                 continue;
6068                         }
6069                 }
6070         }
6071
6072         if (bg_failed)
6073                 btrfs_warn(fs_info,
6074                         "failed to trim %llu block group(s), last error %d",
6075                         bg_failed, bg_ret);
6076
6077         mutex_lock(&fs_devices->device_list_mutex);
6078         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6079                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6080                         continue;
6081
6082                 ret = btrfs_trim_free_extents(device, &group_trimmed);
6083                 if (ret) {
6084                         dev_failed++;
6085                         dev_ret = ret;
6086                         break;
6087                 }
6088
6089                 trimmed += group_trimmed;
6090         }
6091         mutex_unlock(&fs_devices->device_list_mutex);
6092
6093         if (dev_failed)
6094                 btrfs_warn(fs_info,
6095                         "failed to trim %llu device(s), last error %d",
6096                         dev_failed, dev_ret);
6097         range->len = trimmed;
6098         if (bg_ret)
6099                 return bg_ret;
6100         return dev_ret;
6101 }