fs/btrfs/extent-tree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/signal.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/writeback.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/sort.h>
  12 #include <linux/rcupdate.h>
  13 #include <linux/kthread.h>
  14 #include <linux/slab.h>
  15 #include <linux/ratelimit.h>
  16 #include <linux/percpu_counter.h>
  17 #include <linux/lockdep.h>
  18 #include <linux/crc32c.h>
  19 #include "tree-log.h"
  20 #include "disk-io.h"
  21 #include "print-tree.h"
  22 #include "volumes.h"
  23 #include "raid56.h"
  24 #include "locking.h"
  25 #include "free-space-cache.h"
  26 #include "free-space-tree.h"
  27 #include "math.h"
  28 #include "sysfs.h"
  29 #include "qgroup.h"
  30 #include "ref-verify.h"
  31 #include "space-info.h"
  32 #include "block-rsv.h"
  33 #include "delalloc-space.h"
  34
  35 #undef SCRAMBLE_DELAYED_REFS
  36
  37
  38 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  39                                struct btrfs_delayed_ref_node *node, u64 parent,
  40                                u64 root_objectid, u64 owner_objectid,
  41                                u64 owner_offset, int refs_to_drop,
  42                                struct btrfs_delayed_extent_op *extra_op);
  43 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  44                                     struct extent_buffer *leaf,
  45                                     struct btrfs_extent_item *ei);
  46 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  47                                       u64 parent, u64 root_objectid,
  48                                       u64 flags, u64 owner, u64 offset,
  49                                       struct btrfs_key *ins, int ref_mod);
  50 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  51                                      struct btrfs_delayed_ref_node *node,
  52                                      struct btrfs_delayed_extent_op *extent_op);
  53 static int find_next_key(struct btrfs_path *path, int level,
  54                          struct btrfs_key *key);
  55
  56 static noinline int
  57 block_group_cache_done(struct btrfs_block_group_cache *cache)
  58 {
  59         smp_mb();
  60         return cache->cached == BTRFS_CACHE_FINISHED ||
  61                 cache->cached == BTRFS_CACHE_ERROR;
  62 }
  63
  64 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  65 {
  66         return (cache->flags & bits) == bits;
  67 }
  68
  69 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  70 {
  71         atomic_inc(&cache->count);
  72 }
  73
  74 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  75 {
  76         if (atomic_dec_and_test(&cache->count)) {
  77                 WARN_ON(cache->pinned > 0);
  78                 WARN_ON(cache->reserved > 0);
  79
  80                 /*
  81                  * If not empty, someone is still holding mutex of
  82                  * full_stripe_lock, which can only be released by caller.
  83                  * And it will definitely cause use-after-free when caller
  84                  * tries to release full stripe lock.
  85                  *
  86                  * No better way to resolve, but only to warn.
  87                  */
  88                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
  89                 kfree(cache->free_space_ctl);
  90                 kfree(cache);
  91         }
  92 }
  93
  94 /*
  95  * this adds the block group to the fs_info rb tree for the block group
  96  * cache
  97  */
  98 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
  99                                 struct btrfs_block_group_cache *block_group)
 100 {
 101         struct rb_node **p;
 102         struct rb_node *parent = NULL;
 103         struct btrfs_block_group_cache *cache;
 104
 105         spin_lock(&info->block_group_cache_lock);
 106         p = &info->block_group_cache_tree.rb_node;
 107
 108         while (*p) {
 109                 parent = *p;
 110                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 111                                  cache_node);
 112                 if (block_group->key.objectid < cache->key.objectid) {
 113                         p = &(*p)->rb_left;
 114                 } else if (block_group->key.objectid > cache->key.objectid) {
 115                         p = &(*p)->rb_right;
 116                 } else {
 117                         spin_unlock(&info->block_group_cache_lock);
 118                         return -EEXIST;
 119                 }
 120         }
 121
 122         rb_link_node(&block_group->cache_node, parent, p);
 123         rb_insert_color(&block_group->cache_node,
 124                         &info->block_group_cache_tree);
 125
 126         if (info->first_logical_byte > block_group->key.objectid)
 127                 info->first_logical_byte = block_group->key.objectid;
 128
 129         spin_unlock(&info->block_group_cache_lock);
 130
 131         return 0;
 132 }
 133
 134 /*
 135  * This will return the block group at or after bytenr if contains is 0, else
 136  * it will return the block group that contains the bytenr
 137  */
 138 static struct btrfs_block_group_cache *
 139 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 140                               int contains)
 141 {
 142         struct btrfs_block_group_cache *cache, *ret = NULL;
 143         struct rb_node *n;
 144         u64 end, start;
 145
 146         spin_lock(&info->block_group_cache_lock);
 147         n = info->block_group_cache_tree.rb_node;
 148
 149         while (n) {
 150                 cache = rb_entry(n, struct btrfs_block_group_cache,
 151                                  cache_node);
 152                 end = cache->key.objectid + cache->key.offset - 1;
 153                 start = cache->key.objectid;
 154
 155                 if (bytenr < start) {
 156                         if (!contains && (!ret || start < ret->key.objectid))
 157                                 ret = cache;
 158                         n = n->rb_left;
 159                 } else if (bytenr > start) {
 160                         if (contains && bytenr <= end) {
 161                                 ret = cache;
 162                                 break;
 163                         }
 164                         n = n->rb_right;
 165                 } else {
 166                         ret = cache;
 167                         break;
 168                 }
 169         }
 170         if (ret) {
 171                 btrfs_get_block_group(ret);
 172                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 173                         info->first_logical_byte = ret->key.objectid;
 174         }
 175         spin_unlock(&info->block_group_cache_lock);
 176
 177         return ret;
 178 }
 179
 180 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
 181                                u64 start, u64 num_bytes)
 182 {
 183         u64 end = start + num_bytes - 1;
 184         set_extent_bits(&fs_info->freed_extents[0],
 185                         start, end, EXTENT_UPTODATE);
 186         set_extent_bits(&fs_info->freed_extents[1],
 187                         start, end, EXTENT_UPTODATE);
 188         return 0;
 189 }
 190
 191 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
 192 {
 193         struct btrfs_fs_info *fs_info = cache->fs_info;
 194         u64 start, end;
 195
 196         start = cache->key.objectid;
 197         end = start + cache->key.offset - 1;
 198
 199         clear_extent_bits(&fs_info->freed_extents[0],
 200                           start, end, EXTENT_UPTODATE);
 201         clear_extent_bits(&fs_info->freed_extents[1],
 202                           start, end, EXTENT_UPTODATE);
 203 }
 204
 205 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
 206 {
 207         struct btrfs_fs_info *fs_info = cache->fs_info;
 208         u64 bytenr;
 209         u64 *logical;
 210         int stripe_len;
 211         int i, nr, ret;
 212
 213         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 214                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 215                 cache->bytes_super += stripe_len;
 216                 ret = add_excluded_extent(fs_info, cache->key.objectid,
 217                                           stripe_len);
 218                 if (ret)
 219                         return ret;
 220         }
 221
 222         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 223                 bytenr = btrfs_sb_offset(i);
 224                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 225                                        bytenr, &logical, &nr, &stripe_len);
 226                 if (ret)
 227                         return ret;
 228
 229                 while (nr--) {
 230                         u64 start, len;
 231
 232                         if (logical[nr] > cache->key.objectid +
 233                             cache->key.offset)
 234                                 continue;
 235
 236                         if (logical[nr] + stripe_len <= cache->key.objectid)
 237                                 continue;
 238
 239                         start = logical[nr];
 240                         if (start < cache->key.objectid) {
 241                                 start = cache->key.objectid;
 242                                 len = (logical[nr] + stripe_len) - start;
 243                         } else {
 244                                 len = min_t(u64, stripe_len,
 245                                             cache->key.objectid +
 246                                             cache->key.offset - start);
 247                         }
 248
 249                         cache->bytes_super += len;
 250                         ret = add_excluded_extent(fs_info, start, len);
 251                         if (ret) {
 252                                 kfree(logical);
 253                                 return ret;
 254                         }
 255                 }
 256
 257                 kfree(logical);
 258         }
 259         return 0;
 260 }
 261
 262 static struct btrfs_caching_control *
 263 get_caching_control(struct btrfs_block_group_cache *cache)
 264 {
 265         struct btrfs_caching_control *ctl;
 266
 267         spin_lock(&cache->lock);
 268         if (!cache->caching_ctl) {
 269                 spin_unlock(&cache->lock);
 270                 return NULL;
 271         }
 272
 273         ctl = cache->caching_ctl;
 274         refcount_inc(&ctl->count);
 275         spin_unlock(&cache->lock);
 276         return ctl;
 277 }
 278
 279 static void put_caching_control(struct btrfs_caching_control *ctl)
 280 {
 281         if (refcount_dec_and_test(&ctl->count))
 282                 kfree(ctl);
 283 }
 284
 285 #ifdef CONFIG_BTRFS_DEBUG
 286 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
 287 {
 288         struct btrfs_fs_info *fs_info = block_group->fs_info;
 289         u64 start = block_group->key.objectid;
 290         u64 len = block_group->key.offset;
 291         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 292                 fs_info->nodesize : fs_info->sectorsize;
 293         u64 step = chunk << 1;
 294
 295         while (len > chunk) {
 296                 btrfs_remove_free_space(block_group, start, chunk);
 297                 start += step;
 298                 if (len < step)
 299                         len = 0;
 300                 else
 301                         len -= step;
 302         }
 303 }
 304 #endif
 305
 306 /*
 307  * this is only called by cache_block_group, since we could have freed extents
 308  * we need to check the pinned_extents for any extents that can't be used yet
 309  * since their free space will be released as soon as the transaction commits.
 310  */
 311 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 312                        u64 start, u64 end)
 313 {
 314         struct btrfs_fs_info *info = block_group->fs_info;
 315         u64 extent_start, extent_end, size, total_added = 0;
 316         int ret;
 317
 318         while (start < end) {
 319                 ret = find_first_extent_bit(info->pinned_extents, start,
 320                                             &extent_start, &extent_end,
 321                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 322                                             NULL);
 323                 if (ret)
 324                         break;
 325
 326                 if (extent_start <= start) {
 327                         start = extent_end + 1;
 328                 } else if (extent_start > start && extent_start < end) {
 329                         size = extent_start - start;
 330                         total_added += size;
 331                         ret = btrfs_add_free_space(block_group, start,
 332                                                    size);
 333                         BUG_ON(ret); /* -ENOMEM or logic error */
 334                         start = extent_end + 1;
 335                 } else {
 336                         break;
 337                 }
 338         }
 339
 340         if (start < end) {
 341                 size = end - start;
 342                 total_added += size;
 343                 ret = btrfs_add_free_space(block_group, start, size);
 344                 BUG_ON(ret); /* -ENOMEM or logic error */
 345         }
 346
 347         return total_added;
 348 }
 349
 350 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 351 {
 352         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 353         struct btrfs_fs_info *fs_info = block_group->fs_info;
 354         struct btrfs_root *extent_root = fs_info->extent_root;
 355         struct btrfs_path *path;
 356         struct extent_buffer *leaf;
 357         struct btrfs_key key;
 358         u64 total_found = 0;
 359         u64 last = 0;
 360         u32 nritems;
 361         int ret;
 362         bool wakeup = true;
 363
 364         path = btrfs_alloc_path();
 365         if (!path)
 366                 return -ENOMEM;
 367
 368         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 369
 370 #ifdef CONFIG_BTRFS_DEBUG
 371         /*
 372          * If we're fragmenting we don't want to make anybody think we can
 373          * allocate from this block group until we've had a chance to fragment
 374          * the free space.
 375          */
 376         if (btrfs_should_fragment_free_space(block_group))
 377                 wakeup = false;
 378 #endif
 379         /*
 380          * We don't want to deadlock with somebody trying to allocate a new
 381          * extent for the extent root while also trying to search the extent
 382          * root to add free space.  So we skip locking and search the commit
 383          * root, since its read-only
 384          */
 385         path->skip_locking = 1;
 386         path->search_commit_root = 1;
 387         path->reada = READA_FORWARD;
 388
 389         key.objectid = last;
 390         key.offset = 0;
 391         key.type = BTRFS_EXTENT_ITEM_KEY;
 392
 393 next:
 394         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 395         if (ret < 0)
 396                 goto out;
 397
 398         leaf = path->nodes[0];
 399         nritems = btrfs_header_nritems(leaf);
 400
 401         while (1) {
 402                 if (btrfs_fs_closing(fs_info) > 1) {
 403                         last = (u64)-1;
 404                         break;
 405                 }
 406
 407                 if (path->slots[0] < nritems) {
 408                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 409                 } else {
 410                         ret = find_next_key(path, 0, &key);
 411                         if (ret)
 412                                 break;
 413
 414                         if (need_resched() ||
 415                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 416                                 if (wakeup)
 417                                         caching_ctl->progress = last;
 418                                 btrfs_release_path(path);
 419                                 up_read(&fs_info->commit_root_sem);
 420                                 mutex_unlock(&caching_ctl->mutex);
 421                                 cond_resched();
 422                                 mutex_lock(&caching_ctl->mutex);
 423                                 down_read(&fs_info->commit_root_sem);
 424                                 goto next;
 425                         }
 426
 427                         ret = btrfs_next_leaf(extent_root, path);
 428                         if (ret < 0)
 429                                 goto out;
 430                         if (ret)
 431                                 break;
 432                         leaf = path->nodes[0];
 433                         nritems = btrfs_header_nritems(leaf);
 434                         continue;
 435                 }
 436
 437                 if (key.objectid < last) {
 438                         key.objectid = last;
 439                         key.offset = 0;
 440                         key.type = BTRFS_EXTENT_ITEM_KEY;
 441
 442                         if (wakeup)
 443                                 caching_ctl->progress = last;
 444                         btrfs_release_path(path);
 445                         goto next;
 446                 }
 447
 448                 if (key.objectid < block_group->key.objectid) {
 449                         path->slots[0]++;
 450                         continue;
 451                 }
 452
 453                 if (key.objectid >= block_group->key.objectid +
 454                     block_group->key.offset)
 455                         break;
 456
 457                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 458                     key.type == BTRFS_METADATA_ITEM_KEY) {
 459                         total_found += add_new_free_space(block_group, last,
 460                                                           key.objectid);
 461                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 462                                 last = key.objectid +
 463                                         fs_info->nodesize;
 464                         else
 465                                 last = key.objectid + key.offset;
 466
 467                         if (total_found > CACHING_CTL_WAKE_UP) {
 468                                 total_found = 0;
 469                                 if (wakeup)
 470                                         wake_up(&caching_ctl->wait);
 471                         }
 472                 }
 473                 path->slots[0]++;
 474         }
 475         ret = 0;
 476
 477         total_found += add_new_free_space(block_group, last,
 478                                           block_group->key.objectid +
 479                                           block_group->key.offset);
 480         caching_ctl->progress = (u64)-1;
 481
 482 out:
 483         btrfs_free_path(path);
 484         return ret;
 485 }
 486
 487 static noinline void caching_thread(struct btrfs_work *work)
 488 {
 489         struct btrfs_block_group_cache *block_group;
 490         struct btrfs_fs_info *fs_info;
 491         struct btrfs_caching_control *caching_ctl;
 492         int ret;
 493
 494         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 495         block_group = caching_ctl->block_group;
 496         fs_info = block_group->fs_info;
 497
 498         mutex_lock(&caching_ctl->mutex);
 499         down_read(&fs_info->commit_root_sem);
 500
 501         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 502                 ret = load_free_space_tree(caching_ctl);
 503         else
 504                 ret = load_extent_tree_free(caching_ctl);
 505
 506         spin_lock(&block_group->lock);
 507         block_group->caching_ctl = NULL;
 508         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 509         spin_unlock(&block_group->lock);
 510
 511 #ifdef CONFIG_BTRFS_DEBUG
 512         if (btrfs_should_fragment_free_space(block_group)) {
 513                 u64 bytes_used;
 514
 515                 spin_lock(&block_group->space_info->lock);
 516                 spin_lock(&block_group->lock);
 517                 bytes_used = block_group->key.offset -
 518                         btrfs_block_group_used(&block_group->item);
 519                 block_group->space_info->bytes_used += bytes_used >> 1;
 520                 spin_unlock(&block_group->lock);
 521                 spin_unlock(&block_group->space_info->lock);
 522                 fragment_free_space(block_group);
 523         }
 524 #endif
 525
 526         caching_ctl->progress = (u64)-1;
 527
 528         up_read(&fs_info->commit_root_sem);
 529         free_excluded_extents(block_group);
 530         mutex_unlock(&caching_ctl->mutex);
 531
 532         wake_up(&caching_ctl->wait);
 533
 534         put_caching_control(caching_ctl);
 535         btrfs_put_block_group(block_group);
 536 }
 537
 538 static int cache_block_group(struct btrfs_block_group_cache *cache,
 539                              int load_cache_only)
 540 {
 541         DEFINE_WAIT(wait);
 542         struct btrfs_fs_info *fs_info = cache->fs_info;
 543         struct btrfs_caching_control *caching_ctl;
 544         int ret = 0;
 545
 546         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 547         if (!caching_ctl)
 548                 return -ENOMEM;
 549
 550         INIT_LIST_HEAD(&caching_ctl->list);
 551         mutex_init(&caching_ctl->mutex);
 552         init_waitqueue_head(&caching_ctl->wait);
 553         caching_ctl->block_group = cache;
 554         caching_ctl->progress = cache->key.objectid;
 555         refcount_set(&caching_ctl->count, 1);
 556         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 557                         caching_thread, NULL, NULL);
 558
 559         spin_lock(&cache->lock);
 560         /*
 561          * This should be a rare occasion, but this could happen I think in the
 562          * case where one thread starts to load the space cache info, and then
 563          * some other thread starts a transaction commit which tries to do an
 564          * allocation while the other thread is still loading the space cache
 565          * info.  The previous loop should have kept us from choosing this block
 566          * group, but if we've moved to the state where we will wait on caching
 567          * block groups we need to first check if we're doing a fast load here,
 568          * so we can wait for it to finish, otherwise we could end up allocating
 569          * from a block group who's cache gets evicted for one reason or
 570          * another.
 571          */
 572         while (cache->cached == BTRFS_CACHE_FAST) {
 573                 struct btrfs_caching_control *ctl;
 574
 575                 ctl = cache->caching_ctl;
 576                 refcount_inc(&ctl->count);
 577                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 578                 spin_unlock(&cache->lock);
 579
 580                 schedule();
 581
 582                 finish_wait(&ctl->wait, &wait);
 583                 put_caching_control(ctl);
 584                 spin_lock(&cache->lock);
 585         }
 586
 587         if (cache->cached != BTRFS_CACHE_NO) {
 588                 spin_unlock(&cache->lock);
 589                 kfree(caching_ctl);
 590                 return 0;
 591         }
 592         WARN_ON(cache->caching_ctl);
 593         cache->caching_ctl = caching_ctl;
 594         cache->cached = BTRFS_CACHE_FAST;
 595         spin_unlock(&cache->lock);
 596
 597         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 598                 mutex_lock(&caching_ctl->mutex);
 599                 ret = load_free_space_cache(cache);
 600
 601                 spin_lock(&cache->lock);
 602                 if (ret == 1) {
 603                         cache->caching_ctl = NULL;
 604                         cache->cached = BTRFS_CACHE_FINISHED;
 605                         cache->last_byte_to_unpin = (u64)-1;
 606                         caching_ctl->progress = (u64)-1;
 607                 } else {
 608                         if (load_cache_only) {
 609                                 cache->caching_ctl = NULL;
 610                                 cache->cached = BTRFS_CACHE_NO;
 611                         } else {
 612                                 cache->cached = BTRFS_CACHE_STARTED;
 613                                 cache->has_caching_ctl = 1;
 614                         }
 615                 }
 616                 spin_unlock(&cache->lock);
 617 #ifdef CONFIG_BTRFS_DEBUG
 618                 if (ret == 1 &&
 619                     btrfs_should_fragment_free_space(cache)) {
 620                         u64 bytes_used;
 621
 622                         spin_lock(&cache->space_info->lock);
 623                         spin_lock(&cache->lock);
 624                         bytes_used = cache->key.offset -
 625                                 btrfs_block_group_used(&cache->item);
 626                         cache->space_info->bytes_used += bytes_used >> 1;
 627                         spin_unlock(&cache->lock);
 628                         spin_unlock(&cache->space_info->lock);
 629                         fragment_free_space(cache);
 630                 }
 631 #endif
 632                 mutex_unlock(&caching_ctl->mutex);
 633
 634                 wake_up(&caching_ctl->wait);
 635                 if (ret == 1) {
 636                         put_caching_control(caching_ctl);
 637                         free_excluded_extents(cache);
 638                         return 0;
 639                 }
 640         } else {
 641                 /*
 642                  * We're either using the free space tree or no caching at all.
 643                  * Set cached to the appropriate value and wakeup any waiters.
 644                  */
 645                 spin_lock(&cache->lock);
 646                 if (load_cache_only) {
 647                         cache->caching_ctl = NULL;
 648                         cache->cached = BTRFS_CACHE_NO;
 649                 } else {
 650                         cache->cached = BTRFS_CACHE_STARTED;
 651                         cache->has_caching_ctl = 1;
 652                 }
 653                 spin_unlock(&cache->lock);
 654                 wake_up(&caching_ctl->wait);
 655         }
 656
 657         if (load_cache_only) {
 658                 put_caching_control(caching_ctl);
 659                 return 0;
 660         }
 661
 662         down_write(&fs_info->commit_root_sem);
 663         refcount_inc(&caching_ctl->count);
 664         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 665         up_write(&fs_info->commit_root_sem);
 666
 667         btrfs_get_block_group(cache);
 668
 669         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 670
 671         return ret;
 672 }
 673
 674 /*
 675  * return the block group that starts at or after bytenr
 676  */
 677 static struct btrfs_block_group_cache *
 678 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 679 {
 680         return block_group_cache_tree_search(info, bytenr, 0);
 681 }
 682
 683 /*
 684  * return the block group that contains the given bytenr
 685  */
 686 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 687                                                  struct btrfs_fs_info *info,
 688                                                  u64 bytenr)
 689 {
 690         return block_group_cache_tree_search(info, bytenr, 1);
 691 }
 692
 693 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
 694 {
 695         if (ref->type == BTRFS_REF_METADATA) {
 696                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
 697                         return BTRFS_BLOCK_GROUP_SYSTEM;
 698                 else
 699                         return BTRFS_BLOCK_GROUP_METADATA;
 700         }
 701         return BTRFS_BLOCK_GROUP_DATA;
 702 }
 703
 704 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
 705                              struct btrfs_ref *ref)
 706 {
 707         struct btrfs_space_info *space_info;
 708         u64 flags = generic_ref_to_space_flags(ref);
 709
 710         space_info = btrfs_find_space_info(fs_info, flags);
 711         ASSERT(space_info);
 712         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
 713                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
 714 }
 715
 716 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
 717                              struct btrfs_ref *ref)
 718 {
 719         struct btrfs_space_info *space_info;
 720         u64 flags = generic_ref_to_space_flags(ref);
 721
 722         space_info = btrfs_find_space_info(fs_info, flags);
 723         ASSERT(space_info);
 724         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
 725                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
 726 }
 727
 728 /* simple helper to search for an existing data extent at a given offset */
 729 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 730 {
 731         int ret;
 732         struct btrfs_key key;
 733         struct btrfs_path *path;
 734
 735         path = btrfs_alloc_path();
 736         if (!path)
 737                 return -ENOMEM;
 738
 739         key.objectid = start;
 740         key.offset = len;
 741         key.type = BTRFS_EXTENT_ITEM_KEY;
 742         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 743         btrfs_free_path(path);
 744         return ret;
 745 }
 746
 747 /*
 748  * helper function to lookup reference count and flags of a tree block.
 749  *
 750  * the head node for delayed ref is used to store the sum of all the
 751  * reference count modifications queued up in the rbtree. the head
 752  * node may also store the extent flags to set. This way you can check
 753  * to see what the reference count and extent flags would be if all of
 754  * the delayed refs are not processed.
 755  */
 756 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 757                              struct btrfs_fs_info *fs_info, u64 bytenr,
 758                              u64 offset, int metadata, u64 *refs, u64 *flags)
 759 {
 760         struct btrfs_delayed_ref_head *head;
 761         struct btrfs_delayed_ref_root *delayed_refs;
 762         struct btrfs_path *path;
 763         struct btrfs_extent_item *ei;
 764         struct extent_buffer *leaf;
 765         struct btrfs_key key;
 766         u32 item_size;
 767         u64 num_refs;
 768         u64 extent_flags;
 769         int ret;
 770
 771         /*
 772          * If we don't have skinny metadata, don't bother doing anything
 773          * different
 774          */
 775         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 776                 offset = fs_info->nodesize;
 777                 metadata = 0;
 778         }
 779
 780         path = btrfs_alloc_path();
 781         if (!path)
 782                 return -ENOMEM;
 783
 784         if (!trans) {
 785                 path->skip_locking = 1;
 786                 path->search_commit_root = 1;
 787         }
 788
 789 search_again:
 790         key.objectid = bytenr;
 791         key.offset = offset;
 792         if (metadata)
 793                 key.type = BTRFS_METADATA_ITEM_KEY;
 794         else
 795                 key.type = BTRFS_EXTENT_ITEM_KEY;
 796
 797         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 798         if (ret < 0)
 799                 goto out_free;
 800
 801         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 802                 if (path->slots[0]) {
 803                         path->slots[0]--;
 804                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 805                                               path->slots[0]);
 806                         if (key.objectid == bytenr &&
 807                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 808                             key.offset == fs_info->nodesize)
 809                                 ret = 0;
 810                 }
 811         }
 812
 813         if (ret == 0) {
 814                 leaf = path->nodes[0];
 815                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 816                 if (item_size >= sizeof(*ei)) {
 817                         ei = btrfs_item_ptr(leaf, path->slots[0],
 818                                             struct btrfs_extent_item);
 819                         num_refs = btrfs_extent_refs(leaf, ei);
 820                         extent_flags = btrfs_extent_flags(leaf, ei);
 821                 } else {
 822                         ret = -EINVAL;
 823                         btrfs_print_v0_err(fs_info);
 824                         if (trans)
 825                                 btrfs_abort_transaction(trans, ret);
 826                         else
 827                                 btrfs_handle_fs_error(fs_info, ret, NULL);
 828
 829                         goto out_free;
 830                 }
 831
 832                 BUG_ON(num_refs == 0);
 833         } else {
 834                 num_refs = 0;
 835                 extent_flags = 0;
 836                 ret = 0;
 837         }
 838
 839         if (!trans)
 840                 goto out;
 841
 842         delayed_refs = &trans->transaction->delayed_refs;
 843         spin_lock(&delayed_refs->lock);
 844         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 845         if (head) {
 846                 if (!mutex_trylock(&head->mutex)) {
 847                         refcount_inc(&head->refs);
 848                         spin_unlock(&delayed_refs->lock);
 849
 850                         btrfs_release_path(path);
 851
 852                         /*
 853                          * Mutex was contended, block until it's released and try
 854                          * again
 855                          */
 856                         mutex_lock(&head->mutex);
 857                         mutex_unlock(&head->mutex);
 858                         btrfs_put_delayed_ref_head(head);
 859                         goto search_again;
 860                 }
 861                 spin_lock(&head->lock);
 862                 if (head->extent_op && head->extent_op->update_flags)
 863                         extent_flags |= head->extent_op->flags_to_set;
 864                 else
 865                         BUG_ON(num_refs == 0);
 866
 867                 num_refs += head->ref_mod;
 868                 spin_unlock(&head->lock);
 869                 mutex_unlock(&head->mutex);
 870         }
 871         spin_unlock(&delayed_refs->lock);
 872 out:
 873         WARN_ON(num_refs == 0);
 874         if (refs)
 875                 *refs = num_refs;
 876         if (flags)
 877                 *flags = extent_flags;
 878 out_free:
 879         btrfs_free_path(path);
 880         return ret;
 881 }
 882
 883 /*
 884  * Back reference rules.  Back refs have three main goals:
 885  *
 886  * 1) differentiate between all holders of references to an extent so that
 887  *    when a reference is dropped we can make sure it was a valid reference
 888  *    before freeing the extent.
 889  *
 890  * 2) Provide enough information to quickly find the holders of an extent
 891  *    if we notice a given block is corrupted or bad.
 892  *
 893  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 894  *    maintenance.  This is actually the same as #2, but with a slightly
 895  *    different use case.
 896  *
 897  * There are two kinds of back refs. The implicit back refs is optimized
 898  * for pointers in non-shared tree blocks. For a given pointer in a block,
 899  * back refs of this kind provide information about the block's owner tree
 900  * and the pointer's key. These information allow us to find the block by
 901  * b-tree searching. The full back refs is for pointers in tree blocks not
 902  * referenced by their owner trees. The location of tree block is recorded
 903  * in the back refs. Actually the full back refs is generic, and can be
 904  * used in all cases the implicit back refs is used. The major shortcoming
 905  * of the full back refs is its overhead. Every time a tree block gets
 906  * COWed, we have to update back refs entry for all pointers in it.
 907  *
 908  * For a newly allocated tree block, we use implicit back refs for
 909  * pointers in it. This means most tree related operations only involve
 910  * implicit back refs. For a tree block created in old transaction, the
 911  * only way to drop a reference to it is COW it. So we can detect the
 912  * event that tree block loses its owner tree's reference and do the
 913  * back refs conversion.
 914  *
 915  * When a tree block is COWed through a tree, there are four cases:
 916  *
 917  * The reference count of the block is one and the tree is the block's
 918  * owner tree. Nothing to do in this case.
 919  *
 920  * The reference count of the block is one and the tree is not the
 921  * block's owner tree. In this case, full back refs is used for pointers
 922  * in the block. Remove these full back refs, add implicit back refs for
 923  * every pointers in the new block.
 924  *
 925  * The reference count of the block is greater than one and the tree is
 926  * the block's owner tree. In this case, implicit back refs is used for
 927  * pointers in the block. Add full back refs for every pointers in the
 928  * block, increase lower level extents' reference counts. The original
 929  * implicit back refs are entailed to the new block.
 930  *
 931  * The reference count of the block is greater than one and the tree is
 932  * not the block's owner tree. Add implicit back refs for every pointer in
 933  * the new block, increase lower level extents' reference count.
 934  *
 935  * Back Reference Key composing:
 936  *
 937  * The key objectid corresponds to the first byte in the extent,
 938  * The key type is used to differentiate between types of back refs.
 939  * There are different meanings of the key offset for different types
 940  * of back refs.
 941  *
 942  * File extents can be referenced by:
 943  *
 944  * - multiple snapshots, subvolumes, or different generations in one subvol
 945  * - different files inside a single subvolume
 946  * - different offsets inside a file (bookend extents in file.c)
 947  *
 948  * The extent ref structure for the implicit back refs has fields for:
 949  *
 950  * - Objectid of the subvolume root
 951  * - objectid of the file holding the reference
 952  * - original offset in the file
 953  * - how many bookend extents
 954  *
 955  * The key offset for the implicit back refs is hash of the first
 956  * three fields.
 957  *
 958  * The extent ref structure for the full back refs has field for:
 959  *
 960  * - number of pointers in the tree leaf
 961  *
 962  * The key offset for the implicit back refs is the first byte of
 963  * the tree leaf
 964  *
 965  * When a file extent is allocated, The implicit back refs is used.
 966  * the fields are filled in:
 967  *
 968  *     (root_key.objectid, inode objectid, offset in file, 1)
 969  *
 970  * When a file extent is removed file truncation, we find the
 971  * corresponding implicit back refs and check the following fields:
 972  *
 973  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 974  *
 975  * Btree extents can be referenced by:
 976  *
 977  * - Different subvolumes
 978  *
 979  * Both the implicit back refs and the full back refs for tree blocks
 980  * only consist of key. The key offset for the implicit back refs is
 981  * objectid of block's owner tree. The key offset for the full back refs
 982  * is the first byte of parent block.
 983  *
 984  * When implicit back refs is used, information about the lowest key and
 985  * level of the tree block are required. These information are stored in
 986  * tree block info structure.
 987  */
 988
 989 /*
 990  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 991  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 992  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 993  */
 994 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 995                                      struct btrfs_extent_inline_ref *iref,
 996                                      enum btrfs_inline_ref_type is_data)
 997 {
 998         int type = btrfs_extent_inline_ref_type(eb, iref);
 999         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1000
1001         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1002             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1003             type == BTRFS_SHARED_DATA_REF_KEY ||
1004             type == BTRFS_EXTENT_DATA_REF_KEY) {
1005                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1006                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1007                                 return type;
1008                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1009                                 ASSERT(eb->fs_info);
1010                                 /*
1011                                  * Every shared one has parent tree
1012                                  * block, which must be aligned to
1013                                  * nodesize.
1014                                  */
1015                                 if (offset &&
1016                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1017                                         return type;
1018                         }
1019                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1020                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1021                                 return type;
1022                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1023                                 ASSERT(eb->fs_info);
1024                                 /*
1025                                  * Every shared one has parent tree
1026                                  * block, which must be aligned to
1027                                  * nodesize.
1028                                  */
1029                                 if (offset &&
1030                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1031                                         return type;
1032                         }
1033                 } else {
1034                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1035                         return type;
1036                 }
1037         }
1038
1039         btrfs_print_leaf((struct extent_buffer *)eb);
1040         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1041                   eb->start, type);
1042         WARN_ON(1);
1043
1044         return BTRFS_REF_TYPE_INVALID;
1045 }
1046
1047 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1048 {
1049         u32 high_crc = ~(u32)0;
1050         u32 low_crc = ~(u32)0;
1051         __le64 lenum;
1052
1053         lenum = cpu_to_le64(root_objectid);
1054         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1055         lenum = cpu_to_le64(owner);
1056         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1057         lenum = cpu_to_le64(offset);
1058         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1059
1060         return ((u64)high_crc << 31) ^ (u64)low_crc;
1061 }
1062
1063 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1064                                      struct btrfs_extent_data_ref *ref)
1065 {
1066         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1067                                     btrfs_extent_data_ref_objectid(leaf, ref),
1068                                     btrfs_extent_data_ref_offset(leaf, ref));
1069 }
1070
1071 static int match_extent_data_ref(struct extent_buffer *leaf,
1072                                  struct btrfs_extent_data_ref *ref,
1073                                  u64 root_objectid, u64 owner, u64 offset)
1074 {
1075         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1076             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1077             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1078                 return 0;
1079         return 1;
1080 }
1081
1082 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1083                                            struct btrfs_path *path,
1084                                            u64 bytenr, u64 parent,
1085                                            u64 root_objectid,
1086                                            u64 owner, u64 offset)
1087 {
1088         struct btrfs_root *root = trans->fs_info->extent_root;
1089         struct btrfs_key key;
1090         struct btrfs_extent_data_ref *ref;
1091         struct extent_buffer *leaf;
1092         u32 nritems;
1093         int ret;
1094         int recow;
1095         int err = -ENOENT;
1096
1097         key.objectid = bytenr;
1098         if (parent) {
1099                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1100                 key.offset = parent;
1101         } else {
1102                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1103                 key.offset = hash_extent_data_ref(root_objectid,
1104                                                   owner, offset);
1105         }
1106 again:
1107         recow = 0;
1108         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1109         if (ret < 0) {
1110                 err = ret;
1111                 goto fail;
1112         }
1113
1114         if (parent) {
1115                 if (!ret)
1116                         return 0;
1117                 goto fail;
1118         }
1119
1120         leaf = path->nodes[0];
1121         nritems = btrfs_header_nritems(leaf);
1122         while (1) {
1123                 if (path->slots[0] >= nritems) {
1124                         ret = btrfs_next_leaf(root, path);
1125                         if (ret < 0)
1126                                 err = ret;
1127                         if (ret)
1128                                 goto fail;
1129
1130                         leaf = path->nodes[0];
1131                         nritems = btrfs_header_nritems(leaf);
1132                         recow = 1;
1133                 }
1134
1135                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1136                 if (key.objectid != bytenr ||
1137                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1138                         goto fail;
1139
1140                 ref = btrfs_item_ptr(leaf, path->slots[0],
1141                                      struct btrfs_extent_data_ref);
1142
1143                 if (match_extent_data_ref(leaf, ref, root_objectid,
1144                                           owner, offset)) {
1145                         if (recow) {
1146                                 btrfs_release_path(path);
1147                                 goto again;
1148                         }
1149                         err = 0;
1150                         break;
1151                 }
1152                 path->slots[0]++;
1153         }
1154 fail:
1155         return err;
1156 }
1157
1158 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1159                                            struct btrfs_path *path,
1160                                            u64 bytenr, u64 parent,
1161                                            u64 root_objectid, u64 owner,
1162                                            u64 offset, int refs_to_add)
1163 {
1164         struct btrfs_root *root = trans->fs_info->extent_root;
1165         struct btrfs_key key;
1166         struct extent_buffer *leaf;
1167         u32 size;
1168         u32 num_refs;
1169         int ret;
1170
1171         key.objectid = bytenr;
1172         if (parent) {
1173                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1174                 key.offset = parent;
1175                 size = sizeof(struct btrfs_shared_data_ref);
1176         } else {
1177                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1178                 key.offset = hash_extent_data_ref(root_objectid,
1179                                                   owner, offset);
1180                 size = sizeof(struct btrfs_extent_data_ref);
1181         }
1182
1183         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1184         if (ret && ret != -EEXIST)
1185                 goto fail;
1186
1187         leaf = path->nodes[0];
1188         if (parent) {
1189                 struct btrfs_shared_data_ref *ref;
1190                 ref = btrfs_item_ptr(leaf, path->slots[0],
1191                                      struct btrfs_shared_data_ref);
1192                 if (ret == 0) {
1193                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1194                 } else {
1195                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1196                         num_refs += refs_to_add;
1197                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1198                 }
1199         } else {
1200                 struct btrfs_extent_data_ref *ref;
1201                 while (ret == -EEXIST) {
1202                         ref = btrfs_item_ptr(leaf, path->slots[0],
1203                                              struct btrfs_extent_data_ref);
1204                         if (match_extent_data_ref(leaf, ref, root_objectid,
1205                                                   owner, offset))
1206                                 break;
1207                         btrfs_release_path(path);
1208                         key.offset++;
1209                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1210                                                       size);
1211                         if (ret && ret != -EEXIST)
1212                                 goto fail;
1213
1214                         leaf = path->nodes[0];
1215                 }
1216                 ref = btrfs_item_ptr(leaf, path->slots[0],
1217                                      struct btrfs_extent_data_ref);
1218                 if (ret == 0) {
1219                         btrfs_set_extent_data_ref_root(leaf, ref,
1220                                                        root_objectid);
1221                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1222                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1223                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1224                 } else {
1225                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1226                         num_refs += refs_to_add;
1227                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1228                 }
1229         }
1230         btrfs_mark_buffer_dirty(leaf);
1231         ret = 0;
1232 fail:
1233         btrfs_release_path(path);
1234         return ret;
1235 }
1236
1237 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1238                                            struct btrfs_path *path,
1239                                            int refs_to_drop, int *last_ref)
1240 {
1241         struct btrfs_key key;
1242         struct btrfs_extent_data_ref *ref1 = NULL;
1243         struct btrfs_shared_data_ref *ref2 = NULL;
1244         struct extent_buffer *leaf;
1245         u32 num_refs = 0;
1246         int ret = 0;
1247
1248         leaf = path->nodes[0];
1249         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1250
1251         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1252                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1253                                       struct btrfs_extent_data_ref);
1254                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1255         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1256                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1257                                       struct btrfs_shared_data_ref);
1258                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1259         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1260                 btrfs_print_v0_err(trans->fs_info);
1261                 btrfs_abort_transaction(trans, -EINVAL);
1262                 return -EINVAL;
1263         } else {
1264                 BUG();
1265         }
1266
1267         BUG_ON(num_refs < refs_to_drop);
1268         num_refs -= refs_to_drop;
1269
1270         if (num_refs == 0) {
1271                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1272                 *last_ref = 1;
1273         } else {
1274                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1275                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1276                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1277                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1278                 btrfs_mark_buffer_dirty(leaf);
1279         }
1280         return ret;
1281 }
1282
1283 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1284                                           struct btrfs_extent_inline_ref *iref)
1285 {
1286         struct btrfs_key key;
1287         struct extent_buffer *leaf;
1288         struct btrfs_extent_data_ref *ref1;
1289         struct btrfs_shared_data_ref *ref2;
1290         u32 num_refs = 0;
1291         int type;
1292
1293         leaf = path->nodes[0];
1294         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1295
1296         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1297         if (iref) {
1298                 /*
1299                  * If type is invalid, we should have bailed out earlier than
1300                  * this call.
1301                  */
1302                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1303                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1304                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1305                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1306                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1307                 } else {
1308                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1309                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1310                 }
1311         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1312                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1313                                       struct btrfs_extent_data_ref);
1314                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1315         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1316                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1317                                       struct btrfs_shared_data_ref);
1318                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1319         } else {
1320                 WARN_ON(1);
1321         }
1322         return num_refs;
1323 }
1324
1325 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1326                                           struct btrfs_path *path,
1327                                           u64 bytenr, u64 parent,
1328                                           u64 root_objectid)
1329 {
1330         struct btrfs_root *root = trans->fs_info->extent_root;
1331         struct btrfs_key key;
1332         int ret;
1333
1334         key.objectid = bytenr;
1335         if (parent) {
1336                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1337                 key.offset = parent;
1338         } else {
1339                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1340                 key.offset = root_objectid;
1341         }
1342
1343         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1344         if (ret > 0)
1345                 ret = -ENOENT;
1346         return ret;
1347 }
1348
1349 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1350                                           struct btrfs_path *path,
1351                                           u64 bytenr, u64 parent,
1352                                           u64 root_objectid)
1353 {
1354         struct btrfs_key key;
1355         int ret;
1356
1357         key.objectid = bytenr;
1358         if (parent) {
1359                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1360                 key.offset = parent;
1361         } else {
1362                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1363                 key.offset = root_objectid;
1364         }
1365
1366         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1367                                       path, &key, 0);
1368         btrfs_release_path(path);
1369         return ret;
1370 }
1371
1372 static inline int extent_ref_type(u64 parent, u64 owner)
1373 {
1374         int type;
1375         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1376                 if (parent > 0)
1377                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1378                 else
1379                         type = BTRFS_TREE_BLOCK_REF_KEY;
1380         } else {
1381                 if (parent > 0)
1382                         type = BTRFS_SHARED_DATA_REF_KEY;
1383                 else
1384                         type = BTRFS_EXTENT_DATA_REF_KEY;
1385         }
1386         return type;
1387 }
1388
1389 static int find_next_key(struct btrfs_path *path, int level,
1390                          struct btrfs_key *key)
1391
1392 {
1393         for (; level < BTRFS_MAX_LEVEL; level++) {
1394                 if (!path->nodes[level])
1395                         break;
1396                 if (path->slots[level] + 1 >=
1397                     btrfs_header_nritems(path->nodes[level]))
1398                         continue;
1399                 if (level == 0)
1400                         btrfs_item_key_to_cpu(path->nodes[level], key,
1401                                               path->slots[level] + 1);
1402                 else
1403                         btrfs_node_key_to_cpu(path->nodes[level], key,
1404                                               path->slots[level] + 1);
1405                 return 0;
1406         }
1407         return 1;
1408 }
1409
1410 /*
1411  * look for inline back ref. if back ref is found, *ref_ret is set
1412  * to the address of inline back ref, and 0 is returned.
1413  *
1414  * if back ref isn't found, *ref_ret is set to the address where it
1415  * should be inserted, and -ENOENT is returned.
1416  *
1417  * if insert is true and there are too many inline back refs, the path
1418  * points to the extent item, and -EAGAIN is returned.
1419  *
1420  * NOTE: inline back refs are ordered in the same way that back ref
1421  *       items in the tree are ordered.
1422  */
1423 static noinline_for_stack
1424 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1425                                  struct btrfs_path *path,
1426                                  struct btrfs_extent_inline_ref **ref_ret,
1427                                  u64 bytenr, u64 num_bytes,
1428                                  u64 parent, u64 root_objectid,
1429                                  u64 owner, u64 offset, int insert)
1430 {
1431         struct btrfs_fs_info *fs_info = trans->fs_info;
1432         struct btrfs_root *root = fs_info->extent_root;
1433         struct btrfs_key key;
1434         struct extent_buffer *leaf;
1435         struct btrfs_extent_item *ei;
1436         struct btrfs_extent_inline_ref *iref;
1437         u64 flags;
1438         u64 item_size;
1439         unsigned long ptr;
1440         unsigned long end;
1441         int extra_size;
1442         int type;
1443         int want;
1444         int ret;
1445         int err = 0;
1446         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1447         int needed;
1448
1449         key.objectid = bytenr;
1450         key.type = BTRFS_EXTENT_ITEM_KEY;
1451         key.offset = num_bytes;
1452
1453         want = extent_ref_type(parent, owner);
1454         if (insert) {
1455                 extra_size = btrfs_extent_inline_ref_size(want);
1456                 path->keep_locks = 1;
1457         } else
1458                 extra_size = -1;
1459
1460         /*
1461          * Owner is our level, so we can just add one to get the level for the
1462          * block we are interested in.
1463          */
1464         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1465                 key.type = BTRFS_METADATA_ITEM_KEY;
1466                 key.offset = owner;
1467         }
1468
1469 again:
1470         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1471         if (ret < 0) {
1472                 err = ret;
1473                 goto out;
1474         }
1475
1476         /*
1477          * We may be a newly converted file system which still has the old fat
1478          * extent entries for metadata, so try and see if we have one of those.
1479          */
1480         if (ret > 0 && skinny_metadata) {
1481                 skinny_metadata = false;
1482                 if (path->slots[0]) {
1483                         path->slots[0]--;
1484                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1485                                               path->slots[0]);
1486                         if (key.objectid == bytenr &&
1487                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1488                             key.offset == num_bytes)
1489                                 ret = 0;
1490                 }
1491                 if (ret) {
1492                         key.objectid = bytenr;
1493                         key.type = BTRFS_EXTENT_ITEM_KEY;
1494                         key.offset = num_bytes;
1495                         btrfs_release_path(path);
1496                         goto again;
1497                 }
1498         }
1499
1500         if (ret && !insert) {
1501                 err = -ENOENT;
1502                 goto out;
1503         } else if (WARN_ON(ret)) {
1504                 err = -EIO;
1505                 goto out;
1506         }
1507
1508         leaf = path->nodes[0];
1509         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1510         if (unlikely(item_size < sizeof(*ei))) {
1511                 err = -EINVAL;
1512                 btrfs_print_v0_err(fs_info);
1513                 btrfs_abort_transaction(trans, err);
1514                 goto out;
1515         }
1516
1517         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1518         flags = btrfs_extent_flags(leaf, ei);
1519
1520         ptr = (unsigned long)(ei + 1);
1521         end = (unsigned long)ei + item_size;
1522
1523         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1524                 ptr += sizeof(struct btrfs_tree_block_info);
1525                 BUG_ON(ptr > end);
1526         }
1527
1528         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1529                 needed = BTRFS_REF_TYPE_DATA;
1530         else
1531                 needed = BTRFS_REF_TYPE_BLOCK;
1532
1533         err = -ENOENT;
1534         while (1) {
1535                 if (ptr >= end) {
1536                         WARN_ON(ptr > end);
1537                         break;
1538                 }
1539                 iref = (struct btrfs_extent_inline_ref *)ptr;
1540                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1541                 if (type == BTRFS_REF_TYPE_INVALID) {
1542                         err = -EUCLEAN;
1543                         goto out;
1544                 }
1545
1546                 if (want < type)
1547                         break;
1548                 if (want > type) {
1549                         ptr += btrfs_extent_inline_ref_size(type);
1550                         continue;
1551                 }
1552
1553                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1554                         struct btrfs_extent_data_ref *dref;
1555                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1556                         if (match_extent_data_ref(leaf, dref, root_objectid,
1557                                                   owner, offset)) {
1558                                 err = 0;
1559                                 break;
1560                         }
1561                         if (hash_extent_data_ref_item(leaf, dref) <
1562                             hash_extent_data_ref(root_objectid, owner, offset))
1563                                 break;
1564                 } else {
1565                         u64 ref_offset;
1566                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1567                         if (parent > 0) {
1568                                 if (parent == ref_offset) {
1569                                         err = 0;
1570                                         break;
1571                                 }
1572                                 if (ref_offset < parent)
1573                                         break;
1574                         } else {
1575                                 if (root_objectid == ref_offset) {
1576                                         err = 0;
1577                                         break;
1578                                 }
1579                                 if (ref_offset < root_objectid)
1580                                         break;
1581                         }
1582                 }
1583                 ptr += btrfs_extent_inline_ref_size(type);
1584         }
1585         if (err == -ENOENT && insert) {
1586                 if (item_size + extra_size >=
1587                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1588                         err = -EAGAIN;
1589                         goto out;
1590                 }
1591                 /*
1592                  * To add new inline back ref, we have to make sure
1593                  * there is no corresponding back ref item.
1594                  * For simplicity, we just do not add new inline back
1595                  * ref if there is any kind of item for this block
1596                  */
1597                 if (find_next_key(path, 0, &key) == 0 &&
1598                     key.objectid == bytenr &&
1599                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1600                         err = -EAGAIN;
1601                         goto out;
1602                 }
1603         }
1604         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1605 out:
1606         if (insert) {
1607                 path->keep_locks = 0;
1608                 btrfs_unlock_up_safe(path, 1);
1609         }
1610         return err;
1611 }
1612
1613 /*
1614  * helper to add new inline back ref
1615  */
1616 static noinline_for_stack
1617 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1618                                  struct btrfs_path *path,
1619                                  struct btrfs_extent_inline_ref *iref,
1620                                  u64 parent, u64 root_objectid,
1621                                  u64 owner, u64 offset, int refs_to_add,
1622                                  struct btrfs_delayed_extent_op *extent_op)
1623 {
1624         struct extent_buffer *leaf;
1625         struct btrfs_extent_item *ei;
1626         unsigned long ptr;
1627         unsigned long end;
1628         unsigned long item_offset;
1629         u64 refs;
1630         int size;
1631         int type;
1632
1633         leaf = path->nodes[0];
1634         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1635         item_offset = (unsigned long)iref - (unsigned long)ei;
1636
1637         type = extent_ref_type(parent, owner);
1638         size = btrfs_extent_inline_ref_size(type);
1639
1640         btrfs_extend_item(path, size);
1641
1642         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1643         refs = btrfs_extent_refs(leaf, ei);
1644         refs += refs_to_add;
1645         btrfs_set_extent_refs(leaf, ei, refs);
1646         if (extent_op)
1647                 __run_delayed_extent_op(extent_op, leaf, ei);
1648
1649         ptr = (unsigned long)ei + item_offset;
1650         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1651         if (ptr < end - size)
1652                 memmove_extent_buffer(leaf, ptr + size, ptr,
1653                                       end - size - ptr);
1654
1655         iref = (struct btrfs_extent_inline_ref *)ptr;
1656         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1657         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1658                 struct btrfs_extent_data_ref *dref;
1659                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1660                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1661                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1662                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1663                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1664         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1665                 struct btrfs_shared_data_ref *sref;
1666                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1667                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1668                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1669         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1670                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1671         } else {
1672                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1673         }
1674         btrfs_mark_buffer_dirty(leaf);
1675 }
1676
1677 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1678                                  struct btrfs_path *path,
1679                                  struct btrfs_extent_inline_ref **ref_ret,
1680                                  u64 bytenr, u64 num_bytes, u64 parent,
1681                                  u64 root_objectid, u64 owner, u64 offset)
1682 {
1683         int ret;
1684
1685         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1686                                            num_bytes, parent, root_objectid,
1687                                            owner, offset, 0);
1688         if (ret != -ENOENT)
1689                 return ret;
1690
1691         btrfs_release_path(path);
1692         *ref_ret = NULL;
1693
1694         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1695                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1696                                             root_objectid);
1697         } else {
1698                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1699                                              root_objectid, owner, offset);
1700         }
1701         return ret;
1702 }
1703
1704 /*
1705  * helper to update/remove inline back ref
1706  */
1707 static noinline_for_stack
1708 void update_inline_extent_backref(struct btrfs_path *path,
1709                                   struct btrfs_extent_inline_ref *iref,
1710                                   int refs_to_mod,
1711                                   struct btrfs_delayed_extent_op *extent_op,
1712                                   int *last_ref)
1713 {
1714         struct extent_buffer *leaf = path->nodes[0];
1715         struct btrfs_extent_item *ei;
1716         struct btrfs_extent_data_ref *dref = NULL;
1717         struct btrfs_shared_data_ref *sref = NULL;
1718         unsigned long ptr;
1719         unsigned long end;
1720         u32 item_size;
1721         int size;
1722         int type;
1723         u64 refs;
1724
1725         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1726         refs = btrfs_extent_refs(leaf, ei);
1727         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1728         refs += refs_to_mod;
1729         btrfs_set_extent_refs(leaf, ei, refs);
1730         if (extent_op)
1731                 __run_delayed_extent_op(extent_op, leaf, ei);
1732
1733         /*
1734          * If type is invalid, we should have bailed out after
1735          * lookup_inline_extent_backref().
1736          */
1737         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1738         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1739
1740         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1741                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1742                 refs = btrfs_extent_data_ref_count(leaf, dref);
1743         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1744                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1745                 refs = btrfs_shared_data_ref_count(leaf, sref);
1746         } else {
1747                 refs = 1;
1748                 BUG_ON(refs_to_mod != -1);
1749         }
1750
1751         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1752         refs += refs_to_mod;
1753
1754         if (refs > 0) {
1755                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1756                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1757                 else
1758                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1759         } else {
1760                 *last_ref = 1;
1761                 size =  btrfs_extent_inline_ref_size(type);
1762                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1763                 ptr = (unsigned long)iref;
1764                 end = (unsigned long)ei + item_size;
1765                 if (ptr + size < end)
1766                         memmove_extent_buffer(leaf, ptr, ptr + size,
1767                                               end - ptr - size);
1768                 item_size -= size;
1769                 btrfs_truncate_item(path, item_size, 1);
1770         }
1771         btrfs_mark_buffer_dirty(leaf);
1772 }
1773
1774 static noinline_for_stack
1775 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1776                                  struct btrfs_path *path,
1777                                  u64 bytenr, u64 num_bytes, u64 parent,
1778                                  u64 root_objectid, u64 owner,
1779                                  u64 offset, int refs_to_add,
1780                                  struct btrfs_delayed_extent_op *extent_op)
1781 {
1782         struct btrfs_extent_inline_ref *iref;
1783         int ret;
1784
1785         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1786                                            num_bytes, parent, root_objectid,
1787                                            owner, offset, 1);
1788         if (ret == 0) {
1789                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1790                 update_inline_extent_backref(path, iref, refs_to_add,
1791                                              extent_op, NULL);
1792         } else if (ret == -ENOENT) {
1793                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1794                                             root_objectid, owner, offset,
1795                                             refs_to_add, extent_op);
1796                 ret = 0;
1797         }
1798         return ret;
1799 }
1800
1801 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1802                                  struct btrfs_path *path,
1803                                  u64 bytenr, u64 parent, u64 root_objectid,
1804                                  u64 owner, u64 offset, int refs_to_add)
1805 {
1806         int ret;
1807         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1808                 BUG_ON(refs_to_add != 1);
1809                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1810                                             root_objectid);
1811         } else {
1812                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1813                                              root_objectid, owner, offset,
1814                                              refs_to_add);
1815         }
1816         return ret;
1817 }
1818
1819 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1820                                  struct btrfs_path *path,
1821                                  struct btrfs_extent_inline_ref *iref,
1822                                  int refs_to_drop, int is_data, int *last_ref)
1823 {
1824         int ret = 0;
1825
1826         BUG_ON(!is_data && refs_to_drop != 1);
1827         if (iref) {
1828                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1829                                              last_ref);
1830         } else if (is_data) {
1831                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1832                                              last_ref);
1833         } else {
1834                 *last_ref = 1;
1835                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1836         }
1837         return ret;
1838 }
1839
1840 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1841                                u64 *discarded_bytes)
1842 {
1843         int j, ret = 0;
1844         u64 bytes_left, end;
1845         u64 aligned_start = ALIGN(start, 1 << 9);
1846
1847         if (WARN_ON(start != aligned_start)) {
1848                 len -= aligned_start - start;
1849                 len = round_down(len, 1 << 9);
1850                 start = aligned_start;
1851         }
1852
1853         *discarded_bytes = 0;
1854
1855         if (!len)
1856                 return 0;
1857
1858         end = start + len;
1859         bytes_left = len;
1860
1861         /* Skip any superblocks on this device. */
1862         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1863                 u64 sb_start = btrfs_sb_offset(j);
1864                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1865                 u64 size = sb_start - start;
1866
1867                 if (!in_range(sb_start, start, bytes_left) &&
1868                     !in_range(sb_end, start, bytes_left) &&
1869                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1870                         continue;
1871
1872                 /*
1873                  * Superblock spans beginning of range.  Adjust start and
1874                  * try again.
1875                  */
1876                 if (sb_start <= start) {
1877                         start += sb_end - start;
1878                         if (start > end) {
1879                                 bytes_left = 0;
1880                                 break;
1881                         }
1882                         bytes_left = end - start;
1883                         continue;
1884                 }
1885
1886                 if (size) {
1887                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1888                                                    GFP_NOFS, 0);
1889                         if (!ret)
1890                                 *discarded_bytes += size;
1891                         else if (ret != -EOPNOTSUPP)
1892                                 return ret;
1893                 }
1894
1895                 start = sb_end;
1896                 if (start > end) {
1897                         bytes_left = 0;
1898                         break;
1899                 }
1900                 bytes_left = end - start;
1901         }
1902
1903         if (bytes_left) {
1904                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1905                                            GFP_NOFS, 0);
1906                 if (!ret)
1907                         *discarded_bytes += bytes_left;
1908         }
1909         return ret;
1910 }
1911
1912 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1913                          u64 num_bytes, u64 *actual_bytes)
1914 {
1915         int ret;
1916         u64 discarded_bytes = 0;
1917         struct btrfs_bio *bbio = NULL;
1918
1919
1920         /*
1921          * Avoid races with device replace and make sure our bbio has devices
1922          * associated to its stripes that don't go away while we are discarding.
1923          */
1924         btrfs_bio_counter_inc_blocked(fs_info);
1925         /* Tell the block device(s) that the sectors can be discarded */
1926         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1927                               &bbio, 0);
1928         /* Error condition is -ENOMEM */
1929         if (!ret) {
1930                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1931                 int i;
1932
1933
1934                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1935                         u64 bytes;
1936                         struct request_queue *req_q;
1937
1938                         if (!stripe->dev->bdev) {
1939                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1940                                 continue;
1941                         }
1942                         req_q = bdev_get_queue(stripe->dev->bdev);
1943                         if (!blk_queue_discard(req_q))
1944                                 continue;
1945
1946                         ret = btrfs_issue_discard(stripe->dev->bdev,
1947                                                   stripe->physical,
1948                                                   stripe->length,
1949                                                   &bytes);
1950                         if (!ret)
1951                                 discarded_bytes += bytes;
1952                         else if (ret != -EOPNOTSUPP)
1953                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1954
1955                         /*
1956                          * Just in case we get back EOPNOTSUPP for some reason,
1957                          * just ignore the return value so we don't screw up
1958                          * people calling discard_extent.
1959                          */
1960                         ret = 0;
1961                 }
1962                 btrfs_put_bbio(bbio);
1963         }
1964         btrfs_bio_counter_dec(fs_info);
1965
1966         if (actual_bytes)
1967                 *actual_bytes = discarded_bytes;
1968
1969
1970         if (ret == -EOPNOTSUPP)
1971                 ret = 0;
1972         return ret;
1973 }
1974
1975 /* Can return -ENOMEM */
1976 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1977                          struct btrfs_ref *generic_ref)
1978 {
1979         struct btrfs_fs_info *fs_info = trans->fs_info;
1980         int old_ref_mod, new_ref_mod;
1981         int ret;
1982
1983         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1984                generic_ref->action);
1985         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1986                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1987
1988         if (generic_ref->type == BTRFS_REF_METADATA)
1989                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1990                                 NULL, &old_ref_mod, &new_ref_mod);
1991         else
1992                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1993                                                  &old_ref_mod, &new_ref_mod);
1994
1995         btrfs_ref_tree_mod(fs_info, generic_ref);
1996
1997         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1998                 sub_pinned_bytes(fs_info, generic_ref);
1999
2000         return ret;
2001 }
2002
2003 /*
2004  * __btrfs_inc_extent_ref - insert backreference for a given extent
2005  *
2006  * @trans:          Handle of transaction
2007  *
2008  * @node:           The delayed ref node used to get the bytenr/length for
2009  *                  extent whose references are incremented.
2010  *
2011  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2012  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2013  *                  bytenr of the parent block. Since new extents are always
2014  *                  created with indirect references, this will only be the case
2015  *                  when relocating a shared extent. In that case, root_objectid
2016  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2017  *                  be 0
2018  *
2019  * @root_objectid:  The id of the root where this modification has originated,
2020  *                  this can be either one of the well-known metadata trees or
2021  *                  the subvolume id which references this extent.
2022  *
2023  * @owner:          For data extents it is the inode number of the owning file.
2024  *                  For metadata extents this parameter holds the level in the
2025  *                  tree of the extent.
2026  *
2027  * @offset:         For metadata extents the offset is ignored and is currently
2028  *                  always passed as 0. For data extents it is the fileoffset
2029  *                  this extent belongs to.
2030  *
2031  * @refs_to_add     Number of references to add
2032  *
2033  * @extent_op       Pointer to a structure, holding information necessary when
2034  *                  updating a tree block's flags
2035  *
2036  */
2037 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2038                                   struct btrfs_delayed_ref_node *node,
2039                                   u64 parent, u64 root_objectid,
2040                                   u64 owner, u64 offset, int refs_to_add,
2041                                   struct btrfs_delayed_extent_op *extent_op)
2042 {
2043         struct btrfs_path *path;
2044         struct extent_buffer *leaf;
2045         struct btrfs_extent_item *item;
2046         struct btrfs_key key;
2047         u64 bytenr = node->bytenr;
2048         u64 num_bytes = node->num_bytes;
2049         u64 refs;
2050         int ret;
2051
2052         path = btrfs_alloc_path();
2053         if (!path)
2054                 return -ENOMEM;
2055
2056         path->reada = READA_FORWARD;
2057         path->leave_spinning = 1;
2058         /* this will setup the path even if it fails to insert the back ref */
2059         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2060                                            parent, root_objectid, owner,
2061                                            offset, refs_to_add, extent_op);
2062         if ((ret < 0 && ret != -EAGAIN) || !ret)
2063                 goto out;
2064
2065         /*
2066          * Ok we had -EAGAIN which means we didn't have space to insert and
2067          * inline extent ref, so just update the reference count and add a
2068          * normal backref.
2069          */
2070         leaf = path->nodes[0];
2071         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2072         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2073         refs = btrfs_extent_refs(leaf, item);
2074         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2075         if (extent_op)
2076                 __run_delayed_extent_op(extent_op, leaf, item);
2077
2078         btrfs_mark_buffer_dirty(leaf);
2079         btrfs_release_path(path);
2080
2081         path->reada = READA_FORWARD;
2082         path->leave_spinning = 1;
2083         /* now insert the actual backref */
2084         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2085                                     owner, offset, refs_to_add);
2086         if (ret)
2087                 btrfs_abort_transaction(trans, ret);
2088 out:
2089         btrfs_free_path(path);
2090         return ret;
2091 }
2092
2093 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2094                                 struct btrfs_delayed_ref_node *node,
2095                                 struct btrfs_delayed_extent_op *extent_op,
2096                                 int insert_reserved)
2097 {
2098         int ret = 0;
2099         struct btrfs_delayed_data_ref *ref;
2100         struct btrfs_key ins;
2101         u64 parent = 0;
2102         u64 ref_root = 0;
2103         u64 flags = 0;
2104
2105         ins.objectid = node->bytenr;
2106         ins.offset = node->num_bytes;
2107         ins.type = BTRFS_EXTENT_ITEM_KEY;
2108
2109         ref = btrfs_delayed_node_to_data_ref(node);
2110         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2111
2112         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2113                 parent = ref->parent;
2114         ref_root = ref->root;
2115
2116         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2117                 if (extent_op)
2118                         flags |= extent_op->flags_to_set;
2119                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2120                                                  flags, ref->objectid,
2121                                                  ref->offset, &ins,
2122                                                  node->ref_mod);
2123         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2124                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2125                                              ref->objectid, ref->offset,
2126                                              node->ref_mod, extent_op);
2127         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2128                 ret = __btrfs_free_extent(trans, node, parent,
2129                                           ref_root, ref->objectid,
2130                                           ref->offset, node->ref_mod,
2131                                           extent_op);
2132         } else {
2133                 BUG();
2134         }
2135         return ret;
2136 }
2137
2138 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2139                                     struct extent_buffer *leaf,
2140                                     struct btrfs_extent_item *ei)
2141 {
2142         u64 flags = btrfs_extent_flags(leaf, ei);
2143         if (extent_op->update_flags) {
2144                 flags |= extent_op->flags_to_set;
2145                 btrfs_set_extent_flags(leaf, ei, flags);
2146         }
2147
2148         if (extent_op->update_key) {
2149                 struct btrfs_tree_block_info *bi;
2150                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2151                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2152                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2153         }
2154 }
2155
2156 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2157                                  struct btrfs_delayed_ref_head *head,
2158                                  struct btrfs_delayed_extent_op *extent_op)
2159 {
2160         struct btrfs_fs_info *fs_info = trans->fs_info;
2161         struct btrfs_key key;
2162         struct btrfs_path *path;
2163         struct btrfs_extent_item *ei;
2164         struct extent_buffer *leaf;
2165         u32 item_size;
2166         int ret;
2167         int err = 0;
2168         int metadata = !extent_op->is_data;
2169
2170         if (trans->aborted)
2171                 return 0;
2172
2173         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2174                 metadata = 0;
2175
2176         path = btrfs_alloc_path();
2177         if (!path)
2178                 return -ENOMEM;
2179
2180         key.objectid = head->bytenr;
2181
2182         if (metadata) {
2183                 key.type = BTRFS_METADATA_ITEM_KEY;
2184                 key.offset = extent_op->level;
2185         } else {
2186                 key.type = BTRFS_EXTENT_ITEM_KEY;
2187                 key.offset = head->num_bytes;
2188         }
2189
2190 again:
2191         path->reada = READA_FORWARD;
2192         path->leave_spinning = 1;
2193         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2194         if (ret < 0) {
2195                 err = ret;
2196                 goto out;
2197         }
2198         if (ret > 0) {
2199                 if (metadata) {
2200                         if (path->slots[0] > 0) {
2201                                 path->slots[0]--;
2202                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2203                                                       path->slots[0]);
2204                                 if (key.objectid == head->bytenr &&
2205                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2206                                     key.offset == head->num_bytes)
2207                                         ret = 0;
2208                         }
2209                         if (ret > 0) {
2210                                 btrfs_release_path(path);
2211                                 metadata = 0;
2212
2213                                 key.objectid = head->bytenr;
2214                                 key.offset = head->num_bytes;
2215                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2216                                 goto again;
2217                         }
2218                 } else {
2219                         err = -EIO;
2220                         goto out;
2221                 }
2222         }
2223
2224         leaf = path->nodes[0];
2225         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2226
2227         if (unlikely(item_size < sizeof(*ei))) {
2228                 err = -EINVAL;
2229                 btrfs_print_v0_err(fs_info);
2230                 btrfs_abort_transaction(trans, err);
2231                 goto out;
2232         }
2233
2234         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2235         __run_delayed_extent_op(extent_op, leaf, ei);
2236
2237         btrfs_mark_buffer_dirty(leaf);
2238 out:
2239         btrfs_free_path(path);
2240         return err;
2241 }
2242
2243 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2244                                 struct btrfs_delayed_ref_node *node,
2245                                 struct btrfs_delayed_extent_op *extent_op,
2246                                 int insert_reserved)
2247 {
2248         int ret = 0;
2249         struct btrfs_delayed_tree_ref *ref;
2250         u64 parent = 0;
2251         u64 ref_root = 0;
2252
2253         ref = btrfs_delayed_node_to_tree_ref(node);
2254         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2255
2256         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2257                 parent = ref->parent;
2258         ref_root = ref->root;
2259
2260         if (node->ref_mod != 1) {
2261                 btrfs_err(trans->fs_info,
2262         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2263                           node->bytenr, node->ref_mod, node->action, ref_root,
2264                           parent);
2265                 return -EIO;
2266         }
2267         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2268                 BUG_ON(!extent_op || !extent_op->update_flags);
2269                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2270         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2271                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2272                                              ref->level, 0, 1, extent_op);
2273         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2274                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2275                                           ref->level, 0, 1, extent_op);
2276         } else {
2277                 BUG();
2278         }
2279         return ret;
2280 }
2281
2282 /* helper function to actually process a single delayed ref entry */
2283 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2284                                struct btrfs_delayed_ref_node *node,
2285                                struct btrfs_delayed_extent_op *extent_op,
2286                                int insert_reserved)
2287 {
2288         int ret = 0;
2289
2290         if (trans->aborted) {
2291                 if (insert_reserved)
2292                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2293                                          node->num_bytes, 1);
2294                 return 0;
2295         }
2296
2297         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2298             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2299                 ret = run_delayed_tree_ref(trans, node, extent_op,
2300                                            insert_reserved);
2301         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2302                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2303                 ret = run_delayed_data_ref(trans, node, extent_op,
2304                                            insert_reserved);
2305         else
2306                 BUG();
2307         if (ret && insert_reserved)
2308                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2309                                  node->num_bytes, 1);
2310         return ret;
2311 }
2312
2313 static inline struct btrfs_delayed_ref_node *
2314 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2315 {
2316         struct btrfs_delayed_ref_node *ref;
2317
2318         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2319                 return NULL;
2320
2321         /*
2322          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2323          * This is to prevent a ref count from going down to zero, which deletes
2324          * the extent item from the extent tree, when there still are references
2325          * to add, which would fail because they would not find the extent item.
2326          */
2327         if (!list_empty(&head->ref_add_list))
2328                 return list_first_entry(&head->ref_add_list,
2329                                 struct btrfs_delayed_ref_node, add_list);
2330
2331         ref = rb_entry(rb_first_cached(&head->ref_tree),
2332                        struct btrfs_delayed_ref_node, ref_node);
2333         ASSERT(list_empty(&ref->add_list));
2334         return ref;
2335 }
2336
2337 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2338                                       struct btrfs_delayed_ref_head *head)
2339 {
2340         spin_lock(&delayed_refs->lock);
2341         head->processing = 0;
2342         delayed_refs->num_heads_ready++;
2343         spin_unlock(&delayed_refs->lock);
2344         btrfs_delayed_ref_unlock(head);
2345 }
2346
2347 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2348                                 struct btrfs_delayed_ref_head *head)
2349 {
2350         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2351
2352         if (!extent_op)
2353                 return NULL;
2354
2355         if (head->must_insert_reserved) {
2356                 head->extent_op = NULL;
2357                 btrfs_free_delayed_extent_op(extent_op);
2358                 return NULL;
2359         }
2360         return extent_op;
2361 }
2362
2363 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2364                                      struct btrfs_delayed_ref_head *head)
2365 {
2366         struct btrfs_delayed_extent_op *extent_op;
2367         int ret;
2368
2369         extent_op = cleanup_extent_op(head);
2370         if (!extent_op)
2371                 return 0;
2372         head->extent_op = NULL;
2373         spin_unlock(&head->lock);
2374         ret = run_delayed_extent_op(trans, head, extent_op);
2375         btrfs_free_delayed_extent_op(extent_op);
2376         return ret ? ret : 1;
2377 }
2378
2379 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2380                                   struct btrfs_delayed_ref_root *delayed_refs,
2381                                   struct btrfs_delayed_ref_head *head)
2382 {
2383         int nr_items = 1;       /* Dropping this ref head update. */
2384
2385         if (head->total_ref_mod < 0) {
2386                 struct btrfs_space_info *space_info;
2387                 u64 flags;
2388
2389                 if (head->is_data)
2390                         flags = BTRFS_BLOCK_GROUP_DATA;
2391                 else if (head->is_system)
2392                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2393                 else
2394                         flags = BTRFS_BLOCK_GROUP_METADATA;
2395                 space_info = btrfs_find_space_info(fs_info, flags);
2396                 ASSERT(space_info);
2397                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2398                                    -head->num_bytes,
2399                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2400
2401                 /*
2402                  * We had csum deletions accounted for in our delayed refs rsv,
2403                  * we need to drop the csum leaves for this update from our
2404                  * delayed_refs_rsv.
2405                  */
2406                 if (head->is_data) {
2407                         spin_lock(&delayed_refs->lock);
2408                         delayed_refs->pending_csums -= head->num_bytes;
2409                         spin_unlock(&delayed_refs->lock);
2410                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2411                                 head->num_bytes);
2412                 }
2413         }
2414
2415         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2416 }
2417
2418 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2419                             struct btrfs_delayed_ref_head *head)
2420 {
2421
2422         struct btrfs_fs_info *fs_info = trans->fs_info;
2423         struct btrfs_delayed_ref_root *delayed_refs;
2424         int ret;
2425
2426         delayed_refs = &trans->transaction->delayed_refs;
2427
2428         ret = run_and_cleanup_extent_op(trans, head);
2429         if (ret < 0) {
2430                 unselect_delayed_ref_head(delayed_refs, head);
2431                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2432                 return ret;
2433         } else if (ret) {
2434                 return ret;
2435         }
2436
2437         /*
2438          * Need to drop our head ref lock and re-acquire the delayed ref lock
2439          * and then re-check to make sure nobody got added.
2440          */
2441         spin_unlock(&head->lock);
2442         spin_lock(&delayed_refs->lock);
2443         spin_lock(&head->lock);
2444         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2445                 spin_unlock(&head->lock);
2446                 spin_unlock(&delayed_refs->lock);
2447                 return 1;
2448         }
2449         btrfs_delete_ref_head(delayed_refs, head);
2450         spin_unlock(&head->lock);
2451         spin_unlock(&delayed_refs->lock);
2452
2453         if (head->must_insert_reserved) {
2454                 btrfs_pin_extent(fs_info, head->bytenr,
2455                                  head->num_bytes, 1);
2456                 if (head->is_data) {
2457                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2458                                               head->num_bytes);
2459                 }
2460         }
2461
2462         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2463
2464         trace_run_delayed_ref_head(fs_info, head, 0);
2465         btrfs_delayed_ref_unlock(head);
2466         btrfs_put_delayed_ref_head(head);
2467         return 0;
2468 }
2469
2470 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2471                                         struct btrfs_trans_handle *trans)
2472 {
2473         struct btrfs_delayed_ref_root *delayed_refs =
2474                 &trans->transaction->delayed_refs;
2475         struct btrfs_delayed_ref_head *head = NULL;
2476         int ret;
2477
2478         spin_lock(&delayed_refs->lock);
2479         head = btrfs_select_ref_head(delayed_refs);
2480         if (!head) {
2481                 spin_unlock(&delayed_refs->lock);
2482                 return head;
2483         }
2484
2485         /*
2486          * Grab the lock that says we are going to process all the refs for
2487          * this head
2488          */
2489         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2490         spin_unlock(&delayed_refs->lock);
2491
2492         /*
2493          * We may have dropped the spin lock to get the head mutex lock, and
2494          * that might have given someone else time to free the head.  If that's
2495          * true, it has been removed from our list and we can move on.
2496          */
2497         if (ret == -EAGAIN)
2498                 head = ERR_PTR(-EAGAIN);
2499
2500         return head;
2501 }
2502
2503 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2504                                     struct btrfs_delayed_ref_head *locked_ref,
2505                                     unsigned long *run_refs)
2506 {
2507         struct btrfs_fs_info *fs_info = trans->fs_info;
2508         struct btrfs_delayed_ref_root *delayed_refs;
2509         struct btrfs_delayed_extent_op *extent_op;
2510         struct btrfs_delayed_ref_node *ref;
2511         int must_insert_reserved = 0;
2512         int ret;
2513
2514         delayed_refs = &trans->transaction->delayed_refs;
2515
2516         lockdep_assert_held(&locked_ref->mutex);
2517         lockdep_assert_held(&locked_ref->lock);
2518
2519         while ((ref = select_delayed_ref(locked_ref))) {
2520                 if (ref->seq &&
2521                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2522                         spin_unlock(&locked_ref->lock);
2523                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2524                         return -EAGAIN;
2525                 }
2526
2527                 (*run_refs)++;
2528                 ref->in_tree = 0;
2529                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2530                 RB_CLEAR_NODE(&ref->ref_node);
2531                 if (!list_empty(&ref->add_list))
2532                         list_del(&ref->add_list);
2533                 /*
2534                  * When we play the delayed ref, also correct the ref_mod on
2535                  * head
2536                  */
2537                 switch (ref->action) {
2538                 case BTRFS_ADD_DELAYED_REF:
2539                 case BTRFS_ADD_DELAYED_EXTENT:
2540                         locked_ref->ref_mod -= ref->ref_mod;
2541                         break;
2542                 case BTRFS_DROP_DELAYED_REF:
2543                         locked_ref->ref_mod += ref->ref_mod;
2544                         break;
2545                 default:
2546                         WARN_ON(1);
2547                 }
2548                 atomic_dec(&delayed_refs->num_entries);
2549
2550                 /*
2551                  * Record the must_insert_reserved flag before we drop the
2552                  * spin lock.
2553                  */
2554                 must_insert_reserved = locked_ref->must_insert_reserved;
2555                 locked_ref->must_insert_reserved = 0;
2556
2557                 extent_op = locked_ref->extent_op;
2558                 locked_ref->extent_op = NULL;
2559                 spin_unlock(&locked_ref->lock);
2560
2561                 ret = run_one_delayed_ref(trans, ref, extent_op,
2562                                           must_insert_reserved);
2563
2564                 btrfs_free_delayed_extent_op(extent_op);
2565                 if (ret) {
2566                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2567                         btrfs_put_delayed_ref(ref);
2568                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2569                                     ret);
2570                         return ret;
2571                 }
2572
2573                 btrfs_put_delayed_ref(ref);
2574                 cond_resched();
2575
2576                 spin_lock(&locked_ref->lock);
2577                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2578         }
2579
2580         return 0;
2581 }
2582
2583 /*
2584  * Returns 0 on success or if called with an already aborted transaction.
2585  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2586  */
2587 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2588                                              unsigned long nr)
2589 {
2590         struct btrfs_fs_info *fs_info = trans->fs_info;
2591         struct btrfs_delayed_ref_root *delayed_refs;
2592         struct btrfs_delayed_ref_head *locked_ref = NULL;
2593         ktime_t start = ktime_get();
2594         int ret;
2595         unsigned long count = 0;
2596         unsigned long actual_count = 0;
2597
2598         delayed_refs = &trans->transaction->delayed_refs;
2599         do {
2600                 if (!locked_ref) {
2601                         locked_ref = btrfs_obtain_ref_head(trans);
2602                         if (IS_ERR_OR_NULL(locked_ref)) {
2603                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2604                                         continue;
2605                                 } else {
2606                                         break;
2607                                 }
2608                         }
2609                         count++;
2610                 }
2611                 /*
2612                  * We need to try and merge add/drops of the same ref since we
2613                  * can run into issues with relocate dropping the implicit ref
2614                  * and then it being added back again before the drop can
2615                  * finish.  If we merged anything we need to re-loop so we can
2616                  * get a good ref.
2617                  * Or we can get node references of the same type that weren't
2618                  * merged when created due to bumps in the tree mod seq, and
2619                  * we need to merge them to prevent adding an inline extent
2620                  * backref before dropping it (triggering a BUG_ON at
2621                  * insert_inline_extent_backref()).
2622                  */
2623                 spin_lock(&locked_ref->lock);
2624                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2625
2626                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2627                                                       &actual_count);
2628                 if (ret < 0 && ret != -EAGAIN) {
2629                         /*
2630                          * Error, btrfs_run_delayed_refs_for_head already
2631                          * unlocked everything so just bail out
2632                          */
2633                         return ret;
2634                 } else if (!ret) {
2635                         /*
2636                          * Success, perform the usual cleanup of a processed
2637                          * head
2638                          */
2639                         ret = cleanup_ref_head(trans, locked_ref);
2640                         if (ret > 0 ) {
2641                                 /* We dropped our lock, we need to loop. */
2642                                 ret = 0;
2643                                 continue;
2644                         } else if (ret) {
2645                                 return ret;
2646                         }
2647                 }
2648
2649                 /*
2650                  * Either success case or btrfs_run_delayed_refs_for_head
2651                  * returned -EAGAIN, meaning we need to select another head
2652                  */
2653
2654                 locked_ref = NULL;
2655                 cond_resched();
2656         } while ((nr != -1 && count < nr) || locked_ref);
2657
2658         /*
2659          * We don't want to include ref heads since we can have empty ref heads
2660          * and those will drastically skew our runtime down since we just do
2661          * accounting, no actual extent tree updates.
2662          */
2663         if (actual_count > 0) {
2664                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2665                 u64 avg;
2666
2667                 /*
2668                  * We weigh the current average higher than our current runtime
2669                  * to avoid large swings in the average.
2670                  */
2671                 spin_lock(&delayed_refs->lock);
2672                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2673                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2674                 spin_unlock(&delayed_refs->lock);
2675         }
2676         return 0;
2677 }
2678
2679 #ifdef SCRAMBLE_DELAYED_REFS
2680 /*
2681  * Normally delayed refs get processed in ascending bytenr order. This
2682  * correlates in most cases to the order added. To expose dependencies on this
2683  * order, we start to process the tree in the middle instead of the beginning
2684  */
2685 static u64 find_middle(struct rb_root *root)
2686 {
2687         struct rb_node *n = root->rb_node;
2688         struct btrfs_delayed_ref_node *entry;
2689         int alt = 1;
2690         u64 middle;
2691         u64 first = 0, last = 0;
2692
2693         n = rb_first(root);
2694         if (n) {
2695                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2696                 first = entry->bytenr;
2697         }
2698         n = rb_last(root);
2699         if (n) {
2700                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2701                 last = entry->bytenr;
2702         }
2703         n = root->rb_node;
2704
2705         while (n) {
2706                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2707                 WARN_ON(!entry->in_tree);
2708
2709                 middle = entry->bytenr;
2710
2711                 if (alt)
2712                         n = n->rb_left;
2713                 else
2714                         n = n->rb_right;
2715
2716                 alt = 1 - alt;
2717         }
2718         return middle;
2719 }
2720 #endif
2721
2722 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2723 {
2724         u64 num_bytes;
2725
2726         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2727                              sizeof(struct btrfs_extent_inline_ref));
2728         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2729                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2730
2731         /*
2732          * We don't ever fill up leaves all the way so multiply by 2 just to be
2733          * closer to what we're really going to want to use.
2734          */
2735         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2736 }
2737
2738 /*
2739  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2740  * would require to store the csums for that many bytes.
2741  */
2742 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2743 {
2744         u64 csum_size;
2745         u64 num_csums_per_leaf;
2746         u64 num_csums;
2747
2748         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2749         num_csums_per_leaf = div64_u64(csum_size,
2750                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2751         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2752         num_csums += num_csums_per_leaf - 1;
2753         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2754         return num_csums;
2755 }
2756
2757 /*
2758  * this starts processing the delayed reference count updates and
2759  * extent insertions we have queued up so far.  count can be
2760  * 0, which means to process everything in the tree at the start
2761  * of the run (but not newly added entries), or it can be some target
2762  * number you'd like to process.
2763  *
2764  * Returns 0 on success or if called with an aborted transaction
2765  * Returns <0 on error and aborts the transaction
2766  */
2767 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2768                            unsigned long count)
2769 {
2770         struct btrfs_fs_info *fs_info = trans->fs_info;
2771         struct rb_node *node;
2772         struct btrfs_delayed_ref_root *delayed_refs;
2773         struct btrfs_delayed_ref_head *head;
2774         int ret;
2775         int run_all = count == (unsigned long)-1;
2776
2777         /* We'll clean this up in btrfs_cleanup_transaction */
2778         if (trans->aborted)
2779                 return 0;
2780
2781         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2782                 return 0;
2783
2784         delayed_refs = &trans->transaction->delayed_refs;
2785         if (count == 0)
2786                 count = atomic_read(&delayed_refs->num_entries) * 2;
2787
2788 again:
2789 #ifdef SCRAMBLE_DELAYED_REFS
2790         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2791 #endif
2792         ret = __btrfs_run_delayed_refs(trans, count);
2793         if (ret < 0) {
2794                 btrfs_abort_transaction(trans, ret);
2795                 return ret;
2796         }
2797
2798         if (run_all) {
2799                 btrfs_create_pending_block_groups(trans);
2800
2801                 spin_lock(&delayed_refs->lock);
2802                 node = rb_first_cached(&delayed_refs->href_root);
2803                 if (!node) {
2804                         spin_unlock(&delayed_refs->lock);
2805                         goto out;
2806                 }
2807                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2808                                 href_node);
2809                 refcount_inc(&head->refs);
2810                 spin_unlock(&delayed_refs->lock);
2811
2812                 /* Mutex was contended, block until it's released and retry. */
2813                 mutex_lock(&head->mutex);
2814                 mutex_unlock(&head->mutex);
2815
2816                 btrfs_put_delayed_ref_head(head);
2817                 cond_resched();
2818                 goto again;
2819         }
2820 out:
2821         return 0;
2822 }
2823
2824 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2825                                 u64 bytenr, u64 num_bytes, u64 flags,
2826                                 int level, int is_data)
2827 {
2828         struct btrfs_delayed_extent_op *extent_op;
2829         int ret;
2830
2831         extent_op = btrfs_alloc_delayed_extent_op();
2832         if (!extent_op)
2833                 return -ENOMEM;
2834
2835         extent_op->flags_to_set = flags;
2836         extent_op->update_flags = true;
2837         extent_op->update_key = false;
2838         extent_op->is_data = is_data ? true : false;
2839         extent_op->level = level;
2840
2841         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2842         if (ret)
2843                 btrfs_free_delayed_extent_op(extent_op);
2844         return ret;
2845 }
2846
2847 static noinline int check_delayed_ref(struct btrfs_root *root,
2848                                       struct btrfs_path *path,
2849                                       u64 objectid, u64 offset, u64 bytenr)
2850 {
2851         struct btrfs_delayed_ref_head *head;
2852         struct btrfs_delayed_ref_node *ref;
2853         struct btrfs_delayed_data_ref *data_ref;
2854         struct btrfs_delayed_ref_root *delayed_refs;
2855         struct btrfs_transaction *cur_trans;
2856         struct rb_node *node;
2857         int ret = 0;
2858
2859         spin_lock(&root->fs_info->trans_lock);
2860         cur_trans = root->fs_info->running_transaction;
2861         if (cur_trans)
2862                 refcount_inc(&cur_trans->use_count);
2863         spin_unlock(&root->fs_info->trans_lock);
2864         if (!cur_trans)
2865                 return 0;
2866
2867         delayed_refs = &cur_trans->delayed_refs;
2868         spin_lock(&delayed_refs->lock);
2869         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2870         if (!head) {
2871                 spin_unlock(&delayed_refs->lock);
2872                 btrfs_put_transaction(cur_trans);
2873                 return 0;
2874         }
2875
2876         if (!mutex_trylock(&head->mutex)) {
2877                 refcount_inc(&head->refs);
2878                 spin_unlock(&delayed_refs->lock);
2879
2880                 btrfs_release_path(path);
2881
2882                 /*
2883                  * Mutex was contended, block until it's released and let
2884                  * caller try again
2885                  */
2886                 mutex_lock(&head->mutex);
2887                 mutex_unlock(&head->mutex);
2888                 btrfs_put_delayed_ref_head(head);
2889                 btrfs_put_transaction(cur_trans);
2890                 return -EAGAIN;
2891         }
2892         spin_unlock(&delayed_refs->lock);
2893
2894         spin_lock(&head->lock);
2895         /*
2896          * XXX: We should replace this with a proper search function in the
2897          * future.
2898          */
2899         for (node = rb_first_cached(&head->ref_tree); node;
2900              node = rb_next(node)) {
2901                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2902                 /* If it's a shared ref we know a cross reference exists */
2903                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2904                         ret = 1;
2905                         break;
2906                 }
2907
2908                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2909
2910                 /*
2911                  * If our ref doesn't match the one we're currently looking at
2912                  * then we have a cross reference.
2913                  */
2914                 if (data_ref->root != root->root_key.objectid ||
2915                     data_ref->objectid != objectid ||
2916                     data_ref->offset != offset) {
2917                         ret = 1;
2918                         break;
2919                 }
2920         }
2921         spin_unlock(&head->lock);
2922         mutex_unlock(&head->mutex);
2923         btrfs_put_transaction(cur_trans);
2924         return ret;
2925 }
2926
2927 static noinline int check_committed_ref(struct btrfs_root *root,
2928                                         struct btrfs_path *path,
2929                                         u64 objectid, u64 offset, u64 bytenr)
2930 {
2931         struct btrfs_fs_info *fs_info = root->fs_info;
2932         struct btrfs_root *extent_root = fs_info->extent_root;
2933         struct extent_buffer *leaf;
2934         struct btrfs_extent_data_ref *ref;
2935         struct btrfs_extent_inline_ref *iref;
2936         struct btrfs_extent_item *ei;
2937         struct btrfs_key key;
2938         u32 item_size;
2939         int type;
2940         int ret;
2941
2942         key.objectid = bytenr;
2943         key.offset = (u64)-1;
2944         key.type = BTRFS_EXTENT_ITEM_KEY;
2945
2946         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2947         if (ret < 0)
2948                 goto out;
2949         BUG_ON(ret == 0); /* Corruption */
2950
2951         ret = -ENOENT;
2952         if (path->slots[0] == 0)
2953                 goto out;
2954
2955         path->slots[0]--;
2956         leaf = path->nodes[0];
2957         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2958
2959         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2960                 goto out;
2961
2962         ret = 1;
2963         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2964         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2965
2966         if (item_size != sizeof(*ei) +
2967             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2968                 goto out;
2969
2970         if (btrfs_extent_generation(leaf, ei) <=
2971             btrfs_root_last_snapshot(&root->root_item))
2972                 goto out;
2973
2974         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2975
2976         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2977         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2978                 goto out;
2979
2980         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2981         if (btrfs_extent_refs(leaf, ei) !=
2982             btrfs_extent_data_ref_count(leaf, ref) ||
2983             btrfs_extent_data_ref_root(leaf, ref) !=
2984             root->root_key.objectid ||
2985             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2986             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2987                 goto out;
2988
2989         ret = 0;
2990 out:
2991         return ret;
2992 }
2993
2994 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2995                           u64 bytenr)
2996 {
2997         struct btrfs_path *path;
2998         int ret;
2999
3000         path = btrfs_alloc_path();
3001         if (!path)
3002                 return -ENOMEM;
3003
3004         do {
3005                 ret = check_committed_ref(root, path, objectid,
3006                                           offset, bytenr);
3007                 if (ret && ret != -ENOENT)
3008                         goto out;
3009
3010                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3011         } while (ret == -EAGAIN);
3012
3013 out:
3014         btrfs_free_path(path);
3015         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3016                 WARN_ON(ret > 0);
3017         return ret;
3018 }
3019
3020 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3021                            struct btrfs_root *root,
3022                            struct extent_buffer *buf,
3023                            int full_backref, int inc)
3024 {
3025         struct btrfs_fs_info *fs_info = root->fs_info;
3026         u64 bytenr;
3027         u64 num_bytes;
3028         u64 parent;
3029         u64 ref_root;
3030         u32 nritems;
3031         struct btrfs_key key;
3032         struct btrfs_file_extent_item *fi;
3033         struct btrfs_ref generic_ref = { 0 };
3034         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3035         int i;
3036         int action;
3037         int level;
3038         int ret = 0;
3039
3040         if (btrfs_is_testing(fs_info))
3041                 return 0;
3042
3043         ref_root = btrfs_header_owner(buf);
3044         nritems = btrfs_header_nritems(buf);
3045         level = btrfs_header_level(buf);
3046
3047         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3048                 return 0;
3049
3050         if (full_backref)
3051                 parent = buf->start;
3052         else
3053                 parent = 0;
3054         if (inc)
3055                 action = BTRFS_ADD_DELAYED_REF;
3056         else
3057                 action = BTRFS_DROP_DELAYED_REF;
3058
3059         for (i = 0; i < nritems; i++) {
3060                 if (level == 0) {
3061                         btrfs_item_key_to_cpu(buf, &key, i);
3062                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3063                                 continue;
3064                         fi = btrfs_item_ptr(buf, i,
3065                                             struct btrfs_file_extent_item);
3066                         if (btrfs_file_extent_type(buf, fi) ==
3067                             BTRFS_FILE_EXTENT_INLINE)
3068                                 continue;
3069                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3070                         if (bytenr == 0)
3071                                 continue;
3072
3073                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3074                         key.offset -= btrfs_file_extent_offset(buf, fi);
3075                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3076                                                num_bytes, parent);
3077                         generic_ref.real_root = root->root_key.objectid;
3078                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3079                                             key.offset);
3080                         generic_ref.skip_qgroup = for_reloc;
3081                         if (inc)
3082                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3083                         else
3084                                 ret = btrfs_free_extent(trans, &generic_ref);
3085                         if (ret)
3086                                 goto fail;
3087                 } else {
3088                         bytenr = btrfs_node_blockptr(buf, i);
3089                         num_bytes = fs_info->nodesize;
3090                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3091                                                num_bytes, parent);
3092                         generic_ref.real_root = root->root_key.objectid;
3093                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3094                         generic_ref.skip_qgroup = for_reloc;
3095                         if (inc)
3096                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3097                         else
3098                                 ret = btrfs_free_extent(trans, &generic_ref);
3099                         if (ret)
3100                                 goto fail;
3101                 }
3102         }
3103         return 0;
3104 fail:
3105         return ret;
3106 }
3107
3108 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3109                   struct extent_buffer *buf, int full_backref)
3110 {
3111         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3112 }
3113
3114 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3115                   struct extent_buffer *buf, int full_backref)
3116 {
3117         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3118 }
3119
3120 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3121                                  struct btrfs_path *path,
3122                                  struct btrfs_block_group_cache *cache)
3123 {
3124         struct btrfs_fs_info *fs_info = trans->fs_info;
3125         int ret;
3126         struct btrfs_root *extent_root = fs_info->extent_root;
3127         unsigned long bi;
3128         struct extent_buffer *leaf;
3129
3130         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3131         if (ret) {
3132                 if (ret > 0)
3133                         ret = -ENOENT;
3134                 goto fail;
3135         }
3136
3137         leaf = path->nodes[0];
3138         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3139         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3140         btrfs_mark_buffer_dirty(leaf);
3141 fail:
3142         btrfs_release_path(path);
3143         return ret;
3144
3145 }
3146
3147 static struct btrfs_block_group_cache *next_block_group(
3148                 struct btrfs_block_group_cache *cache)
3149 {
3150         struct btrfs_fs_info *fs_info = cache->fs_info;
3151         struct rb_node *node;
3152
3153         spin_lock(&fs_info->block_group_cache_lock);
3154
3155         /* If our block group was removed, we need a full search. */
3156         if (RB_EMPTY_NODE(&cache->cache_node)) {
3157                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3158
3159                 spin_unlock(&fs_info->block_group_cache_lock);
3160                 btrfs_put_block_group(cache);
3161                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3162         }
3163         node = rb_next(&cache->cache_node);
3164         btrfs_put_block_group(cache);
3165         if (node) {
3166                 cache = rb_entry(node, struct btrfs_block_group_cache,
3167                                  cache_node);
3168                 btrfs_get_block_group(cache);
3169         } else
3170                 cache = NULL;
3171         spin_unlock(&fs_info->block_group_cache_lock);
3172         return cache;
3173 }
3174
3175 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3176                             struct btrfs_trans_handle *trans,
3177                             struct btrfs_path *path)
3178 {
3179         struct btrfs_fs_info *fs_info = block_group->fs_info;
3180         struct btrfs_root *root = fs_info->tree_root;
3181         struct inode *inode = NULL;
3182         struct extent_changeset *data_reserved = NULL;
3183         u64 alloc_hint = 0;
3184         int dcs = BTRFS_DC_ERROR;
3185         u64 num_pages = 0;
3186         int retries = 0;
3187         int ret = 0;
3188
3189         /*
3190          * If this block group is smaller than 100 megs don't bother caching the
3191          * block group.
3192          */
3193         if (block_group->key.offset < (100 * SZ_1M)) {
3194                 spin_lock(&block_group->lock);
3195                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3196                 spin_unlock(&block_group->lock);
3197                 return 0;
3198         }
3199
3200         if (trans->aborted)
3201                 return 0;
3202 again:
3203         inode = lookup_free_space_inode(block_group, path);
3204         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3205                 ret = PTR_ERR(inode);
3206                 btrfs_release_path(path);
3207                 goto out;
3208         }
3209
3210         if (IS_ERR(inode)) {
3211                 BUG_ON(retries);
3212                 retries++;
3213
3214                 if (block_group->ro)
3215                         goto out_free;
3216
3217                 ret = create_free_space_inode(trans, block_group, path);
3218                 if (ret)
3219                         goto out_free;
3220                 goto again;
3221         }
3222
3223         /*
3224          * We want to set the generation to 0, that way if anything goes wrong
3225          * from here on out we know not to trust this cache when we load up next
3226          * time.
3227          */
3228         BTRFS_I(inode)->generation = 0;
3229         ret = btrfs_update_inode(trans, root, inode);
3230         if (ret) {
3231                 /*
3232                  * So theoretically we could recover from this, simply set the
3233                  * super cache generation to 0 so we know to invalidate the
3234                  * cache, but then we'd have to keep track of the block groups
3235                  * that fail this way so we know we _have_ to reset this cache
3236                  * before the next commit or risk reading stale cache.  So to
3237                  * limit our exposure to horrible edge cases lets just abort the
3238                  * transaction, this only happens in really bad situations
3239                  * anyway.
3240                  */
3241                 btrfs_abort_transaction(trans, ret);
3242                 goto out_put;
3243         }
3244         WARN_ON(ret);
3245
3246         /* We've already setup this transaction, go ahead and exit */
3247         if (block_group->cache_generation == trans->transid &&
3248             i_size_read(inode)) {
3249                 dcs = BTRFS_DC_SETUP;
3250                 goto out_put;
3251         }
3252
3253         if (i_size_read(inode) > 0) {
3254                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3255                                         &fs_info->global_block_rsv);
3256                 if (ret)
3257                         goto out_put;
3258
3259                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3260                 if (ret)
3261                         goto out_put;
3262         }
3263
3264         spin_lock(&block_group->lock);
3265         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3266             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3267                 /*
3268                  * don't bother trying to write stuff out _if_
3269                  * a) we're not cached,
3270                  * b) we're with nospace_cache mount option,
3271                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3272                  */
3273                 dcs = BTRFS_DC_WRITTEN;
3274                 spin_unlock(&block_group->lock);
3275                 goto out_put;
3276         }
3277         spin_unlock(&block_group->lock);
3278
3279         /*
3280          * We hit an ENOSPC when setting up the cache in this transaction, just
3281          * skip doing the setup, we've already cleared the cache so we're safe.
3282          */
3283         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3284                 ret = -ENOSPC;
3285                 goto out_put;
3286         }
3287
3288         /*
3289          * Try to preallocate enough space based on how big the block group is.
3290          * Keep in mind this has to include any pinned space which could end up
3291          * taking up quite a bit since it's not folded into the other space
3292          * cache.
3293          */
3294         num_pages = div_u64(block_group->key.offset, SZ_256M);
3295         if (!num_pages)
3296                 num_pages = 1;
3297
3298         num_pages *= 16;
3299         num_pages *= PAGE_SIZE;
3300
3301         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3302         if (ret)
3303                 goto out_put;
3304
3305         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3306                                               num_pages, num_pages,
3307                                               &alloc_hint);
3308         /*
3309          * Our cache requires contiguous chunks so that we don't modify a bunch
3310          * of metadata or split extents when writing the cache out, which means
3311          * we can enospc if we are heavily fragmented in addition to just normal
3312          * out of space conditions.  So if we hit this just skip setting up any
3313          * other block groups for this transaction, maybe we'll unpin enough
3314          * space the next time around.
3315          */
3316         if (!ret)
3317                 dcs = BTRFS_DC_SETUP;
3318         else if (ret == -ENOSPC)
3319                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3320
3321 out_put:
3322         iput(inode);
3323 out_free:
3324         btrfs_release_path(path);
3325 out:
3326         spin_lock(&block_group->lock);
3327         if (!ret && dcs == BTRFS_DC_SETUP)
3328                 block_group->cache_generation = trans->transid;
3329         block_group->disk_cache_state = dcs;
3330         spin_unlock(&block_group->lock);
3331
3332         extent_changeset_free(data_reserved);
3333         return ret;
3334 }
3335
3336 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3337 {
3338         struct btrfs_fs_info *fs_info = trans->fs_info;
3339         struct btrfs_block_group_cache *cache, *tmp;
3340         struct btrfs_transaction *cur_trans = trans->transaction;
3341         struct btrfs_path *path;
3342
3343         if (list_empty(&cur_trans->dirty_bgs) ||
3344             !btrfs_test_opt(fs_info, SPACE_CACHE))
3345                 return 0;
3346
3347         path = btrfs_alloc_path();
3348         if (!path)
3349                 return -ENOMEM;
3350
3351         /* Could add new block groups, use _safe just in case */
3352         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3353                                  dirty_list) {
3354                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3355                         cache_save_setup(cache, trans, path);
3356         }
3357
3358         btrfs_free_path(path);
3359         return 0;
3360 }
3361
3362 /*
3363  * transaction commit does final block group cache writeback during a
3364  * critical section where nothing is allowed to change the FS.  This is
3365  * required in order for the cache to actually match the block group,
3366  * but can introduce a lot of latency into the commit.
3367  *
3368  * So, btrfs_start_dirty_block_groups is here to kick off block group
3369  * cache IO.  There's a chance we'll have to redo some of it if the
3370  * block group changes again during the commit, but it greatly reduces
3371  * the commit latency by getting rid of the easy block groups while
3372  * we're still allowing others to join the commit.
3373  */
3374 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3375 {
3376         struct btrfs_fs_info *fs_info = trans->fs_info;
3377         struct btrfs_block_group_cache *cache;
3378         struct btrfs_transaction *cur_trans = trans->transaction;
3379         int ret = 0;
3380         int should_put;
3381         struct btrfs_path *path = NULL;
3382         LIST_HEAD(dirty);
3383         struct list_head *io = &cur_trans->io_bgs;
3384         int num_started = 0;
3385         int loops = 0;
3386
3387         spin_lock(&cur_trans->dirty_bgs_lock);
3388         if (list_empty(&cur_trans->dirty_bgs)) {
3389                 spin_unlock(&cur_trans->dirty_bgs_lock);
3390                 return 0;
3391         }
3392         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3393         spin_unlock(&cur_trans->dirty_bgs_lock);
3394
3395 again:
3396         /*
3397          * make sure all the block groups on our dirty list actually
3398          * exist
3399          */
3400         btrfs_create_pending_block_groups(trans);
3401
3402         if (!path) {
3403                 path = btrfs_alloc_path();
3404                 if (!path)
3405                         return -ENOMEM;
3406         }
3407
3408         /*
3409          * cache_write_mutex is here only to save us from balance or automatic
3410          * removal of empty block groups deleting this block group while we are
3411          * writing out the cache
3412          */
3413         mutex_lock(&trans->transaction->cache_write_mutex);
3414         while (!list_empty(&dirty)) {
3415                 bool drop_reserve = true;
3416
3417                 cache = list_first_entry(&dirty,
3418                                          struct btrfs_block_group_cache,
3419                                          dirty_list);
3420                 /*
3421                  * this can happen if something re-dirties a block
3422                  * group that is already under IO.  Just wait for it to
3423                  * finish and then do it all again
3424                  */
3425                 if (!list_empty(&cache->io_list)) {
3426                         list_del_init(&cache->io_list);
3427                         btrfs_wait_cache_io(trans, cache, path);
3428                         btrfs_put_block_group(cache);
3429                 }
3430
3431
3432                 /*
3433                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3434                  * if it should update the cache_state.  Don't delete
3435                  * until after we wait.
3436                  *
3437                  * Since we're not running in the commit critical section
3438                  * we need the dirty_bgs_lock to protect from update_block_group
3439                  */
3440                 spin_lock(&cur_trans->dirty_bgs_lock);
3441                 list_del_init(&cache->dirty_list);
3442                 spin_unlock(&cur_trans->dirty_bgs_lock);
3443
3444                 should_put = 1;
3445
3446                 cache_save_setup(cache, trans, path);
3447
3448                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3449                         cache->io_ctl.inode = NULL;
3450                         ret = btrfs_write_out_cache(trans, cache, path);
3451                         if (ret == 0 && cache->io_ctl.inode) {
3452                                 num_started++;
3453                                 should_put = 0;
3454
3455                                 /*
3456                                  * The cache_write_mutex is protecting the
3457                                  * io_list, also refer to the definition of
3458                                  * btrfs_transaction::io_bgs for more details
3459                                  */
3460                                 list_add_tail(&cache->io_list, io);
3461                         } else {
3462                                 /*
3463                                  * if we failed to write the cache, the
3464                                  * generation will be bad and life goes on
3465                                  */
3466                                 ret = 0;
3467                         }
3468                 }
3469                 if (!ret) {
3470                         ret = write_one_cache_group(trans, path, cache);
3471                         /*
3472                          * Our block group might still be attached to the list
3473                          * of new block groups in the transaction handle of some
3474                          * other task (struct btrfs_trans_handle->new_bgs). This
3475                          * means its block group item isn't yet in the extent
3476                          * tree. If this happens ignore the error, as we will
3477                          * try again later in the critical section of the
3478                          * transaction commit.
3479                          */
3480                         if (ret == -ENOENT) {
3481                                 ret = 0;
3482                                 spin_lock(&cur_trans->dirty_bgs_lock);
3483                                 if (list_empty(&cache->dirty_list)) {
3484                                         list_add_tail(&cache->dirty_list,
3485                                                       &cur_trans->dirty_bgs);
3486                                         btrfs_get_block_group(cache);
3487                                         drop_reserve = false;
3488                                 }
3489                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3490                         } else if (ret) {
3491                                 btrfs_abort_transaction(trans, ret);
3492                         }
3493                 }
3494
3495                 /* if it's not on the io list, we need to put the block group */
3496                 if (should_put)
3497                         btrfs_put_block_group(cache);
3498                 if (drop_reserve)
3499                         btrfs_delayed_refs_rsv_release(fs_info, 1);
3500
3501                 if (ret)
3502                         break;
3503
3504                 /*
3505                  * Avoid blocking other tasks for too long. It might even save
3506                  * us from writing caches for block groups that are going to be
3507                  * removed.
3508                  */
3509                 mutex_unlock(&trans->transaction->cache_write_mutex);
3510                 mutex_lock(&trans->transaction->cache_write_mutex);
3511         }
3512         mutex_unlock(&trans->transaction->cache_write_mutex);
3513
3514         /*
3515          * go through delayed refs for all the stuff we've just kicked off
3516          * and then loop back (just once)
3517          */
3518         ret = btrfs_run_delayed_refs(trans, 0);
3519         if (!ret && loops == 0) {
3520                 loops++;
3521                 spin_lock(&cur_trans->dirty_bgs_lock);
3522                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3523                 /*
3524                  * dirty_bgs_lock protects us from concurrent block group
3525                  * deletes too (not just cache_write_mutex).
3526                  */
3527                 if (!list_empty(&dirty)) {
3528                         spin_unlock(&cur_trans->dirty_bgs_lock);
3529                         goto again;
3530                 }
3531                 spin_unlock(&cur_trans->dirty_bgs_lock);
3532         } else if (ret < 0) {
3533                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3534         }
3535
3536         btrfs_free_path(path);
3537         return ret;
3538 }
3539
3540 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3541 {
3542         struct btrfs_fs_info *fs_info = trans->fs_info;
3543         struct btrfs_block_group_cache *cache;
3544         struct btrfs_transaction *cur_trans = trans->transaction;
3545         int ret = 0;
3546         int should_put;
3547         struct btrfs_path *path;
3548         struct list_head *io = &cur_trans->io_bgs;
3549         int num_started = 0;
3550
3551         path = btrfs_alloc_path();
3552         if (!path)
3553                 return -ENOMEM;
3554
3555         /*
3556          * Even though we are in the critical section of the transaction commit,
3557          * we can still have concurrent tasks adding elements to this
3558          * transaction's list of dirty block groups. These tasks correspond to
3559          * endio free space workers started when writeback finishes for a
3560          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3561          * allocate new block groups as a result of COWing nodes of the root
3562          * tree when updating the free space inode. The writeback for the space
3563          * caches is triggered by an earlier call to
3564          * btrfs_start_dirty_block_groups() and iterations of the following
3565          * loop.
3566          * Also we want to do the cache_save_setup first and then run the
3567          * delayed refs to make sure we have the best chance at doing this all
3568          * in one shot.
3569          */
3570         spin_lock(&cur_trans->dirty_bgs_lock);
3571         while (!list_empty(&cur_trans->dirty_bgs)) {
3572                 cache = list_first_entry(&cur_trans->dirty_bgs,
3573                                          struct btrfs_block_group_cache,
3574                                          dirty_list);
3575
3576                 /*
3577                  * this can happen if cache_save_setup re-dirties a block
3578                  * group that is already under IO.  Just wait for it to
3579                  * finish and then do it all again
3580                  */
3581                 if (!list_empty(&cache->io_list)) {
3582                         spin_unlock(&cur_trans->dirty_bgs_lock);
3583                         list_del_init(&cache->io_list);
3584                         btrfs_wait_cache_io(trans, cache, path);
3585                         btrfs_put_block_group(cache);
3586                         spin_lock(&cur_trans->dirty_bgs_lock);
3587                 }
3588
3589                 /*
3590                  * don't remove from the dirty list until after we've waited
3591                  * on any pending IO
3592                  */
3593                 list_del_init(&cache->dirty_list);
3594                 spin_unlock(&cur_trans->dirty_bgs_lock);
3595                 should_put = 1;
3596
3597                 cache_save_setup(cache, trans, path);
3598
3599                 if (!ret)
3600                         ret = btrfs_run_delayed_refs(trans,
3601                                                      (unsigned long) -1);
3602
3603                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3604                         cache->io_ctl.inode = NULL;
3605                         ret = btrfs_write_out_cache(trans, cache, path);
3606                         if (ret == 0 && cache->io_ctl.inode) {
3607                                 num_started++;
3608                                 should_put = 0;
3609                                 list_add_tail(&cache->io_list, io);
3610                         } else {
3611                                 /*
3612                                  * if we failed to write the cache, the
3613                                  * generation will be bad and life goes on
3614                                  */
3615                                 ret = 0;
3616                         }
3617                 }
3618                 if (!ret) {
3619                         ret = write_one_cache_group(trans, path, cache);
3620                         /*
3621                          * One of the free space endio workers might have
3622                          * created a new block group while updating a free space
3623                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3624                          * and hasn't released its transaction handle yet, in
3625                          * which case the new block group is still attached to
3626                          * its transaction handle and its creation has not
3627                          * finished yet (no block group item in the extent tree
3628                          * yet, etc). If this is the case, wait for all free
3629                          * space endio workers to finish and retry. This is a
3630                          * a very rare case so no need for a more efficient and
3631                          * complex approach.
3632                          */
3633                         if (ret == -ENOENT) {
3634                                 wait_event(cur_trans->writer_wait,
3635                                    atomic_read(&cur_trans->num_writers) == 1);
3636                                 ret = write_one_cache_group(trans, path, cache);
3637                         }
3638                         if (ret)
3639                                 btrfs_abort_transaction(trans, ret);
3640                 }
3641
3642                 /* if its not on the io list, we need to put the block group */
3643                 if (should_put)
3644                         btrfs_put_block_group(cache);
3645                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3646                 spin_lock(&cur_trans->dirty_bgs_lock);
3647         }
3648         spin_unlock(&cur_trans->dirty_bgs_lock);
3649
3650         /*
3651          * Refer to the definition of io_bgs member for details why it's safe
3652          * to use it without any locking
3653          */
3654         while (!list_empty(io)) {
3655                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3656                                          io_list);
3657                 list_del_init(&cache->io_list);
3658                 btrfs_wait_cache_io(trans, cache, path);
3659                 btrfs_put_block_group(cache);
3660         }
3661
3662         btrfs_free_path(path);
3663         return ret;
3664 }
3665
3666 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3667 {
3668         struct btrfs_block_group_cache *block_group;
3669         int readonly = 0;
3670
3671         block_group = btrfs_lookup_block_group(fs_info, bytenr);
3672         if (!block_group || block_group->ro)
3673                 readonly = 1;
3674         if (block_group)
3675                 btrfs_put_block_group(block_group);
3676         return readonly;
3677 }
3678
3679 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3680 {
3681         struct btrfs_block_group_cache *bg;
3682         bool ret = true;
3683
3684         bg = btrfs_lookup_block_group(fs_info, bytenr);
3685         if (!bg)
3686                 return false;
3687
3688         spin_lock(&bg->lock);
3689         if (bg->ro)
3690                 ret = false;
3691         else
3692                 atomic_inc(&bg->nocow_writers);
3693         spin_unlock(&bg->lock);
3694
3695         /* no put on block group, done by btrfs_dec_nocow_writers */
3696         if (!ret)
3697                 btrfs_put_block_group(bg);
3698
3699         return ret;
3700
3701 }
3702
3703 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3704 {
3705         struct btrfs_block_group_cache *bg;
3706
3707         bg = btrfs_lookup_block_group(fs_info, bytenr);
3708         ASSERT(bg);
3709         if (atomic_dec_and_test(&bg->nocow_writers))
3710                 wake_up_var(&bg->nocow_writers);
3711         /*
3712          * Once for our lookup and once for the lookup done by a previous call
3713          * to btrfs_inc_nocow_writers()
3714          */
3715         btrfs_put_block_group(bg);
3716         btrfs_put_block_group(bg);
3717 }
3718
3719 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3720 {
3721         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3722 }
3723
3724 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3725 {
3726         u64 extra_flags = chunk_to_extended(flags) &
3727                                 BTRFS_EXTENDED_PROFILE_MASK;
3728
3729         write_seqlock(&fs_info->profiles_lock);
3730         if (flags & BTRFS_BLOCK_GROUP_DATA)
3731                 fs_info->avail_data_alloc_bits |= extra_flags;
3732         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3733                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3734         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3735                 fs_info->avail_system_alloc_bits |= extra_flags;
3736         write_sequnlock(&fs_info->profiles_lock);
3737 }
3738
3739 /*
3740  * returns target flags in extended format or 0 if restripe for this
3741  * chunk_type is not in progress
3742  *
3743  * should be called with balance_lock held
3744  */
3745 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3746 {
3747         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3748         u64 target = 0;
3749
3750         if (!bctl)
3751                 return 0;
3752
3753         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3754             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3755                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3756         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3757                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3758                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3759         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3760                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3761                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3762         }
3763
3764         return target;
3765 }
3766
3767 /*
3768  * @flags: available profiles in extended format (see ctree.h)
3769  *
3770  * Returns reduced profile in chunk format.  If profile changing is in
3771  * progress (either running or paused) picks the target profile (if it's
3772  * already available), otherwise falls back to plain reducing.
3773  */
3774 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
3775 {
3776         u64 num_devices = fs_info->fs_devices->rw_devices;
3777         u64 target;
3778         u64 raid_type;
3779         u64 allowed = 0;
3780
3781         /*
3782          * see if restripe for this chunk_type is in progress, if so
3783          * try to reduce to the target profile
3784          */
3785         spin_lock(&fs_info->balance_lock);
3786         target = get_restripe_target(fs_info, flags);
3787         if (target) {
3788                 /* pick target profile only if it's already available */
3789                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3790                         spin_unlock(&fs_info->balance_lock);
3791                         return extended_to_chunk(target);
3792                 }
3793         }
3794         spin_unlock(&fs_info->balance_lock);
3795
3796         /* First, mask out the RAID levels which aren't possible */
3797         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3798                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3799                         allowed |= btrfs_raid_array[raid_type].bg_flag;
3800         }
3801         allowed &= flags;
3802
3803         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3804                 allowed = BTRFS_BLOCK_GROUP_RAID6;
3805         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3806                 allowed = BTRFS_BLOCK_GROUP_RAID5;
3807         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3808                 allowed = BTRFS_BLOCK_GROUP_RAID10;
3809         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3810                 allowed = BTRFS_BLOCK_GROUP_RAID1;
3811         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3812                 allowed = BTRFS_BLOCK_GROUP_RAID0;
3813
3814         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3815
3816         return extended_to_chunk(flags | allowed);
3817 }
3818
3819 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
3820 {
3821         unsigned seq;
3822         u64 flags;
3823
3824         do {
3825                 flags = orig_flags;
3826                 seq = read_seqbegin(&fs_info->profiles_lock);
3827
3828                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3829                         flags |= fs_info->avail_data_alloc_bits;
3830                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3831                         flags |= fs_info->avail_system_alloc_bits;
3832                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3833                         flags |= fs_info->avail_metadata_alloc_bits;
3834         } while (read_seqretry(&fs_info->profiles_lock, seq));
3835
3836         return btrfs_reduce_alloc_profile(fs_info, flags);
3837 }
3838
3839 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
3840 {
3841         struct btrfs_fs_info *fs_info = root->fs_info;
3842         u64 flags;
3843         u64 ret;
3844
3845         if (data)
3846                 flags = BTRFS_BLOCK_GROUP_DATA;
3847         else if (root == fs_info->chunk_root)
3848                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3849         else
3850                 flags = BTRFS_BLOCK_GROUP_METADATA;
3851
3852         ret = get_alloc_profile(fs_info, flags);
3853         return ret;
3854 }
3855
3856 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3857 {
3858         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3859 }
3860
3861 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3862 {
3863         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3864 }
3865
3866 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3867 {
3868         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3869 }
3870
3871 static void force_metadata_allocation(struct btrfs_fs_info *info)
3872 {
3873         struct list_head *head = &info->space_info;
3874         struct btrfs_space_info *found;
3875
3876         rcu_read_lock();
3877         list_for_each_entry_rcu(found, head, list) {
3878                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3879                         found->force_alloc = CHUNK_ALLOC_FORCE;
3880         }
3881         rcu_read_unlock();
3882 }
3883
3884 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3885                               struct btrfs_space_info *sinfo, int force)
3886 {
3887         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3888         u64 thresh;
3889
3890         if (force == CHUNK_ALLOC_FORCE)
3891                 return 1;
3892
3893         /*
3894          * in limited mode, we want to have some free space up to
3895          * about 1% of the FS size.
3896          */
3897         if (force == CHUNK_ALLOC_LIMITED) {
3898                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3899                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3900
3901                 if (sinfo->total_bytes - bytes_used < thresh)
3902                         return 1;
3903         }
3904
3905         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3906                 return 0;
3907         return 1;
3908 }
3909
3910 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3911 {
3912         u64 num_dev;
3913
3914         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3915         if (!num_dev)
3916                 num_dev = fs_info->fs_devices->rw_devices;
3917
3918         return num_dev;
3919 }
3920
3921 /*
3922  * If @is_allocation is true, reserve space in the system space info necessary
3923  * for allocating a chunk, otherwise if it's false, reserve space necessary for
3924  * removing a chunk.
3925  */
3926 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3927 {
3928         struct btrfs_fs_info *fs_info = trans->fs_info;
3929         struct btrfs_space_info *info;
3930         u64 left;
3931         u64 thresh;
3932         int ret = 0;
3933         u64 num_devs;
3934
3935         /*
3936          * Needed because we can end up allocating a system chunk and for an
3937          * atomic and race free space reservation in the chunk block reserve.
3938          */
3939         lockdep_assert_held(&fs_info->chunk_mutex);
3940
3941         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3942         spin_lock(&info->lock);
3943         left = info->total_bytes - btrfs_space_info_used(info, true);
3944         spin_unlock(&info->lock);
3945
3946         num_devs = get_profile_num_devs(fs_info, type);
3947
3948         /* num_devs device items to update and 1 chunk item to add or remove */
3949         thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
3950                 btrfs_calc_trans_metadata_size(fs_info, 1);
3951
3952         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3953                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3954                            left, thresh, type);
3955                 btrfs_dump_space_info(fs_info, info, 0, 0);
3956         }
3957
3958         if (left < thresh) {
3959                 u64 flags = btrfs_system_alloc_profile(fs_info);
3960
3961                 /*
3962                  * Ignore failure to create system chunk. We might end up not
3963                  * needing it, as we might not need to COW all nodes/leafs from
3964                  * the paths we visit in the chunk tree (they were already COWed
3965                  * or created in the current transaction for example).
3966                  */
3967                 ret = btrfs_alloc_chunk(trans, flags);
3968         }
3969
3970         if (!ret) {
3971                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3972                                           &fs_info->chunk_block_rsv,
3973                                           thresh, BTRFS_RESERVE_NO_FLUSH);
3974                 if (!ret)
3975                         trans->chunk_bytes_reserved += thresh;
3976         }
3977 }
3978
3979 /*
3980  * If force is CHUNK_ALLOC_FORCE:
3981  *    - return 1 if it successfully allocates a chunk,
3982  *    - return errors including -ENOSPC otherwise.
3983  * If force is NOT CHUNK_ALLOC_FORCE:
3984  *    - return 0 if it doesn't need to allocate a new chunk,
3985  *    - return 1 if it successfully allocates a chunk,
3986  *    - return errors including -ENOSPC otherwise.
3987  */
3988 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3989                       enum btrfs_chunk_alloc_enum force)
3990 {
3991         struct btrfs_fs_info *fs_info = trans->fs_info;
3992         struct btrfs_space_info *space_info;
3993         bool wait_for_alloc = false;
3994         bool should_alloc = false;
3995         int ret = 0;
3996
3997         /* Don't re-enter if we're already allocating a chunk */
3998         if (trans->allocating_chunk)
3999                 return -ENOSPC;
4000
4001         space_info = btrfs_find_space_info(fs_info, flags);
4002         ASSERT(space_info);
4003
4004         do {
4005                 spin_lock(&space_info->lock);
4006                 if (force < space_info->force_alloc)
4007                         force = space_info->force_alloc;
4008                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4009                 if (space_info->full) {
4010                         /* No more free physical space */
4011                         if (should_alloc)
4012                                 ret = -ENOSPC;
4013                         else
4014                                 ret = 0;
4015                         spin_unlock(&space_info->lock);
4016                         return ret;
4017                 } else if (!should_alloc) {
4018                         spin_unlock(&space_info->lock);
4019                         return 0;
4020                 } else if (space_info->chunk_alloc) {
4021                         /*
4022                          * Someone is already allocating, so we need to block
4023                          * until this someone is finished and then loop to
4024                          * recheck if we should continue with our allocation
4025                          * attempt.
4026                          */
4027                         wait_for_alloc = true;
4028                         spin_unlock(&space_info->lock);
4029                         mutex_lock(&fs_info->chunk_mutex);
4030                         mutex_unlock(&fs_info->chunk_mutex);
4031                 } else {
4032                         /* Proceed with allocation */
4033                         space_info->chunk_alloc = 1;
4034                         wait_for_alloc = false;
4035                         spin_unlock(&space_info->lock);
4036                 }
4037
4038                 cond_resched();
4039         } while (wait_for_alloc);
4040
4041         mutex_lock(&fs_info->chunk_mutex);
4042         trans->allocating_chunk = true;
4043
4044         /*
4045          * If we have mixed data/metadata chunks we want to make sure we keep
4046          * allocating mixed chunks instead of individual chunks.
4047          */
4048         if (btrfs_mixed_space_info(space_info))
4049                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4050
4051         /*
4052          * if we're doing a data chunk, go ahead and make sure that
4053          * we keep a reasonable number of metadata chunks allocated in the
4054          * FS as well.
4055          */
4056         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4057                 fs_info->data_chunk_allocations++;
4058                 if (!(fs_info->data_chunk_allocations %
4059                       fs_info->metadata_ratio))
4060                         force_metadata_allocation(fs_info);
4061         }
4062
4063         /*
4064          * Check if we have enough space in SYSTEM chunk because we may need
4065          * to update devices.
4066          */
4067         check_system_chunk(trans, flags);
4068
4069         ret = btrfs_alloc_chunk(trans, flags);
4070         trans->allocating_chunk = false;
4071
4072         spin_lock(&space_info->lock);
4073         if (ret < 0) {
4074                 if (ret == -ENOSPC)
4075                         space_info->full = 1;
4076                 else
4077                         goto out;
4078         } else {
4079                 ret = 1;
4080                 space_info->max_extent_size = 0;
4081         }
4082
4083         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4084 out:
4085         space_info->chunk_alloc = 0;
4086         spin_unlock(&space_info->lock);
4087         mutex_unlock(&fs_info->chunk_mutex);
4088         /*
4089          * When we allocate a new chunk we reserve space in the chunk block
4090          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4091          * add new nodes/leafs to it if we end up needing to do it when
4092          * inserting the chunk item and updating device items as part of the
4093          * second phase of chunk allocation, performed by
4094          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4095          * large number of new block groups to create in our transaction
4096          * handle's new_bgs list to avoid exhausting the chunk block reserve
4097          * in extreme cases - like having a single transaction create many new
4098          * block groups when starting to write out the free space caches of all
4099          * the block groups that were made dirty during the lifetime of the
4100          * transaction.
4101          */
4102         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4103                 btrfs_create_pending_block_groups(trans);
4104
4105         return ret;
4106 }
4107
4108 static int update_block_group(struct btrfs_trans_handle *trans,
4109                               u64 bytenr, u64 num_bytes, int alloc)
4110 {
4111         struct btrfs_fs_info *info = trans->fs_info;
4112         struct btrfs_block_group_cache *cache = NULL;
4113         u64 total = num_bytes;
4114         u64 old_val;
4115         u64 byte_in_group;
4116         int factor;
4117         int ret = 0;
4118
4119         /* block accounting for super block */
4120         spin_lock(&info->delalloc_root_lock);
4121         old_val = btrfs_super_bytes_used(info->super_copy);
4122         if (alloc)
4123                 old_val += num_bytes;
4124         else
4125                 old_val -= num_bytes;
4126         btrfs_set_super_bytes_used(info->super_copy, old_val);
4127         spin_unlock(&info->delalloc_root_lock);
4128
4129         while (total) {
4130                 cache = btrfs_lookup_block_group(info, bytenr);
4131                 if (!cache) {
4132                         ret = -ENOENT;
4133                         break;
4134                 }
4135                 factor = btrfs_bg_type_to_factor(cache->flags);
4136
4137                 /*
4138                  * If this block group has free space cache written out, we
4139                  * need to make sure to load it if we are removing space.  This
4140                  * is because we need the unpinning stage to actually add the
4141                  * space back to the block group, otherwise we will leak space.
4142                  */
4143                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4144                         cache_block_group(cache, 1);
4145
4146                 byte_in_group = bytenr - cache->key.objectid;
4147                 WARN_ON(byte_in_group > cache->key.offset);
4148
4149                 spin_lock(&cache->space_info->lock);
4150                 spin_lock(&cache->lock);
4151
4152                 if (btrfs_test_opt(info, SPACE_CACHE) &&
4153                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4154                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4155
4156                 old_val = btrfs_block_group_used(&cache->item);
4157                 num_bytes = min(total, cache->key.offset - byte_in_group);
4158                 if (alloc) {
4159                         old_val += num_bytes;
4160                         btrfs_set_block_group_used(&cache->item, old_val);
4161                         cache->reserved -= num_bytes;
4162                         cache->space_info->bytes_reserved -= num_bytes;
4163                         cache->space_info->bytes_used += num_bytes;
4164                         cache->space_info->disk_used += num_bytes * factor;
4165                         spin_unlock(&cache->lock);
4166                         spin_unlock(&cache->space_info->lock);
4167                 } else {
4168                         old_val -= num_bytes;
4169                         btrfs_set_block_group_used(&cache->item, old_val);
4170                         cache->pinned += num_bytes;
4171                         btrfs_space_info_update_bytes_pinned(info,
4172                                         cache->space_info, num_bytes);
4173                         cache->space_info->bytes_used -= num_bytes;
4174                         cache->space_info->disk_used -= num_bytes * factor;
4175                         spin_unlock(&cache->lock);
4176                         spin_unlock(&cache->space_info->lock);
4177
4178                         trace_btrfs_space_reservation(info, "pinned",
4179                                                       cache->space_info->flags,
4180                                                       num_bytes, 1);
4181                         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
4182                                            num_bytes,
4183                                            BTRFS_TOTAL_BYTES_PINNED_BATCH);
4184                         set_extent_dirty(info->pinned_extents,
4185                                          bytenr, bytenr + num_bytes - 1,
4186                                          GFP_NOFS | __GFP_NOFAIL);
4187                 }
4188
4189                 spin_lock(&trans->transaction->dirty_bgs_lock);
4190                 if (list_empty(&cache->dirty_list)) {
4191                         list_add_tail(&cache->dirty_list,
4192                                       &trans->transaction->dirty_bgs);
4193                         trans->delayed_ref_updates++;
4194                         btrfs_get_block_group(cache);
4195                 }
4196                 spin_unlock(&trans->transaction->dirty_bgs_lock);
4197
4198                 /*
4199                  * No longer have used bytes in this block group, queue it for
4200                  * deletion. We do this after adding the block group to the
4201                  * dirty list to avoid races between cleaner kthread and space
4202                  * cache writeout.
4203                  */
4204                 if (!alloc && old_val == 0)
4205                         btrfs_mark_bg_unused(cache);
4206
4207                 btrfs_put_block_group(cache);
4208                 total -= num_bytes;
4209                 bytenr += num_bytes;
4210         }
4211
4212         /* Modified block groups are accounted for in the delayed_refs_rsv. */
4213         btrfs_update_delayed_refs_rsv(trans);
4214         return ret;
4215 }
4216
4217 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
4218 {
4219         struct btrfs_block_group_cache *cache;
4220         u64 bytenr;
4221
4222         spin_lock(&fs_info->block_group_cache_lock);
4223         bytenr = fs_info->first_logical_byte;
4224         spin_unlock(&fs_info->block_group_cache_lock);
4225
4226         if (bytenr < (u64)-1)
4227                 return bytenr;
4228
4229         cache = btrfs_lookup_first_block_group(fs_info, search_start);
4230         if (!cache)
4231                 return 0;
4232
4233         bytenr = cache->key.objectid;
4234         btrfs_put_block_group(cache);
4235
4236         return bytenr;
4237 }
4238
4239 static int pin_down_extent(struct btrfs_block_group_cache *cache,
4240                            u64 bytenr, u64 num_bytes, int reserved)
4241 {
4242         struct btrfs_fs_info *fs_info = cache->fs_info;
4243
4244         spin_lock(&cache->space_info->lock);
4245         spin_lock(&cache->lock);
4246         cache->pinned += num_bytes;
4247         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
4248                                              num_bytes);
4249         if (reserved) {
4250                 cache->reserved -= num_bytes;
4251                 cache->space_info->bytes_reserved -= num_bytes;
4252         }
4253         spin_unlock(&cache->lock);
4254         spin_unlock(&cache->space_info->lock);
4255
4256         trace_btrfs_space_reservation(fs_info, "pinned",
4257                                       cache->space_info->flags, num_bytes, 1);
4258         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
4259                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
4260         set_extent_dirty(fs_info->pinned_extents, bytenr,
4261                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4262         return 0;
4263 }
4264
4265 /*
4266  * this function must be called within transaction
4267  */
4268 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
4269                      u64 bytenr, u64 num_bytes, int reserved)
4270 {
4271         struct btrfs_block_group_cache *cache;
4272
4273         cache = btrfs_lookup_block_group(fs_info, bytenr);
4274         BUG_ON(!cache); /* Logic error */
4275
4276         pin_down_extent(cache, bytenr, num_bytes, reserved);
4277
4278         btrfs_put_block_group(cache);
4279         return 0;
4280 }
4281
4282 /*
4283  * this function must be called within transaction
4284  */
4285 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
4286                                     u64 bytenr, u64 num_bytes)
4287 {
4288         struct btrfs_block_group_cache *cache;
4289         int ret;
4290
4291         cache = btrfs_lookup_block_group(fs_info, bytenr);
4292         if (!cache)
4293                 return -EINVAL;
4294
4295         /*
4296          * pull in the free space cache (if any) so that our pin
4297          * removes the free space from the cache.  We have load_only set
4298          * to one because the slow code to read in the free extents does check
4299          * the pinned extents.
4300          */
4301         cache_block_group(cache, 1);
4302
4303         pin_down_extent(cache, bytenr, num_bytes, 0);
4304
4305         /* remove us from the free space cache (if we're there at all) */
4306         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
4307         btrfs_put_block_group(cache);
4308         return ret;
4309 }
4310
4311 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
4312                                    u64 start, u64 num_bytes)
4313 {
4314         int ret;
4315         struct btrfs_block_group_cache *block_group;
4316         struct btrfs_caching_control *caching_ctl;
4317
4318         block_group = btrfs_lookup_block_group(fs_info, start);
4319         if (!block_group)
4320                 return -EINVAL;
4321
4322         cache_block_group(block_group, 0);
4323         caching_ctl = get_caching_control(block_group);
4324
4325         if (!caching_ctl) {
4326                 /* Logic error */
4327                 BUG_ON(!block_group_cache_done(block_group));
4328                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4329         } else {
4330                 mutex_lock(&caching_ctl->mutex);
4331
4332                 if (start >= caching_ctl->progress) {
4333                         ret = add_excluded_extent(fs_info, start, num_bytes);
4334                 } else if (start + num_bytes <= caching_ctl->progress) {
4335                         ret = btrfs_remove_free_space(block_group,
4336                                                       start, num_bytes);
4337                 } else {
4338                         num_bytes = caching_ctl->progress - start;
4339                         ret = btrfs_remove_free_space(block_group,
4340                                                       start, num_bytes);
4341                         if (ret)
4342                                 goto out_lock;
4343
4344                         num_bytes = (start + num_bytes) -
4345                                 caching_ctl->progress;
4346                         start = caching_ctl->progress;
4347                         ret = add_excluded_extent(fs_info, start, num_bytes);
4348                 }
4349 out_lock:
4350                 mutex_unlock(&caching_ctl->mutex);
4351                 put_caching_control(caching_ctl);
4352         }
4353         btrfs_put_block_group(block_group);
4354         return ret;
4355 }
4356
4357 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
4358 {
4359         struct btrfs_fs_info *fs_info = eb->fs_info;
4360         struct btrfs_file_extent_item *item;
4361         struct btrfs_key key;
4362         int found_type;
4363         int i;
4364         int ret = 0;
4365
4366         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
4367                 return 0;
4368
4369         for (i = 0; i < btrfs_header_nritems(eb); i++) {
4370                 btrfs_item_key_to_cpu(eb, &key, i);
4371                 if (key.type != BTRFS_EXTENT_DATA_KEY)
4372                         continue;
4373                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
4374                 found_type = btrfs_file_extent_type(eb, item);
4375                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
4376                         continue;
4377                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
4378                         continue;
4379                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
4380                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
4381                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
4382                 if (ret)
4383                         break;
4384         }
4385
4386         return ret;
4387 }
4388
4389 static void
4390 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
4391 {
4392         atomic_inc(&bg->reservations);
4393 }
4394
4395 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
4396                                         const u64 start)
4397 {
4398         struct btrfs_block_group_cache *bg;
4399
4400         bg = btrfs_lookup_block_group(fs_info, start);
4401         ASSERT(bg);
4402         if (atomic_dec_and_test(&bg->reservations))
4403                 wake_up_var(&bg->reservations);
4404         btrfs_put_block_group(bg);
4405 }
4406
4407 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
4408 {
4409         struct btrfs_space_info *space_info = bg->space_info;
4410
4411         ASSERT(bg->ro);
4412
4413         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
4414                 return;
4415
4416         /*
4417          * Our block group is read only but before we set it to read only,
4418          * some task might have had allocated an extent from it already, but it
4419          * has not yet created a respective ordered extent (and added it to a
4420          * root's list of ordered extents).
4421          * Therefore wait for any task currently allocating extents, since the
4422          * block group's reservations counter is incremented while a read lock
4423          * on the groups' semaphore is held and decremented after releasing
4424          * the read access on that semaphore and creating the ordered extent.
4425          */
4426         down_write(&space_info->groups_sem);
4427         up_write(&space_info->groups_sem);
4428
4429         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
4430 }
4431
4432 /**
4433  * btrfs_add_reserved_bytes - update the block_group and space info counters
4434  * @cache:      The cache we are manipulating
4435  * @ram_bytes:  The number of bytes of file content, and will be same to
4436  *              @num_bytes except for the compress path.
4437  * @num_bytes:  The number of bytes in question
4438  * @delalloc:   The blocks are allocated for the delalloc write
4439  *
4440  * This is called by the allocator when it reserves space. If this is a
4441  * reservation and the block group has become read only we cannot make the
4442  * reservation and return -EAGAIN, otherwise this function always succeeds.
4443  */
4444 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
4445                                     u64 ram_bytes, u64 num_bytes, int delalloc)
4446 {
4447         struct btrfs_space_info *space_info = cache->space_info;
4448         int ret = 0;
4449
4450         spin_lock(&space_info->lock);
4451         spin_lock(&cache->lock);
4452         if (cache->ro) {
4453                 ret = -EAGAIN;
4454         } else {
4455                 cache->reserved += num_bytes;
4456                 space_info->bytes_reserved += num_bytes;
4457                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
4458                                                       space_info, -ram_bytes);
4459                 if (delalloc)
4460                         cache->delalloc_bytes += num_bytes;
4461         }
4462         spin_unlock(&cache->lock);
4463         spin_unlock(&space_info->lock);
4464         return ret;
4465 }
4466
4467 /**
4468  * btrfs_free_reserved_bytes - update the block_group and space info counters
4469  * @cache:      The cache we are manipulating
4470  * @num_bytes:  The number of bytes in question
4471  * @delalloc:   The blocks are allocated for the delalloc write
4472  *
4473  * This is called by somebody who is freeing space that was never actually used
4474  * on disk.  For example if you reserve some space for a new leaf in transaction
4475  * A and before transaction A commits you free that leaf, you call this with
4476  * reserve set to 0 in order to clear the reservation.
4477  */
4478
4479 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
4480                                       u64 num_bytes, int delalloc)
4481 {
4482         struct btrfs_space_info *space_info = cache->space_info;
4483
4484         spin_lock(&space_info->lock);
4485         spin_lock(&cache->lock);
4486         if (cache->ro)
4487                 space_info->bytes_readonly += num_bytes;
4488         cache->reserved -= num_bytes;
4489         space_info->bytes_reserved -= num_bytes;
4490         space_info->max_extent_size = 0;
4491
4492         if (delalloc)
4493                 cache->delalloc_bytes -= num_bytes;
4494         spin_unlock(&cache->lock);
4495         spin_unlock(&space_info->lock);
4496 }
4497 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
4498 {
4499         struct btrfs_caching_control *next;
4500         struct btrfs_caching_control *caching_ctl;
4501         struct btrfs_block_group_cache *cache;
4502
4503         down_write(&fs_info->commit_root_sem);
4504
4505         list_for_each_entry_safe(caching_ctl, next,
4506                                  &fs_info->caching_block_groups, list) {
4507                 cache = caching_ctl->block_group;
4508                 if (block_group_cache_done(cache)) {
4509                         cache->last_byte_to_unpin = (u64)-1;
4510                         list_del_init(&caching_ctl->list);
4511                         put_caching_control(caching_ctl);
4512                 } else {
4513                         cache->last_byte_to_unpin = caching_ctl->progress;
4514                 }
4515         }
4516
4517         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4518                 fs_info->pinned_extents = &fs_info->freed_extents[1];
4519         else
4520                 fs_info->pinned_extents = &fs_info->freed_extents[0];
4521
4522         up_write(&fs_info->commit_root_sem);
4523
4524         btrfs_update_global_block_rsv(fs_info);
4525 }
4526
4527 /*
4528  * Returns the free cluster for the given space info and sets empty_cluster to
4529  * what it should be based on the mount options.
4530  */
4531 static struct btrfs_free_cluster *
4532 fetch_cluster_info(struct btrfs_fs_info *fs_info,
4533                    struct btrfs_space_info *space_info, u64 *empty_cluster)
4534 {
4535         struct btrfs_free_cluster *ret = NULL;
4536
4537         *empty_cluster = 0;
4538         if (btrfs_mixed_space_info(space_info))
4539                 return ret;
4540
4541         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4542                 ret = &fs_info->meta_alloc_cluster;
4543                 if (btrfs_test_opt(fs_info, SSD))
4544                         *empty_cluster = SZ_2M;
4545                 else
4546                         *empty_cluster = SZ_64K;
4547         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
4548                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
4549                 *empty_cluster = SZ_2M;
4550                 ret = &fs_info->data_alloc_cluster;
4551         }
4552
4553         return ret;
4554 }
4555
4556 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
4557                               u64 start, u64 end,
4558                               const bool return_free_space)
4559 {
4560         struct btrfs_block_group_cache *cache = NULL;
4561         struct btrfs_space_info *space_info;
4562         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4563         struct btrfs_free_cluster *cluster = NULL;
4564         u64 len;
4565         u64 total_unpinned = 0;
4566         u64 empty_cluster = 0;
4567         bool readonly;
4568
4569         while (start <= end) {
4570                 readonly = false;
4571                 if (!cache ||
4572                     start >= cache->key.objectid + cache->key.offset) {
4573                         if (cache)
4574                                 btrfs_put_block_group(cache);
4575                         total_unpinned = 0;
4576                         cache = btrfs_lookup_block_group(fs_info, start);
4577                         BUG_ON(!cache); /* Logic error */
4578
4579                         cluster = fetch_cluster_info(fs_info,
4580                                                      cache->space_info,
4581                                                      &empty_cluster);
4582                         empty_cluster <<= 1;
4583                 }
4584
4585                 len = cache->key.objectid + cache->key.offset - start;
4586                 len = min(len, end + 1 - start);
4587
4588                 if (start < cache->last_byte_to_unpin) {
4589                         len = min(len, cache->last_byte_to_unpin - start);
4590                         if (return_free_space)
4591                                 btrfs_add_free_space(cache, start, len);
4592                 }
4593
4594                 start += len;
4595                 total_unpinned += len;
4596                 space_info = cache->space_info;
4597
4598                 /*
4599                  * If this space cluster has been marked as fragmented and we've
4600                  * unpinned enough in this block group to potentially allow a
4601                  * cluster to be created inside of it go ahead and clear the
4602                  * fragmented check.
4603                  */
4604                 if (cluster && cluster->fragmented &&
4605                     total_unpinned > empty_cluster) {
4606                         spin_lock(&cluster->lock);
4607                         cluster->fragmented = 0;
4608                         spin_unlock(&cluster->lock);
4609                 }
4610
4611                 spin_lock(&space_info->lock);
4612                 spin_lock(&cache->lock);
4613                 cache->pinned -= len;
4614                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
4615
4616                 trace_btrfs_space_reservation(fs_info, "pinned",
4617                                               space_info->flags, len, 0);
4618                 space_info->max_extent_size = 0;
4619                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
4620                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
4621                 if (cache->ro) {
4622                         space_info->bytes_readonly += len;
4623                         readonly = true;
4624                 }
4625                 spin_unlock(&cache->lock);
4626                 if (!readonly && return_free_space &&
4627                     global_rsv->space_info == space_info) {
4628                         u64 to_add = len;
4629
4630                         spin_lock(&global_rsv->lock);
4631                         if (!global_rsv->full) {
4632                                 to_add = min(len, global_rsv->size -
4633                                              global_rsv->reserved);
4634                                 global_rsv->reserved += to_add;
4635                                 btrfs_space_info_update_bytes_may_use(fs_info,
4636                                                 space_info, to_add);
4637                                 if (global_rsv->reserved >= global_rsv->size)
4638                                         global_rsv->full = 1;
4639                                 trace_btrfs_space_reservation(fs_info,
4640                                                               "space_info",
4641                                                               space_info->flags,
4642                                                               to_add, 1);
4643                                 len -= to_add;
4644                         }
4645                         spin_unlock(&global_rsv->lock);
4646                         /* Add to any tickets we may have */
4647                         if (len)
4648                                 btrfs_space_info_add_new_bytes(fs_info,
4649                                                 space_info, len);
4650                 }
4651                 spin_unlock(&space_info->lock);
4652         }
4653
4654         if (cache)
4655                 btrfs_put_block_group(cache);
4656         return 0;
4657 }
4658
4659 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
4660 {
4661         struct btrfs_fs_info *fs_info = trans->fs_info;
4662         struct btrfs_block_group_cache *block_group, *tmp;
4663         struct list_head *deleted_bgs;
4664         struct extent_io_tree *unpin;
4665         u64 start;
4666         u64 end;
4667         int ret;
4668
4669         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4670                 unpin = &fs_info->freed_extents[1];
4671         else
4672                 unpin = &fs_info->freed_extents[0];
4673
4674         while (!trans->aborted) {
4675                 struct extent_state *cached_state = NULL;
4676
4677                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
4678                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4679                                             EXTENT_DIRTY, &cached_state);
4680                 if (ret) {
4681                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4682                         break;
4683                 }
4684
4685                 if (btrfs_test_opt(fs_info, DISCARD))
4686                         ret = btrfs_discard_extent(fs_info, start,
4687                                                    end + 1 - start, NULL);
4688
4689                 clear_extent_dirty(unpin, start, end, &cached_state);
4690                 unpin_extent_range(fs_info, start, end, true);
4691                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4692                 free_extent_state(cached_state);
4693                 cond_resched();
4694         }
4695
4696         /*
4697          * Transaction is finished.  We don't need the lock anymore.  We
4698          * do need to clean up the block groups in case of a transaction
4699          * abort.
4700          */
4701         deleted_bgs = &trans->transaction->deleted_bgs;
4702         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
4703                 u64 trimmed = 0;
4704
4705                 ret = -EROFS;
4706                 if (!trans->aborted)
4707                         ret = btrfs_discard_extent(fs_info,
4708                                                    block_group->key.objectid,
4709                                                    block_group->key.offset,
4710                                                    &trimmed);
4711
4712                 list_del_init(&block_group->bg_list);
4713                 btrfs_put_block_group_trimming(block_group);
4714                 btrfs_put_block_group(block_group);
4715
4716                 if (ret) {
4717                         const char *errstr = btrfs_decode_error(ret);
4718                         btrfs_warn(fs_info,
4719                            "discard failed while removing blockgroup: errno=%d %s",
4720                                    ret, errstr);
4721                 }
4722         }
4723
4724         return 0;
4725 }
4726
4727 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4728                                struct btrfs_delayed_ref_node *node, u64 parent,
4729                                u64 root_objectid, u64 owner_objectid,
4730                                u64 owner_offset, int refs_to_drop,
4731                                struct btrfs_delayed_extent_op *extent_op)
4732 {
4733         struct btrfs_fs_info *info = trans->fs_info;
4734         struct btrfs_key key;
4735         struct btrfs_path *path;
4736         struct btrfs_root *extent_root = info->extent_root;
4737         struct extent_buffer *leaf;
4738         struct btrfs_extent_item *ei;
4739         struct btrfs_extent_inline_ref *iref;
4740         int ret;
4741         int is_data;
4742         int extent_slot = 0;
4743         int found_extent = 0;
4744         int num_to_del = 1;
4745         u32 item_size;
4746         u64 refs;
4747         u64 bytenr = node->bytenr;
4748         u64 num_bytes = node->num_bytes;
4749         int last_ref = 0;
4750         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
4751
4752         path = btrfs_alloc_path();
4753         if (!path)
4754                 return -ENOMEM;
4755
4756         path->reada = READA_FORWARD;
4757         path->leave_spinning = 1;
4758
4759         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4760         BUG_ON(!is_data && refs_to_drop != 1);
4761
4762         if (is_data)
4763                 skinny_metadata = false;
4764
4765         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
4766                                     parent, root_objectid, owner_objectid,
4767                                     owner_offset);
4768         if (ret == 0) {
4769                 extent_slot = path->slots[0];
4770                 while (extent_slot >= 0) {
4771                         btrfs_item_key_to_cpu(path->nodes[0], &key,
4772                                               extent_slot);
4773                         if (key.objectid != bytenr)
4774                                 break;
4775                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4776                             key.offset == num_bytes) {
4777                                 found_extent = 1;
4778                                 break;
4779                         }
4780                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
4781                             key.offset == owner_objectid) {
4782                                 found_extent = 1;
4783                                 break;
4784                         }
4785                         if (path->slots[0] - extent_slot > 5)
4786                                 break;
4787                         extent_slot--;
4788                 }
4789
4790                 if (!found_extent) {
4791                         BUG_ON(iref);
4792                         ret = remove_extent_backref(trans, path, NULL,
4793                                                     refs_to_drop,
4794                                                     is_data, &last_ref);
4795                         if (ret) {
4796                                 btrfs_abort_transaction(trans, ret);
4797                                 goto out;
4798                         }
4799                         btrfs_release_path(path);
4800                         path->leave_spinning = 1;
4801
4802                         key.objectid = bytenr;
4803                         key.type = BTRFS_EXTENT_ITEM_KEY;
4804                         key.offset = num_bytes;
4805
4806                         if (!is_data && skinny_metadata) {
4807                                 key.type = BTRFS_METADATA_ITEM_KEY;
4808                                 key.offset = owner_objectid;
4809                         }
4810
4811                         ret = btrfs_search_slot(trans, extent_root,
4812                                                 &key, path, -1, 1);
4813                         if (ret > 0 && skinny_metadata && path->slots[0]) {
4814                                 /*
4815                                  * Couldn't find our skinny metadata item,
4816                                  * see if we have ye olde extent item.
4817                                  */
4818                                 path->slots[0]--;
4819                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
4820                                                       path->slots[0]);
4821                                 if (key.objectid == bytenr &&
4822                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
4823                                     key.offset == num_bytes)
4824                                         ret = 0;
4825                         }
4826
4827                         if (ret > 0 && skinny_metadata) {
4828                                 skinny_metadata = false;
4829                                 key.objectid = bytenr;
4830                                 key.type = BTRFS_EXTENT_ITEM_KEY;
4831                                 key.offset = num_bytes;
4832                                 btrfs_release_path(path);
4833                                 ret = btrfs_search_slot(trans, extent_root,
4834                                                         &key, path, -1, 1);
4835                         }
4836
4837                         if (ret) {
4838                                 btrfs_err(info,
4839                                           "umm, got %d back from search, was looking for %llu",
4840                                           ret, bytenr);
4841                                 if (ret > 0)
4842                                         btrfs_print_leaf(path->nodes[0]);
4843                         }
4844                         if (ret < 0) {
4845                                 btrfs_abort_transaction(trans, ret);
4846                                 goto out;
4847                         }
4848                         extent_slot = path->slots[0];
4849                 }
4850         } else if (WARN_ON(ret == -ENOENT)) {
4851                 btrfs_print_leaf(path->nodes[0]);
4852                 btrfs_err(info,
4853                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
4854                         bytenr, parent, root_objectid, owner_objectid,
4855                         owner_offset);
4856                 btrfs_abort_transaction(trans, ret);
4857                 goto out;
4858         } else {
4859                 btrfs_abort_transaction(trans, ret);
4860                 goto out;
4861         }
4862
4863         leaf = path->nodes[0];
4864         item_size = btrfs_item_size_nr(leaf, extent_slot);
4865         if (unlikely(item_size < sizeof(*ei))) {
4866                 ret = -EINVAL;
4867                 btrfs_print_v0_err(info);
4868                 btrfs_abort_transaction(trans, ret);
4869                 goto out;
4870         }
4871         ei = btrfs_item_ptr(leaf, extent_slot,
4872                             struct btrfs_extent_item);
4873         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
4874             key.type == BTRFS_EXTENT_ITEM_KEY) {
4875                 struct btrfs_tree_block_info *bi;
4876                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4877                 bi = (struct btrfs_tree_block_info *)(ei + 1);
4878                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4879         }
4880
4881         refs = btrfs_extent_refs(leaf, ei);
4882         if (refs < refs_to_drop) {
4883                 btrfs_err(info,
4884                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
4885                           refs_to_drop, refs, bytenr);
4886                 ret = -EINVAL;
4887                 btrfs_abort_transaction(trans, ret);
4888                 goto out;
4889         }
4890         refs -= refs_to_drop;
4891
4892         if (refs > 0) {
4893                 if (extent_op)
4894                         __run_delayed_extent_op(extent_op, leaf, ei);
4895                 /*
4896                  * In the case of inline back ref, reference count will
4897                  * be updated by remove_extent_backref
4898                  */
4899                 if (iref) {
4900                         BUG_ON(!found_extent);
4901                 } else {
4902                         btrfs_set_extent_refs(leaf, ei, refs);
4903                         btrfs_mark_buffer_dirty(leaf);
4904                 }
4905                 if (found_extent) {
4906                         ret = remove_extent_backref(trans, path, iref,
4907                                                     refs_to_drop, is_data,
4908                                                     &last_ref);
4909                         if (ret) {
4910                                 btrfs_abort_transaction(trans, ret);
4911                                 goto out;
4912                         }
4913                 }
4914         } else {
4915                 if (found_extent) {
4916                         BUG_ON(is_data && refs_to_drop !=
4917                                extent_data_ref_count(path, iref));
4918                         if (iref) {
4919                                 BUG_ON(path->slots[0] != extent_slot);
4920                         } else {
4921                                 BUG_ON(path->slots[0] != extent_slot + 1);
4922                                 path->slots[0] = extent_slot;
4923                                 num_to_del = 2;
4924                         }
4925                 }
4926
4927                 last_ref = 1;
4928                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4929                                       num_to_del);
4930                 if (ret) {
4931                         btrfs_abort_transaction(trans, ret);
4932                         goto out;
4933                 }
4934                 btrfs_release_path(path);
4935
4936                 if (is_data) {
4937                         ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
4938                         if (ret) {
4939                                 btrfs_abort_transaction(trans, ret);
4940                                 goto out;
4941                         }
4942                 }
4943
4944                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
4945                 if (ret) {
4946                         btrfs_abort_transaction(trans, ret);
4947                         goto out;
4948                 }
4949
4950                 ret = update_block_group(trans, bytenr, num_bytes, 0);
4951                 if (ret) {
4952                         btrfs_abort_transaction(trans, ret);
4953                         goto out;
4954                 }
4955         }
4956         btrfs_release_path(path);
4957
4958 out:
4959         btrfs_free_path(path);
4960         return ret;
4961 }
4962
4963 /*
4964  * when we free an block, it is possible (and likely) that we free the last
4965  * delayed ref for that extent as well.  This searches the delayed ref tree for
4966  * a given extent, and if there are no other delayed refs to be processed, it
4967  * removes it from the tree.
4968  */
4969 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4970                                       u64 bytenr)
4971 {
4972         struct btrfs_delayed_ref_head *head;
4973         struct btrfs_delayed_ref_root *delayed_refs;
4974         int ret = 0;
4975
4976         delayed_refs = &trans->transaction->delayed_refs;
4977         spin_lock(&delayed_refs->lock);
4978         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
4979         if (!head)
4980                 goto out_delayed_unlock;
4981
4982         spin_lock(&head->lock);
4983         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
4984                 goto out;
4985
4986         if (cleanup_extent_op(head) != NULL)
4987                 goto out;
4988
4989         /*
4990          * waiting for the lock here would deadlock.  If someone else has it
4991          * locked they are already in the process of dropping it anyway
4992          */
4993         if (!mutex_trylock(&head->mutex))
4994                 goto out;
4995
4996         btrfs_delete_ref_head(delayed_refs, head);
4997         head->processing = 0;
4998
4999         spin_unlock(&head->lock);
5000         spin_unlock(&delayed_refs->lock);
5001
5002         BUG_ON(head->extent_op);
5003         if (head->must_insert_reserved)
5004                 ret = 1;
5005
5006         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
5007         mutex_unlock(&head->mutex);
5008         btrfs_put_delayed_ref_head(head);
5009         return ret;
5010 out:
5011         spin_unlock(&head->lock);
5012
5013 out_delayed_unlock:
5014         spin_unlock(&delayed_refs->lock);
5015         return 0;
5016 }
5017
5018 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5019                            struct btrfs_root *root,
5020                            struct extent_buffer *buf,
5021                            u64 parent, int last_ref)
5022 {
5023         struct btrfs_fs_info *fs_info = root->fs_info;
5024         struct btrfs_ref generic_ref = { 0 };
5025         int pin = 1;
5026         int ret;
5027
5028         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
5029                                buf->start, buf->len, parent);
5030         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
5031                             root->root_key.objectid);
5032
5033         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5034                 int old_ref_mod, new_ref_mod;
5035
5036                 btrfs_ref_tree_mod(fs_info, &generic_ref);
5037                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
5038                                                  &old_ref_mod, &new_ref_mod);
5039                 BUG_ON(ret); /* -ENOMEM */
5040                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
5041         }
5042
5043         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
5044                 struct btrfs_block_group_cache *cache;
5045
5046                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5047                         ret = check_ref_cleanup(trans, buf->start);
5048                         if (!ret)
5049                                 goto out;
5050                 }
5051
5052                 pin = 0;
5053                 cache = btrfs_lookup_block_group(fs_info, buf->start);
5054
5055                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5056                         pin_down_extent(cache, buf->start, buf->len, 1);
5057                         btrfs_put_block_group(cache);
5058                         goto out;
5059                 }
5060
5061                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5062
5063                 btrfs_add_free_space(cache, buf->start, buf->len);
5064                 btrfs_free_reserved_bytes(cache, buf->len, 0);
5065                 btrfs_put_block_group(cache);
5066                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
5067         }
5068 out:
5069         if (pin)
5070                 add_pinned_bytes(fs_info, &generic_ref);
5071
5072         if (last_ref) {
5073                 /*
5074                  * Deleting the buffer, clear the corrupt flag since it doesn't
5075                  * matter anymore.
5076                  */
5077                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5078         }
5079 }
5080
5081 /* Can return -ENOMEM */
5082 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
5083 {
5084         struct btrfs_fs_info *fs_info = trans->fs_info;
5085         int old_ref_mod, new_ref_mod;
5086         int ret;
5087
5088         if (btrfs_is_testing(fs_info))
5089                 return 0;
5090
5091         /*
5092          * tree log blocks never actually go into the extent allocation
5093          * tree, just update pinning info and exit early.
5094          */
5095         if ((ref->type == BTRFS_REF_METADATA &&
5096              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5097             (ref->type == BTRFS_REF_DATA &&
5098              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
5099                 /* unlocks the pinned mutex */
5100                 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
5101                 old_ref_mod = new_ref_mod = 0;
5102                 ret = 0;
5103         } else if (ref->type == BTRFS_REF_METADATA) {
5104                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
5105                                                  &old_ref_mod, &new_ref_mod);
5106         } else {
5107                 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
5108                                                  &old_ref_mod, &new_ref_mod);
5109         }
5110
5111         if (!((ref->type == BTRFS_REF_METADATA &&
5112                ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5113               (ref->type == BTRFS_REF_DATA &&
5114                ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
5115                 btrfs_ref_tree_mod(fs_info, ref);
5116
5117         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
5118                 add_pinned_bytes(fs_info, ref);
5119
5120         return ret;
5121 }
5122
5123 /*
5124  * when we wait for progress in the block group caching, its because
5125  * our allocation attempt failed at least once.  So, we must sleep
5126  * and let some progress happen before we try again.
5127  *
5128  * This function will sleep at least once waiting for new free space to
5129  * show up, and then it will check the block group free space numbers
5130  * for our min num_bytes.  Another option is to have it go ahead
5131  * and look in the rbtree for a free extent of a given size, but this
5132  * is a good start.
5133  *
5134  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
5135  * any of the information in this block group.
5136  */
5137 static noinline void
5138 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5139                                 u64 num_bytes)
5140 {
5141         struct btrfs_caching_control *caching_ctl;
5142
5143         caching_ctl = get_caching_control(cache);
5144         if (!caching_ctl)
5145                 return;
5146
5147         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5148                    (cache->free_space_ctl->free_space >= num_bytes));
5149
5150         put_caching_control(caching_ctl);
5151 }
5152
5153 static noinline int
5154 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5155 {
5156         struct btrfs_caching_control *caching_ctl;
5157         int ret = 0;
5158
5159         caching_ctl = get_caching_control(cache);
5160         if (!caching_ctl)
5161                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
5162
5163         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5164         if (cache->cached == BTRFS_CACHE_ERROR)
5165                 ret = -EIO;
5166         put_caching_control(caching_ctl);
5167         return ret;
5168 }
5169
5170 enum btrfs_loop_type {
5171         LOOP_CACHING_NOWAIT,
5172         LOOP_CACHING_WAIT,
5173         LOOP_ALLOC_CHUNK,
5174         LOOP_NO_EMPTY_SIZE,
5175 };
5176
5177 static inline void
5178 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
5179                        int delalloc)
5180 {
5181         if (delalloc)
5182                 down_read(&cache->data_rwsem);
5183 }
5184
5185 static inline void
5186 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
5187                        int delalloc)
5188 {
5189         btrfs_get_block_group(cache);
5190         if (delalloc)
5191                 down_read(&cache->data_rwsem);
5192 }
5193
5194 static struct btrfs_block_group_cache *
5195 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
5196                    struct btrfs_free_cluster *cluster,
5197                    int delalloc)
5198 {
5199         struct btrfs_block_group_cache *used_bg = NULL;
5200
5201         spin_lock(&cluster->refill_lock);
5202         while (1) {
5203                 used_bg = cluster->block_group;
5204                 if (!used_bg)
5205                         return NULL;
5206
5207                 if (used_bg == block_group)
5208                         return used_bg;
5209
5210                 btrfs_get_block_group(used_bg);
5211
5212                 if (!delalloc)
5213                         return used_bg;
5214
5215                 if (down_read_trylock(&used_bg->data_rwsem))
5216                         return used_bg;
5217
5218                 spin_unlock(&cluster->refill_lock);
5219
5220                 /* We should only have one-level nested. */
5221                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
5222
5223                 spin_lock(&cluster->refill_lock);
5224                 if (used_bg == cluster->block_group)
5225                         return used_bg;
5226
5227                 up_read(&used_bg->data_rwsem);
5228                 btrfs_put_block_group(used_bg);
5229         }
5230 }
5231
5232 static inline void
5233 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
5234                          int delalloc)
5235 {
5236         if (delalloc)
5237                 up_read(&cache->data_rwsem);
5238         btrfs_put_block_group(cache);
5239 }
5240
5241 /*
5242  * Structure used internally for find_free_extent() function.  Wraps needed
5243  * parameters.
5244  */
5245 struct find_free_extent_ctl {
5246         /* Basic allocation info */
5247         u64 ram_bytes;
5248         u64 num_bytes;
5249         u64 empty_size;
5250         u64 flags;
5251         int delalloc;
5252
5253         /* Where to start the search inside the bg */
5254         u64 search_start;
5255
5256         /* For clustered allocation */
5257         u64 empty_cluster;
5258
5259         bool have_caching_bg;
5260         bool orig_have_caching_bg;
5261
5262         /* RAID index, converted from flags */
5263         int index;
5264
5265         /*
5266          * Current loop number, check find_free_extent_update_loop() for details
5267          */
5268         int loop;
5269
5270         /*
5271          * Whether we're refilling a cluster, if true we need to re-search
5272          * current block group but don't try to refill the cluster again.
5273          */
5274         bool retry_clustered;
5275
5276         /*
5277          * Whether we're updating free space cache, if true we need to re-search
5278          * current block group but don't try updating free space cache again.
5279          */
5280         bool retry_unclustered;
5281
5282         /* If current block group is cached */
5283         int cached;
5284
5285         /* Max contiguous hole found */
5286         u64 max_extent_size;
5287
5288         /* Total free space from free space cache, not always contiguous */
5289         u64 total_free_space;
5290
5291         /* Found result */
5292         u64 found_offset;
5293 };
5294
5295
5296 /*
5297  * Helper function for find_free_extent().
5298  *
5299  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
5300  * Return -EAGAIN to inform caller that we need to re-search this block group
5301  * Return >0 to inform caller that we find nothing
5302  * Return 0 means we have found a location and set ffe_ctl->found_offset.
5303  */
5304 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
5305                 struct btrfs_free_cluster *last_ptr,
5306                 struct find_free_extent_ctl *ffe_ctl,
5307                 struct btrfs_block_group_cache **cluster_bg_ret)
5308 {
5309         struct btrfs_block_group_cache *cluster_bg;
5310         u64 aligned_cluster;
5311         u64 offset;
5312         int ret;
5313
5314         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
5315         if (!cluster_bg)
5316                 goto refill_cluster;
5317         if (cluster_bg != bg && (cluster_bg->ro ||
5318             !block_group_bits(cluster_bg, ffe_ctl->flags)))
5319                 goto release_cluster;
5320
5321         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
5322                         ffe_ctl->num_bytes, cluster_bg->key.objectid,
5323                         &ffe_ctl->max_extent_size);
5324         if (offset) {
5325                 /* We have a block, we're done */
5326                 spin_unlock(&last_ptr->refill_lock);
5327                 trace_btrfs_reserve_extent_cluster(cluster_bg,
5328                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
5329                 *cluster_bg_ret = cluster_bg;
5330                 ffe_ctl->found_offset = offset;
5331                 return 0;
5332         }
5333         WARN_ON(last_ptr->block_group != cluster_bg);
5334
5335 release_cluster:
5336         /*
5337          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
5338          * lets just skip it and let the allocator find whatever block it can
5339          * find. If we reach this point, we will have tried the cluster
5340          * allocator plenty of times and not have found anything, so we are
5341          * likely way too fragmented for the clustering stuff to find anything.
5342          *
5343          * However, if the cluster is taken from the current block group,
5344          * release the cluster first, so that we stand a better chance of
5345          * succeeding in the unclustered allocation.
5346          */
5347         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
5348                 spin_unlock(&last_ptr->refill_lock);
5349                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
5350                 return -ENOENT;
5351         }
5352
5353         /* This cluster didn't work out, free it and start over */
5354         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5355
5356         if (cluster_bg != bg)
5357                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
5358
5359 refill_cluster:
5360         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
5361                 spin_unlock(&last_ptr->refill_lock);
5362                 return -ENOENT;
5363         }
5364
5365         aligned_cluster = max_t(u64,
5366                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
5367                         bg->full_stripe_len);
5368         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
5369                         ffe_ctl->num_bytes, aligned_cluster);
5370         if (ret == 0) {
5371                 /* Now pull our allocation out of this cluster */
5372                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
5373                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
5374                                 &ffe_ctl->max_extent_size);
5375                 if (offset) {
5376                         /* We found one, proceed */
5377                         spin_unlock(&last_ptr->refill_lock);
5378                         trace_btrfs_reserve_extent_cluster(bg,
5379                                         ffe_ctl->search_start,
5380                                         ffe_ctl->num_bytes);
5381                         ffe_ctl->found_offset = offset;
5382                         return 0;
5383                 }
5384         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
5385                    !ffe_ctl->retry_clustered) {
5386                 spin_unlock(&last_ptr->refill_lock);
5387
5388                 ffe_ctl->retry_clustered = true;
5389                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
5390                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
5391                 return -EAGAIN;
5392         }
5393         /*
5394          * At this point we either didn't find a cluster or we weren't able to
5395          * allocate a block from our cluster.  Free the cluster we've been
5396          * trying to use, and go to the next block group.
5397          */
5398         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5399         spin_unlock(&last_ptr->refill_lock);
5400         return 1;
5401 }
5402
5403 /*
5404  * Return >0 to inform caller that we find nothing
5405  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
5406  * Return -EAGAIN to inform caller that we need to re-search this block group
5407  */
5408 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
5409                 struct btrfs_free_cluster *last_ptr,
5410                 struct find_free_extent_ctl *ffe_ctl)
5411 {
5412         u64 offset;
5413
5414         /*
5415          * We are doing an unclustered allocation, set the fragmented flag so
5416          * we don't bother trying to setup a cluster again until we get more
5417          * space.
5418          */
5419         if (unlikely(last_ptr)) {
5420                 spin_lock(&last_ptr->lock);
5421                 last_ptr->fragmented = 1;
5422                 spin_unlock(&last_ptr->lock);
5423         }
5424         if (ffe_ctl->cached) {
5425                 struct btrfs_free_space_ctl *free_space_ctl;
5426
5427                 free_space_ctl = bg->free_space_ctl;
5428                 spin_lock(&free_space_ctl->tree_lock);
5429                 if (free_space_ctl->free_space <
5430                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
5431                     ffe_ctl->empty_size) {
5432                         ffe_ctl->total_free_space = max_t(u64,
5433                                         ffe_ctl->total_free_space,
5434                                         free_space_ctl->free_space);
5435                         spin_unlock(&free_space_ctl->tree_lock);
5436                         return 1;
5437                 }
5438                 spin_unlock(&free_space_ctl->tree_lock);
5439         }
5440
5441         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
5442                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
5443                         &ffe_ctl->max_extent_size);
5444
5445         /*
5446          * If we didn't find a chunk, and we haven't failed on this block group
5447          * before, and this block group is in the middle of caching and we are
5448          * ok with waiting, then go ahead and wait for progress to be made, and
5449          * set @retry_unclustered to true.
5450          *
5451          * If @retry_unclustered is true then we've already waited on this
5452          * block group once and should move on to the next block group.
5453          */
5454         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
5455             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
5456                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
5457                                                 ffe_ctl->empty_size);
5458                 ffe_ctl->retry_unclustered = true;
5459                 return -EAGAIN;
5460         } else if (!offset) {
5461                 return 1;
5462         }
5463         ffe_ctl->found_offset = offset;
5464         return 0;
5465 }
5466
5467 /*
5468  * Return >0 means caller needs to re-search for free extent
5469  * Return 0 means we have the needed free extent.
5470  * Return <0 means we failed to locate any free extent.
5471  */
5472 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
5473                                         struct btrfs_free_cluster *last_ptr,
5474                                         struct btrfs_key *ins,
5475                                         struct find_free_extent_ctl *ffe_ctl,
5476                                         int full_search, bool use_cluster)
5477 {
5478         struct btrfs_root *root = fs_info->extent_root;
5479         int ret;
5480
5481         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
5482             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
5483                 ffe_ctl->orig_have_caching_bg = true;
5484
5485         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
5486             ffe_ctl->have_caching_bg)
5487                 return 1;
5488
5489         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
5490                 return 1;
5491
5492         if (ins->objectid) {
5493                 if (!use_cluster && last_ptr) {
5494                         spin_lock(&last_ptr->lock);
5495                         last_ptr->window_start = ins->objectid;
5496                         spin_unlock(&last_ptr->lock);
5497                 }
5498                 return 0;
5499         }
5500
5501         /*
5502          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5503          *                      caching kthreads as we move along
5504          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5505          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5506          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5507          *                     again
5508          */
5509         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
5510                 ffe_ctl->index = 0;
5511                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
5512                         /*
5513                          * We want to skip the LOOP_CACHING_WAIT step if we
5514                          * don't have any uncached bgs and we've already done a
5515                          * full search through.
5516                          */
5517                         if (ffe_ctl->orig_have_caching_bg || !full_search)
5518                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
5519                         else
5520                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
5521                 } else {
5522                         ffe_ctl->loop++;
5523                 }
5524
5525                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
5526                         struct btrfs_trans_handle *trans;
5527                         int exist = 0;
5528
5529                         trans = current->journal_info;
5530                         if (trans)
5531                                 exist = 1;
5532                         else
5533                                 trans = btrfs_join_transaction(root);
5534
5535                         if (IS_ERR(trans)) {
5536                                 ret = PTR_ERR(trans);
5537                                 return ret;
5538                         }
5539
5540                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
5541                                                 CHUNK_ALLOC_FORCE);
5542
5543                         /*
5544                          * If we can't allocate a new chunk we've already looped
5545                          * through at least once, move on to the NO_EMPTY_SIZE
5546                          * case.
5547                          */
5548                         if (ret == -ENOSPC)
5549                                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
5550
5551                         /* Do not bail out on ENOSPC since we can do more. */
5552                         if (ret < 0 && ret != -ENOSPC)
5553                                 btrfs_abort_transaction(trans, ret);
5554                         else
5555                                 ret = 0;
5556                         if (!exist)
5557                                 btrfs_end_transaction(trans);
5558                         if (ret)
5559                                 return ret;
5560                 }
5561
5562                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
5563                         /*
5564                          * Don't loop again if we already have no empty_size and
5565                          * no empty_cluster.
5566                          */
5567                         if (ffe_ctl->empty_size == 0 &&
5568                             ffe_ctl->empty_cluster == 0)
5569                                 return -ENOSPC;
5570                         ffe_ctl->empty_size = 0;
5571                         ffe_ctl->empty_cluster = 0;
5572                 }
5573                 return 1;
5574         }
5575         return -ENOSPC;
5576 }
5577
5578 /*
5579  * walks the btree of allocated extents and find a hole of a given size.
5580  * The key ins is changed to record the hole:
5581  * ins->objectid == start position
5582  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5583  * ins->offset == the size of the hole.
5584  * Any available blocks before search_start are skipped.
5585  *
5586  * If there is no suitable free space, we will record the max size of
5587  * the free space extent currently.
5588  *
5589  * The overall logic and call chain:
5590  *
5591  * find_free_extent()
5592  * |- Iterate through all block groups
5593  * |  |- Get a valid block group
5594  * |  |- Try to do clustered allocation in that block group
5595  * |  |- Try to do unclustered allocation in that block group
5596  * |  |- Check if the result is valid
5597  * |  |  |- If valid, then exit
5598  * |  |- Jump to next block group
5599  * |
5600  * |- Push harder to find free extents
5601  *    |- If not found, re-iterate all block groups
5602  */
5603 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
5604                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
5605                                 u64 hint_byte, struct btrfs_key *ins,
5606                                 u64 flags, int delalloc)
5607 {
5608         int ret = 0;
5609         struct btrfs_free_cluster *last_ptr = NULL;
5610         struct btrfs_block_group_cache *block_group = NULL;
5611         struct find_free_extent_ctl ffe_ctl = {0};
5612         struct btrfs_space_info *space_info;
5613         bool use_cluster = true;
5614         bool full_search = false;
5615
5616         WARN_ON(num_bytes < fs_info->sectorsize);
5617
5618         ffe_ctl.ram_bytes = ram_bytes;
5619         ffe_ctl.num_bytes = num_bytes;
5620         ffe_ctl.empty_size = empty_size;
5621         ffe_ctl.flags = flags;
5622         ffe_ctl.search_start = 0;
5623         ffe_ctl.retry_clustered = false;
5624         ffe_ctl.retry_unclustered = false;
5625         ffe_ctl.delalloc = delalloc;
5626         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
5627         ffe_ctl.have_caching_bg = false;
5628         ffe_ctl.orig_have_caching_bg = false;
5629         ffe_ctl.found_offset = 0;
5630
5631         ins->type = BTRFS_EXTENT_ITEM_KEY;
5632         ins->objectid = 0;
5633         ins->offset = 0;
5634
5635         trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
5636
5637         space_info = btrfs_find_space_info(fs_info, flags);
5638         if (!space_info) {
5639                 btrfs_err(fs_info, "No space info for %llu", flags);
5640                 return -ENOSPC;
5641         }
5642
5643         /*
5644          * If our free space is heavily fragmented we may not be able to make
5645          * big contiguous allocations, so instead of doing the expensive search
5646          * for free space, simply return ENOSPC with our max_extent_size so we
5647          * can go ahead and search for a more manageable chunk.
5648          *
5649          * If our max_extent_size is large enough for our allocation simply
5650          * disable clustering since we will likely not be able to find enough
5651          * space to create a cluster and induce latency trying.
5652          */
5653         if (unlikely(space_info->max_extent_size)) {
5654                 spin_lock(&space_info->lock);
5655                 if (space_info->max_extent_size &&
5656                     num_bytes > space_info->max_extent_size) {
5657                         ins->offset = space_info->max_extent_size;
5658                         spin_unlock(&space_info->lock);
5659                         return -ENOSPC;
5660                 } else if (space_info->max_extent_size) {
5661                         use_cluster = false;
5662                 }
5663                 spin_unlock(&space_info->lock);
5664         }
5665
5666         last_ptr = fetch_cluster_info(fs_info, space_info,
5667                                       &ffe_ctl.empty_cluster);
5668         if (last_ptr) {
5669                 spin_lock(&last_ptr->lock);
5670                 if (last_ptr->block_group)
5671                         hint_byte = last_ptr->window_start;
5672                 if (last_ptr->fragmented) {
5673                         /*
5674                          * We still set window_start so we can keep track of the
5675                          * last place we found an allocation to try and save
5676                          * some time.
5677                          */
5678                         hint_byte = last_ptr->window_start;
5679                         use_cluster = false;
5680                 }
5681                 spin_unlock(&last_ptr->lock);
5682         }
5683
5684         ffe_ctl.search_start = max(ffe_ctl.search_start,
5685                                    first_logical_byte(fs_info, 0));
5686         ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
5687         if (ffe_ctl.search_start == hint_byte) {
5688                 block_group = btrfs_lookup_block_group(fs_info,
5689                                                        ffe_ctl.search_start);
5690                 /*
5691                  * we don't want to use the block group if it doesn't match our
5692                  * allocation bits, or if its not cached.
5693                  *
5694                  * However if we are re-searching with an ideal block group
5695                  * picked out then we don't care that the block group is cached.
5696                  */
5697                 if (block_group && block_group_bits(block_group, flags) &&
5698                     block_group->cached != BTRFS_CACHE_NO) {
5699                         down_read(&space_info->groups_sem);
5700                         if (list_empty(&block_group->list) ||
5701                             block_group->ro) {
5702                                 /*
5703                                  * someone is removing this block group,
5704                                  * we can't jump into the have_block_group
5705                                  * target because our list pointers are not
5706                                  * valid
5707                                  */
5708                                 btrfs_put_block_group(block_group);
5709                                 up_read(&space_info->groups_sem);
5710                         } else {
5711                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
5712                                                 block_group->flags);
5713                                 btrfs_lock_block_group(block_group, delalloc);
5714                                 goto have_block_group;
5715                         }
5716                 } else if (block_group) {
5717                         btrfs_put_block_group(block_group);
5718                 }
5719         }
5720 search:
5721         ffe_ctl.have_caching_bg = false;
5722         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
5723             ffe_ctl.index == 0)
5724                 full_search = true;
5725         down_read(&space_info->groups_sem);
5726         list_for_each_entry(block_group,
5727                             &space_info->block_groups[ffe_ctl.index], list) {
5728                 /* If the block group is read-only, we can skip it entirely. */
5729                 if (unlikely(block_group->ro))
5730                         continue;
5731
5732                 btrfs_grab_block_group(block_group, delalloc);
5733                 ffe_ctl.search_start = block_group->key.objectid;
5734
5735                 /*
5736                  * this can happen if we end up cycling through all the
5737                  * raid types, but we want to make sure we only allocate
5738                  * for the proper type.
5739                  */
5740                 if (!block_group_bits(block_group, flags)) {
5741                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
5742                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
5743                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
5744                                 BTRFS_BLOCK_GROUP_RAID10;
5745
5746                         /*
5747                          * if they asked for extra copies and this block group
5748                          * doesn't provide them, bail.  This does allow us to
5749                          * fill raid0 from raid1.
5750                          */
5751                         if ((flags & extra) && !(block_group->flags & extra))
5752                                 goto loop;
5753                 }
5754
5755 have_block_group:
5756                 ffe_ctl.cached = block_group_cache_done(block_group);
5757                 if (unlikely(!ffe_ctl.cached)) {
5758                         ffe_ctl.have_caching_bg = true;
5759                         ret = cache_block_group(block_group, 0);
5760                         BUG_ON(ret < 0);
5761                         ret = 0;
5762                 }
5763
5764                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
5765                         goto loop;
5766
5767                 /*
5768                  * Ok we want to try and use the cluster allocator, so
5769                  * lets look there
5770                  */
5771                 if (last_ptr && use_cluster) {
5772                         struct btrfs_block_group_cache *cluster_bg = NULL;
5773
5774                         ret = find_free_extent_clustered(block_group, last_ptr,
5775                                                          &ffe_ctl, &cluster_bg);
5776
5777                         if (ret == 0) {
5778                                 if (cluster_bg && cluster_bg != block_group) {
5779                                         btrfs_release_block_group(block_group,
5780                                                                   delalloc);
5781                                         block_group = cluster_bg;
5782                                 }
5783                                 goto checks;
5784                         } else if (ret == -EAGAIN) {
5785                                 goto have_block_group;
5786                         } else if (ret > 0) {
5787                                 goto loop;
5788                         }
5789                         /* ret == -ENOENT case falls through */
5790                 }
5791
5792                 ret = find_free_extent_unclustered(block_group, last_ptr,
5793                                                    &ffe_ctl);
5794                 if (ret == -EAGAIN)
5795                         goto have_block_group;
5796                 else if (ret > 0)
5797                         goto loop;
5798                 /* ret == 0 case falls through */
5799 checks:
5800                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
5801                                              fs_info->stripesize);
5802
5803                 /* move on to the next group */
5804                 if (ffe_ctl.search_start + num_bytes >
5805                     block_group->key.objectid + block_group->key.offset) {
5806                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
5807                                              num_bytes);
5808                         goto loop;
5809                 }
5810
5811                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
5812                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
5813                                 ffe_ctl.search_start - ffe_ctl.found_offset);
5814
5815                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
5816                                 num_bytes, delalloc);
5817                 if (ret == -EAGAIN) {
5818                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
5819                                              num_bytes);
5820                         goto loop;
5821                 }
5822                 btrfs_inc_block_group_reservations(block_group);
5823
5824                 /* we are all good, lets return */
5825                 ins->objectid = ffe_ctl.search_start;
5826                 ins->offset = num_bytes;
5827
5828                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
5829                                            num_bytes);
5830                 btrfs_release_block_group(block_group, delalloc);
5831                 break;
5832 loop:
5833                 ffe_ctl.retry_clustered = false;
5834                 ffe_ctl.retry_unclustered = false;
5835                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
5836                        ffe_ctl.index);
5837                 btrfs_release_block_group(block_group, delalloc);
5838                 cond_resched();
5839         }
5840         up_read(&space_info->groups_sem);
5841
5842         ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
5843                                            full_search, use_cluster);
5844         if (ret > 0)
5845                 goto search;
5846
5847         if (ret == -ENOSPC) {
5848                 /*
5849                  * Use ffe_ctl->total_free_space as fallback if we can't find
5850                  * any contiguous hole.
5851                  */
5852                 if (!ffe_ctl.max_extent_size)
5853                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
5854                 spin_lock(&space_info->lock);
5855                 space_info->max_extent_size = ffe_ctl.max_extent_size;
5856                 spin_unlock(&space_info->lock);
5857                 ins->offset = ffe_ctl.max_extent_size;
5858         }
5859         return ret;
5860 }
5861
5862 /*
5863  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
5864  *                        hole that is at least as big as @num_bytes.
5865  *
5866  * @root           -    The root that will contain this extent
5867  *
5868  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
5869  *                      is used for accounting purposes. This value differs
5870  *                      from @num_bytes only in the case of compressed extents.
5871  *
5872  * @num_bytes      -    Number of bytes to allocate on-disk.
5873  *
5874  * @min_alloc_size -    Indicates the minimum amount of space that the
5875  *                      allocator should try to satisfy. In some cases
5876  *                      @num_bytes may be larger than what is required and if
5877  *                      the filesystem is fragmented then allocation fails.
5878  *                      However, the presence of @min_alloc_size gives a
5879  *                      chance to try and satisfy the smaller allocation.
5880  *
5881  * @empty_size     -    A hint that you plan on doing more COW. This is the
5882  *                      size in bytes the allocator should try to find free
5883  *                      next to the block it returns.  This is just a hint and
5884  *                      may be ignored by the allocator.
5885  *
5886  * @hint_byte      -    Hint to the allocator to start searching above the byte
5887  *                      address passed. It might be ignored.
5888  *
5889  * @ins            -    This key is modified to record the found hole. It will
5890  *                      have the following values:
5891  *                      ins->objectid == start position
5892  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
5893  *                      ins->offset == the size of the hole.
5894  *
5895  * @is_data        -    Boolean flag indicating whether an extent is
5896  *                      allocated for data (true) or metadata (false)
5897  *
5898  * @delalloc       -    Boolean flag indicating whether this allocation is for
5899  *                      delalloc or not. If 'true' data_rwsem of block groups
5900  *                      is going to be acquired.
5901  *
5902  *
5903  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
5904  * case -ENOSPC is returned then @ins->offset will contain the size of the
5905  * largest available hole the allocator managed to find.
5906  */
5907 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
5908                          u64 num_bytes, u64 min_alloc_size,
5909                          u64 empty_size, u64 hint_byte,
5910                          struct btrfs_key *ins, int is_data, int delalloc)
5911 {
5912         struct btrfs_fs_info *fs_info = root->fs_info;
5913         bool final_tried = num_bytes == min_alloc_size;
5914         u64 flags;
5915         int ret;
5916
5917         flags = get_alloc_profile_by_root(root, is_data);
5918 again:
5919         WARN_ON(num_bytes < fs_info->sectorsize);
5920         ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
5921                                hint_byte, ins, flags, delalloc);
5922         if (!ret && !is_data) {
5923                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
5924         } else if (ret == -ENOSPC) {
5925                 if (!final_tried && ins->offset) {
5926                         num_bytes = min(num_bytes >> 1, ins->offset);
5927                         num_bytes = round_down(num_bytes,
5928                                                fs_info->sectorsize);
5929                         num_bytes = max(num_bytes, min_alloc_size);
5930                         ram_bytes = num_bytes;
5931                         if (num_bytes == min_alloc_size)
5932                                 final_tried = true;
5933                         goto again;
5934                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
5935                         struct btrfs_space_info *sinfo;
5936
5937                         sinfo = btrfs_find_space_info(fs_info, flags);
5938                         btrfs_err(fs_info,
5939                                   "allocation failed flags %llu, wanted %llu",
5940                                   flags, num_bytes);
5941                         if (sinfo)
5942                                 btrfs_dump_space_info(fs_info, sinfo,
5943                                                       num_bytes, 1);
5944                 }
5945         }
5946
5947         return ret;
5948 }
5949
5950 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
5951                                         u64 start, u64 len,
5952                                         int pin, int delalloc)
5953 {
5954         struct btrfs_block_group_cache *cache;
5955         int ret = 0;
5956
5957         cache = btrfs_lookup_block_group(fs_info, start);
5958         if (!cache) {
5959                 btrfs_err(fs_info, "Unable to find block group for %llu",
5960                           start);
5961                 return -ENOSPC;
5962         }
5963
5964         if (pin)
5965                 pin_down_extent(cache, start, len, 1);
5966         else {
5967                 if (btrfs_test_opt(fs_info, DISCARD))
5968                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
5969                 btrfs_add_free_space(cache, start, len);
5970                 btrfs_free_reserved_bytes(cache, len, delalloc);
5971                 trace_btrfs_reserved_extent_free(fs_info, start, len);
5972         }
5973
5974         btrfs_put_block_group(cache);
5975         return ret;
5976 }
5977
5978 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
5979                                u64 start, u64 len, int delalloc)
5980 {
5981         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
5982 }
5983
5984 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
5985                                        u64 start, u64 len)
5986 {
5987         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
5988 }
5989
5990 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5991                                       u64 parent, u64 root_objectid,
5992                                       u64 flags, u64 owner, u64 offset,
5993                                       struct btrfs_key *ins, int ref_mod)
5994 {
5995         struct btrfs_fs_info *fs_info = trans->fs_info;
5996         int ret;
5997         struct btrfs_extent_item *extent_item;
5998         struct btrfs_extent_inline_ref *iref;
5999         struct btrfs_path *path;
6000         struct extent_buffer *leaf;
6001         int type;
6002         u32 size;
6003
6004         if (parent > 0)
6005                 type = BTRFS_SHARED_DATA_REF_KEY;
6006         else
6007                 type = BTRFS_EXTENT_DATA_REF_KEY;
6008
6009         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6010
6011         path = btrfs_alloc_path();
6012         if (!path)
6013                 return -ENOMEM;
6014
6015         path->leave_spinning = 1;
6016         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6017                                       ins, size);
6018         if (ret) {
6019                 btrfs_free_path(path);
6020                 return ret;
6021         }
6022
6023         leaf = path->nodes[0];
6024         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6025                                      struct btrfs_extent_item);
6026         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6027         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6028         btrfs_set_extent_flags(leaf, extent_item,
6029                                flags | BTRFS_EXTENT_FLAG_DATA);
6030
6031         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6032         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6033         if (parent > 0) {
6034                 struct btrfs_shared_data_ref *ref;
6035                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6036                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6037                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6038         } else {
6039                 struct btrfs_extent_data_ref *ref;
6040                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6041                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6042                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6043                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6044                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6045         }
6046
6047         btrfs_mark_buffer_dirty(path->nodes[0]);
6048         btrfs_free_path(path);
6049
6050         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
6051         if (ret)
6052                 return ret;
6053
6054         ret = update_block_group(trans, ins->objectid, ins->offset, 1);
6055         if (ret) { /* -ENOENT, logic error */
6056                 btrfs_err(fs_info, "update block group failed for %llu %llu",
6057                         ins->objectid, ins->offset);
6058                 BUG();
6059         }
6060         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
6061         return ret;
6062 }
6063
6064 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6065                                      struct btrfs_delayed_ref_node *node,
6066                                      struct btrfs_delayed_extent_op *extent_op)
6067 {
6068         struct btrfs_fs_info *fs_info = trans->fs_info;
6069         int ret;
6070         struct btrfs_extent_item *extent_item;
6071         struct btrfs_key extent_key;
6072         struct btrfs_tree_block_info *block_info;
6073         struct btrfs_extent_inline_ref *iref;
6074         struct btrfs_path *path;
6075         struct extent_buffer *leaf;
6076         struct btrfs_delayed_tree_ref *ref;
6077         u32 size = sizeof(*extent_item) + sizeof(*iref);
6078         u64 num_bytes;
6079         u64 flags = extent_op->flags_to_set;
6080         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
6081
6082         ref = btrfs_delayed_node_to_tree_ref(node);
6083
6084         extent_key.objectid = node->bytenr;
6085         if (skinny_metadata) {
6086                 extent_key.offset = ref->level;
6087                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
6088                 num_bytes = fs_info->nodesize;
6089         } else {
6090                 extent_key.offset = node->num_bytes;
6091                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6092                 size += sizeof(*block_info);
6093                 num_bytes = node->num_bytes;
6094         }
6095
6096         path = btrfs_alloc_path();
6097         if (!path)
6098                 return -ENOMEM;
6099
6100         path->leave_spinning = 1;
6101         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6102                                       &extent_key, size);
6103         if (ret) {
6104                 btrfs_free_path(path);
6105                 return ret;
6106         }
6107
6108         leaf = path->nodes[0];
6109         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6110                                      struct btrfs_extent_item);
6111         btrfs_set_extent_refs(leaf, extent_item, 1);
6112         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6113         btrfs_set_extent_flags(leaf, extent_item,
6114                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6115
6116         if (skinny_metadata) {
6117                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6118         } else {
6119                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6120                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
6121                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
6122                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6123         }
6124
6125         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
6126                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6127                 btrfs_set_extent_inline_ref_type(leaf, iref,
6128                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6129                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
6130         } else {
6131                 btrfs_set_extent_inline_ref_type(leaf, iref,
6132                                                  BTRFS_TREE_BLOCK_REF_KEY);
6133                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
6134         }
6135
6136         btrfs_mark_buffer_dirty(leaf);
6137         btrfs_free_path(path);
6138
6139         ret = remove_from_free_space_tree(trans, extent_key.objectid,
6140                                           num_bytes);
6141         if (ret)
6142                 return ret;
6143
6144         ret = update_block_group(trans, extent_key.objectid,
6145                                  fs_info->nodesize, 1);
6146         if (ret) { /* -ENOENT, logic error */
6147                 btrfs_err(fs_info, "update block group failed for %llu %llu",
6148                         extent_key.objectid, extent_key.offset);
6149                 BUG();
6150         }
6151
6152         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
6153                                           fs_info->nodesize);
6154         return ret;
6155 }
6156
6157 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6158                                      struct btrfs_root *root, u64 owner,
6159                                      u64 offset, u64 ram_bytes,
6160                                      struct btrfs_key *ins)
6161 {
6162         struct btrfs_ref generic_ref = { 0 };
6163         int ret;
6164
6165         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
6166
6167         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
6168                                ins->objectid, ins->offset, 0);
6169         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
6170         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
6171         ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
6172                                          ram_bytes, NULL, NULL);
6173         return ret;
6174 }
6175
6176 /*
6177  * this is used by the tree logging recovery code.  It records that
6178  * an extent has been allocated and makes sure to clear the free
6179  * space cache bits as well
6180  */
6181 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6182                                    u64 root_objectid, u64 owner, u64 offset,
6183                                    struct btrfs_key *ins)
6184 {
6185         struct btrfs_fs_info *fs_info = trans->fs_info;
6186         int ret;
6187         struct btrfs_block_group_cache *block_group;
6188         struct btrfs_space_info *space_info;
6189
6190         /*
6191          * Mixed block groups will exclude before processing the log so we only
6192          * need to do the exclude dance if this fs isn't mixed.
6193          */
6194         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
6195                 ret = __exclude_logged_extent(fs_info, ins->objectid,
6196                                               ins->offset);
6197                 if (ret)
6198                         return ret;
6199         }
6200
6201         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
6202         if (!block_group)
6203                 return -EINVAL;
6204
6205         space_info = block_group->space_info;
6206         spin_lock(&space_info->lock);
6207         spin_lock(&block_group->lock);
6208         space_info->bytes_reserved += ins->offset;
6209         block_group->reserved += ins->offset;
6210         spin_unlock(&block_group->lock);
6211         spin_unlock(&space_info->lock);
6212
6213         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
6214                                          offset, ins, 1);
6215         btrfs_put_block_group(block_group);
6216         return ret;
6217 }
6218
6219 static struct extent_buffer *
6220 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6221                       u64 bytenr, int level, u64 owner)
6222 {
6223         struct btrfs_fs_info *fs_info = root->fs_info;
6224         struct extent_buffer *buf;
6225
6226         buf = btrfs_find_create_tree_block(fs_info, bytenr);
6227         if (IS_ERR(buf))
6228                 return buf;
6229
6230         /*
6231          * Extra safety check in case the extent tree is corrupted and extent
6232          * allocator chooses to use a tree block which is already used and
6233          * locked.
6234          */
6235         if (buf->lock_owner == current->pid) {
6236                 btrfs_err_rl(fs_info,
6237 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
6238                         buf->start, btrfs_header_owner(buf), current->pid);
6239                 free_extent_buffer(buf);
6240                 return ERR_PTR(-EUCLEAN);
6241         }
6242
6243         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6244         btrfs_tree_lock(buf);
6245         btrfs_clean_tree_block(buf);
6246         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6247
6248         btrfs_set_lock_blocking_write(buf);
6249         set_extent_buffer_uptodate(buf);
6250
6251         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
6252         btrfs_set_header_level(buf, level);
6253         btrfs_set_header_bytenr(buf, buf->start);
6254         btrfs_set_header_generation(buf, trans->transid);
6255         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
6256         btrfs_set_header_owner(buf, owner);
6257         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
6258         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
6259         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6260                 buf->log_index = root->log_transid % 2;
6261                 /*
6262                  * we allow two log transactions at a time, use different
6263                  * EXTENT bit to differentiate dirty pages.
6264                  */
6265                 if (buf->log_index == 0)
6266                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6267                                         buf->start + buf->len - 1, GFP_NOFS);
6268                 else
6269                         set_extent_new(&root->dirty_log_pages, buf->start,
6270                                         buf->start + buf->len - 1);
6271         } else {
6272                 buf->log_index = -1;
6273                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6274                          buf->start + buf->len - 1, GFP_NOFS);
6275         }
6276         trans->dirty = true;
6277         /* this returns a buffer locked for blocking */
6278         return buf;
6279 }
6280
6281 /*
6282  * finds a free extent and does all the dirty work required for allocation
6283  * returns the tree buffer or an ERR_PTR on error.
6284  */
6285 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
6286                                              struct btrfs_root *root,
6287                                              u64 parent, u64 root_objectid,
6288                                              const struct btrfs_disk_key *key,
6289                                              int level, u64 hint,
6290                                              u64 empty_size)
6291 {
6292         struct btrfs_fs_info *fs_info = root->fs_info;
6293         struct btrfs_key ins;
6294         struct btrfs_block_rsv *block_rsv;
6295         struct extent_buffer *buf;
6296         struct btrfs_delayed_extent_op *extent_op;
6297         struct btrfs_ref generic_ref = { 0 };
6298         u64 flags = 0;
6299         int ret;
6300         u32 blocksize = fs_info->nodesize;
6301         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
6302
6303 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6304         if (btrfs_is_testing(fs_info)) {
6305                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
6306                                             level, root_objectid);
6307                 if (!IS_ERR(buf))
6308                         root->alloc_bytenr += blocksize;
6309                 return buf;
6310         }
6311 #endif
6312
6313         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
6314         if (IS_ERR(block_rsv))
6315                 return ERR_CAST(block_rsv);
6316
6317         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
6318                                    empty_size, hint, &ins, 0, 0);
6319         if (ret)
6320                 goto out_unuse;
6321
6322         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
6323                                     root_objectid);
6324         if (IS_ERR(buf)) {
6325                 ret = PTR_ERR(buf);
6326                 goto out_free_reserved;
6327         }
6328
6329         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6330                 if (parent == 0)
6331                         parent = ins.objectid;
6332                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6333         } else
6334                 BUG_ON(parent > 0);
6335
6336         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6337                 extent_op = btrfs_alloc_delayed_extent_op();
6338                 if (!extent_op) {
6339                         ret = -ENOMEM;
6340                         goto out_free_buf;
6341                 }
6342                 if (key)
6343                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6344                 else
6345                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6346                 extent_op->flags_to_set = flags;
6347                 extent_op->update_key = skinny_metadata ? false : true;
6348                 extent_op->update_flags = true;
6349                 extent_op->is_data = false;
6350                 extent_op->level = level;
6351
6352                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
6353                                        ins.objectid, ins.offset, parent);
6354                 generic_ref.real_root = root->root_key.objectid;
6355                 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
6356                 btrfs_ref_tree_mod(fs_info, &generic_ref);
6357                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
6358                                                  extent_op, NULL, NULL);
6359                 if (ret)
6360                         goto out_free_delayed;
6361         }
6362         return buf;
6363
6364 out_free_delayed:
6365         btrfs_free_delayed_extent_op(extent_op);
6366 out_free_buf:
6367         free_extent_buffer(buf);
6368 out_free_reserved:
6369         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
6370 out_unuse:
6371         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
6372         return ERR_PTR(ret);
6373 }
6374
6375 struct walk_control {
6376         u64 refs[BTRFS_MAX_LEVEL];
6377         u64 flags[BTRFS_MAX_LEVEL];
6378         struct btrfs_key update_progress;
6379         struct btrfs_key drop_progress;
6380         int drop_level;
6381         int stage;
6382         int level;
6383         int shared_level;
6384         int update_ref;
6385         int keep_locks;
6386         int reada_slot;
6387         int reada_count;
6388         int restarted;
6389 };
6390
6391 #define DROP_REFERENCE  1
6392 #define UPDATE_BACKREF  2
6393
6394 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6395                                      struct btrfs_root *root,
6396                                      struct walk_control *wc,
6397                                      struct btrfs_path *path)
6398 {
6399         struct btrfs_fs_info *fs_info = root->fs_info;
6400         u64 bytenr;
6401         u64 generation;
6402         u64 refs;
6403         u64 flags;
6404         u32 nritems;
6405         struct btrfs_key key;
6406         struct extent_buffer *eb;
6407         int ret;
6408         int slot;
6409         int nread = 0;
6410
6411         if (path->slots[wc->level] < wc->reada_slot) {
6412                 wc->reada_count = wc->reada_count * 2 / 3;
6413                 wc->reada_count = max(wc->reada_count, 2);
6414         } else {
6415                 wc->reada_count = wc->reada_count * 3 / 2;
6416                 wc->reada_count = min_t(int, wc->reada_count,
6417                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
6418         }
6419
6420         eb = path->nodes[wc->level];
6421         nritems = btrfs_header_nritems(eb);
6422
6423         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6424                 if (nread >= wc->reada_count)
6425                         break;
6426
6427                 cond_resched();
6428                 bytenr = btrfs_node_blockptr(eb, slot);
6429                 generation = btrfs_node_ptr_generation(eb, slot);
6430
6431                 if (slot == path->slots[wc->level])
6432                         goto reada;
6433
6434                 if (wc->stage == UPDATE_BACKREF &&
6435                     generation <= root->root_key.offset)
6436                         continue;
6437
6438                 /* We don't lock the tree block, it's OK to be racy here */
6439                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
6440                                                wc->level - 1, 1, &refs,
6441                                                &flags);
6442                 /* We don't care about errors in readahead. */
6443                 if (ret < 0)
6444                         continue;
6445                 BUG_ON(refs == 0);
6446
6447                 if (wc->stage == DROP_REFERENCE) {
6448                         if (refs == 1)
6449                                 goto reada;
6450
6451                         if (wc->level == 1 &&
6452                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6453                                 continue;
6454                         if (!wc->update_ref ||
6455                             generation <= root->root_key.offset)
6456                                 continue;
6457                         btrfs_node_key_to_cpu(eb, &key, slot);
6458                         ret = btrfs_comp_cpu_keys(&key,
6459                                                   &wc->update_progress);
6460                         if (ret < 0)
6461                                 continue;
6462                 } else {
6463                         if (wc->level == 1 &&
6464                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6465                                 continue;
6466                 }
6467 reada:
6468                 readahead_tree_block(fs_info, bytenr);
6469                 nread++;
6470         }
6471         wc->reada_slot = slot;
6472 }
6473
6474 /*
6475  * helper to process tree block while walking down the tree.
6476  *
6477  * when wc->stage == UPDATE_BACKREF, this function updates
6478  * back refs for pointers in the block.
6479  *
6480  * NOTE: return value 1 means we should stop walking down.
6481  */
6482 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6483                                    struct btrfs_root *root,
6484                                    struct btrfs_path *path,
6485                                    struct walk_control *wc, int lookup_info)
6486 {
6487         struct btrfs_fs_info *fs_info = root->fs_info;
6488         int level = wc->level;
6489         struct extent_buffer *eb = path->nodes[level];
6490         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6491         int ret;
6492
6493         if (wc->stage == UPDATE_BACKREF &&
6494             btrfs_header_owner(eb) != root->root_key.objectid)
6495                 return 1;
6496
6497         /*
6498          * when reference count of tree block is 1, it won't increase
6499          * again. once full backref flag is set, we never clear it.
6500          */
6501         if (lookup_info &&
6502             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6503              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6504                 BUG_ON(!path->locks[level]);
6505                 ret = btrfs_lookup_extent_info(trans, fs_info,
6506                                                eb->start, level, 1,
6507                                                &wc->refs[level],
6508                                                &wc->flags[level]);
6509                 BUG_ON(ret == -ENOMEM);
6510                 if (ret)
6511                         return ret;
6512                 BUG_ON(wc->refs[level] == 0);
6513         }
6514
6515         if (wc->stage == DROP_REFERENCE) {
6516                 if (wc->refs[level] > 1)
6517                         return 1;
6518
6519                 if (path->locks[level] && !wc->keep_locks) {
6520                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6521                         path->locks[level] = 0;
6522                 }
6523                 return 0;
6524         }
6525
6526         /* wc->stage == UPDATE_BACKREF */
6527         if (!(wc->flags[level] & flag)) {
6528                 BUG_ON(!path->locks[level]);
6529                 ret = btrfs_inc_ref(trans, root, eb, 1);
6530                 BUG_ON(ret); /* -ENOMEM */
6531                 ret = btrfs_dec_ref(trans, root, eb, 0);
6532                 BUG_ON(ret); /* -ENOMEM */
6533                 ret = btrfs_set_disk_extent_flags(trans, eb->start,
6534                                                   eb->len, flag,
6535                                                   btrfs_header_level(eb), 0);
6536                 BUG_ON(ret); /* -ENOMEM */
6537                 wc->flags[level] |= flag;
6538         }
6539
6540         /*
6541          * the block is shared by multiple trees, so it's not good to
6542          * keep the tree lock
6543          */
6544         if (path->locks[level] && level > 0) {
6545                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6546                 path->locks[level] = 0;
6547         }
6548         return 0;
6549 }
6550
6551 /*
6552  * This is used to verify a ref exists for this root to deal with a bug where we
6553  * would have a drop_progress key that hadn't been updated properly.
6554  */
6555 static int check_ref_exists(struct btrfs_trans_handle *trans,
6556                             struct btrfs_root *root, u64 bytenr, u64 parent,
6557                             int level)
6558 {
6559         struct btrfs_path *path;
6560         struct btrfs_extent_inline_ref *iref;
6561         int ret;
6562
6563         path = btrfs_alloc_path();
6564         if (!path)
6565                 return -ENOMEM;
6566
6567         ret = lookup_extent_backref(trans, path, &iref, bytenr,
6568                                     root->fs_info->nodesize, parent,
6569                                     root->root_key.objectid, level, 0);
6570         btrfs_free_path(path);
6571         if (ret == -ENOENT)
6572                 return 0;
6573         if (ret < 0)
6574                 return ret;
6575         return 1;
6576 }
6577
6578 /*
6579  * helper to process tree block pointer.
6580  *
6581  * when wc->stage == DROP_REFERENCE, this function checks
6582  * reference count of the block pointed to. if the block
6583  * is shared and we need update back refs for the subtree
6584  * rooted at the block, this function changes wc->stage to
6585  * UPDATE_BACKREF. if the block is shared and there is no
6586  * need to update back, this function drops the reference
6587  * to the block.
6588  *
6589  * NOTE: return value 1 means we should stop walking down.
6590  */
6591 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6592                                  struct btrfs_root *root,
6593                                  struct btrfs_path *path,
6594                                  struct walk_control *wc, int *lookup_info)
6595 {
6596         struct btrfs_fs_info *fs_info = root->fs_info;
6597         u64 bytenr;
6598         u64 generation;
6599         u64 parent;
6600         struct btrfs_key key;
6601         struct btrfs_key first_key;
6602         struct btrfs_ref ref = { 0 };
6603         struct extent_buffer *next;
6604         int level = wc->level;
6605         int reada = 0;
6606         int ret = 0;
6607         bool need_account = false;
6608
6609         generation = btrfs_node_ptr_generation(path->nodes[level],
6610                                                path->slots[level]);
6611         /*
6612          * if the lower level block was created before the snapshot
6613          * was created, we know there is no need to update back refs
6614          * for the subtree
6615          */
6616         if (wc->stage == UPDATE_BACKREF &&
6617             generation <= root->root_key.offset) {
6618                 *lookup_info = 1;
6619                 return 1;
6620         }
6621
6622         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6623         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
6624                               path->slots[level]);
6625
6626         next = find_extent_buffer(fs_info, bytenr);
6627         if (!next) {
6628                 next = btrfs_find_create_tree_block(fs_info, bytenr);
6629                 if (IS_ERR(next))
6630                         return PTR_ERR(next);
6631
6632                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
6633                                                level - 1);
6634                 reada = 1;
6635         }
6636         btrfs_tree_lock(next);
6637         btrfs_set_lock_blocking_write(next);
6638
6639         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
6640                                        &wc->refs[level - 1],
6641                                        &wc->flags[level - 1]);
6642         if (ret < 0)
6643                 goto out_unlock;
6644
6645         if (unlikely(wc->refs[level - 1] == 0)) {
6646                 btrfs_err(fs_info, "Missing references.");
6647                 ret = -EIO;
6648                 goto out_unlock;
6649         }
6650         *lookup_info = 0;
6651
6652         if (wc->stage == DROP_REFERENCE) {
6653                 if (wc->refs[level - 1] > 1) {
6654                         need_account = true;
6655                         if (level == 1 &&
6656                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6657                                 goto skip;
6658
6659                         if (!wc->update_ref ||
6660                             generation <= root->root_key.offset)
6661                                 goto skip;
6662
6663                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6664                                               path->slots[level]);
6665                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6666                         if (ret < 0)
6667                                 goto skip;
6668
6669                         wc->stage = UPDATE_BACKREF;
6670                         wc->shared_level = level - 1;
6671                 }
6672         } else {
6673                 if (level == 1 &&
6674                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6675                         goto skip;
6676         }
6677
6678         if (!btrfs_buffer_uptodate(next, generation, 0)) {
6679                 btrfs_tree_unlock(next);
6680                 free_extent_buffer(next);
6681                 next = NULL;
6682                 *lookup_info = 1;
6683         }
6684
6685         if (!next) {
6686                 if (reada && level == 1)
6687                         reada_walk_down(trans, root, wc, path);
6688                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
6689                                        &first_key);
6690                 if (IS_ERR(next)) {
6691                         return PTR_ERR(next);
6692                 } else if (!extent_buffer_uptodate(next)) {
6693                         free_extent_buffer(next);
6694                         return -EIO;
6695                 }
6696                 btrfs_tree_lock(next);
6697                 btrfs_set_lock_blocking_write(next);
6698         }
6699
6700         level--;
6701         ASSERT(level == btrfs_header_level(next));
6702         if (level != btrfs_header_level(next)) {
6703                 btrfs_err(root->fs_info, "mismatched level");
6704                 ret = -EIO;
6705                 goto out_unlock;
6706         }
6707         path->nodes[level] = next;
6708         path->slots[level] = 0;
6709         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6710         wc->level = level;
6711         if (wc->level == 1)
6712                 wc->reada_slot = 0;
6713         return 0;
6714 skip:
6715         wc->refs[level - 1] = 0;
6716         wc->flags[level - 1] = 0;
6717         if (wc->stage == DROP_REFERENCE) {
6718                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6719                         parent = path->nodes[level]->start;
6720                 } else {
6721                         ASSERT(root->root_key.objectid ==
6722                                btrfs_header_owner(path->nodes[level]));
6723                         if (root->root_key.objectid !=
6724                             btrfs_header_owner(path->nodes[level])) {
6725                                 btrfs_err(root->fs_info,
6726                                                 "mismatched block owner");
6727                                 ret = -EIO;
6728                                 goto out_unlock;
6729                         }
6730                         parent = 0;
6731                 }
6732
6733                 /*
6734                  * If we had a drop_progress we need to verify the refs are set
6735                  * as expected.  If we find our ref then we know that from here
6736                  * on out everything should be correct, and we can clear the
6737                  * ->restarted flag.
6738                  */
6739                 if (wc->restarted) {
6740                         ret = check_ref_exists(trans, root, bytenr, parent,
6741                                                level - 1);
6742                         if (ret < 0)
6743                                 goto out_unlock;
6744                         if (ret == 0)
6745                                 goto no_delete;
6746                         ret = 0;
6747                         wc->restarted = 0;
6748                 }
6749
6750                 /*
6751                  * Reloc tree doesn't contribute to qgroup numbers, and we have
6752                  * already accounted them at merge time (replace_path),
6753                  * thus we could skip expensive subtree trace here.
6754                  */
6755                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
6756                     need_account) {
6757                         ret = btrfs_qgroup_trace_subtree(trans, next,
6758                                                          generation, level - 1);
6759                         if (ret) {
6760                                 btrfs_err_rl(fs_info,
6761                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
6762                                              ret);
6763                         }
6764                 }
6765
6766                 /*
6767                  * We need to update the next key in our walk control so we can
6768                  * update the drop_progress key accordingly.  We don't care if
6769                  * find_next_key doesn't find a key because that means we're at
6770                  * the end and are going to clean up now.
6771                  */
6772                 wc->drop_level = level;
6773                 find_next_key(path, level, &wc->drop_progress);
6774
6775                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
6776                                        fs_info->nodesize, parent);
6777                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
6778                 ret = btrfs_free_extent(trans, &ref);
6779                 if (ret)
6780                         goto out_unlock;
6781         }
6782 no_delete:
6783         *lookup_info = 1;
6784         ret = 1;
6785
6786 out_unlock:
6787         btrfs_tree_unlock(next);
6788         free_extent_buffer(next);
6789
6790         return ret;
6791 }
6792
6793 /*
6794  * helper to process tree block while walking up the tree.
6795  *
6796  * when wc->stage == DROP_REFERENCE, this function drops
6797  * reference count on the block.
6798  *
6799  * when wc->stage == UPDATE_BACKREF, this function changes
6800  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6801  * to UPDATE_BACKREF previously while processing the block.
6802  *
6803  * NOTE: return value 1 means we should stop walking up.
6804  */
6805 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6806                                  struct btrfs_root *root,
6807                                  struct btrfs_path *path,
6808                                  struct walk_control *wc)
6809 {
6810         struct btrfs_fs_info *fs_info = root->fs_info;
6811         int ret;
6812         int level = wc->level;
6813         struct extent_buffer *eb = path->nodes[level];
6814         u64 parent = 0;
6815
6816         if (wc->stage == UPDATE_BACKREF) {
6817                 BUG_ON(wc->shared_level < level);
6818                 if (level < wc->shared_level)
6819                         goto out;
6820
6821                 ret = find_next_key(path, level + 1, &wc->update_progress);
6822                 if (ret > 0)
6823                         wc->update_ref = 0;
6824
6825                 wc->stage = DROP_REFERENCE;
6826                 wc->shared_level = -1;
6827                 path->slots[level] = 0;
6828
6829                 /*
6830                  * check reference count again if the block isn't locked.
6831                  * we should start walking down the tree again if reference
6832                  * count is one.
6833                  */
6834                 if (!path->locks[level]) {
6835                         BUG_ON(level == 0);
6836                         btrfs_tree_lock(eb);
6837                         btrfs_set_lock_blocking_write(eb);
6838                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6839
6840                         ret = btrfs_lookup_extent_info(trans, fs_info,
6841                                                        eb->start, level, 1,
6842                                                        &wc->refs[level],
6843                                                        &wc->flags[level]);
6844                         if (ret < 0) {
6845                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6846                                 path->locks[level] = 0;
6847                                 return ret;
6848                         }
6849                         BUG_ON(wc->refs[level] == 0);
6850                         if (wc->refs[level] == 1) {
6851                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6852                                 path->locks[level] = 0;
6853                                 return 1;
6854                         }
6855                 }
6856         }
6857
6858         /* wc->stage == DROP_REFERENCE */
6859         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6860
6861         if (wc->refs[level] == 1) {
6862                 if (level == 0) {
6863                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6864                                 ret = btrfs_dec_ref(trans, root, eb, 1);
6865                         else
6866                                 ret = btrfs_dec_ref(trans, root, eb, 0);
6867                         BUG_ON(ret); /* -ENOMEM */
6868                         if (is_fstree(root->root_key.objectid)) {
6869                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
6870                                 if (ret) {
6871                                         btrfs_err_rl(fs_info,
6872         "error %d accounting leaf items, quota is out of sync, rescan required",
6873                                              ret);
6874                                 }
6875                         }
6876                 }
6877                 /* make block locked assertion in btrfs_clean_tree_block happy */
6878                 if (!path->locks[level] &&
6879                     btrfs_header_generation(eb) == trans->transid) {
6880                         btrfs_tree_lock(eb);
6881                         btrfs_set_lock_blocking_write(eb);
6882                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6883                 }
6884                 btrfs_clean_tree_block(eb);
6885         }
6886
6887         if (eb == root->node) {
6888                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6889                         parent = eb->start;
6890                 else if (root->root_key.objectid != btrfs_header_owner(eb))
6891                         goto owner_mismatch;
6892         } else {
6893                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6894                         parent = path->nodes[level + 1]->start;
6895                 else if (root->root_key.objectid !=
6896                          btrfs_header_owner(path->nodes[level + 1]))
6897                         goto owner_mismatch;
6898         }
6899
6900         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6901 out:
6902         wc->refs[level] = 0;
6903         wc->flags[level] = 0;
6904         return 0;
6905
6906 owner_mismatch:
6907         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
6908                      btrfs_header_owner(eb), root->root_key.objectid);
6909         return -EUCLEAN;
6910 }
6911
6912 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6913                                    struct btrfs_root *root,
6914                                    struct btrfs_path *path,
6915                                    struct walk_control *wc)
6916 {
6917         int level = wc->level;
6918         int lookup_info = 1;
6919         int ret;
6920
6921         while (level >= 0) {
6922                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6923                 if (ret > 0)
6924                         break;
6925
6926                 if (level == 0)
6927                         break;
6928
6929                 if (path->slots[level] >=
6930                     btrfs_header_nritems(path->nodes[level]))
6931                         break;
6932
6933                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6934                 if (ret > 0) {
6935                         path->slots[level]++;
6936                         continue;
6937                 } else if (ret < 0)
6938                         return ret;
6939                 level = wc->level;
6940         }
6941         return 0;
6942 }
6943
6944 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6945                                  struct btrfs_root *root,
6946                                  struct btrfs_path *path,
6947                                  struct walk_control *wc, int max_level)
6948 {
6949         int level = wc->level;
6950         int ret;
6951
6952         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6953         while (level < max_level && path->nodes[level]) {
6954                 wc->level = level;
6955                 if (path->slots[level] + 1 <
6956                     btrfs_header_nritems(path->nodes[level])) {
6957                         path->slots[level]++;
6958                         return 0;
6959                 } else {
6960                         ret = walk_up_proc(trans, root, path, wc);
6961                         if (ret > 0)
6962                                 return 0;
6963                         if (ret < 0)
6964                                 return ret;
6965
6966                         if (path->locks[level]) {
6967                                 btrfs_tree_unlock_rw(path->nodes[level],
6968                                                      path->locks[level]);
6969                                 path->locks[level] = 0;
6970                         }
6971                         free_extent_buffer(path->nodes[level]);
6972                         path->nodes[level] = NULL;
6973                         level++;
6974                 }
6975         }
6976         return 1;
6977 }
6978
6979 /*
6980  * drop a subvolume tree.
6981  *
6982  * this function traverses the tree freeing any blocks that only
6983  * referenced by the tree.
6984  *
6985  * when a shared tree block is found. this function decreases its
6986  * reference count by one. if update_ref is true, this function
6987  * also make sure backrefs for the shared block and all lower level
6988  * blocks are properly updated.
6989  *
6990  * If called with for_reloc == 0, may exit early with -EAGAIN
6991  */
6992 int btrfs_drop_snapshot(struct btrfs_root *root,
6993                          struct btrfs_block_rsv *block_rsv, int update_ref,
6994                          int for_reloc)
6995 {
6996         struct btrfs_fs_info *fs_info = root->fs_info;
6997         struct btrfs_path *path;
6998         struct btrfs_trans_handle *trans;
6999         struct btrfs_root *tree_root = fs_info->tree_root;
7000         struct btrfs_root_item *root_item = &root->root_item;
7001         struct walk_control *wc;
7002         struct btrfs_key key;
7003         int err = 0;
7004         int ret;
7005         int level;
7006         bool root_dropped = false;
7007
7008         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
7009
7010         path = btrfs_alloc_path();
7011         if (!path) {
7012                 err = -ENOMEM;
7013                 goto out;
7014         }
7015
7016         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7017         if (!wc) {
7018                 btrfs_free_path(path);
7019                 err = -ENOMEM;
7020                 goto out;
7021         }
7022
7023         trans = btrfs_start_transaction(tree_root, 0);
7024         if (IS_ERR(trans)) {
7025                 err = PTR_ERR(trans);
7026                 goto out_free;
7027         }
7028
7029         err = btrfs_run_delayed_items(trans);
7030         if (err)
7031                 goto out_end_trans;
7032
7033         if (block_rsv)
7034                 trans->block_rsv = block_rsv;
7035
7036         /*
7037          * This will help us catch people modifying the fs tree while we're
7038          * dropping it.  It is unsafe to mess with the fs tree while it's being
7039          * dropped as we unlock the root node and parent nodes as we walk down
7040          * the tree, assuming nothing will change.  If something does change
7041          * then we'll have stale information and drop references to blocks we've
7042          * already dropped.
7043          */
7044         set_bit(BTRFS_ROOT_DELETING, &root->state);
7045         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7046                 level = btrfs_header_level(root->node);
7047                 path->nodes[level] = btrfs_lock_root_node(root);
7048                 btrfs_set_lock_blocking_write(path->nodes[level]);
7049                 path->slots[level] = 0;
7050                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7051                 memset(&wc->update_progress, 0,
7052                        sizeof(wc->update_progress));
7053         } else {
7054                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7055                 memcpy(&wc->update_progress, &key,
7056                        sizeof(wc->update_progress));
7057
7058                 level = root_item->drop_level;
7059                 BUG_ON(level == 0);
7060                 path->lowest_level = level;
7061                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7062                 path->lowest_level = 0;
7063                 if (ret < 0) {
7064                         err = ret;
7065                         goto out_end_trans;
7066                 }
7067                 WARN_ON(ret > 0);
7068
7069                 /*
7070                  * unlock our path, this is safe because only this
7071                  * function is allowed to delete this snapshot
7072                  */
7073                 btrfs_unlock_up_safe(path, 0);
7074
7075                 level = btrfs_header_level(root->node);
7076                 while (1) {
7077                         btrfs_tree_lock(path->nodes[level]);
7078                         btrfs_set_lock_blocking_write(path->nodes[level]);
7079                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7080
7081                         ret = btrfs_lookup_extent_info(trans, fs_info,
7082                                                 path->nodes[level]->start,
7083                                                 level, 1, &wc->refs[level],
7084                                                 &wc->flags[level]);
7085                         if (ret < 0) {
7086                                 err = ret;
7087                                 goto out_end_trans;
7088                         }
7089                         BUG_ON(wc->refs[level] == 0);
7090
7091                         if (level == root_item->drop_level)
7092                                 break;
7093
7094                         btrfs_tree_unlock(path->nodes[level]);
7095                         path->locks[level] = 0;
7096                         WARN_ON(wc->refs[level] != 1);
7097                         level--;
7098                 }
7099         }
7100
7101         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
7102         wc->level = level;
7103         wc->shared_level = -1;
7104         wc->stage = DROP_REFERENCE;
7105         wc->update_ref = update_ref;
7106         wc->keep_locks = 0;
7107         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
7108
7109         while (1) {
7110
7111                 ret = walk_down_tree(trans, root, path, wc);
7112                 if (ret < 0) {
7113                         err = ret;
7114                         break;
7115                 }
7116
7117                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7118                 if (ret < 0) {
7119                         err = ret;
7120                         break;
7121                 }
7122
7123                 if (ret > 0) {
7124                         BUG_ON(wc->stage != DROP_REFERENCE);
7125                         break;
7126                 }
7127
7128                 if (wc->stage == DROP_REFERENCE) {
7129                         wc->drop_level = wc->level;
7130                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
7131                                               &wc->drop_progress,
7132                                               path->slots[wc->drop_level]);
7133                 }
7134                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
7135                                       &wc->drop_progress);
7136                 root_item->drop_level = wc->drop_level;
7137
7138                 BUG_ON(wc->level == 0);
7139                 if (btrfs_should_end_transaction(trans) ||
7140                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
7141                         ret = btrfs_update_root(trans, tree_root,
7142                                                 &root->root_key,
7143                                                 root_item);
7144                         if (ret) {
7145                                 btrfs_abort_transaction(trans, ret);
7146                                 err = ret;
7147                                 goto out_end_trans;
7148                         }
7149
7150                         btrfs_end_transaction_throttle(trans);
7151                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
7152                                 btrfs_debug(fs_info,
7153                                             "drop snapshot early exit");
7154                                 err = -EAGAIN;
7155                                 goto out_free;
7156                         }
7157
7158                         trans = btrfs_start_transaction(tree_root, 0);
7159                         if (IS_ERR(trans)) {
7160                                 err = PTR_ERR(trans);
7161                                 goto out_free;
7162                         }
7163                         if (block_rsv)
7164                                 trans->block_rsv = block_rsv;
7165                 }
7166         }
7167         btrfs_release_path(path);
7168         if (err)
7169                 goto out_end_trans;
7170
7171         ret = btrfs_del_root(trans, &root->root_key);
7172         if (ret) {
7173                 btrfs_abort_transaction(trans, ret);
7174                 err = ret;
7175                 goto out_end_trans;
7176         }
7177
7178         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7179                 ret = btrfs_find_root(tree_root, &root->root_key, path,
7180                                       NULL, NULL);
7181                 if (ret < 0) {
7182                         btrfs_abort_transaction(trans, ret);
7183                         err = ret;
7184                         goto out_end_trans;
7185                 } else if (ret > 0) {
7186                         /* if we fail to delete the orphan item this time
7187                          * around, it'll get picked up the next time.
7188                          *
7189                          * The most common failure here is just -ENOENT.
7190                          */
7191                         btrfs_del_orphan_item(trans, tree_root,
7192                                               root->root_key.objectid);
7193                 }
7194         }
7195
7196         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
7197                 btrfs_add_dropped_root(trans, root);
7198         } else {
7199                 free_extent_buffer(root->node);
7200                 free_extent_buffer(root->commit_root);
7201                 btrfs_put_fs_root(root);
7202         }
7203         root_dropped = true;
7204 out_end_trans:
7205         btrfs_end_transaction_throttle(trans);
7206 out_free:
7207         kfree(wc);
7208         btrfs_free_path(path);
7209 out:
7210         /*
7211          * So if we need to stop dropping the snapshot for whatever reason we
7212          * need to make sure to add it back to the dead root list so that we
7213          * keep trying to do the work later.  This also cleans up roots if we
7214          * don't have it in the radix (like when we recover after a power fail
7215          * or unmount) so we don't leak memory.
7216          */
7217         if (!for_reloc && !root_dropped)
7218                 btrfs_add_dead_root(root);
7219         if (err && err != -EAGAIN)
7220                 btrfs_handle_fs_error(fs_info, err, NULL);
7221         return err;
7222 }
7223
7224 /*
7225  * drop subtree rooted at tree block 'node'.
7226  *
7227  * NOTE: this function will unlock and release tree block 'node'
7228  * only used by relocation code
7229  */
7230 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7231                         struct btrfs_root *root,
7232                         struct extent_buffer *node,
7233                         struct extent_buffer *parent)
7234 {
7235         struct btrfs_fs_info *fs_info = root->fs_info;
7236         struct btrfs_path *path;
7237         struct walk_control *wc;
7238         int level;
7239         int parent_level;
7240         int ret = 0;
7241         int wret;
7242
7243         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7244
7245         path = btrfs_alloc_path();
7246         if (!path)
7247                 return -ENOMEM;
7248
7249         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7250         if (!wc) {
7251                 btrfs_free_path(path);
7252                 return -ENOMEM;
7253         }
7254
7255         btrfs_assert_tree_locked(parent);
7256         parent_level = btrfs_header_level(parent);
7257         extent_buffer_get(parent);
7258         path->nodes[parent_level] = parent;
7259         path->slots[parent_level] = btrfs_header_nritems(parent);
7260
7261         btrfs_assert_tree_locked(node);
7262         level = btrfs_header_level(node);
7263         path->nodes[level] = node;
7264         path->slots[level] = 0;
7265         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7266
7267         wc->refs[parent_level] = 1;
7268         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7269         wc->level = level;
7270         wc->shared_level = -1;
7271         wc->stage = DROP_REFERENCE;
7272         wc->update_ref = 0;
7273         wc->keep_locks = 1;
7274         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
7275
7276         while (1) {
7277                 wret = walk_down_tree(trans, root, path, wc);
7278                 if (wret < 0) {
7279                         ret = wret;
7280                         break;
7281                 }
7282
7283                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7284                 if (wret < 0)
7285                         ret = wret;
7286                 if (wret != 0)
7287                         break;
7288         }
7289
7290         kfree(wc);
7291         btrfs_free_path(path);
7292         return ret;
7293 }
7294
7295 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
7296 {
7297         u64 num_devices;
7298         u64 stripped;
7299
7300         /*
7301          * if restripe for this chunk_type is on pick target profile and
7302          * return, otherwise do the usual balance
7303          */
7304         stripped = get_restripe_target(fs_info, flags);
7305         if (stripped)
7306                 return extended_to_chunk(stripped);
7307
7308         num_devices = fs_info->fs_devices->rw_devices;
7309
7310         stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
7311                 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
7312
7313         if (num_devices == 1) {
7314                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7315                 stripped = flags & ~stripped;
7316
7317                 /* turn raid0 into single device chunks */
7318                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7319                         return stripped;
7320
7321                 /* turn mirroring into duplication */
7322                 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
7323                              BTRFS_BLOCK_GROUP_RAID10))
7324                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7325         } else {
7326                 /* they already had raid on here, just return */
7327                 if (flags & stripped)
7328                         return flags;
7329
7330                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7331                 stripped = flags & ~stripped;
7332
7333                 /* switch duplicated blocks with raid1 */
7334                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7335                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7336
7337                 /* this is drive concat, leave it alone */
7338         }
7339
7340         return flags;
7341 }
7342
7343 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7344 {
7345         struct btrfs_space_info *sinfo = cache->space_info;
7346         u64 num_bytes;
7347         u64 sinfo_used;
7348         u64 min_allocable_bytes;
7349         int ret = -ENOSPC;
7350
7351         /*
7352          * We need some metadata space and system metadata space for
7353          * allocating chunks in some corner cases until we force to set
7354          * it to be readonly.
7355          */
7356         if ((sinfo->flags &
7357              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7358             !force)
7359                 min_allocable_bytes = SZ_1M;
7360         else
7361                 min_allocable_bytes = 0;
7362
7363         spin_lock(&sinfo->lock);
7364         spin_lock(&cache->lock);
7365
7366         if (cache->ro) {
7367                 cache->ro++;
7368                 ret = 0;
7369                 goto out;
7370         }
7371
7372         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7373                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7374         sinfo_used = btrfs_space_info_used(sinfo, true);
7375
7376         if (sinfo_used + num_bytes + min_allocable_bytes <=
7377             sinfo->total_bytes) {
7378                 sinfo->bytes_readonly += num_bytes;
7379                 cache->ro++;
7380                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
7381                 ret = 0;
7382         }
7383 out:
7384         spin_unlock(&cache->lock);
7385         spin_unlock(&sinfo->lock);
7386         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
7387                 btrfs_info(cache->fs_info,
7388                         "unable to make block group %llu ro",
7389                         cache->key.objectid);
7390                 btrfs_info(cache->fs_info,
7391                         "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
7392                         sinfo_used, num_bytes, min_allocable_bytes);
7393                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
7394         }
7395         return ret;
7396 }
7397
7398 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
7399
7400 {
7401         struct btrfs_fs_info *fs_info = cache->fs_info;
7402         struct btrfs_trans_handle *trans;
7403         u64 alloc_flags;
7404         int ret;
7405
7406 again:
7407         trans = btrfs_join_transaction(fs_info->extent_root);
7408         if (IS_ERR(trans))
7409                 return PTR_ERR(trans);
7410
7411         /*
7412          * we're not allowed to set block groups readonly after the dirty
7413          * block groups cache has started writing.  If it already started,
7414          * back off and let this transaction commit
7415          */
7416         mutex_lock(&fs_info->ro_block_group_mutex);
7417         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
7418                 u64 transid = trans->transid;
7419
7420                 mutex_unlock(&fs_info->ro_block_group_mutex);
7421                 btrfs_end_transaction(trans);
7422
7423                 ret = btrfs_wait_for_commit(fs_info, transid);
7424                 if (ret)
7425                         return ret;
7426                 goto again;
7427         }
7428
7429         /*
7430          * if we are changing raid levels, try to allocate a corresponding
7431          * block group with the new raid level.
7432          */
7433         alloc_flags = update_block_group_flags(fs_info, cache->flags);
7434         if (alloc_flags != cache->flags) {
7435                 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
7436                 /*
7437                  * ENOSPC is allowed here, we may have enough space
7438                  * already allocated at the new raid level to
7439                  * carry on
7440                  */
7441                 if (ret == -ENOSPC)
7442                         ret = 0;
7443                 if (ret < 0)
7444                         goto out;
7445         }
7446
7447         ret = inc_block_group_ro(cache, 0);
7448         if (!ret)
7449                 goto out;
7450         alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
7451         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
7452         if (ret < 0)
7453                 goto out;
7454         ret = inc_block_group_ro(cache, 0);
7455 out:
7456         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
7457                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
7458                 mutex_lock(&fs_info->chunk_mutex);
7459                 check_system_chunk(trans, alloc_flags);
7460                 mutex_unlock(&fs_info->chunk_mutex);
7461         }
7462         mutex_unlock(&fs_info->ro_block_group_mutex);
7463
7464         btrfs_end_transaction(trans);
7465         return ret;
7466 }
7467
7468 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
7469 {
7470         u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
7471
7472         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
7473 }
7474
7475 /*
7476  * helper to account the unused space of all the readonly block group in the
7477  * space_info. takes mirrors into account.
7478  */
7479 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7480 {
7481         struct btrfs_block_group_cache *block_group;
7482         u64 free_bytes = 0;
7483         int factor;
7484
7485         /* It's df, we don't care if it's racy */
7486         if (list_empty(&sinfo->ro_bgs))
7487                 return 0;
7488
7489         spin_lock(&sinfo->lock);
7490         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
7491                 spin_lock(&block_group->lock);
7492
7493                 if (!block_group->ro) {
7494                         spin_unlock(&block_group->lock);
7495                         continue;
7496                 }
7497
7498                 factor = btrfs_bg_type_to_factor(block_group->flags);
7499                 free_bytes += (block_group->key.offset -
7500                                btrfs_block_group_used(&block_group->item)) *
7501                                factor;
7502
7503                 spin_unlock(&block_group->lock);
7504         }
7505         spin_unlock(&sinfo->lock);
7506
7507         return free_bytes;
7508 }
7509
7510 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
7511 {
7512         struct btrfs_space_info *sinfo = cache->space_info;
7513         u64 num_bytes;
7514
7515         BUG_ON(!cache->ro);
7516
7517         spin_lock(&sinfo->lock);
7518         spin_lock(&cache->lock);
7519         if (!--cache->ro) {
7520                 num_bytes = cache->key.offset - cache->reserved -
7521                             cache->pinned - cache->bytes_super -
7522                             btrfs_block_group_used(&cache->item);
7523                 sinfo->bytes_readonly -= num_bytes;
7524                 list_del_init(&cache->ro_list);
7525         }
7526         spin_unlock(&cache->lock);
7527         spin_unlock(&sinfo->lock);
7528 }
7529
7530 /*
7531  * Checks to see if it's even possible to relocate this block group.
7532  *
7533  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7534  * ok to go ahead and try.
7535  */
7536 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
7537 {
7538         struct btrfs_block_group_cache *block_group;
7539         struct btrfs_space_info *space_info;
7540         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7541         struct btrfs_device *device;
7542         u64 min_free;
7543         u64 dev_min = 1;
7544         u64 dev_nr = 0;
7545         u64 target;
7546         int debug;
7547         int index;
7548         int full = 0;
7549         int ret = 0;
7550
7551         debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
7552
7553         block_group = btrfs_lookup_block_group(fs_info, bytenr);
7554
7555         /* odd, couldn't find the block group, leave it alone */
7556         if (!block_group) {
7557                 if (debug)
7558                         btrfs_warn(fs_info,
7559                                    "can't find block group for bytenr %llu",
7560                                    bytenr);
7561                 return -1;
7562         }
7563
7564         min_free = btrfs_block_group_used(&block_group->item);
7565
7566         /* no bytes used, we're good */
7567         if (!min_free)
7568                 goto out;
7569
7570         space_info = block_group->space_info;
7571         spin_lock(&space_info->lock);
7572
7573         full = space_info->full;
7574
7575         /*
7576          * if this is the last block group we have in this space, we can't
7577          * relocate it unless we're able to allocate a new chunk below.
7578          *
7579          * Otherwise, we need to make sure we have room in the space to handle
7580          * all of the extents from this block group.  If we can, we're good
7581          */
7582         if ((space_info->total_bytes != block_group->key.offset) &&
7583             (btrfs_space_info_used(space_info, false) + min_free <
7584              space_info->total_bytes)) {
7585                 spin_unlock(&space_info->lock);
7586                 goto out;
7587         }
7588         spin_unlock(&space_info->lock);
7589
7590         /*
7591          * ok we don't have enough space, but maybe we have free space on our
7592          * devices to allocate new chunks for relocation, so loop through our
7593          * alloc devices and guess if we have enough space.  if this block
7594          * group is going to be restriped, run checks against the target
7595          * profile instead of the current one.
7596          */
7597         ret = -1;
7598
7599         /*
7600          * index:
7601          *      0: raid10
7602          *      1: raid1
7603          *      2: dup
7604          *      3: raid0
7605          *      4: single
7606          */
7607         target = get_restripe_target(fs_info, block_group->flags);
7608         if (target) {
7609                 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
7610         } else {
7611                 /*
7612                  * this is just a balance, so if we were marked as full
7613                  * we know there is no space for a new chunk
7614                  */
7615                 if (full) {
7616                         if (debug)
7617                                 btrfs_warn(fs_info,
7618                                            "no space to alloc new chunk for block group %llu",
7619                                            block_group->key.objectid);
7620                         goto out;
7621                 }
7622
7623                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
7624         }
7625
7626         if (index == BTRFS_RAID_RAID10) {
7627                 dev_min = 4;
7628                 /* Divide by 2 */
7629                 min_free >>= 1;
7630         } else if (index == BTRFS_RAID_RAID1) {
7631                 dev_min = 2;
7632         } else if (index == BTRFS_RAID_DUP) {
7633                 /* Multiply by 2 */
7634                 min_free <<= 1;
7635         } else if (index == BTRFS_RAID_RAID0) {
7636                 dev_min = fs_devices->rw_devices;
7637                 min_free = div64_u64(min_free, dev_min);
7638         }
7639
7640         mutex_lock(&fs_info->chunk_mutex);
7641         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7642                 u64 dev_offset;
7643
7644                 /*
7645                  * check to make sure we can actually find a chunk with enough
7646                  * space to fit our block group in.
7647                  */
7648                 if (device->total_bytes > device->bytes_used + min_free &&
7649                     !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7650                         ret = find_free_dev_extent(device, min_free,
7651                                                    &dev_offset, NULL);
7652                         if (!ret)
7653                                 dev_nr++;
7654
7655                         if (dev_nr >= dev_min)
7656                                 break;
7657
7658                         ret = -1;
7659                 }
7660         }
7661         if (debug && ret == -1)
7662                 btrfs_warn(fs_info,
7663                            "no space to allocate a new chunk for block group %llu",
7664                            block_group->key.objectid);
7665         mutex_unlock(&fs_info->chunk_mutex);
7666 out:
7667         btrfs_put_block_group(block_group);
7668         return ret;
7669 }
7670
7671 static int find_first_block_group(struct btrfs_fs_info *fs_info,
7672                                   struct btrfs_path *path,
7673                                   struct btrfs_key *key)
7674 {
7675         struct btrfs_root *root = fs_info->extent_root;
7676         int ret = 0;
7677         struct btrfs_key found_key;
7678         struct extent_buffer *leaf;
7679         struct btrfs_block_group_item bg;
7680         u64 flags;
7681         int slot;
7682
7683         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7684         if (ret < 0)
7685                 goto out;
7686
7687         while (1) {
7688                 slot = path->slots[0];
7689                 leaf = path->nodes[0];
7690                 if (slot >= btrfs_header_nritems(leaf)) {
7691                         ret = btrfs_next_leaf(root, path);
7692                         if (ret == 0)
7693                                 continue;
7694                         if (ret < 0)
7695                                 goto out;
7696                         break;
7697                 }
7698                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7699
7700                 if (found_key.objectid >= key->objectid &&
7701                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7702                         struct extent_map_tree *em_tree;
7703                         struct extent_map *em;
7704
7705                         em_tree = &root->fs_info->mapping_tree;
7706                         read_lock(&em_tree->lock);
7707                         em = lookup_extent_mapping(em_tree, found_key.objectid,
7708                                                    found_key.offset);
7709                         read_unlock(&em_tree->lock);
7710                         if (!em) {
7711                                 btrfs_err(fs_info,
7712                         "logical %llu len %llu found bg but no related chunk",
7713                                           found_key.objectid, found_key.offset);
7714                                 ret = -ENOENT;
7715                         } else if (em->start != found_key.objectid ||
7716                                    em->len != found_key.offset) {
7717                                 btrfs_err(fs_info,
7718                 "block group %llu len %llu mismatch with chunk %llu len %llu",
7719                                           found_key.objectid, found_key.offset,
7720                                           em->start, em->len);
7721                                 ret = -EUCLEAN;
7722                         } else {
7723                                 read_extent_buffer(leaf, &bg,
7724                                         btrfs_item_ptr_offset(leaf, slot),
7725                                         sizeof(bg));
7726                                 flags = btrfs_block_group_flags(&bg) &
7727                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
7728
7729                                 if (flags != (em->map_lookup->type &
7730                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
7731                                         btrfs_err(fs_info,
7732 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
7733                                                 found_key.objectid,
7734                                                 found_key.offset, flags,
7735                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
7736                                                  em->map_lookup->type));
7737                                         ret = -EUCLEAN;
7738                                 } else {
7739                                         ret = 0;
7740                                 }
7741                         }
7742                         free_extent_map(em);
7743                         goto out;
7744                 }
7745                 path->slots[0]++;
7746         }
7747 out:
7748         return ret;
7749 }
7750
7751 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7752 {
7753         struct btrfs_block_group_cache *block_group;
7754         u64 last = 0;
7755
7756         while (1) {
7757                 struct inode *inode;
7758
7759                 block_group = btrfs_lookup_first_block_group(info, last);
7760                 while (block_group) {
7761                         wait_block_group_cache_done(block_group);
7762                         spin_lock(&block_group->lock);
7763                         if (block_group->iref)
7764                                 break;
7765                         spin_unlock(&block_group->lock);
7766                         block_group = next_block_group(block_group);
7767                 }
7768                 if (!block_group) {
7769                         if (last == 0)
7770                                 break;
7771                         last = 0;
7772                         continue;
7773                 }
7774
7775                 inode = block_group->inode;
7776                 block_group->iref = 0;
7777                 block_group->inode = NULL;
7778                 spin_unlock(&block_group->lock);
7779                 ASSERT(block_group->io_ctl.inode == NULL);
7780                 iput(inode);
7781                 last = block_group->key.objectid + block_group->key.offset;
7782                 btrfs_put_block_group(block_group);
7783         }
7784 }
7785
7786 /*
7787  * Must be called only after stopping all workers, since we could have block
7788  * group caching kthreads running, and therefore they could race with us if we
7789  * freed the block groups before stopping them.
7790  */
7791 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7792 {
7793         struct btrfs_block_group_cache *block_group;
7794         struct btrfs_space_info *space_info;
7795         struct btrfs_caching_control *caching_ctl;
7796         struct rb_node *n;
7797
7798         down_write(&info->commit_root_sem);
7799         while (!list_empty(&info->caching_block_groups)) {
7800                 caching_ctl = list_entry(info->caching_block_groups.next,
7801                                          struct btrfs_caching_control, list);
7802                 list_del(&caching_ctl->list);
7803                 put_caching_control(caching_ctl);
7804         }
7805         up_write(&info->commit_root_sem);
7806
7807         spin_lock(&info->unused_bgs_lock);
7808         while (!list_empty(&info->unused_bgs)) {
7809                 block_group = list_first_entry(&info->unused_bgs,
7810                                                struct btrfs_block_group_cache,
7811                                                bg_list);
7812                 list_del_init(&block_group->bg_list);
7813                 btrfs_put_block_group(block_group);
7814         }
7815         spin_unlock(&info->unused_bgs_lock);
7816
7817         spin_lock(&info->block_group_cache_lock);
7818         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7819                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7820                                        cache_node);
7821                 rb_erase(&block_group->cache_node,
7822                          &info->block_group_cache_tree);
7823                 RB_CLEAR_NODE(&block_group->cache_node);
7824                 spin_unlock(&info->block_group_cache_lock);
7825
7826                 down_write(&block_group->space_info->groups_sem);
7827                 list_del(&block_group->list);
7828                 up_write(&block_group->space_info->groups_sem);
7829
7830                 /*
7831                  * We haven't cached this block group, which means we could
7832                  * possibly have excluded extents on this block group.
7833                  */
7834                 if (block_group->cached == BTRFS_CACHE_NO ||
7835                     block_group->cached == BTRFS_CACHE_ERROR)
7836                         free_excluded_extents(block_group);
7837
7838                 btrfs_remove_free_space_cache(block_group);
7839                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
7840                 ASSERT(list_empty(&block_group->dirty_list));
7841                 ASSERT(list_empty(&block_group->io_list));
7842                 ASSERT(list_empty(&block_group->bg_list));
7843                 ASSERT(atomic_read(&block_group->count) == 1);
7844                 btrfs_put_block_group(block_group);
7845
7846                 spin_lock(&info->block_group_cache_lock);
7847         }
7848         spin_unlock(&info->block_group_cache_lock);
7849
7850         /* now that all the block groups are freed, go through and
7851          * free all the space_info structs.  This is only called during
7852          * the final stages of unmount, and so we know nobody is
7853          * using them.  We call synchronize_rcu() once before we start,
7854          * just to be on the safe side.
7855          */
7856         synchronize_rcu();
7857
7858         btrfs_release_global_block_rsv(info);
7859
7860         while (!list_empty(&info->space_info)) {
7861                 int i;
7862
7863                 space_info = list_entry(info->space_info.next,
7864                                         struct btrfs_space_info,
7865                                         list);
7866
7867                 /*
7868                  * Do not hide this behind enospc_debug, this is actually
7869                  * important and indicates a real bug if this happens.
7870                  */
7871                 if (WARN_ON(space_info->bytes_pinned > 0 ||
7872                             space_info->bytes_reserved > 0 ||
7873                             space_info->bytes_may_use > 0))
7874                         btrfs_dump_space_info(info, space_info, 0, 0);
7875                 list_del(&space_info->list);
7876                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
7877                         struct kobject *kobj;
7878                         kobj = space_info->block_group_kobjs[i];
7879                         space_info->block_group_kobjs[i] = NULL;
7880                         if (kobj) {
7881                                 kobject_del(kobj);
7882                                 kobject_put(kobj);
7883                         }
7884                 }
7885                 kobject_del(&space_info->kobj);
7886                 kobject_put(&space_info->kobj);
7887         }
7888         return 0;
7889 }
7890
7891 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
7892 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
7893 {
7894         struct btrfs_space_info *space_info;
7895         struct raid_kobject *rkobj;
7896         LIST_HEAD(list);
7897         int ret = 0;
7898
7899         spin_lock(&fs_info->pending_raid_kobjs_lock);
7900         list_splice_init(&fs_info->pending_raid_kobjs, &list);
7901         spin_unlock(&fs_info->pending_raid_kobjs_lock);
7902
7903         list_for_each_entry(rkobj, &list, list) {
7904                 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
7905
7906                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
7907                                 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
7908                 if (ret) {
7909                         kobject_put(&rkobj->kobj);
7910                         break;
7911                 }
7912         }
7913         if (ret)
7914                 btrfs_warn(fs_info,
7915                            "failed to add kobject for block cache, ignoring");
7916 }
7917
7918 static void link_block_group(struct btrfs_block_group_cache *cache)
7919 {
7920         struct btrfs_space_info *space_info = cache->space_info;
7921         struct btrfs_fs_info *fs_info = cache->fs_info;
7922         int index = btrfs_bg_flags_to_raid_index(cache->flags);
7923         bool first = false;
7924
7925         down_write(&space_info->groups_sem);
7926         if (list_empty(&space_info->block_groups[index]))
7927                 first = true;
7928         list_add_tail(&cache->list, &space_info->block_groups[index]);
7929         up_write(&space_info->groups_sem);
7930
7931         if (first) {
7932                 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
7933                 if (!rkobj) {
7934                         btrfs_warn(cache->fs_info,
7935                                 "couldn't alloc memory for raid level kobject");
7936                         return;
7937                 }
7938                 rkobj->flags = cache->flags;
7939                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
7940
7941                 spin_lock(&fs_info->pending_raid_kobjs_lock);
7942                 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
7943                 spin_unlock(&fs_info->pending_raid_kobjs_lock);
7944                 space_info->block_group_kobjs[index] = &rkobj->kobj;
7945         }
7946 }
7947
7948 static struct btrfs_block_group_cache *
7949 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
7950                                u64 start, u64 size)
7951 {
7952         struct btrfs_block_group_cache *cache;
7953
7954         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7955         if (!cache)
7956                 return NULL;
7957
7958         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7959                                         GFP_NOFS);
7960         if (!cache->free_space_ctl) {
7961                 kfree(cache);
7962                 return NULL;
7963         }
7964
7965         cache->key.objectid = start;
7966         cache->key.offset = size;
7967         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7968
7969         cache->fs_info = fs_info;
7970         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
7971         set_free_space_tree_thresholds(cache);
7972
7973         atomic_set(&cache->count, 1);
7974         spin_lock_init(&cache->lock);
7975         init_rwsem(&cache->data_rwsem);
7976         INIT_LIST_HEAD(&cache->list);
7977         INIT_LIST_HEAD(&cache->cluster_list);
7978         INIT_LIST_HEAD(&cache->bg_list);
7979         INIT_LIST_HEAD(&cache->ro_list);
7980         INIT_LIST_HEAD(&cache->dirty_list);
7981         INIT_LIST_HEAD(&cache->io_list);
7982         btrfs_init_free_space_ctl(cache);
7983         atomic_set(&cache->trimming, 0);
7984         mutex_init(&cache->free_space_lock);
7985         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
7986
7987         return cache;
7988 }
7989
7990
7991 /*
7992  * Iterate all chunks and verify that each of them has the corresponding block
7993  * group
7994  */
7995 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
7996 {
7997         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7998         struct extent_map *em;
7999         struct btrfs_block_group_cache *bg;
8000         u64 start = 0;
8001         int ret = 0;
8002
8003         while (1) {
8004                 read_lock(&map_tree->lock);
8005                 /*
8006                  * lookup_extent_mapping will return the first extent map
8007                  * intersecting the range, so setting @len to 1 is enough to
8008                  * get the first chunk.
8009                  */
8010                 em = lookup_extent_mapping(map_tree, start, 1);
8011                 read_unlock(&map_tree->lock);
8012                 if (!em)
8013                         break;
8014
8015                 bg = btrfs_lookup_block_group(fs_info, em->start);
8016                 if (!bg) {
8017                         btrfs_err(fs_info,
8018         "chunk start=%llu len=%llu doesn't have corresponding block group",
8019                                      em->start, em->len);
8020                         ret = -EUCLEAN;
8021                         free_extent_map(em);
8022                         break;
8023                 }
8024                 if (bg->key.objectid != em->start ||
8025                     bg->key.offset != em->len ||
8026                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
8027                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8028                         btrfs_err(fs_info,
8029 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
8030                                 em->start, em->len,
8031                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
8032                                 bg->key.objectid, bg->key.offset,
8033                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
8034                         ret = -EUCLEAN;
8035                         free_extent_map(em);
8036                         btrfs_put_block_group(bg);
8037                         break;
8038                 }
8039                 start = em->start + em->len;
8040                 free_extent_map(em);
8041                 btrfs_put_block_group(bg);
8042         }
8043         return ret;
8044 }
8045
8046 int btrfs_read_block_groups(struct btrfs_fs_info *info)
8047 {
8048         struct btrfs_path *path;
8049         int ret;
8050         struct btrfs_block_group_cache *cache;
8051         struct btrfs_space_info *space_info;
8052         struct btrfs_key key;
8053         struct btrfs_key found_key;
8054         struct extent_buffer *leaf;
8055         int need_clear = 0;
8056         u64 cache_gen;
8057         u64 feature;
8058         int mixed;
8059
8060         feature = btrfs_super_incompat_flags(info->super_copy);
8061         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
8062
8063         key.objectid = 0;
8064         key.offset = 0;
8065         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8066         path = btrfs_alloc_path();
8067         if (!path)
8068                 return -ENOMEM;
8069         path->reada = READA_FORWARD;
8070
8071         cache_gen = btrfs_super_cache_generation(info->super_copy);
8072         if (btrfs_test_opt(info, SPACE_CACHE) &&
8073             btrfs_super_generation(info->super_copy) != cache_gen)
8074                 need_clear = 1;
8075         if (btrfs_test_opt(info, CLEAR_CACHE))
8076                 need_clear = 1;
8077
8078         while (1) {
8079                 ret = find_first_block_group(info, path, &key);
8080                 if (ret > 0)
8081                         break;
8082                 if (ret != 0)
8083                         goto error;
8084
8085                 leaf = path->nodes[0];
8086                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8087
8088                 cache = btrfs_create_block_group_cache(info, found_key.objectid,
8089                                                        found_key.offset);
8090                 if (!cache) {
8091                         ret = -ENOMEM;
8092                         goto error;
8093                 }
8094
8095                 if (need_clear) {
8096                         /*
8097                          * When we mount with old space cache, we need to
8098                          * set BTRFS_DC_CLEAR and set dirty flag.
8099                          *
8100                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8101                          *    truncate the old free space cache inode and
8102                          *    setup a new one.
8103                          * b) Setting 'dirty flag' makes sure that we flush
8104                          *    the new space cache info onto disk.
8105                          */
8106                         if (btrfs_test_opt(info, SPACE_CACHE))
8107                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
8108                 }
8109
8110                 read_extent_buffer(leaf, &cache->item,
8111                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
8112                                    sizeof(cache->item));
8113                 cache->flags = btrfs_block_group_flags(&cache->item);
8114                 if (!mixed &&
8115                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
8116                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
8117                         btrfs_err(info,
8118 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
8119                                   cache->key.objectid);
8120                         ret = -EINVAL;
8121                         goto error;
8122                 }
8123
8124                 key.objectid = found_key.objectid + found_key.offset;
8125                 btrfs_release_path(path);
8126
8127                 /*
8128                  * We need to exclude the super stripes now so that the space
8129                  * info has super bytes accounted for, otherwise we'll think
8130                  * we have more space than we actually do.
8131                  */
8132                 ret = exclude_super_stripes(cache);
8133                 if (ret) {
8134                         /*
8135                          * We may have excluded something, so call this just in
8136                          * case.
8137                          */
8138                         free_excluded_extents(cache);
8139                         btrfs_put_block_group(cache);
8140                         goto error;
8141                 }
8142
8143                 /*
8144                  * check for two cases, either we are full, and therefore
8145                  * don't need to bother with the caching work since we won't
8146                  * find any space, or we are empty, and we can just add all
8147                  * the space in and be done with it.  This saves us _a_lot_ of
8148                  * time, particularly in the full case.
8149                  */
8150                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8151                         cache->last_byte_to_unpin = (u64)-1;
8152                         cache->cached = BTRFS_CACHE_FINISHED;
8153                         free_excluded_extents(cache);
8154                 } else if (btrfs_block_group_used(&cache->item) == 0) {
8155                         cache->last_byte_to_unpin = (u64)-1;
8156                         cache->cached = BTRFS_CACHE_FINISHED;
8157                         add_new_free_space(cache, found_key.objectid,
8158                                            found_key.objectid +
8159                                            found_key.offset);
8160                         free_excluded_extents(cache);
8161                 }
8162
8163                 ret = btrfs_add_block_group_cache(info, cache);
8164                 if (ret) {
8165                         btrfs_remove_free_space_cache(cache);
8166                         btrfs_put_block_group(cache);
8167                         goto error;
8168                 }
8169
8170                 trace_btrfs_add_block_group(info, cache, 0);
8171                 btrfs_update_space_info(info, cache->flags, found_key.offset,
8172                                         btrfs_block_group_used(&cache->item),
8173                                         cache->bytes_super, &space_info);
8174
8175                 cache->space_info = space_info;
8176
8177                 link_block_group(cache);
8178
8179                 set_avail_alloc_bits(info, cache->flags);
8180                 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
8181                         inc_block_group_ro(cache, 1);
8182                 } else if (btrfs_block_group_used(&cache->item) == 0) {
8183                         ASSERT(list_empty(&cache->bg_list));
8184                         btrfs_mark_bg_unused(cache);
8185                 }
8186         }
8187
8188         list_for_each_entry_rcu(space_info, &info->space_info, list) {
8189                 if (!(get_alloc_profile(info, space_info->flags) &
8190                       (BTRFS_BLOCK_GROUP_RAID10 |
8191                        BTRFS_BLOCK_GROUP_RAID1_MASK |
8192                        BTRFS_BLOCK_GROUP_RAID56_MASK |
8193                        BTRFS_BLOCK_GROUP_DUP)))
8194                         continue;
8195                 /*
8196                  * avoid allocating from un-mirrored block group if there are
8197                  * mirrored block groups.
8198                  */
8199                 list_for_each_entry(cache,
8200                                 &space_info->block_groups[BTRFS_RAID_RAID0],
8201                                 list)
8202                         inc_block_group_ro(cache, 1);
8203                 list_for_each_entry(cache,
8204                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
8205                                 list)
8206                         inc_block_group_ro(cache, 1);
8207         }
8208
8209         btrfs_add_raid_kobjects(info);
8210         btrfs_init_global_block_rsv(info);
8211         ret = check_chunk_block_group_mappings(info);
8212 error:
8213         btrfs_free_path(path);
8214         return ret;
8215 }
8216
8217 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
8218 {
8219         struct btrfs_fs_info *fs_info = trans->fs_info;
8220         struct btrfs_block_group_cache *block_group;
8221         struct btrfs_root *extent_root = fs_info->extent_root;
8222         struct btrfs_block_group_item item;
8223         struct btrfs_key key;
8224         int ret = 0;
8225
8226         if (!trans->can_flush_pending_bgs)
8227                 return;
8228
8229         while (!list_empty(&trans->new_bgs)) {
8230                 block_group = list_first_entry(&trans->new_bgs,
8231                                                struct btrfs_block_group_cache,
8232                                                bg_list);
8233                 if (ret)
8234                         goto next;
8235
8236                 spin_lock(&block_group->lock);
8237                 memcpy(&item, &block_group->item, sizeof(item));
8238                 memcpy(&key, &block_group->key, sizeof(key));
8239                 spin_unlock(&block_group->lock);
8240
8241                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
8242                                         sizeof(item));
8243                 if (ret)
8244                         btrfs_abort_transaction(trans, ret);
8245                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
8246                 if (ret)
8247                         btrfs_abort_transaction(trans, ret);
8248                 add_block_group_free_space(trans, block_group);
8249                 /* already aborted the transaction if it failed. */
8250 next:
8251                 btrfs_delayed_refs_rsv_release(fs_info, 1);
8252                 list_del_init(&block_group->bg_list);
8253         }
8254         btrfs_trans_release_chunk_metadata(trans);
8255 }
8256
8257 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
8258                            u64 type, u64 chunk_offset, u64 size)
8259 {
8260         struct btrfs_fs_info *fs_info = trans->fs_info;
8261         struct btrfs_block_group_cache *cache;
8262         int ret;
8263
8264         btrfs_set_log_full_commit(trans);
8265
8266         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
8267         if (!cache)
8268                 return -ENOMEM;
8269
8270         btrfs_set_block_group_used(&cache->item, bytes_used);
8271         btrfs_set_block_group_chunk_objectid(&cache->item,
8272                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID);
8273         btrfs_set_block_group_flags(&cache->item, type);
8274
8275         cache->flags = type;
8276         cache->last_byte_to_unpin = (u64)-1;
8277         cache->cached = BTRFS_CACHE_FINISHED;
8278         cache->needs_free_space = 1;
8279         ret = exclude_super_stripes(cache);
8280         if (ret) {
8281                 /*
8282                  * We may have excluded something, so call this just in
8283                  * case.
8284                  */
8285                 free_excluded_extents(cache);
8286                 btrfs_put_block_group(cache);
8287                 return ret;
8288         }
8289
8290         add_new_free_space(cache, chunk_offset, chunk_offset + size);
8291
8292         free_excluded_extents(cache);
8293
8294 #ifdef CONFIG_BTRFS_DEBUG
8295         if (btrfs_should_fragment_free_space(cache)) {
8296                 u64 new_bytes_used = size - bytes_used;
8297
8298                 bytes_used += new_bytes_used >> 1;
8299                 fragment_free_space(cache);
8300         }
8301 #endif
8302         /*
8303          * Ensure the corresponding space_info object is created and
8304          * assigned to our block group. We want our bg to be added to the rbtree
8305          * with its ->space_info set.
8306          */
8307         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
8308         ASSERT(cache->space_info);
8309
8310         ret = btrfs_add_block_group_cache(fs_info, cache);
8311         if (ret) {
8312                 btrfs_remove_free_space_cache(cache);
8313                 btrfs_put_block_group(cache);
8314                 return ret;
8315         }
8316
8317         /*
8318          * Now that our block group has its ->space_info set and is inserted in
8319          * the rbtree, update the space info's counters.
8320          */
8321         trace_btrfs_add_block_group(fs_info, cache, 1);
8322         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
8323                                 cache->bytes_super, &cache->space_info);
8324         btrfs_update_global_block_rsv(fs_info);
8325
8326         link_block_group(cache);
8327
8328         list_add_tail(&cache->bg_list, &trans->new_bgs);
8329         trans->delayed_ref_updates++;
8330         btrfs_update_delayed_refs_rsv(trans);
8331
8332         set_avail_alloc_bits(fs_info, type);
8333         return 0;
8334 }
8335
8336 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8337 {
8338         u64 extra_flags = chunk_to_extended(flags) &
8339                                 BTRFS_EXTENDED_PROFILE_MASK;
8340
8341         write_seqlock(&fs_info->profiles_lock);
8342         if (flags & BTRFS_BLOCK_GROUP_DATA)
8343                 fs_info->avail_data_alloc_bits &= ~extra_flags;
8344         if (flags & BTRFS_BLOCK_GROUP_METADATA)
8345                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8346         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8347                 fs_info->avail_system_alloc_bits &= ~extra_flags;
8348         write_sequnlock(&fs_info->profiles_lock);
8349 }
8350
8351 /*
8352  * Clear incompat bits for the following feature(s):
8353  *
8354  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
8355  *            in the whole filesystem
8356  */
8357 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
8358 {
8359         if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
8360                 struct list_head *head = &fs_info->space_info;
8361                 struct btrfs_space_info *sinfo;
8362
8363                 list_for_each_entry_rcu(sinfo, head, list) {
8364                         bool found = false;
8365
8366                         down_read(&sinfo->groups_sem);
8367                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
8368                                 found = true;
8369                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
8370                                 found = true;
8371                         up_read(&sinfo->groups_sem);
8372
8373                         if (found)
8374                                 return;
8375                 }
8376                 btrfs_clear_fs_incompat(fs_info, RAID56);
8377         }
8378 }
8379
8380 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8381                              u64 group_start, struct extent_map *em)
8382 {
8383         struct btrfs_fs_info *fs_info = trans->fs_info;
8384         struct btrfs_root *root = fs_info->extent_root;
8385         struct btrfs_path *path;
8386         struct btrfs_block_group_cache *block_group;
8387         struct btrfs_free_cluster *cluster;
8388         struct btrfs_root *tree_root = fs_info->tree_root;
8389         struct btrfs_key key;
8390         struct inode *inode;
8391         struct kobject *kobj = NULL;
8392         int ret;
8393         int index;
8394         int factor;
8395         struct btrfs_caching_control *caching_ctl = NULL;
8396         bool remove_em;
8397         bool remove_rsv = false;
8398
8399         block_group = btrfs_lookup_block_group(fs_info, group_start);
8400         BUG_ON(!block_group);
8401         BUG_ON(!block_group->ro);
8402
8403         trace_btrfs_remove_block_group(block_group);
8404         /*
8405          * Free the reserved super bytes from this block group before
8406          * remove it.
8407          */
8408         free_excluded_extents(block_group);
8409         btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
8410                                   block_group->key.offset);
8411
8412         memcpy(&key, &block_group->key, sizeof(key));
8413         index = btrfs_bg_flags_to_raid_index(block_group->flags);
8414         factor = btrfs_bg_type_to_factor(block_group->flags);
8415
8416         /* make sure this block group isn't part of an allocation cluster */
8417         cluster = &fs_info->data_alloc_cluster;
8418         spin_lock(&cluster->refill_lock);
8419         btrfs_return_cluster_to_free_space(block_group, cluster);
8420         spin_unlock(&cluster->refill_lock);
8421
8422         /*
8423          * make sure this block group isn't part of a metadata
8424          * allocation cluster
8425          */
8426         cluster = &fs_info->meta_alloc_cluster;
8427         spin_lock(&cluster->refill_lock);
8428         btrfs_return_cluster_to_free_space(block_group, cluster);
8429         spin_unlock(&cluster->refill_lock);
8430
8431         path = btrfs_alloc_path();
8432         if (!path) {
8433                 ret = -ENOMEM;
8434                 goto out;
8435         }
8436
8437         /*
8438          * get the inode first so any iput calls done for the io_list
8439          * aren't the final iput (no unlinks allowed now)
8440          */
8441         inode = lookup_free_space_inode(block_group, path);
8442
8443         mutex_lock(&trans->transaction->cache_write_mutex);
8444         /*
8445          * Make sure our free space cache IO is done before removing the
8446          * free space inode
8447          */
8448         spin_lock(&trans->transaction->dirty_bgs_lock);
8449         if (!list_empty(&block_group->io_list)) {
8450                 list_del_init(&block_group->io_list);
8451
8452                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
8453
8454                 spin_unlock(&trans->transaction->dirty_bgs_lock);
8455                 btrfs_wait_cache_io(trans, block_group, path);
8456                 btrfs_put_block_group(block_group);
8457                 spin_lock(&trans->transaction->dirty_bgs_lock);
8458         }
8459
8460         if (!list_empty(&block_group->dirty_list)) {
8461                 list_del_init(&block_group->dirty_list);
8462                 remove_rsv = true;
8463                 btrfs_put_block_group(block_group);
8464         }
8465         spin_unlock(&trans->transaction->dirty_bgs_lock);
8466         mutex_unlock(&trans->transaction->cache_write_mutex);
8467
8468         if (!IS_ERR(inode)) {
8469                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
8470                 if (ret) {
8471                         btrfs_add_delayed_iput(inode);
8472                         goto out;
8473                 }
8474                 clear_nlink(inode);
8475                 /* One for the block groups ref */
8476                 spin_lock(&block_group->lock);
8477                 if (block_group->iref) {
8478                         block_group->iref = 0;
8479                         block_group->inode = NULL;
8480                         spin_unlock(&block_group->lock);
8481                         iput(inode);
8482                 } else {
8483                         spin_unlock(&block_group->lock);
8484                 }
8485                 /* One for our lookup ref */
8486                 btrfs_add_delayed_iput(inode);
8487         }
8488
8489         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8490         key.offset = block_group->key.objectid;
8491         key.type = 0;
8492
8493         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8494         if (ret < 0)
8495                 goto out;
8496         if (ret > 0)
8497                 btrfs_release_path(path);
8498         if (ret == 0) {
8499                 ret = btrfs_del_item(trans, tree_root, path);
8500                 if (ret)
8501                         goto out;
8502                 btrfs_release_path(path);
8503         }
8504
8505         spin_lock(&fs_info->block_group_cache_lock);
8506         rb_erase(&block_group->cache_node,
8507                  &fs_info->block_group_cache_tree);
8508         RB_CLEAR_NODE(&block_group->cache_node);
8509
8510         if (fs_info->first_logical_byte == block_group->key.objectid)
8511                 fs_info->first_logical_byte = (u64)-1;
8512         spin_unlock(&fs_info->block_group_cache_lock);
8513
8514         down_write(&block_group->space_info->groups_sem);
8515         /*
8516          * we must use list_del_init so people can check to see if they
8517          * are still on the list after taking the semaphore
8518          */
8519         list_del_init(&block_group->list);
8520         if (list_empty(&block_group->space_info->block_groups[index])) {
8521                 kobj = block_group->space_info->block_group_kobjs[index];
8522                 block_group->space_info->block_group_kobjs[index] = NULL;
8523                 clear_avail_alloc_bits(fs_info, block_group->flags);
8524         }
8525         up_write(&block_group->space_info->groups_sem);
8526         clear_incompat_bg_bits(fs_info, block_group->flags);
8527         if (kobj) {
8528                 kobject_del(kobj);
8529                 kobject_put(kobj);
8530         }
8531
8532         if (block_group->has_caching_ctl)
8533                 caching_ctl = get_caching_control(block_group);
8534         if (block_group->cached == BTRFS_CACHE_STARTED)
8535                 wait_block_group_cache_done(block_group);
8536         if (block_group->has_caching_ctl) {
8537                 down_write(&fs_info->commit_root_sem);
8538                 if (!caching_ctl) {
8539                         struct btrfs_caching_control *ctl;
8540
8541                         list_for_each_entry(ctl,
8542                                     &fs_info->caching_block_groups, list)
8543                                 if (ctl->block_group == block_group) {
8544                                         caching_ctl = ctl;
8545                                         refcount_inc(&caching_ctl->count);
8546                                         break;
8547                                 }
8548                 }
8549                 if (caching_ctl)
8550                         list_del_init(&caching_ctl->list);
8551                 up_write(&fs_info->commit_root_sem);
8552                 if (caching_ctl) {
8553                         /* Once for the caching bgs list and once for us. */
8554                         put_caching_control(caching_ctl);
8555                         put_caching_control(caching_ctl);
8556                 }
8557         }
8558
8559         spin_lock(&trans->transaction->dirty_bgs_lock);
8560         WARN_ON(!list_empty(&block_group->dirty_list));
8561         WARN_ON(!list_empty(&block_group->io_list));
8562         spin_unlock(&trans->transaction->dirty_bgs_lock);
8563
8564         btrfs_remove_free_space_cache(block_group);
8565
8566         spin_lock(&block_group->space_info->lock);
8567         list_del_init(&block_group->ro_list);
8568
8569         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8570                 WARN_ON(block_group->space_info->total_bytes
8571                         < block_group->key.offset);
8572                 WARN_ON(block_group->space_info->bytes_readonly
8573                         < block_group->key.offset);
8574                 WARN_ON(block_group->space_info->disk_total
8575                         < block_group->key.offset * factor);
8576         }
8577         block_group->space_info->total_bytes -= block_group->key.offset;
8578         block_group->space_info->bytes_readonly -= block_group->key.offset;
8579         block_group->space_info->disk_total -= block_group->key.offset * factor;
8580
8581         spin_unlock(&block_group->space_info->lock);
8582
8583         memcpy(&key, &block_group->key, sizeof(key));
8584
8585         mutex_lock(&fs_info->chunk_mutex);
8586         spin_lock(&block_group->lock);
8587         block_group->removed = 1;
8588         /*
8589          * At this point trimming can't start on this block group, because we
8590          * removed the block group from the tree fs_info->block_group_cache_tree
8591          * so no one can't find it anymore and even if someone already got this
8592          * block group before we removed it from the rbtree, they have already
8593          * incremented block_group->trimming - if they didn't, they won't find
8594          * any free space entries because we already removed them all when we
8595          * called btrfs_remove_free_space_cache().
8596          *
8597          * And we must not remove the extent map from the fs_info->mapping_tree
8598          * to prevent the same logical address range and physical device space
8599          * ranges from being reused for a new block group. This is because our
8600          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
8601          * completely transactionless, so while it is trimming a range the
8602          * currently running transaction might finish and a new one start,
8603          * allowing for new block groups to be created that can reuse the same
8604          * physical device locations unless we take this special care.
8605          *
8606          * There may also be an implicit trim operation if the file system
8607          * is mounted with -odiscard. The same protections must remain
8608          * in place until the extents have been discarded completely when
8609          * the transaction commit has completed.
8610          */
8611         remove_em = (atomic_read(&block_group->trimming) == 0);
8612         spin_unlock(&block_group->lock);
8613
8614         mutex_unlock(&fs_info->chunk_mutex);
8615
8616         ret = remove_block_group_free_space(trans, block_group);
8617         if (ret)
8618                 goto out;
8619
8620         btrfs_put_block_group(block_group);
8621         btrfs_put_block_group(block_group);
8622
8623         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8624         if (ret > 0)
8625                 ret = -EIO;
8626         if (ret < 0)
8627                 goto out;
8628
8629         ret = btrfs_del_item(trans, root, path);
8630         if (ret)
8631                 goto out;
8632
8633         if (remove_em) {
8634                 struct extent_map_tree *em_tree;
8635
8636                 em_tree = &fs_info->mapping_tree;
8637                 write_lock(&em_tree->lock);
8638                 remove_extent_mapping(em_tree, em);
8639                 write_unlock(&em_tree->lock);
8640                 /* once for the tree */
8641                 free_extent_map(em);
8642         }
8643 out:
8644         if (remove_rsv)
8645                 btrfs_delayed_refs_rsv_release(fs_info, 1);
8646         btrfs_free_path(path);
8647         return ret;
8648 }
8649
8650 struct btrfs_trans_handle *
8651 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
8652                                      const u64 chunk_offset)
8653 {
8654         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8655         struct extent_map *em;
8656         struct map_lookup *map;
8657         unsigned int num_items;
8658
8659         read_lock(&em_tree->lock);
8660         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
8661         read_unlock(&em_tree->lock);
8662         ASSERT(em && em->start == chunk_offset);
8663
8664         /*
8665          * We need to reserve 3 + N units from the metadata space info in order
8666          * to remove a block group (done at btrfs_remove_chunk() and at
8667          * btrfs_remove_block_group()), which are used for:
8668          *
8669          * 1 unit for adding the free space inode's orphan (located in the tree
8670          * of tree roots).
8671          * 1 unit for deleting the block group item (located in the extent
8672          * tree).
8673          * 1 unit for deleting the free space item (located in tree of tree
8674          * roots).
8675          * N units for deleting N device extent items corresponding to each
8676          * stripe (located in the device tree).
8677          *
8678          * In order to remove a block group we also need to reserve units in the
8679          * system space info in order to update the chunk tree (update one or
8680          * more device items and remove one chunk item), but this is done at
8681          * btrfs_remove_chunk() through a call to check_system_chunk().
8682          */
8683         map = em->map_lookup;
8684         num_items = 3 + map->num_stripes;
8685         free_extent_map(em);
8686
8687         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
8688                                                            num_items, 1);
8689 }
8690
8691 /*
8692  * Process the unused_bgs list and remove any that don't have any allocated
8693  * space inside of them.
8694  */
8695 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
8696 {
8697         struct btrfs_block_group_cache *block_group;
8698         struct btrfs_space_info *space_info;
8699         struct btrfs_trans_handle *trans;
8700         int ret = 0;
8701
8702         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
8703                 return;
8704
8705         spin_lock(&fs_info->unused_bgs_lock);
8706         while (!list_empty(&fs_info->unused_bgs)) {
8707                 u64 start, end;
8708                 int trimming;
8709
8710                 block_group = list_first_entry(&fs_info->unused_bgs,
8711                                                struct btrfs_block_group_cache,
8712                                                bg_list);
8713                 list_del_init(&block_group->bg_list);
8714
8715                 space_info = block_group->space_info;
8716
8717                 if (ret || btrfs_mixed_space_info(space_info)) {
8718                         btrfs_put_block_group(block_group);
8719                         continue;
8720                 }
8721                 spin_unlock(&fs_info->unused_bgs_lock);
8722
8723                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
8724
8725                 /* Don't want to race with allocators so take the groups_sem */
8726                 down_write(&space_info->groups_sem);
8727                 spin_lock(&block_group->lock);
8728                 if (block_group->reserved || block_group->pinned ||
8729                     btrfs_block_group_used(&block_group->item) ||
8730                     block_group->ro ||
8731                     list_is_singular(&block_group->list)) {
8732                         /*
8733                          * We want to bail if we made new allocations or have
8734                          * outstanding allocations in this block group.  We do
8735                          * the ro check in case balance is currently acting on
8736                          * this block group.
8737                          */
8738                         trace_btrfs_skip_unused_block_group(block_group);
8739                         spin_unlock(&block_group->lock);
8740                         up_write(&space_info->groups_sem);
8741                         goto next;
8742                 }
8743                 spin_unlock(&block_group->lock);
8744
8745                 /* We don't want to force the issue, only flip if it's ok. */
8746                 ret = inc_block_group_ro(block_group, 0);
8747                 up_write(&space_info->groups_sem);
8748                 if (ret < 0) {
8749                         ret = 0;
8750                         goto next;
8751                 }
8752
8753                 /*
8754                  * Want to do this before we do anything else so we can recover
8755                  * properly if we fail to join the transaction.
8756                  */
8757                 trans = btrfs_start_trans_remove_block_group(fs_info,
8758                                                      block_group->key.objectid);
8759                 if (IS_ERR(trans)) {
8760                         btrfs_dec_block_group_ro(block_group);
8761                         ret = PTR_ERR(trans);
8762                         goto next;
8763                 }
8764
8765                 /*
8766                  * We could have pending pinned extents for this block group,
8767                  * just delete them, we don't care about them anymore.
8768                  */
8769                 start = block_group->key.objectid;
8770                 end = start + block_group->key.offset - 1;
8771                 /*
8772                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
8773                  * btrfs_finish_extent_commit(). If we are at transaction N,
8774                  * another task might be running finish_extent_commit() for the
8775                  * previous transaction N - 1, and have seen a range belonging
8776                  * to the block group in freed_extents[] before we were able to
8777                  * clear the whole block group range from freed_extents[]. This
8778                  * means that task can lookup for the block group after we
8779                  * unpinned it from freed_extents[] and removed it, leading to
8780                  * a BUG_ON() at btrfs_unpin_extent_range().
8781                  */
8782                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
8783                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
8784                                   EXTENT_DIRTY);
8785                 if (ret) {
8786                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
8787                         btrfs_dec_block_group_ro(block_group);
8788                         goto end_trans;
8789                 }
8790                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
8791                                   EXTENT_DIRTY);
8792                 if (ret) {
8793                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
8794                         btrfs_dec_block_group_ro(block_group);
8795                         goto end_trans;
8796                 }
8797                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
8798
8799                 /* Reset pinned so btrfs_put_block_group doesn't complain */
8800                 spin_lock(&space_info->lock);
8801                 spin_lock(&block_group->lock);
8802
8803                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
8804                                                      -block_group->pinned);
8805                 space_info->bytes_readonly += block_group->pinned;
8806                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
8807                                    -block_group->pinned,
8808                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
8809                 block_group->pinned = 0;
8810
8811                 spin_unlock(&block_group->lock);
8812                 spin_unlock(&space_info->lock);
8813
8814                 /* DISCARD can flip during remount */
8815                 trimming = btrfs_test_opt(fs_info, DISCARD);
8816
8817                 /* Implicit trim during transaction commit. */
8818                 if (trimming)
8819                         btrfs_get_block_group_trimming(block_group);
8820
8821                 /*
8822                  * Btrfs_remove_chunk will abort the transaction if things go
8823                  * horribly wrong.
8824                  */
8825                 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
8826
8827                 if (ret) {
8828                         if (trimming)
8829                                 btrfs_put_block_group_trimming(block_group);
8830                         goto end_trans;
8831                 }
8832
8833                 /*
8834                  * If we're not mounted with -odiscard, we can just forget
8835                  * about this block group. Otherwise we'll need to wait
8836                  * until transaction commit to do the actual discard.
8837                  */
8838                 if (trimming) {
8839                         spin_lock(&fs_info->unused_bgs_lock);
8840                         /*
8841                          * A concurrent scrub might have added us to the list
8842                          * fs_info->unused_bgs, so use a list_move operation
8843                          * to add the block group to the deleted_bgs list.
8844                          */
8845                         list_move(&block_group->bg_list,
8846                                   &trans->transaction->deleted_bgs);
8847                         spin_unlock(&fs_info->unused_bgs_lock);
8848                         btrfs_get_block_group(block_group);
8849                 }
8850 end_trans:
8851                 btrfs_end_transaction(trans);
8852 next:
8853                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8854                 btrfs_put_block_group(block_group);
8855                 spin_lock(&fs_info->unused_bgs_lock);
8856         }
8857         spin_unlock(&fs_info->unused_bgs_lock);
8858 }
8859
8860 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
8861                                    u64 start, u64 end)
8862 {
8863         return unpin_extent_range(fs_info, start, end, false);
8864 }
8865
8866 /*
8867  * It used to be that old block groups would be left around forever.
8868  * Iterating over them would be enough to trim unused space.  Since we
8869  * now automatically remove them, we also need to iterate over unallocated
8870  * space.
8871  *
8872  * We don't want a transaction for this since the discard may take a
8873  * substantial amount of time.  We don't require that a transaction be
8874  * running, but we do need to take a running transaction into account
8875  * to ensure that we're not discarding chunks that were released or
8876  * allocated in the current transaction.
8877  *
8878  * Holding the chunks lock will prevent other threads from allocating
8879  * or releasing chunks, but it won't prevent a running transaction
8880  * from committing and releasing the memory that the pending chunks
8881  * list head uses.  For that, we need to take a reference to the
8882  * transaction and hold the commit root sem.  We only need to hold
8883  * it while performing the free space search since we have already
8884  * held back allocations.
8885  */
8886 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
8887 {
8888         u64 start = SZ_1M, len = 0, end = 0;
8889         int ret;
8890
8891         *trimmed = 0;
8892
8893         /* Discard not supported = nothing to do. */
8894         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
8895                 return 0;
8896
8897         /* Not writable = nothing to do. */
8898         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
8899                 return 0;
8900
8901         /* No free space = nothing to do. */
8902         if (device->total_bytes <= device->bytes_used)
8903                 return 0;
8904
8905         ret = 0;
8906
8907         while (1) {
8908                 struct btrfs_fs_info *fs_info = device->fs_info;
8909                 u64 bytes;
8910
8911                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
8912                 if (ret)
8913                         break;
8914
8915                 find_first_clear_extent_bit(&device->alloc_state, start,
8916                                             &start, &end,
8917                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
8918
8919                 /* Ensure we skip the reserved area in the first 1M */
8920                 start = max_t(u64, start, SZ_1M);
8921
8922                 /*
8923                  * If find_first_clear_extent_bit find a range that spans the
8924                  * end of the device it will set end to -1, in this case it's up
8925                  * to the caller to trim the value to the size of the device.
8926                  */
8927                 end = min(end, device->total_bytes - 1);
8928
8929                 len = end - start + 1;
8930
8931                 /* We didn't find any extents */
8932                 if (!len) {
8933                         mutex_unlock(&fs_info->chunk_mutex);
8934                         ret = 0;
8935                         break;
8936                 }
8937
8938                 ret = btrfs_issue_discard(device->bdev, start, len,
8939                                           &bytes);
8940                 if (!ret)
8941                         set_extent_bits(&device->alloc_state, start,
8942                                         start + bytes - 1,
8943                                         CHUNK_TRIMMED);
8944                 mutex_unlock(&fs_info->chunk_mutex);
8945
8946                 if (ret)
8947                         break;
8948
8949                 start += len;
8950                 *trimmed += bytes;
8951
8952                 if (fatal_signal_pending(current)) {
8953                         ret = -ERESTARTSYS;
8954                         break;
8955                 }
8956
8957                 cond_resched();
8958         }
8959
8960         return ret;
8961 }
8962
8963 /*
8964  * Trim the whole filesystem by:
8965  * 1) trimming the free space in each block group
8966  * 2) trimming the unallocated space on each device
8967  *
8968  * This will also continue trimming even if a block group or device encounters
8969  * an error.  The return value will be the last error, or 0 if nothing bad
8970  * happens.
8971  */
8972 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
8973 {
8974         struct btrfs_block_group_cache *cache = NULL;
8975         struct btrfs_device *device;
8976         struct list_head *devices;
8977         u64 group_trimmed;
8978         u64 start;
8979         u64 end;
8980         u64 trimmed = 0;
8981         u64 bg_failed = 0;
8982         u64 dev_failed = 0;
8983         int bg_ret = 0;
8984         int dev_ret = 0;
8985         int ret = 0;
8986
8987         cache = btrfs_lookup_first_block_group(fs_info, range->start);
8988         for (; cache; cache = next_block_group(cache)) {
8989                 if (cache->key.objectid >= (range->start + range->len)) {
8990                         btrfs_put_block_group(cache);
8991                         break;
8992                 }
8993
8994                 start = max(range->start, cache->key.objectid);
8995                 end = min(range->start + range->len,
8996                                 cache->key.objectid + cache->key.offset);
8997
8998                 if (end - start >= range->minlen) {
8999                         if (!block_group_cache_done(cache)) {
9000                                 ret = cache_block_group(cache, 0);
9001                                 if (ret) {
9002                                         bg_failed++;
9003                                         bg_ret = ret;
9004                                         continue;
9005                                 }
9006                                 ret = wait_block_group_cache_done(cache);
9007                                 if (ret) {
9008                                         bg_failed++;
9009                                         bg_ret = ret;
9010                                         continue;
9011                                 }
9012                         }
9013                         ret = btrfs_trim_block_group(cache,
9014                                                      &group_trimmed,
9015                                                      start,
9016                                                      end,
9017                                                      range->minlen);
9018
9019                         trimmed += group_trimmed;
9020                         if (ret) {
9021                                 bg_failed++;
9022                                 bg_ret = ret;
9023                                 continue;
9024                         }
9025                 }
9026         }
9027
9028         if (bg_failed)
9029                 btrfs_warn(fs_info,
9030                         "failed to trim %llu block group(s), last error %d",
9031                         bg_failed, bg_ret);
9032         mutex_lock(&fs_info->fs_devices->device_list_mutex);
9033         devices = &fs_info->fs_devices->devices;
9034         list_for_each_entry(device, devices, dev_list) {
9035                 ret = btrfs_trim_free_extents(device, &group_trimmed);
9036                 if (ret) {
9037                         dev_failed++;
9038                         dev_ret = ret;
9039                         break;
9040                 }
9041
9042                 trimmed += group_trimmed;
9043         }
9044         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
9045
9046         if (dev_failed)
9047                 btrfs_warn(fs_info,
9048                         "failed to trim %llu device(s), last error %d",
9049                         dev_failed, dev_ret);
9050         range->len = trimmed;
9051         if (bg_ret)
9052                 return bg_ret;
9053         return dev_ret;
9054 }
9055
9056 /*
9057  * btrfs_{start,end}_write_no_snapshotting() are similar to
9058  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9059  * data into the page cache through nocow before the subvolume is snapshoted,
9060  * but flush the data into disk after the snapshot creation, or to prevent
9061  * operations while snapshotting is ongoing and that cause the snapshot to be
9062  * inconsistent (writes followed by expanding truncates for example).
9063  */
9064 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
9065 {
9066         percpu_counter_dec(&root->subv_writers->counter);
9067         cond_wake_up(&root->subv_writers->wait);
9068 }
9069
9070 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
9071 {
9072         if (atomic_read(&root->will_be_snapshotted))
9073                 return 0;
9074
9075         percpu_counter_inc(&root->subv_writers->counter);
9076         /*
9077          * Make sure counter is updated before we check for snapshot creation.
9078          */
9079         smp_mb();
9080         if (atomic_read(&root->will_be_snapshotted)) {
9081                 btrfs_end_write_no_snapshotting(root);
9082                 return 0;
9083         }
9084         return 1;
9085 }
9086
9087 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
9088 {
9089         while (true) {
9090                 int ret;
9091
9092                 ret = btrfs_start_write_no_snapshotting(root);
9093                 if (ret)
9094                         break;
9095                 wait_var_event(&root->will_be_snapshotted,
9096                                !atomic_read(&root->will_be_snapshotted));
9097         }
9098 }
9099
9100 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
9101 {
9102         struct btrfs_fs_info *fs_info = bg->fs_info;
9103
9104         spin_lock(&fs_info->unused_bgs_lock);
9105         if (list_empty(&bg->bg_list)) {
9106                 btrfs_get_block_group(bg);
9107                 trace_btrfs_add_unused_block_group(bg);
9108                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
9109         }
9110         spin_unlock(&fs_info->unused_bgs_lock);
9111 }