fs/btrfs/transaction.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/fs.h>
  20 #include <linux/sched.h>
  21 #include <linux/writeback.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/blkdev.h>
  24 #include "ctree.h"
  25 #include "disk-io.h"
  26 #include "transaction.h"
  27 #include "locking.h"
  28 #include "ref-cache.h"
  29 #include "tree-log.h"
  30
  31 static int total_trans = 0;
  32 extern struct kmem_cache *btrfs_trans_handle_cachep;
  33 extern struct kmem_cache *btrfs_transaction_cachep;
  34
  35 #define BTRFS_ROOT_TRANS_TAG 0
  36
  37 static noinline void put_transaction(struct btrfs_transaction *transaction)
  38 {
  39         WARN_ON(transaction->use_count == 0);
  40         transaction->use_count--;
  41         if (transaction->use_count == 0) {
  42                 WARN_ON(total_trans == 0);
  43                 total_trans--;
  44                 list_del_init(&transaction->list);
  45                 memset(transaction, 0, sizeof(*transaction));
  46                 kmem_cache_free(btrfs_transaction_cachep, transaction);
  47         }
  48 }
  49
  50 /*
  51  * either allocate a new transaction or hop into the existing one
  52  */
  53 static noinline int join_transaction(struct btrfs_root *root)
  54 {
  55         struct btrfs_transaction *cur_trans;
  56         cur_trans = root->fs_info->running_transaction;
  57         if (!cur_trans) {
  58                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
  59                                              GFP_NOFS);
  60                 total_trans++;
  61                 BUG_ON(!cur_trans);
  62                 root->fs_info->generation++;
  63                 root->fs_info->last_alloc = 0;
  64                 root->fs_info->last_data_alloc = 0;
  65                 cur_trans->num_writers = 1;
  66                 cur_trans->num_joined = 0;
  67                 cur_trans->transid = root->fs_info->generation;
  68                 init_waitqueue_head(&cur_trans->writer_wait);
  69                 init_waitqueue_head(&cur_trans->commit_wait);
  70                 cur_trans->in_commit = 0;
  71                 cur_trans->blocked = 0;
  72                 cur_trans->use_count = 1;
  73                 cur_trans->commit_done = 0;
  74                 cur_trans->start_time = get_seconds();
  75                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
  76                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
  77                 extent_io_tree_init(&cur_trans->dirty_pages,
  78                                      root->fs_info->btree_inode->i_mapping,
  79                                      GFP_NOFS);
  80                 spin_lock(&root->fs_info->new_trans_lock);
  81                 root->fs_info->running_transaction = cur_trans;
  82                 spin_unlock(&root->fs_info->new_trans_lock);
  83         } else {
  84                 cur_trans->num_writers++;
  85                 cur_trans->num_joined++;
  86         }
  87
  88         return 0;
  89 }
  90
  91 /*
  92  * this does all the record keeping required to make sure that a
  93  * reference counted root is properly recorded in a given transaction.
  94  * This is required to make sure the old root from before we joined the transaction
  95  * is deleted when the transaction commits
  96  */
  97 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
  98 {
  99         struct btrfs_dirty_root *dirty;
 100         u64 running_trans_id = root->fs_info->running_transaction->transid;
 101         if (root->ref_cows && root->last_trans < running_trans_id) {
 102                 WARN_ON(root == root->fs_info->extent_root);
 103                 if (root->root_item.refs != 0) {
 104                         radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 105                                    (unsigned long)root->root_key.objectid,
 106                                    BTRFS_ROOT_TRANS_TAG);
 107
 108                         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 109                         BUG_ON(!dirty);
 110                         dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
 111                         BUG_ON(!dirty->root);
 112                         dirty->latest_root = root;
 113                         INIT_LIST_HEAD(&dirty->list);
 114
 115                         root->commit_root = btrfs_root_node(root);
 116
 117                         memcpy(dirty->root, root, sizeof(*root));
 118                         spin_lock_init(&dirty->root->node_lock);
 119                         spin_lock_init(&dirty->root->list_lock);
 120                         mutex_init(&dirty->root->objectid_mutex);
 121                         mutex_init(&dirty->root->log_mutex);
 122                         INIT_LIST_HEAD(&dirty->root->dead_list);
 123                         dirty->root->node = root->commit_root;
 124                         dirty->root->commit_root = NULL;
 125
 126                         spin_lock(&root->list_lock);
 127                         list_add(&dirty->root->dead_list, &root->dead_list);
 128                         spin_unlock(&root->list_lock);
 129
 130                         root->dirty_root = dirty;
 131                 } else {
 132                         WARN_ON(1);
 133                 }
 134                 root->last_trans = running_trans_id;
 135         }
 136         return 0;
 137 }
 138
 139 /* wait for commit against the current transaction to become unblocked
 140  * when this is done, it is safe to start a new transaction, but the current
 141  * transaction might not be fully on disk.
 142  */
 143 static void wait_current_trans(struct btrfs_root *root)
 144 {
 145         struct btrfs_transaction *cur_trans;
 146
 147         cur_trans = root->fs_info->running_transaction;
 148         if (cur_trans && cur_trans->blocked) {
 149                 DEFINE_WAIT(wait);
 150                 cur_trans->use_count++;
 151                 while(1) {
 152                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 153                                         TASK_UNINTERRUPTIBLE);
 154                         if (cur_trans->blocked) {
 155                                 mutex_unlock(&root->fs_info->trans_mutex);
 156                                 schedule();
 157                                 mutex_lock(&root->fs_info->trans_mutex);
 158                                 finish_wait(&root->fs_info->transaction_wait,
 159                                             &wait);
 160                         } else {
 161                                 finish_wait(&root->fs_info->transaction_wait,
 162                                             &wait);
 163                                 break;
 164                         }
 165                 }
 166                 put_transaction(cur_trans);
 167         }
 168 }
 169
 170 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 171                                              int num_blocks, int wait)
 172 {
 173         struct btrfs_trans_handle *h =
 174                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 175         int ret;
 176
 177         mutex_lock(&root->fs_info->trans_mutex);
 178         if (!root->fs_info->log_root_recovering &&
 179             ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
 180                 wait_current_trans(root);
 181         ret = join_transaction(root);
 182         BUG_ON(ret);
 183
 184         btrfs_record_root_in_trans(root);
 185         h->transid = root->fs_info->running_transaction->transid;
 186         h->transaction = root->fs_info->running_transaction;
 187         h->blocks_reserved = num_blocks;
 188         h->blocks_used = 0;
 189         h->block_group = NULL;
 190         h->alloc_exclude_nr = 0;
 191         h->alloc_exclude_start = 0;
 192         root->fs_info->running_transaction->use_count++;
 193         mutex_unlock(&root->fs_info->trans_mutex);
 194         return h;
 195 }
 196
 197 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 198                                                    int num_blocks)
 199 {
 200         return start_transaction(root, num_blocks, 1);
 201 }
 202 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 203                                                    int num_blocks)
 204 {
 205         return start_transaction(root, num_blocks, 0);
 206 }
 207
 208 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 209                                                          int num_blocks)
 210 {
 211         return start_transaction(r, num_blocks, 2);
 212 }
 213
 214 /* wait for a transaction commit to be fully complete */
 215 static noinline int wait_for_commit(struct btrfs_root *root,
 216                                     struct btrfs_transaction *commit)
 217 {
 218         DEFINE_WAIT(wait);
 219         mutex_lock(&root->fs_info->trans_mutex);
 220         while(!commit->commit_done) {
 221                 prepare_to_wait(&commit->commit_wait, &wait,
 222                                 TASK_UNINTERRUPTIBLE);
 223                 if (commit->commit_done)
 224                         break;
 225                 mutex_unlock(&root->fs_info->trans_mutex);
 226                 schedule();
 227                 mutex_lock(&root->fs_info->trans_mutex);
 228         }
 229         mutex_unlock(&root->fs_info->trans_mutex);
 230         finish_wait(&commit->commit_wait, &wait);
 231         return 0;
 232 }
 233
 234 /*
 235  * rate limit against the drop_snapshot code.  This helps to slow down new operations
 236  * if the drop_snapshot code isn't able to keep up.
 237  */
 238 static void throttle_on_drops(struct btrfs_root *root)
 239 {
 240         struct btrfs_fs_info *info = root->fs_info;
 241         int harder_count = 0;
 242
 243 harder:
 244         if (atomic_read(&info->throttles)) {
 245                 DEFINE_WAIT(wait);
 246                 int thr;
 247                 thr = atomic_read(&info->throttle_gen);
 248
 249                 do {
 250                         prepare_to_wait(&info->transaction_throttle,
 251                                         &wait, TASK_UNINTERRUPTIBLE);
 252                         if (!atomic_read(&info->throttles)) {
 253                                 finish_wait(&info->transaction_throttle, &wait);
 254                                 break;
 255                         }
 256                         schedule();
 257                         finish_wait(&info->transaction_throttle, &wait);
 258                 } while (thr == atomic_read(&info->throttle_gen));
 259                 harder_count++;
 260
 261                 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
 262                     harder_count < 2)
 263                         goto harder;
 264
 265                 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
 266                     harder_count < 10)
 267                         goto harder;
 268
 269                 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
 270                     harder_count < 20)
 271                         goto harder;
 272         }
 273 }
 274
 275 void btrfs_throttle(struct btrfs_root *root)
 276 {
 277         mutex_lock(&root->fs_info->trans_mutex);
 278         if (!root->fs_info->open_ioctl_trans)
 279                 wait_current_trans(root);
 280         mutex_unlock(&root->fs_info->trans_mutex);
 281
 282         throttle_on_drops(root);
 283 }
 284
 285 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 286                           struct btrfs_root *root, int throttle)
 287 {
 288         struct btrfs_transaction *cur_trans;
 289         struct btrfs_fs_info *info = root->fs_info;
 290
 291         mutex_lock(&info->trans_mutex);
 292         cur_trans = info->running_transaction;
 293         WARN_ON(cur_trans != trans->transaction);
 294         WARN_ON(cur_trans->num_writers < 1);
 295         cur_trans->num_writers--;
 296
 297         if (waitqueue_active(&cur_trans->writer_wait))
 298                 wake_up(&cur_trans->writer_wait);
 299         put_transaction(cur_trans);
 300         mutex_unlock(&info->trans_mutex);
 301         memset(trans, 0, sizeof(*trans));
 302         kmem_cache_free(btrfs_trans_handle_cachep, trans);
 303
 304         if (throttle)
 305                 throttle_on_drops(root);
 306
 307         return 0;
 308 }
 309
 310 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 311                           struct btrfs_root *root)
 312 {
 313         return __btrfs_end_transaction(trans, root, 0);
 314 }
 315
 316 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 317                                    struct btrfs_root *root)
 318 {
 319         return __btrfs_end_transaction(trans, root, 1);
 320 }
 321
 322 /*
 323  * when btree blocks are allocated, they have some corresponding bits set for
 324  * them in one of two extent_io trees.  This is used to make sure all of
 325  * those extents are on disk for transaction or log commit
 326  */
 327 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 328                                         struct extent_io_tree *dirty_pages)
 329 {
 330         int ret;
 331         int err = 0;
 332         int werr = 0;
 333         struct page *page;
 334         struct inode *btree_inode = root->fs_info->btree_inode;
 335         struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 336         u64 start = 0;
 337         u64 end;
 338         unsigned long index;
 339
 340         while(1) {
 341                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 342                                             EXTENT_DIRTY);
 343                 if (ret)
 344                         break;
 345                 while(start <= end) {
 346                         cond_resched();
 347
 348                         index = start >> PAGE_CACHE_SHIFT;
 349                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 350                         page = find_get_page(btree_inode->i_mapping, index);
 351                         if (!page)
 352                                 continue;
 353
 354                         btree_lock_page_hook(page);
 355                         if (!page->mapping) {
 356                                 unlock_page(page);
 357                                 page_cache_release(page);
 358                                 continue;
 359                         }
 360
 361                         if (PageWriteback(page)) {
 362                                 if (PageDirty(page))
 363                                         wait_on_page_writeback(page);
 364                                 else {
 365                                         unlock_page(page);
 366                                         page_cache_release(page);
 367                                         continue;
 368                                 }
 369                         }
 370                         err = write_one_page(page, 0);
 371                         if (err)
 372                                 werr = err;
 373                         page_cache_release(page);
 374                 }
 375         }
 376         /*
 377          * we unplug once and then use the wait_on_extent_bit for
 378          * everything else
 379          */
 380         blk_run_address_space(btree_inode->i_mapping);
 381         while(1) {
 382                 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 383                                             EXTENT_DIRTY);
 384                 if (ret)
 385                         break;
 386
 387                 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
 388                 while(start <= end) {
 389                         index = start >> PAGE_CACHE_SHIFT;
 390                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 391                         page = find_get_page(btree_inode->i_mapping, index);
 392                         if (!page)
 393                                 continue;
 394                         if (PageDirty(page)) {
 395                                 btree_lock_page_hook(page);
 396                                 wait_on_page_writeback(page);
 397                                 err = write_one_page(page, 0);
 398                                 if (err)
 399                                         werr = err;
 400                         }
 401                         if (PageWriteback(page)) {
 402                                 /*
 403                                  * we don't wait on the page writeback bit
 404                                  * because that triggers a lot of unplugs.
 405                                  * The extent bits are much nicer to
 406                                  * the disks, but come with a slightly
 407                                  * higher latency because we aren't forcing
 408                                  * unplugs.
 409                                  */
 410                                 wait_on_extent_writeback(io_tree,
 411                                          page_offset(page),
 412                                          page_offset(page) +
 413                                          PAGE_CACHE_SIZE - 1);
 414                         }
 415                         if (PageWriteback(page)) {
 416                                 /*
 417                                  * the state bits get cleared before the
 418                                  * page bits, lets add some extra
 419                                  * paranoia here
 420                                  */
 421                                 wait_on_page_writeback(page);
 422                         }
 423                         page_cache_release(page);
 424                         cond_resched();
 425                 }
 426         }
 427         if (err)
 428                 werr = err;
 429         return werr;
 430 }
 431
 432 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 433                                      struct btrfs_root *root)
 434 {
 435         if (!trans || !trans->transaction) {
 436                 struct inode *btree_inode;
 437                 btree_inode = root->fs_info->btree_inode;
 438                 return filemap_write_and_wait(btree_inode->i_mapping);
 439         }
 440         return btrfs_write_and_wait_marked_extents(root,
 441                                            &trans->transaction->dirty_pages);
 442 }
 443
 444 /*
 445  * this is used to update the root pointer in the tree of tree roots.
 446  *
 447  * But, in the case of the extent allocation tree, updating the root
 448  * pointer may allocate blocks which may change the root of the extent
 449  * allocation tree.
 450  *
 451  * So, this loops and repeats and makes sure the cowonly root didn't
 452  * change while the root pointer was being updated in the metadata.
 453  */
 454 static int update_cowonly_root(struct btrfs_trans_handle *trans,
 455                                struct btrfs_root *root)
 456 {
 457         int ret;
 458         u64 old_root_bytenr;
 459         struct btrfs_root *tree_root = root->fs_info->tree_root;
 460
 461         btrfs_extent_post_op(trans, root);
 462         btrfs_write_dirty_block_groups(trans, root);
 463         btrfs_extent_post_op(trans, root);
 464
 465         while(1) {
 466                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 467                 if (old_root_bytenr == root->node->start)
 468                         break;
 469                 btrfs_set_root_bytenr(&root->root_item,
 470                                        root->node->start);
 471                 btrfs_set_root_level(&root->root_item,
 472                                      btrfs_header_level(root->node));
 473                 btrfs_set_root_generation(&root->root_item, trans->transid);
 474
 475                 btrfs_extent_post_op(trans, root);
 476
 477                 ret = btrfs_update_root(trans, tree_root,
 478                                         &root->root_key,
 479                                         &root->root_item);
 480                 BUG_ON(ret);
 481                 btrfs_write_dirty_block_groups(trans, root);
 482                 btrfs_extent_post_op(trans, root);
 483         }
 484         return 0;
 485 }
 486
 487 /*
 488  * update all the cowonly tree roots on disk
 489  */
 490 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 491                             struct btrfs_root *root)
 492 {
 493         struct btrfs_fs_info *fs_info = root->fs_info;
 494         struct list_head *next;
 495         struct extent_buffer *eb;
 496
 497         btrfs_extent_post_op(trans, fs_info->tree_root);
 498
 499         eb = btrfs_lock_root_node(fs_info->tree_root);
 500         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
 501         btrfs_tree_unlock(eb);
 502         free_extent_buffer(eb);
 503
 504         btrfs_extent_post_op(trans, fs_info->tree_root);
 505
 506         while(!list_empty(&fs_info->dirty_cowonly_roots)) {
 507                 next = fs_info->dirty_cowonly_roots.next;
 508                 list_del_init(next);
 509                 root = list_entry(next, struct btrfs_root, dirty_list);
 510
 511                 update_cowonly_root(trans, root);
 512         }
 513         return 0;
 514 }
 515
 516 /*
 517  * dead roots are old snapshots that need to be deleted.  This allocates
 518  * a dirty root struct and adds it into the list of dead roots that need to
 519  * be deleted
 520  */
 521 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 522 {
 523         struct btrfs_dirty_root *dirty;
 524
 525         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 526         if (!dirty)
 527                 return -ENOMEM;
 528         dirty->root = root;
 529         dirty->latest_root = latest;
 530
 531         mutex_lock(&root->fs_info->trans_mutex);
 532         list_add(&dirty->list, &latest->fs_info->dead_roots);
 533         mutex_unlock(&root->fs_info->trans_mutex);
 534         return 0;
 535 }
 536
 537 /*
 538  * at transaction commit time we need to schedule the old roots for
 539  * deletion via btrfs_drop_snapshot.  This runs through all the
 540  * reference counted roots that were modified in the current
 541  * transaction and puts them into the drop list
 542  */
 543 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 544                                     struct radix_tree_root *radix,
 545                                     struct list_head *list)
 546 {
 547         struct btrfs_dirty_root *dirty;
 548         struct btrfs_root *gang[8];
 549         struct btrfs_root *root;
 550         int i;
 551         int ret;
 552         int err = 0;
 553         u32 refs;
 554
 555         while(1) {
 556                 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 557                                                  ARRAY_SIZE(gang),
 558                                                  BTRFS_ROOT_TRANS_TAG);
 559                 if (ret == 0)
 560                         break;
 561                 for (i = 0; i < ret; i++) {
 562                         root = gang[i];
 563                         radix_tree_tag_clear(radix,
 564                                      (unsigned long)root->root_key.objectid,
 565                                      BTRFS_ROOT_TRANS_TAG);
 566
 567                         BUG_ON(!root->ref_tree);
 568                         dirty = root->dirty_root;
 569
 570                         btrfs_free_log(trans, root);
 571                         btrfs_free_reloc_root(trans, root);
 572
 573                         if (root->commit_root == root->node) {
 574                                 WARN_ON(root->node->start !=
 575                                         btrfs_root_bytenr(&root->root_item));
 576
 577                                 free_extent_buffer(root->commit_root);
 578                                 root->commit_root = NULL;
 579                                 root->dirty_root = NULL;
 580
 581                                 spin_lock(&root->list_lock);
 582                                 list_del_init(&dirty->root->dead_list);
 583                                 spin_unlock(&root->list_lock);
 584
 585                                 kfree(dirty->root);
 586                                 kfree(dirty);
 587
 588                                 /* make sure to update the root on disk
 589                                  * so we get any updates to the block used
 590                                  * counts
 591                                  */
 592                                 err = btrfs_update_root(trans,
 593                                                 root->fs_info->tree_root,
 594                                                 &root->root_key,
 595                                                 &root->root_item);
 596                                 continue;
 597                         }
 598
 599                         memset(&root->root_item.drop_progress, 0,
 600                                sizeof(struct btrfs_disk_key));
 601                         root->root_item.drop_level = 0;
 602                         root->commit_root = NULL;
 603                         root->dirty_root = NULL;
 604                         root->root_key.offset = root->fs_info->generation;
 605                         btrfs_set_root_bytenr(&root->root_item,
 606                                               root->node->start);
 607                         btrfs_set_root_level(&root->root_item,
 608                                              btrfs_header_level(root->node));
 609                         btrfs_set_root_generation(&root->root_item,
 610                                                   root->root_key.offset);
 611
 612                         err = btrfs_insert_root(trans, root->fs_info->tree_root,
 613                                                 &root->root_key,
 614                                                 &root->root_item);
 615                         if (err)
 616                                 break;
 617
 618                         refs = btrfs_root_refs(&dirty->root->root_item);
 619                         btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
 620                         err = btrfs_update_root(trans, root->fs_info->tree_root,
 621                                                 &dirty->root->root_key,
 622                                                 &dirty->root->root_item);
 623
 624                         BUG_ON(err);
 625                         if (refs == 1) {
 626                                 list_add(&dirty->list, list);
 627                         } else {
 628                                 WARN_ON(1);
 629                                 free_extent_buffer(dirty->root->node);
 630                                 kfree(dirty->root);
 631                                 kfree(dirty);
 632                         }
 633                 }
 634         }
 635         return err;
 636 }
 637
 638 /*
 639  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 640  * otherwise every leaf in the btree is read and defragged.
 641  */
 642 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 643 {
 644         struct btrfs_fs_info *info = root->fs_info;
 645         int ret;
 646         struct btrfs_trans_handle *trans;
 647         unsigned long nr;
 648
 649         smp_mb();
 650         if (root->defrag_running)
 651                 return 0;
 652         trans = btrfs_start_transaction(root, 1);
 653         while (1) {
 654                 root->defrag_running = 1;
 655                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
 656                 nr = trans->blocks_used;
 657                 btrfs_end_transaction(trans, root);
 658                 btrfs_btree_balance_dirty(info->tree_root, nr);
 659                 cond_resched();
 660
 661                 trans = btrfs_start_transaction(root, 1);
 662                 if (root->fs_info->closing || ret != -EAGAIN)
 663                         break;
 664         }
 665         root->defrag_running = 0;
 666         smp_mb();
 667         btrfs_end_transaction(trans, root);
 668         return 0;
 669 }
 670
 671 /*
 672  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 673  * all of them
 674  */
 675 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 676                                      struct list_head *list)
 677 {
 678         struct btrfs_dirty_root *dirty;
 679         struct btrfs_trans_handle *trans;
 680         unsigned long nr;
 681         u64 num_bytes;
 682         u64 bytes_used;
 683         u64 max_useless;
 684         int ret = 0;
 685         int err;
 686
 687         while(!list_empty(list)) {
 688                 struct btrfs_root *root;
 689
 690                 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
 691                 list_del_init(&dirty->list);
 692
 693                 num_bytes = btrfs_root_used(&dirty->root->root_item);
 694                 root = dirty->latest_root;
 695                 atomic_inc(&root->fs_info->throttles);
 696
 697                 while(1) {
 698                         trans = btrfs_start_transaction(tree_root, 1);
 699                         mutex_lock(&root->fs_info->drop_mutex);
 700                         ret = btrfs_drop_snapshot(trans, dirty->root);
 701                         if (ret != -EAGAIN) {
 702                                 break;
 703                         }
 704                         mutex_unlock(&root->fs_info->drop_mutex);
 705
 706                         err = btrfs_update_root(trans,
 707                                         tree_root,
 708                                         &dirty->root->root_key,
 709                                         &dirty->root->root_item);
 710                         if (err)
 711                                 ret = err;
 712                         nr = trans->blocks_used;
 713                         ret = btrfs_end_transaction(trans, tree_root);
 714                         BUG_ON(ret);
 715
 716                         btrfs_btree_balance_dirty(tree_root, nr);
 717                         cond_resched();
 718                 }
 719                 BUG_ON(ret);
 720                 atomic_dec(&root->fs_info->throttles);
 721                 wake_up(&root->fs_info->transaction_throttle);
 722
 723                 num_bytes -= btrfs_root_used(&dirty->root->root_item);
 724                 bytes_used = btrfs_root_used(&root->root_item);
 725                 if (num_bytes) {
 726                         btrfs_record_root_in_trans(root);
 727                         btrfs_set_root_used(&root->root_item,
 728                                             bytes_used - num_bytes);
 729                 }
 730
 731                 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 732                 if (ret) {
 733                         BUG();
 734                         break;
 735                 }
 736                 mutex_unlock(&root->fs_info->drop_mutex);
 737
 738                 spin_lock(&root->list_lock);
 739                 list_del_init(&dirty->root->dead_list);
 740                 if (!list_empty(&root->dead_list)) {
 741                         struct btrfs_root *oldest;
 742                         oldest = list_entry(root->dead_list.prev,
 743                                             struct btrfs_root, dead_list);
 744                         max_useless = oldest->root_key.offset - 1;
 745                 } else {
 746                         max_useless = root->root_key.offset - 1;
 747                 }
 748                 spin_unlock(&root->list_lock);
 749
 750                 nr = trans->blocks_used;
 751                 ret = btrfs_end_transaction(trans, tree_root);
 752                 BUG_ON(ret);
 753
 754                 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
 755                 BUG_ON(ret);
 756
 757                 free_extent_buffer(dirty->root->node);
 758                 kfree(dirty->root);
 759                 kfree(dirty);
 760
 761                 btrfs_btree_balance_dirty(tree_root, nr);
 762                 cond_resched();
 763         }
 764         return ret;
 765 }
 766
 767 /*
 768  * new snapshots need to be created at a very specific time in the
 769  * transaction commit.  This does the actual creation
 770  */
 771 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 772                                    struct btrfs_fs_info *fs_info,
 773                                    struct btrfs_pending_snapshot *pending)
 774 {
 775         struct btrfs_key key;
 776         struct btrfs_root_item *new_root_item;
 777         struct btrfs_root *tree_root = fs_info->tree_root;
 778         struct btrfs_root *root = pending->root;
 779         struct extent_buffer *tmp;
 780         struct extent_buffer *old;
 781         int ret;
 782         u64 objectid;
 783
 784         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 785         if (!new_root_item) {
 786                 ret = -ENOMEM;
 787                 goto fail;
 788         }
 789         ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
 790         if (ret)
 791                 goto fail;
 792
 793         btrfs_record_root_in_trans(root);
 794         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 795         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 796
 797         key.objectid = objectid;
 798         key.offset = trans->transid;
 799         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 800
 801         old = btrfs_lock_root_node(root);
 802         btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
 803
 804         btrfs_copy_root(trans, root, old, &tmp, objectid);
 805         btrfs_tree_unlock(old);
 806         free_extent_buffer(old);
 807
 808         btrfs_set_root_bytenr(new_root_item, tmp->start);
 809         btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
 810         btrfs_set_root_generation(new_root_item, trans->transid);
 811         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 812                                 new_root_item);
 813         btrfs_tree_unlock(tmp);
 814         free_extent_buffer(tmp);
 815         if (ret)
 816                 goto fail;
 817
 818         key.offset = (u64)-1;
 819         memcpy(&pending->root_key, &key, sizeof(key));
 820 fail:
 821         kfree(new_root_item);
 822         return ret;
 823 }
 824
 825 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 826                                    struct btrfs_pending_snapshot *pending)
 827 {
 828         int ret;
 829         int namelen;
 830         u64 index = 0;
 831         struct btrfs_trans_handle *trans;
 832         struct inode *parent_inode;
 833         struct inode *inode;
 834
 835         parent_inode = pending->dentry->d_parent->d_inode;
 836         trans = btrfs_start_transaction(BTRFS_I(parent_inode)->root, 1);
 837
 838         /*
 839          * insert the directory item
 840          */
 841         namelen = strlen(pending->name);
 842         ret = btrfs_set_inode_index(parent_inode, &index);
 843         ret = btrfs_insert_dir_item(trans,
 844                             BTRFS_I(parent_inode)->root,
 845                             pending->name, namelen,
 846                             parent_inode->i_ino,
 847                             &pending->root_key, BTRFS_FT_DIR, index);
 848
 849         if (ret)
 850                 goto fail;
 851 #if 0
 852         ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 853                              pending->name, strlen(pending->name), objectid,
 854                              root->fs_info->sb->s_root->d_inode->i_ino, 0);
 855 #endif
 856         inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
 857         d_instantiate(pending->dentry, inode);
 858 fail:
 859         btrfs_end_transaction(trans, fs_info->fs_root);
 860         return ret;
 861 }
 862
 863 /*
 864  * create all the snapshots we've scheduled for creation
 865  */
 866 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 867                                              struct btrfs_fs_info *fs_info)
 868 {
 869         struct btrfs_pending_snapshot *pending;
 870         struct list_head *head = &trans->transaction->pending_snapshots;
 871         struct list_head *cur;
 872         int ret;
 873
 874         list_for_each(cur, head) {
 875                 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
 876                 ret = create_pending_snapshot(trans, fs_info, pending);
 877                 BUG_ON(ret);
 878         }
 879         return 0;
 880 }
 881
 882 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 883                                              struct btrfs_fs_info *fs_info)
 884 {
 885         struct btrfs_pending_snapshot *pending;
 886         struct list_head *head = &trans->transaction->pending_snapshots;
 887         int ret;
 888
 889         while(!list_empty(head)) {
 890                 pending = list_entry(head->next,
 891                                      struct btrfs_pending_snapshot, list);
 892                 ret = finish_pending_snapshot(fs_info, pending);
 893                 BUG_ON(ret);
 894                 list_del(&pending->list);
 895                 kfree(pending->name);
 896                 kfree(pending);
 897         }
 898         return 0;
 899 }
 900
 901 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 902                              struct btrfs_root *root)
 903 {
 904         unsigned long joined = 0;
 905         unsigned long timeout = 1;
 906         struct btrfs_transaction *cur_trans;
 907         struct btrfs_transaction *prev_trans = NULL;
 908         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
 909         struct list_head dirty_fs_roots;
 910         struct extent_io_tree *pinned_copy;
 911         DEFINE_WAIT(wait);
 912         int ret;
 913
 914         INIT_LIST_HEAD(&dirty_fs_roots);
 915         mutex_lock(&root->fs_info->trans_mutex);
 916         if (trans->transaction->in_commit) {
 917                 cur_trans = trans->transaction;
 918                 trans->transaction->use_count++;
 919                 mutex_unlock(&root->fs_info->trans_mutex);
 920                 btrfs_end_transaction(trans, root);
 921
 922                 ret = wait_for_commit(root, cur_trans);
 923                 BUG_ON(ret);
 924
 925                 mutex_lock(&root->fs_info->trans_mutex);
 926                 put_transaction(cur_trans);
 927                 mutex_unlock(&root->fs_info->trans_mutex);
 928
 929                 return 0;
 930         }
 931
 932         pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
 933         if (!pinned_copy)
 934                 return -ENOMEM;
 935
 936         extent_io_tree_init(pinned_copy,
 937                              root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 938
 939         trans->transaction->in_commit = 1;
 940         trans->transaction->blocked = 1;
 941         cur_trans = trans->transaction;
 942         if (cur_trans->list.prev != &root->fs_info->trans_list) {
 943                 prev_trans = list_entry(cur_trans->list.prev,
 944                                         struct btrfs_transaction, list);
 945                 if (!prev_trans->commit_done) {
 946                         prev_trans->use_count++;
 947                         mutex_unlock(&root->fs_info->trans_mutex);
 948
 949                         wait_for_commit(root, prev_trans);
 950
 951                         mutex_lock(&root->fs_info->trans_mutex);
 952                         put_transaction(prev_trans);
 953                 }
 954         }
 955
 956         do {
 957                 int snap_pending = 0;
 958                 joined = cur_trans->num_joined;
 959                 if (!list_empty(&trans->transaction->pending_snapshots))
 960                         snap_pending = 1;
 961
 962                 WARN_ON(cur_trans != trans->transaction);
 963                 prepare_to_wait(&cur_trans->writer_wait, &wait,
 964                                 TASK_UNINTERRUPTIBLE);
 965
 966                 if (cur_trans->num_writers > 1)
 967                         timeout = MAX_SCHEDULE_TIMEOUT;
 968                 else
 969                         timeout = 1;
 970
 971                 mutex_unlock(&root->fs_info->trans_mutex);
 972
 973                 if (snap_pending) {
 974                         ret = btrfs_wait_ordered_extents(root, 1);
 975                         BUG_ON(ret);
 976                 }
 977
 978                 schedule_timeout(timeout);
 979
 980                 mutex_lock(&root->fs_info->trans_mutex);
 981                 finish_wait(&cur_trans->writer_wait, &wait);
 982         } while (cur_trans->num_writers > 1 ||
 983                  (cur_trans->num_joined != joined));
 984
 985         ret = create_pending_snapshots(trans, root->fs_info);
 986         BUG_ON(ret);
 987
 988         WARN_ON(cur_trans != trans->transaction);
 989
 990         /* btrfs_commit_tree_roots is responsible for getting the
 991          * various roots consistent with each other.  Every pointer
 992          * in the tree of tree roots has to point to the most up to date
 993          * root for every subvolume and other tree.  So, we have to keep
 994          * the tree logging code from jumping in and changing any
 995          * of the trees.
 996          *
 997          * At this point in the commit, there can't be any tree-log
 998          * writers, but a little lower down we drop the trans mutex
 999          * and let new people in.  By holding the tree_log_mutex
1000          * from now until after the super is written, we avoid races
1001          * with the tree-log code.
1002          */
1003         mutex_lock(&root->fs_info->tree_log_mutex);
1004         /*
1005          * keep tree reloc code from adding new reloc trees
1006          */
1007         mutex_lock(&root->fs_info->tree_reloc_mutex);
1008
1009
1010         ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
1011                               &dirty_fs_roots);
1012         BUG_ON(ret);
1013
1014         /* add_dirty_roots gets rid of all the tree log roots, it is now
1015          * safe to free the root of tree log roots
1016          */
1017         btrfs_free_log_root_tree(trans, root->fs_info);
1018
1019         ret = btrfs_commit_tree_roots(trans, root);
1020         BUG_ON(ret);
1021
1022         cur_trans = root->fs_info->running_transaction;
1023         spin_lock(&root->fs_info->new_trans_lock);
1024         root->fs_info->running_transaction = NULL;
1025         spin_unlock(&root->fs_info->new_trans_lock);
1026         btrfs_set_super_generation(&root->fs_info->super_copy,
1027                                    cur_trans->transid);
1028         btrfs_set_super_root(&root->fs_info->super_copy,
1029                              root->fs_info->tree_root->node->start);
1030         btrfs_set_super_root_level(&root->fs_info->super_copy,
1031                            btrfs_header_level(root->fs_info->tree_root->node));
1032
1033         btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1034                                    chunk_root->node->start);
1035         btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1036                                          btrfs_header_level(chunk_root->node));
1037         btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1038                                 btrfs_header_generation(chunk_root->node));
1039
1040         if (!root->fs_info->log_root_recovering) {
1041                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1042                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1043         }
1044
1045         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1046                sizeof(root->fs_info->super_copy));
1047
1048         btrfs_copy_pinned(root, pinned_copy);
1049
1050         trans->transaction->blocked = 0;
1051         wake_up(&root->fs_info->transaction_throttle);
1052         wake_up(&root->fs_info->transaction_wait);
1053
1054         mutex_unlock(&root->fs_info->trans_mutex);
1055         ret = btrfs_write_and_wait_transaction(trans, root);
1056         BUG_ON(ret);
1057         write_ctree_super(trans, root);
1058
1059         /*
1060          * the super is written, we can safely allow the tree-loggers
1061          * to go about their business
1062          */
1063         mutex_unlock(&root->fs_info->tree_log_mutex);
1064
1065         btrfs_finish_extent_commit(trans, root, pinned_copy);
1066         kfree(pinned_copy);
1067
1068         btrfs_drop_dead_reloc_roots(root);
1069         mutex_unlock(&root->fs_info->tree_reloc_mutex);
1070
1071         /* do the directory inserts of any pending snapshot creations */
1072         finish_pending_snapshots(trans, root->fs_info);
1073
1074         mutex_lock(&root->fs_info->trans_mutex);
1075
1076         cur_trans->commit_done = 1;
1077         root->fs_info->last_trans_committed = cur_trans->transid;
1078         wake_up(&cur_trans->commit_wait);
1079
1080         put_transaction(cur_trans);
1081         put_transaction(cur_trans);
1082
1083         list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1084         if (root->fs_info->closing)
1085                 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1086
1087         mutex_unlock(&root->fs_info->trans_mutex);
1088
1089         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1090
1091         if (root->fs_info->closing) {
1092                 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1093         }
1094         return ret;
1095 }
1096
1097 /*
1098  * interface function to delete all the snapshots we have scheduled for deletion
1099  */
1100 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1101 {
1102         struct list_head dirty_roots;
1103         INIT_LIST_HEAD(&dirty_roots);
1104 again:
1105         mutex_lock(&root->fs_info->trans_mutex);
1106         list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1107         mutex_unlock(&root->fs_info->trans_mutex);
1108
1109         if (!list_empty(&dirty_roots)) {
1110                 drop_dirty_roots(root, &dirty_roots);
1111                 goto again;
1112         }
1113         return 0;
1114 }