Merge branch 'raid56-experimental' into for-linus-3.9
[linux-2.6-block.git] / fs / btrfs / disk-io.c
index e9fa7b4d18e38b2aaaf5fed4add96085e43a5f0b..eb7c1430852106034fac0848ee3b5ff8a3903624 100644 (file)
@@ -57,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                            struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -421,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
        struct extent_io_tree *tree;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 found_start;
        struct extent_buffer *eb;
 
@@ -962,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+       struct btrfs_fs_info *fs_info;
+       int ret;
+
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
-               struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-               u64 num_dirty;
-               unsigned long thresh = 32 * 1024 * 1024;
 
                if (wbc->for_kupdate)
                        return 0;
 
+               fs_info = BTRFS_I(mapping->host)->root->fs_info;
                /* this is a bit racy, but that's ok */
-               num_dirty = root->fs_info->dirty_metadata_bytes;
-               if (num_dirty < thresh)
+               ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+                                            BTRFS_DIRTY_METADATA_THRESH);
+               if (ret < 0)
                        return 0;
        }
        return btree_write_cache_pages(mapping, wbc);
@@ -1141,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      struct extent_buffer *buf)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        if (btrfs_header_generation(buf) ==
-           root->fs_info->running_transaction->transid) {
+           fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
 
                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (root->fs_info->dirty_metadata_bytes >= buf->len)
-                               root->fs_info->dirty_metadata_bytes -= buf->len;
-                       else {
-                               spin_unlock(&root->fs_info->delalloc_lock);
-                               btrfs_panic(root->fs_info, -EOVERFLOW,
-                                         "Can't clear %lu bytes from "
-                                         " dirty_mdatadata_bytes (%llu)",
-                                         buf->len,
-                                         root->fs_info->dirty_metadata_bytes);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-
+                       __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+                                            -buf->len,
+                                            fs_info->dirty_metadata_batch);
                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
                        btrfs_set_lock_blocking(buf);
                        clear_extent_buffer_dirty(buf);
@@ -1194,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+       INIT_LIST_HEAD(&root->logged_list[0]);
+       INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->accounting_lock);
+       spin_lock_init(&root->log_extents_lock[0]);
+       spin_lock_init(&root->log_extents_lock[1]);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -2020,10 +2019,24 @@ int open_ctree(struct super_block *sb,
                goto fail_srcu;
        }
 
+       ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+       if (ret) {
+               err = ret;
+               goto fail_bdi;
+       }
+       fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+                                       (1 + ilog2(nr_cpu_ids));
+
+       ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+       if (ret) {
+               err = ret;
+               goto fail_dirty_metadata_bytes;
+       }
+
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-               goto fail_bdi;
+               goto fail_delalloc_bytes;
        }
 
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2033,7 +2046,6 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-       INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->trans_lock);
@@ -2044,6 +2056,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->tree_mod_seq_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
+       seqlock_init(&fs_info->profiles_lock);
 
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2142,6 +2155,7 @@ int open_ctree(struct super_block *sb,
 
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT;
+       fs_info->first_logical_byte = (u64)-1;
 
        extent_io_tree_init(&fs_info->freed_extents[0],
                             fs_info->btree_inode->i_mapping);
@@ -2209,7 +2223,8 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
 
        /* check FS state, whether FS is broken. */
-       fs_info->fs_state |= btrfs_super_flags(disk_super);
+       if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+               set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
        ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        if (ret) {
@@ -2283,6 +2298,8 @@ int open_ctree(struct super_block *sb,
        leafsize = btrfs_super_leafsize(disk_super);
        sectorsize = btrfs_super_sectorsize(disk_super);
        stripesize = btrfs_super_stripesize(disk_super);
+       fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+       fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
        /*
         * mixed block groups end up with duplicate but slightly offset
@@ -2422,8 +2439,7 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-       if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-                   sizeof(disk_super->magic))) {
+       if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2726,13 +2742,13 @@ fail_cleaner:
         * kthreads
         */
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_block_groups:
        btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
        free_root_pointers(fs_info, 1);
+       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
@@ -2755,8 +2771,11 @@ fail_alloc:
 fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+       percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+       percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail_srcu:
@@ -2830,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 
                super = (struct btrfs_super_block *)bh->b_data;
                if (btrfs_super_bytenr(super) != bytenr ||
-                   strncmp((char *)(&super->magic), BTRFS_MAGIC,
-                           sizeof(super->magic))) {
+                   super->magic != cpu_to_le64(BTRFS_MAGIC)) {
                        brelse(bh);
                        continue;
                }
@@ -3379,7 +3397,7 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
 
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                btrfs_error_commit_super(root);
 
        btrfs_put_block_group_cache(fs_info);
@@ -3392,9 +3410,9 @@ int close_ctree(struct btrfs_root *root)
 
        btrfs_free_qgroup_config(root->fs_info);
 
-       if (fs_info->delalloc_bytes) {
-               printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-                      (unsigned long long)fs_info->delalloc_bytes);
+       if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+               printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+                      percpu_counter_sum(&fs_info->delalloc_bytes));
        }
 
        free_extent_buffer(fs_info->extent_root->node);
@@ -3443,6 +3461,8 @@ int close_ctree(struct btrfs_root *root)
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+       percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+       percpu_counter_destroy(&fs_info->delalloc_bytes);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
@@ -3487,11 +3507,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
        was_dirty = set_extent_buffer_dirty(buf);
-       if (!was_dirty) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->dirty_metadata_bytes += buf->len;
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
+       if (!was_dirty)
+               __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+                                    buf->len,
+                                    root->fs_info->dirty_metadata_batch);
 }
 
 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3501,8 +3520,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-       u64 num_dirty;
-       unsigned long thresh = 32 * 1024 * 1024;
+       int ret;
 
        if (current->flags & PF_MEMALLOC)
                return;
@@ -3510,11 +3528,11 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
        if (flush_delayed)
                btrfs_balance_delayed_items(root);
 
-       num_dirty = root->fs_info->dirty_metadata_bytes;
-
-       if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited_nr(
-                                  root->fs_info->btree_inode->i_mapping, 1);
+       ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+                                    BTRFS_DIRTY_METADATA_THRESH);
+       if (ret > 0) {
+               balance_dirty_pages_ratelimited(
+                                  root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
@@ -3562,7 +3580,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
 
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                            struct btrfs_root *root)
 {
        struct btrfs_inode *btrfs_inode;
        struct list_head splice;
@@ -3572,7 +3591,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
 
-       list_splice_init(&root->fs_info->ordered_operations, &splice);
+       list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                         ordered_operations);
@@ -3588,35 +3607,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
-       struct list_head splice;
        struct btrfs_ordered_extent *ordered;
-       struct inode *inode;
-
-       INIT_LIST_HEAD(&splice);
 
        spin_lock(&root->fs_info->ordered_extent_lock);
-
-       list_splice_init(&root->fs_info->ordered_extents, &splice);
-       while (!list_empty(&splice)) {
-               ordered = list_entry(splice.next, struct btrfs_ordered_extent,
-                                    root_extent_list);
-
-               list_del_init(&ordered->root_extent_list);
-               atomic_inc(&ordered->refs);
-
-               /* the inode may be getting freed (in sys_unlink path). */
-               inode = igrab(ordered->inode);
-
-               spin_unlock(&root->fs_info->ordered_extent_lock);
-               if (inode)
-                       iput(inode);
-
-               atomic_set(&ordered->refs, 1);
-               btrfs_put_ordered_extent(ordered);
-
-               spin_lock(&root->fs_info->ordered_extent_lock);
-       }
-
+       /*
+        * This will just short circuit the ordered completion stuff which will
+        * make sure the ordered extent gets properly cleaned up.
+        */
+       list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+                           root_extent_list)
+               set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
 
@@ -3638,11 +3638,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        }
 
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               struct btrfs_delayed_ref_head *head = NULL;
 
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
                if (btrfs_delayed_ref_is_head(ref)) {
-                       struct btrfs_delayed_ref_head *head;
 
                        head = btrfs_delayed_node_to_head(ref);
                        if (!mutex_trylock(&head->mutex)) {
@@ -3658,16 +3658,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                continue;
                        }
 
-                       kfree(head->extent_op);
+                       btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
                                delayed_refs->num_heads_ready--;
                        list_del_init(&head->cluster);
                }
+
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               if (head)
+                       mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
                btrfs_put_delayed_ref(ref);
 
@@ -3715,6 +3717,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
                                    delalloc_inodes);
 
                list_del_init(&btrfs_inode->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &btrfs_inode->runtime_flags);
 
                btrfs_invalidate_inodes(btrfs_inode->root);
        }
@@ -3867,10 +3871,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 
        while (!list_empty(&list)) {
                t = list_entry(list.next, struct btrfs_transaction, list);
-               if (!t)
-                       break;
 
-               btrfs_destroy_ordered_operations(root);
+               btrfs_destroy_ordered_operations(t, root);
 
                btrfs_destroy_ordered_extents(root);