Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorChris Mason <chris.mason@oracle.com>
Thu, 1 Oct 2009 16:58:13 +0000 (12:58 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 1 Oct 2009 16:58:13 +0000 (12:58 -0400)
1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/disk-io.c
index 644e796fd643e045ca0b0ed057743b7eb5e88438,69dce50aabd210ed669fc22246d26cfdc5856eb0..d20dc05208febce95e6f0753abe3ec90373ba2fc
@@@ -773,7 -773,7 +773,7 @@@ static void btree_invalidatepage(struc
        }
  }
  
 -static struct address_space_operations btree_aops = {
 +static const struct address_space_operations btree_aops = {
        .readpage       = btree_readpage,
        .writepage      = btree_writepage,
        .writepages     = btree_writepages,
@@@ -829,7 -829,9 +829,9 @@@ int btrfs_write_tree_block(struct exten
  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
  {
        return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-                                 buf->start, buf->start + buf->len - 1);
+                                 buf->start >> PAGE_CACHE_SHIFT,
+                                 (buf->start + buf->len - 1) >>
+                                  PAGE_CACHE_SHIFT);
  }
  
  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@@ -1371,7 -1373,6 +1373,7 @@@ static int setup_bdi(struct btrfs_fs_in
  {
        int err;
  
 +      bdi->name = "btrfs";
        bdi->capabilities = BDI_CAP_MAP_COPY;
        err = bdi_init(bdi);
        if (err)
@@@ -1630,7 -1631,7 +1632,7 @@@ struct btrfs_root *open_ctree(struct su
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
-       fs_info->metadata_ratio = 8;
+       fs_info->metadata_ratio = 0;
  
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
  
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
 +      sb->s_bdi = &fs_info->bdi;
  
        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
        fs_info->btree_inode->i_nlink = 1;
diff --combined fs/btrfs/extent-tree.c
index 993f93ff7ba695c97b490f5e2d5d6d3b5939980c,d119c0388af1b22b6f39a50bb1e3f890106d6b64..359a754c782cd8338f0a1bf0d456306e3240c32c
@@@ -68,6 -68,8 +68,8 @@@ static int pin_down_bytes(struct btrfs_
                          struct extent_buffer **must_clean);
  static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
+ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+                           int dump_block_groups);
  
  static noinline int
  block_group_cache_done(struct btrfs_block_group_cache *cache)
@@@ -1570,8 -1572,7 +1572,8 @@@ static int remove_extent_backref(struc
  static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
  {
 -      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 +      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 +                           DISCARD_FL_BARRIER);
  }
  #endif
  
@@@ -2765,67 -2766,346 +2767,346 @@@ void btrfs_set_inode_space_info(struct 
                                                       alloc_target);
  }
  
+ static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+ {
+       u64 num_bytes;
+       int level;
+       level = BTRFS_MAX_LEVEL - 2;
+       /*
+        * NOTE: these calculations are absolutely the worst possible case.
+        * This assumes that _every_ item we insert will require a new leaf, and
+        * that the tree has grown to its maximum level size.
+        */
+       /*
+        * for every item we insert we could insert both an extent item and a
+        * extent ref item.  Then for ever item we insert, we will need to cow
+        * both the original leaf, plus the leaf to the left and right of it.
+        *
+        * Unless we are talking about the extent root, then we just want the
+        * number of items * 2, since we just need the extent item plus its ref.
+        */
+       if (root == root->fs_info->extent_root)
+               num_bytes = num_items * 2;
+       else
+               num_bytes = (num_items + (2 * num_items)) * 3;
+       /*
+        * num_bytes is total number of leaves we could need times the leaf
+        * size, and then for every leaf we could end up cow'ing 2 nodes per
+        * level, down to the leaf level.
+        */
+       num_bytes = (num_bytes * root->leafsize) +
+               (num_bytes * (level * 2)) * root->nodesize;
+       return num_bytes;
+ }
  /*
-  * for now this just makes sure we have at least 5% of our metadata space free
-  * for use.
+  * Unreserve metadata space for delalloc.  If we have less reserved credits than
+  * we have extents, this function does nothing.
   */
- int btrfs_check_metadata_free_space(struct btrfs_root *root)
+ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                         struct inode *inode, int num_items)
  {
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_space_info *meta_sinfo;
-       u64 alloc_target, thresh;
-       int committed = 0, ret;
+       u64 num_bytes;
+       u64 alloc_target;
+       bool bug = false;
  
        /* get the space info for where the metadata will live */
        alloc_target = btrfs_get_alloc_profile(root, 0);
        meta_sinfo = __find_space_info(info, alloc_target);
-       if (!meta_sinfo)
-               goto alloc;
  
- again:
+       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+                                          num_items);
        spin_lock(&meta_sinfo->lock);
-       if (!meta_sinfo->full)
-               thresh = meta_sinfo->total_bytes * 80;
-       else
-               thresh = meta_sinfo->total_bytes * 95;
+       if (BTRFS_I(inode)->delalloc_reserved_extents <=
+           BTRFS_I(inode)->delalloc_extents) {
+               spin_unlock(&meta_sinfo->lock);
+               return 0;
+       }
+       BTRFS_I(inode)->delalloc_reserved_extents--;
+       BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+       if (meta_sinfo->bytes_delalloc < num_bytes) {
+               bug = true;
+               meta_sinfo->bytes_delalloc = 0;
+       } else {
+               meta_sinfo->bytes_delalloc -= num_bytes;
+       }
+       spin_unlock(&meta_sinfo->lock);
+       BUG_ON(bug);
+       return 0;
+ }
+ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+ {
+       u64 thresh;
+       thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use;
  
+       thresh = meta_sinfo->total_bytes - thresh;
+       thresh *= 80;
        do_div(thresh, 100);
+       if (thresh <= meta_sinfo->bytes_delalloc)
+               meta_sinfo->force_delalloc = 1;
+       else
+               meta_sinfo->force_delalloc = 0;
+ }
  
-       if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-           meta_sinfo->bytes_super > thresh) {
-               struct btrfs_trans_handle *trans;
-               if (!meta_sinfo->full) {
-                       meta_sinfo->force_alloc = 1;
+ static int maybe_allocate_chunk(struct btrfs_root *root,
+                                struct btrfs_space_info *info)
+ {
+       struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+       struct btrfs_trans_handle *trans;
+       bool wait = false;
+       int ret = 0;
+       u64 min_metadata;
+       u64 free_space;
+       free_space = btrfs_super_total_bytes(disk_super);
+       /*
+        * we allow the metadata to grow to a max of either 5gb or 5% of the
+        * space in the volume.
+        */
+       min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+                            div64_u64(free_space * 5, 100));
+       if (info->total_bytes >= min_metadata) {
+               spin_unlock(&info->lock);
+               return 0;
+       }
+       if (info->full) {
+               spin_unlock(&info->lock);
+               return 0;
+       }
+       if (!info->allocating_chunk) {
+               info->force_alloc = 1;
+               info->allocating_chunk = 1;
+               init_waitqueue_head(&info->wait);
+       } else {
+               wait = true;
+       }
+       spin_unlock(&info->lock);
+       if (wait) {
+               wait_event(info->wait,
+                          !info->allocating_chunk);
+               return 1;
+       }
+       trans = btrfs_start_transaction(root, 1);
+       if (!trans) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                            4096 + 2 * 1024 * 1024,
+                            info->flags, 0);
+       btrfs_end_transaction(trans, root);
+       if (ret)
+               goto out;
+ out:
+       spin_lock(&info->lock);
+       info->allocating_chunk = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->wait);
+       if (ret)
+               return 0;
+       return 1;
+ }
+ /*
+  * Reserve metadata space for delalloc.
+  */
+ int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+                                       struct inode *inode, int num_items)
+ {
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 used;
+       u64 alloc_target;
+       int flushed = 0;
+       int force_delalloc;
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+                                          num_items);
+ again:
+       spin_lock(&meta_sinfo->lock);
+       force_delalloc = meta_sinfo->force_delalloc;
+       if (unlikely(!meta_sinfo->bytes_root))
+               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+       if (!flushed)
+               meta_sinfo->bytes_delalloc += num_bytes;
+       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+       if (used > meta_sinfo->total_bytes) {
+               flushed++;
+               if (flushed == 1) {
+                       if (maybe_allocate_chunk(root, meta_sinfo))
+                               goto again;
+                       flushed++;
+               } else {
                        spin_unlock(&meta_sinfo->lock);
- alloc:
-                       trans = btrfs_start_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
+               }
  
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            2 * 1024 * 1024, alloc_target, 0);
-                       btrfs_end_transaction(trans, root);
-                       if (!meta_sinfo) {
-                               meta_sinfo = __find_space_info(info,
-                                                              alloc_target);
-                       }
+               if (flushed == 2) {
+                       filemap_flush(inode->i_mapping);
+                       goto again;
+               } else if (flushed == 3) {
+                       btrfs_start_delalloc_inodes(root);
+                       btrfs_wait_ordered_extents(root, 0);
                        goto again;
                }
+               spin_lock(&meta_sinfo->lock);
+               meta_sinfo->bytes_delalloc -= num_bytes;
                spin_unlock(&meta_sinfo->lock);
+               printk(KERN_ERR "enospc, has %d, reserved %d\n",
+                      BTRFS_I(inode)->delalloc_extents,
+                      BTRFS_I(inode)->delalloc_reserved_extents);
+               dump_space_info(meta_sinfo, 0, 0);
+               return -ENOSPC;
+       }
  
-               if (!committed) {
-                       committed = 1;
-                       trans = btrfs_join_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
+       BTRFS_I(inode)->delalloc_reserved_extents++;
+       check_force_delalloc(meta_sinfo);
+       spin_unlock(&meta_sinfo->lock);
+       if (!flushed && force_delalloc)
+               filemap_flush(inode->i_mapping);
+       return 0;
+ }
+ /*
+  * unreserve num_items number of items worth of metadata space.  This needs to
+  * be paired with btrfs_reserve_metadata_space.
+  *
+  * NOTE: if you have the option, run this _AFTER_ you do a
+  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+  * oprations which will result in more used metadata, so we want to make sure we
+  * can do that without issue.
+  */
+ int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+ {
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 alloc_target;
+       bool bug = false;
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+       num_bytes = calculate_bytes_needed(root, num_items);
+       spin_lock(&meta_sinfo->lock);
+       if (meta_sinfo->bytes_may_use < num_bytes) {
+               bug = true;
+               meta_sinfo->bytes_may_use = 0;
+       } else {
+               meta_sinfo->bytes_may_use -= num_bytes;
+       }
+       spin_unlock(&meta_sinfo->lock);
+       BUG_ON(bug);
+       return 0;
+ }
+ /*
+  * Reserve some metadata space for use.  We'll calculate the worste case number
+  * of bytes that would be needed to modify num_items number of items.  If we
+  * have space, fantastic, if not, you get -ENOSPC.  Please call
+  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+  * items you reserved, since whatever metadata you needed should have already
+  * been allocated.
+  *
+  * This will commit the transaction to make more space if we don't have enough
+  * metadata space.  THe only time we don't do this is if we're reserving space
+  * inside of a transaction, then we will just return -ENOSPC and it is the
+  * callers responsibility to handle it properly.
+  */
+ int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+ {
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 used;
+       u64 alloc_target;
+       int retries = 0;
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+       num_bytes = calculate_bytes_needed(root, num_items);
+ again:
+       spin_lock(&meta_sinfo->lock);
+       if (unlikely(!meta_sinfo->bytes_root))
+               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+       if (!retries)
+               meta_sinfo->bytes_may_use += num_bytes;
+       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+       if (used > meta_sinfo->total_bytes) {
+               retries++;
+               if (retries == 1) {
+                       if (maybe_allocate_chunk(root, meta_sinfo))
+                               goto again;
+                       retries++;
+               } else {
+                       spin_unlock(&meta_sinfo->lock);
+               }
+               if (retries == 2) {
+                       btrfs_start_delalloc_inodes(root);
+                       btrfs_wait_ordered_extents(root, 0);
                        goto again;
                }
+               spin_lock(&meta_sinfo->lock);
+               meta_sinfo->bytes_may_use -= num_bytes;
+               spin_unlock(&meta_sinfo->lock);
+               dump_space_info(meta_sinfo, 0, 0);
                return -ENOSPC;
        }
+       check_force_delalloc(meta_sinfo);
        spin_unlock(&meta_sinfo->lock);
  
        return 0;
@@@ -2888,7 -3168,7 +3169,7 @@@ alloc
                spin_unlock(&data_sinfo->lock);
  
                /* commit the current transaction and try again */
-               if (!committed) {
+               if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
                        if (!trans)
        BTRFS_I(inode)->reserved_bytes += bytes;
        spin_unlock(&data_sinfo->lock);
  
-       return btrfs_check_metadata_free_space(root);
+       return 0;
  }
  
  /*
@@@ -3015,17 -3295,15 +3296,15 @@@ static int do_chunk_alloc(struct btrfs_
        BUG_ON(!space_info);
  
        spin_lock(&space_info->lock);
-       if (space_info->force_alloc) {
+       if (space_info->force_alloc)
                force = 1;
-               space_info->force_alloc = 0;
-       }
        if (space_info->full) {
                spin_unlock(&space_info->lock);
                goto out;
        }
  
        thresh = space_info->total_bytes - space_info->bytes_readonly;
-       thresh = div_factor(thresh, 6);
+       thresh = div_factor(thresh, 8);
        if (!force &&
           (space_info->bytes_used + space_info->bytes_pinned +
            space_info->bytes_reserved + alloc_bytes) < thresh) {
         * we keep a reasonable number of metadata chunks allocated in the
         * FS as well.
         */
-       if (flags & BTRFS_BLOCK_GROUP_DATA) {
+       if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
                fs_info->data_chunk_allocations++;
                if (!(fs_info->data_chunk_allocations %
                      fs_info->metadata_ratio))
        }
  
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+       spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+       space_info->force_alloc = 0;
+       spin_unlock(&space_info->lock);
  out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
@@@ -4063,21 -4344,32 +4345,32 @@@ loop
        return ret;
  }
  
- static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+                           int dump_block_groups)
  {
        struct btrfs_block_group_cache *cache;
  
+       spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
-                                   info->bytes_pinned - info->bytes_reserved),
+                                   info->bytes_pinned - info->bytes_reserved -
+                                   info->bytes_super),
               (info->full) ? "" : "not ");
        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-              " may_use=%llu, used=%llu\n",
+              " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+              "\n",
               (unsigned long long)info->total_bytes,
               (unsigned long long)info->bytes_pinned,
               (unsigned long long)info->bytes_delalloc,
               (unsigned long long)info->bytes_may_use,
-              (unsigned long long)info->bytes_used);
+              (unsigned long long)info->bytes_used,
+              (unsigned long long)info->bytes_root,
+              (unsigned long long)info->bytes_super,
+              (unsigned long long)info->bytes_reserved);
+       spin_unlock(&info->lock);
+       if (!dump_block_groups)
+               return;
  
        down_read(&info->groups_sem);
        list_for_each_entry(cache, &info->block_groups, list) {
@@@ -4145,7 -4437,7 +4438,7 @@@ again
                printk(KERN_ERR "btrfs allocation failed flags %llu, "
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
-               dump_space_info(sinfo, num_bytes);
+               dump_space_info(sinfo, num_bytes, 1);
        }
  
        return ret;
diff --combined fs/btrfs/file.c
index a3492a3ad96b960fecec513d4023f7ee641fcf81,f155179877a6ff81cb741a8a47b13a6aebc992a4..7351bdbca26f9eef18d024683067a80f5786c2c3
@@@ -123,7 -123,10 +123,10 @@@ static noinline int dirty_and_release_p
                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
        end_of_last_block = start_pos + num_bytes - 1;
-       btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+       err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+       if (err)
+               return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
                SetPageUptodate(p);
@@@ -917,21 -920,35 +920,35 @@@ static ssize_t btrfs_file_write(struct 
        start_pos = pos;
  
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+       /* do the reserve before the mutex lock in case we have to do some
+        * flushing.  We wouldn't deadlock, but this is more polite.
+        */
+       err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+       if (err)
+               goto out_nolock;
+       mutex_lock(&inode->i_mutex);
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
-               goto out_nolock;
+               goto out;
        if (count == 0)
-               goto out_nolock;
+               goto out;
  
        err = file_remove_suid(file);
        if (err)
-               goto out_nolock;
+               goto out;
        file_update_time(file);
  
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
  
-       mutex_lock(&inode->i_mutex);
+       /* generic_write_checks can change our pos */
+       start_pos = pos;
        BTRFS_I(inode)->sequence++;
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@@ -1028,6 -1045,7 +1045,7 @@@ out
        mutex_unlock(&inode->i_mutex);
        if (ret)
                err = ret;
+       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
  
  out_nolock:
        kfree(pages);
@@@ -1184,7 -1202,7 +1202,7 @@@ out
        return ret > 0 ? EIO : ret;
  }
  
 -static struct vm_operations_struct btrfs_file_vm_ops = {
 +static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = btrfs_page_mkwrite,
  };
diff --combined fs/btrfs/inode.c
index e9b76bcd1c129e0d1f4664f54fefd33caba72e08,3cc5677f544029fab14bc3c8f26f913c297f63a1..5b9c8e0d6dd66dd7da7e18d1b6bbfcab6c291222
@@@ -55,13 -55,13 +55,13 @@@ struct btrfs_iget_args 
        struct btrfs_root *root;
  };
  
 -static struct inode_operations btrfs_dir_inode_operations;
 -static struct inode_operations btrfs_symlink_inode_operations;
 -static struct inode_operations btrfs_dir_ro_inode_operations;
 -static struct inode_operations btrfs_special_inode_operations;
 -static struct inode_operations btrfs_file_inode_operations;
 -static struct address_space_operations btrfs_aops;
 -static struct address_space_operations btrfs_symlink_aops;
 +static const struct inode_operations btrfs_dir_inode_operations;
 +static const struct inode_operations btrfs_symlink_inode_operations;
 +static const struct inode_operations btrfs_dir_ro_inode_operations;
 +static const struct inode_operations btrfs_special_inode_operations;
 +static const struct inode_operations btrfs_file_inode_operations;
 +static const struct address_space_operations btrfs_aops;
 +static const struct address_space_operations btrfs_symlink_aops;
  static struct file_operations btrfs_dir_file_operations;
  static struct extent_io_ops btrfs_extent_io_ops;
  
@@@ -1159,6 -1159,83 +1159,83 @@@ static int run_delalloc_range(struct in
        return ret;
  }
  
+ static int btrfs_split_extent_hook(struct inode *inode,
+                                   struct extent_state *orig, u64 split)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 size;
+       if (!(orig->state & EXTENT_DELALLOC))
+               return 0;
+       size = orig->end - orig->start + 1;
+       if (size > root->fs_info->max_extent) {
+               u64 num_extents;
+               u64 new_size;
+               new_size = orig->end - split + 1;
+               num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+                                       root->fs_info->max_extent);
+               /*
+                * if we break a large extent up then leave delalloc_extents be,
+                * since we've already accounted for the large extent.
+                */
+               if (div64_u64(new_size + root->fs_info->max_extent - 1,
+                             root->fs_info->max_extent) < num_extents)
+                       return 0;
+       }
+       BTRFS_I(inode)->delalloc_extents++;
+       return 0;
+ }
+ /*
+  * extent_io.c merge_extent_hook, used to track merged delayed allocation
+  * extents so we can keep track of new extents that are just merged onto old
+  * extents, such as when we are doing sequential writes, so we can properly
+  * account for the metadata space we'll need.
+  */
+ static int btrfs_merge_extent_hook(struct inode *inode,
+                                  struct extent_state *new,
+                                  struct extent_state *other)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 new_size, old_size;
+       u64 num_extents;
+       /* not delalloc, ignore it */
+       if (!(other->state & EXTENT_DELALLOC))
+               return 0;
+       old_size = other->end - other->start + 1;
+       if (new->start < other->start)
+               new_size = other->end - new->start + 1;
+       else
+               new_size = new->end - other->start + 1;
+       /* we're not bigger than the max, unreserve the space and go */
+       if (new_size <= root->fs_info->max_extent) {
+               BTRFS_I(inode)->delalloc_extents--;
+               return 0;
+       }
+       /*
+        * If we grew by another max_extent, just return, we want to keep that
+        * reserved amount.
+        */
+       num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+                               root->fs_info->max_extent);
+       if (div64_u64(new_size + root->fs_info->max_extent - 1,
+                     root->fs_info->max_extent) > num_extents)
+               return 0;
+       BTRFS_I(inode)->delalloc_extents--;
+       return 0;
+ }
  /*
   * extent_io.c set_bit_hook, used to track delayed allocation
   * bytes in this file, and to maintain the list of inodes that
  static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
                       unsigned long old, unsigned long bits)
  {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         */
        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+               BTRFS_I(inode)->delalloc_extents++;
                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
  /*
   * extent_io.c clear_bit_hook, see set_bit_hook for why
   */
- static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
-                        unsigned long old, unsigned long bits)
+ static int btrfs_clear_bit_hook(struct inode *inode,
+                               struct extent_state *state, unsigned long bits)
  {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-       if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+       if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
  
+               BTRFS_I(inode)->delalloc_extents--;
+               btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                spin_lock(&root->fs_info->delalloc_lock);
-               if (end - start + 1 > root->fs_info->delalloc_bytes) {
+               if (state->end - state->start + 1 >
+                   root->fs_info->delalloc_bytes) {
                        printk(KERN_INFO "btrfs warning: delalloc account "
                               "%llu %llu\n",
-                              (unsigned long long)end - start + 1,
+                              (unsigned long long)
+                              state->end - state->start + 1,
                               (unsigned long long)
                               root->fs_info->delalloc_bytes);
                        btrfs_delalloc_free_space(root, inode, (u64)-1);
                        BTRFS_I(inode)->delalloc_bytes = 0;
                } else {
                        btrfs_delalloc_free_space(root, inode,
-                                                 end - start + 1);
-                       root->fs_info->delalloc_bytes -= end - start + 1;
-                       BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+                                                 state->end -
+                                                 state->start + 1);
+                       root->fs_info->delalloc_bytes -= state->end -
+                               state->start + 1;
+                       BTRFS_I(inode)->delalloc_bytes -= state->end -
+                               state->start + 1;
                }
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@@ -2950,7 -3038,12 +3038,12 @@@ again
                goto again;
        }
  
-       btrfs_set_extent_delalloc(inode, page_start, page_end);
+       ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+       if (ret) {
+               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               goto out_unlock;
+       }
        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
                kaddr = kmap(page);
@@@ -2981,15 -3074,11 +3074,11 @@@ int btrfs_cont_expand(struct inode *ino
        u64 last_byte;
        u64 cur_offset;
        u64 hole_size;
-       int err;
+       int err = 0;
  
        if (size <= hole_start)
                return 0;
  
-       err = btrfs_check_metadata_free_space(root);
-       if (err)
-               return err;
        btrfs_truncate_page(inode->i_mapping, inode->i_size);
  
        while (1) {
                                                 cur_offset, &hint_byte, 1);
                        if (err)
                                break;
+                       err = btrfs_reserve_metadata_space(root, 1);
+                       if (err)
+                               break;
                        err = btrfs_insert_file_extent(trans, root,
                                        inode->i_ino, cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
+                       btrfs_unreserve_metadata_space(root, 1);
                }
                free_extent_map(em);
                cur_offset = last_byte;
@@@ -3990,11 -4085,18 +4085,18 @@@ static int btrfs_mknod(struct inode *di
        if (!new_valid_dev(rdev))
                return -EINVAL;
  
-       err = btrfs_check_metadata_free_space(root);
+       /*
+        * 2 for inode item and ref
+        * 2 for dir items
+        * 1 for xattr if selinux is on
+        */
+       err = btrfs_reserve_metadata_space(root, 5);
        if (err)
-               goto fail;
+               return err;
  
        trans = btrfs_start_transaction(root, 1);
+       if (!trans)
+               goto fail;
        btrfs_set_trans_block_group(trans, dir);
  
        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@@ -4032,6 -4134,7 +4134,7 @@@ out_unlock
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
  fail:
+       btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -4052,10 -4155,18 +4155,18 @@@ static int btrfs_create(struct inode *d
        u64 objectid;
        u64 index = 0;
  
-       err = btrfs_check_metadata_free_space(root);
+       /*
+        * 2 for inode item and ref
+        * 2 for dir items
+        * 1 for xattr if selinux is on
+        */
+       err = btrfs_reserve_metadata_space(root, 5);
        if (err)
-               goto fail;
+               return err;
        trans = btrfs_start_transaction(root, 1);
+       if (!trans)
+               goto fail;
        btrfs_set_trans_block_group(trans, dir);
  
        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@@ -4096,6 -4207,7 +4207,7 @@@ out_unlock
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
  fail:
+       btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -4118,10 -4230,16 +4230,16 @@@ static int btrfs_link(struct dentry *ol
        if (inode->i_nlink == 0)
                return -ENOENT;
  
-       btrfs_inc_nlink(inode);
-       err = btrfs_check_metadata_free_space(root);
+       /*
+        * 1 item for inode ref
+        * 2 items for dir items
+        */
+       err = btrfs_reserve_metadata_space(root, 3);
        if (err)
-               goto fail;
+               return err;
+       btrfs_inc_nlink(inode);
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
  fail:
+       btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -4164,17 -4283,21 +4283,21 @@@ static int btrfs_mkdir(struct inode *di
        u64 index = 0;
        unsigned long nr = 1;
  
-       err = btrfs_check_metadata_free_space(root);
+       /*
+        * 2 items for inode and ref
+        * 2 items for dir items
+        * 1 for xattr if selinux is on
+        */
+       err = btrfs_reserve_metadata_space(root, 5);
        if (err)
-               goto out_unlock;
+               return err;
  
        trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, dir);
-       if (IS_ERR(trans)) {
-               err = PTR_ERR(trans);
+       if (!trans) {
+               err = -ENOMEM;
                goto out_unlock;
        }
+       btrfs_set_trans_block_group(trans, dir);
  
        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
        if (err) {
@@@ -4223,6 -4346,7 +4346,7 @@@ out_fail
        btrfs_end_transaction_throttle(trans, root);
  
  out_unlock:
+       btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@@ -4747,6 -4871,13 +4871,13 @@@ int btrfs_page_mkwrite(struct vm_area_s
                goto out;
        }
  
+       ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+       if (ret) {
+               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+               ret = VM_FAULT_SIGBUS;
+               goto out;
+       }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
  again:
        lock_page(page);
                goto again;
        }
  
-       btrfs_set_extent_delalloc(inode, page_start, page_end);
+       ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+       if (ret) {
+               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               ret = VM_FAULT_SIGBUS;
+               goto out_unlock;
+       }
        ret = 0;
  
        /* page is wholly or partially inside EOF */
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
  
  out_unlock:
+       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
@@@ -4917,6 -5054,8 +5054,8 @@@ struct inode *btrfs_alloc_inode(struct 
                return NULL;
        ei->last_trans = 0;
        ei->logged_trans = 0;
+       ei->delalloc_extents = 0;
+       ei->delalloc_reserved_extents = 0;
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->ordered_operations);
@@@ -5070,7 -5209,12 +5209,12 @@@ static int btrfs_rename(struct inode *o
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
  
-       ret = btrfs_check_metadata_free_space(root);
+       /*
+        * 2 items for dir items
+        * 1 item for orphan entry
+        * 1 item for ref
+        */
+       ret = btrfs_reserve_metadata_space(root, 4);
        if (ret)
                return ret;
  
@@@ -5185,6 -5329,8 +5329,8 @@@ out_fail
  
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
+       btrfs_unreserve_metadata_space(root, 4);
        return ret;
  }
  
@@@ -5256,11 -5402,18 +5402,18 @@@ static int btrfs_symlink(struct inode *
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
  
-       err = btrfs_check_metadata_free_space(root);
+       /*
+        * 2 items for inode item and ref
+        * 2 items for dir items
+        * 1 item for xattr if selinux is on
+        */
+       err = btrfs_reserve_metadata_space(root, 5);
        if (err)
-               goto out_fail;
+               return err;
  
        trans = btrfs_start_transaction(root, 1);
+       if (!trans)
+               goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
  
        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@@ -5341,6 -5494,7 +5494,7 @@@ out_unlock
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
  out_fail:
+       btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -5362,6 -5516,11 +5516,11 @@@ static int prealloc_file_range(struct b
  
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
+               ret = btrfs_reserve_metadata_space(root, 1);
+               if (ret)
+                       goto out;
                ret = btrfs_reserve_extent(trans, root, alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                alloc_hint = ins.objectid + ins.offset;
+               btrfs_unreserve_metadata_space(root, 1);
        }
  out:
        if (cur_offset > start) {
@@@ -5521,7 -5681,7 +5681,7 @@@ static int btrfs_permission(struct inod
        return generic_permission(inode, mask, btrfs_check_acl);
  }
  
 -static struct inode_operations btrfs_dir_inode_operations = {
 +static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
        .create         = btrfs_create,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
  };
 -static struct inode_operations btrfs_dir_ro_inode_operations = {
 +static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
  };
@@@ -5566,6 -5726,8 +5726,8 @@@ static struct extent_io_ops btrfs_exten
        .readpage_io_failed_hook = btrfs_io_failed_hook,
        .set_bit_hook = btrfs_set_bit_hook,
        .clear_bit_hook = btrfs_clear_bit_hook,
+       .merge_extent_hook = btrfs_merge_extent_hook,
+       .split_extent_hook = btrfs_split_extent_hook,
  };
  
  /*
   *
   * For now we're avoiding this by dropping bmap.
   */
 -static struct address_space_operations btrfs_aops = {
 +static const struct address_space_operations btrfs_aops = {
        .readpage       = btrfs_readpage,
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
        .set_page_dirty = btrfs_set_page_dirty,
 +      .error_remove_page = generic_error_remove_page,
  };
  
 -static struct address_space_operations btrfs_symlink_aops = {
 +static const struct address_space_operations btrfs_symlink_aops = {
        .readpage       = btrfs_readpage,
        .writepage      = btrfs_writepage,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
  };
  
 -static struct inode_operations btrfs_file_inode_operations = {
 +static const struct inode_operations btrfs_file_inode_operations = {
        .truncate       = btrfs_truncate,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
  };
 -static struct inode_operations btrfs_special_inode_operations = {
 +static const struct inode_operations btrfs_special_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
  };
 -static struct inode_operations btrfs_symlink_inode_operations = {
 +static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
diff --combined fs/btrfs/super.c
index 67035385444cf29c31a59951311af66702fa80a8,e0a64328080c4ad235bf876c7a1633e62fe61784..9de9b22364190de829c4479cc74b81578f41b18a
@@@ -51,7 -51,7 +51,7 @@@
  #include "export.h"
  #include "compression.h"
  
 -static struct super_operations btrfs_super_ops;
 +static const struct super_operations btrfs_super_ops;
  
  static void btrfs_put_super(struct super_block *sb)
  {
@@@ -344,7 -344,9 +344,9 @@@ static int btrfs_fill_super(struct supe
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
+ #ifdef CONFIG_BTRFS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
+ #endif
  
        tree_root = open_ctree(sb, fs_devices, (char *)data);
  
@@@ -675,7 -677,7 +677,7 @@@ static int btrfs_unfreeze(struct super_
        return 0;
  }
  
 -static struct super_operations btrfs_super_ops = {
 +static const struct super_operations btrfs_super_ops = {
        .drop_inode     = btrfs_drop_inode,
        .delete_inode   = btrfs_delete_inode,
        .put_super      = btrfs_put_super,
diff --combined fs/btrfs/volumes.c
index 23e7d36ff32554eb7e781ab7c8e9b8102fddc235,20cbd2eebd9731718956720c02c18958569234c3..7eda483d7b5aa0cd5e5bd96d149cca6fe2fc8f16
@@@ -260,7 -260,7 +260,7 @@@ loop_lock
                num_run++;
                batch_run++;
  
 -              if (bio_sync(cur))
 +              if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
                        num_sync_run++;
  
                if (need_resched()) {
@@@ -446,8 -446,10 +446,10 @@@ static struct btrfs_fs_devices *clone_f
                        goto error;
  
                device->name = kstrdup(orig_dev->name, GFP_NOFS);
-               if (!device->name)
+               if (!device->name) {
+                       kfree(device);
                        goto error;
+               }
  
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
@@@ -2942,7 -2944,7 +2944,7 @@@ static noinline int schedule_bio(struc
        bio->bi_rw |= rw;
  
        spin_lock(&device->io_lock);
 -      if (bio_sync(bio))
 +      if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
                pending_bios = &device->pending_sync_bios;
        else
                pending_bios = &device->pending_bios;