Merge branch 'raid56-experimental' into for-linus-3.9
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
index 5cd44e239595f701c4681689780c82ce37f92aac..b3ecca447ddf82c37beef432b44c8c8da2ab710d 100644 (file)
@@ -31,6 +31,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                *actual_bytes = discarded_bytes;
 
 
+       if (ret == -EOPNOTSUPP)
+               ret = 0;
        return ret;
 }
 
@@ -2440,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+                     int count)
+{
+       int val = atomic_read(&delayed_refs->ref_seq);
+
+       if (val < seq || val >= seq + count)
+               return 1;
+       return 0;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2474,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+
+       if (!run_all && !run_most) {
+               int old;
+               int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+               old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+               if (old) {
+                       DEFINE_WAIT(__wait);
+                       if (delayed_refs->num_entries < 16348)
+                               return 0;
+
+                       prepare_to_wait(&delayed_refs->wait, &__wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+                       if (old) {
+                               schedule();
+                               finish_wait(&delayed_refs->wait, &__wait);
+
+                               if (!refs_newer(delayed_refs, seq, 256))
+                                       goto progress;
+                               else
+                                       return 0;
+                       } else {
+                               finish_wait(&delayed_refs->wait, &__wait);
+                               goto again;
+                       }
+               }
+
+       } else {
+               atomic_inc(&delayed_refs->procs_running_refs);
+       }
+
 again:
        loops = 0;
        spin_lock(&delayed_refs->lock);
@@ -2482,10 +2533,6 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 
-       if (count == 0) {
-               count = delayed_refs->num_entries * 2;
-               run_most = 1;
-       }
        while (1) {
                if (!(run_all || run_most) &&
                    delayed_refs->num_heads_ready < 64)
@@ -2508,9 +2555,12 @@ again:
                        btrfs_release_ref_cluster(&cluster);
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
+                       atomic_dec(&delayed_refs->procs_running_refs);
                        return ret;
                }
 
+               atomic_add(ret, &delayed_refs->ref_seq);
+
                count -= min_t(unsigned long, ret, count);
 
                if (count == 0)
@@ -2579,6 +2629,11 @@ again:
                goto again;
        }
 out:
+       atomic_dec(&delayed_refs->procs_running_refs);
+       smp_mb();
+       if (waitqueue_active(&delayed_refs->wait))
+               wake_up(&delayed_refs->wait);
+
        spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
@@ -3284,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
        u64 target;
+       u64 tmp;
 
        /*
         * see if restripe for this chunk_type is in progress, if so
@@ -3300,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        }
        spin_unlock(&root->fs_info->balance_lock);
 
+       /* First, mask out the RAID levels which aren't possible */
        if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+                          BTRFS_BLOCK_GROUP_RAID5);
+       if (num_devices < 3)
+               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
        if (num_devices < 4)
                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 
-       if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-           (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                     BTRFS_BLOCK_GROUP_RAID10))) {
-               flags &= ~BTRFS_BLOCK_GROUP_DUP;
-       }
+       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+       flags &= ~tmp;
 
-       if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-           (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-               flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-       }
-
-       if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
-           ((flags & BTRFS_BLOCK_GROUP_RAID1) |
-            (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP))) {
-               flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-       }
+       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+               tmp = BTRFS_BLOCK_GROUP_RAID6;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+               tmp = BTRFS_BLOCK_GROUP_RAID5;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+               tmp = BTRFS_BLOCK_GROUP_RAID10;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+               tmp = BTRFS_BLOCK_GROUP_RAID1;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+               tmp = BTRFS_BLOCK_GROUP_RAID0;
 
-       return extended_to_chunk(flags);
+       return extended_to_chunk(flags | tmp);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3347,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
+       u64 ret;
 
        if (data)
                flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3355,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        else
                flags = BTRFS_BLOCK_GROUP_METADATA;
 
-       return get_alloc_profile(root, flags);
+       ret = get_alloc_profile(root, flags);
+       return ret;
 }
 
 /*
@@ -3530,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 {
        u64 num_dev;
 
-       if (type & BTRFS_BLOCK_GROUP_RAID10 ||
-           type & BTRFS_BLOCK_GROUP_RAID0)
+       if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+                   BTRFS_BLOCK_GROUP_RAID0 |
+                   BTRFS_BLOCK_GROUP_RAID5 |
+                   BTRFS_BLOCK_GROUP_RAID6))
                num_dev = root->fs_info->fs_devices->rw_devices;
        else if (type & BTRFS_BLOCK_GROUP_RAID1)
                num_dev = 2;
@@ -3706,7 +3768,9 @@ static int can_overcommit(struct btrfs_root *root,
 
        /*
         * If we have dup, raid1 or raid10 then only half of the free
-        * space is actually useable.
+        * space is actually useable.  For raid56, the space info used
+        * doesn't include the parity drive, so we don't have to
+        * change the math
         */
        if (profile & (BTRFS_BLOCK_GROUP_DUP |
                       BTRFS_BLOCK_GROUP_RAID1 |
@@ -5539,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return ret;
 }
 
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+                       struct btrfs_block_group_cache *cache,
+                       u64 val, u64 num_bytes)
 {
-       u64 mask = ((u64)root->stripesize - 1);
-       u64 ret = (val + mask) & ~mask;
+       u64 mask;
+       u64 ret;
+       mask = ((u64)root->stripesize - 1);
+       ret = (val + mask) & ~mask;
        return ret;
 }
 
@@ -5599,8 +5667,12 @@ int __get_raid_index(u64 flags)
                return BTRFS_RAID_DUP;
        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
                return BTRFS_RAID_RAID0;
-       else
-               return BTRFS_RAID_SINGLE;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+               return BTRFS_RAID_RAID5;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+               return BTRFS_RAID_RAID6;
+
+       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5743,6 +5815,8 @@ search:
                if (!block_group_bits(block_group, data)) {
                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID5 |
+                               BTRFS_BLOCK_GROUP_RAID6 |
                                BTRFS_BLOCK_GROUP_RAID10;
 
                        /*
@@ -5771,6 +5845,7 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr) {
+                       unsigned long aligned_cluster;
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
@@ -5837,11 +5912,15 @@ refill_cluster:
                                goto unclustered_alloc;
                        }
 
+                       aligned_cluster = max_t(unsigned long,
+                                               empty_cluster + empty_size,
+                                             block_group->full_stripe_len);
+
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               search_start, num_bytes,
-                                              empty_cluster + empty_size);
+                                              aligned_cluster);
                        if (ret == 0) {
                                /*
                                 * now pull our allocation out of this
@@ -5912,7 +5991,8 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-               search_start = stripe_align(root, offset);
+               search_start = stripe_align(root, used_block_group,
+                                           offset, num_bytes);
 
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -7284,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
                root->fs_info->fs_devices->missing_devices;
 
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
+               BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
        if (num_devices == 1) {
@@ -7837,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                btrfs_release_path(path);
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
-
+               cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                              &root->fs_info->mapping_tree,
+                                              found_key.objectid);
                btrfs_init_free_space_ctl(cache);
 
                /*
@@ -7891,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                if (!(get_alloc_profile(root, space_info->flags) &
                      (BTRFS_BLOCK_GROUP_RAID10 |
                       BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID5 |
+                      BTRFS_BLOCK_GROUP_RAID6 |
                       BTRFS_BLOCK_GROUP_DUP)))
                        continue;
                /*
@@ -7966,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
        cache->fs_info = root->fs_info;
+       cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                              &root->fs_info->mapping_tree,
+                                              chunk_offset);
 
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);