Merge branch 'qgroup' of git://git.jan-o-sch.net/btrfs-unstable into for-linus
authorChris Mason <chris.mason@fusionio.com>
Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
committerChris Mason <chris.mason@fusionio.com>
Wed, 25 Jul 2012 20:11:38 +0000 (16:11 -0400)
Conflicts:
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/transaction.c
fs/btrfs/transaction.h

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
15 files changed:
fs/btrfs/Makefile
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/qgroup.c [new file with mode: 0644]
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index 0c4fa2befae793f1a6845322d7ba71aaa5da4374..0bc4d3a10a5fe631b6a85873a6b44b72f6bce4c6 100644 (file)
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o ulist.o
+          reada.o backref.o ulist.o qgroup.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index a383c18e74e86eebaa847d756e3493e7ca3c9bfd..7d80ddd8f544edc36e335d0880c376414131c58d 100644 (file)
@@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 delayed_ref_seq, u64 time_seq,
-                            struct ulist *refs, struct ulist *roots,
-                            const u64 *extent_item_pos)
+                            u64 time_seq, struct ulist *refs,
+                            struct ulist *roots, const u64 *extent_item_pos)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -837,7 +836,7 @@ again:
                                btrfs_put_delayed_ref(&head->node);
                                goto again;
                        }
-                       ret = __add_delayed_refs(head, delayed_ref_seq,
+                       ret = __add_delayed_refs(head, time_seq,
                                                 &prefs_delayed);
                        mutex_unlock(&head->mutex);
                        if (ret) {
@@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks)
  */
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 delayed_ref_seq, u64 time_seq,
-                               struct ulist **leafs,
+                               u64 time_seq, struct ulist **leafs,
                                const u64 *extent_item_pos)
 {
        struct ulist *tmp;
@@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+       ret = find_parent_nodes(trans, fs_info, bytenr,
                                time_seq, *leafs, tmp, extent_item_pos);
        ulist_free(tmp);
 
@@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 delayed_ref_seq, u64 time_seq,
-                               struct ulist **roots)
+                               u64 time_seq, struct ulist **roots)
 {
        struct ulist *tmp;
        struct ulist_node *node = NULL;
@@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 
        ULIST_ITER_INIT(&uiter);
        while (1) {
-               ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+               ret = find_parent_nodes(trans, fs_info, bytenr,
                                        time_seq, tmp, *roots, NULL);
                if (ret < 0 && ret != -ENOENT) {
                        ulist_free(tmp);
@@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
        struct ulist_node *root_node = NULL;
-       struct seq_list seq_elem = {};
        struct seq_list tree_mod_seq_elem = {};
        struct ulist_iterator ref_uiter;
        struct ulist_iterator root_uiter;
-       struct btrfs_delayed_ref_root *delayed_refs = NULL;
 
        pr_debug("resolving all inodes for extent %llu\n",
                        extent_item_objectid);
@@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                trans = btrfs_join_transaction(fs_info->extent_root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-
-               delayed_refs = &trans->transaction->delayed_refs;
-               spin_lock(&delayed_refs->lock);
-               btrfs_get_delayed_seq(delayed_refs, &seq_elem);
-               spin_unlock(&delayed_refs->lock);
                btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
        }
 
        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-                                  seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+                                  tree_mod_seq_elem.seq, &refs,
                                   &extent_item_pos);
        if (ret)
                goto out;
@@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        ULIST_ITER_INIT(&ref_uiter);
        while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
                ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
-                                               seq_elem.seq,
-                                               tree_mod_seq_elem.seq, &roots);
+                                          tree_mod_seq_elem.seq, &roots);
                if (ret)
                        break;
                ULIST_ITER_INIT(&root_uiter);
@@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 out:
        if (!search_commit_root) {
                btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-               btrfs_put_delayed_seq(delayed_refs, &seq_elem);
                btrfs_end_transaction(trans, fs_info->extent_root);
        }
 
index c18d8ac7b795da487c4a526979954e91cbddf52b..3a1ad3e2dcb05caeff5de53a56c18496e0922698 100644 (file)
@@ -58,8 +58,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 delayed_ref_seq, u64 time_seq,
-                               struct ulist **roots);
+                               u64 time_seq, struct ulist **roots);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
index 67fe46fdee6f9c4af8ee7b9473fa00b7c6d9924b..fb21431fe4e045f507c0b5f0974c7f4afbfa2797 100644 (file)
@@ -321,7 +321,7 @@ struct tree_mod_root {
 struct tree_mod_elem {
        struct rb_node node;
        u64 index;              /* shifted logical */
-       struct seq_list elem;
+       u64 seq;
        enum mod_log_op op;
 
        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +341,50 @@ struct tree_mod_elem {
        struct tree_mod_root old_root;
 };
 
-static inline void
-__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 {
-       elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
-       list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+       read_lock(&fs_info->tree_mod_log_lock);
 }
 
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
-                           struct seq_list *elem)
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+       read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+       write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 {
-       elem->flags = 1;
+       write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                          struct seq_list *elem)
+{
+       u64 seq;
+
+       tree_mod_log_write_lock(fs_info);
        spin_lock(&fs_info->tree_mod_seq_lock);
-       __get_tree_mod_seq(fs_info, elem);
+       if (!elem->seq) {
+               elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+               list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+       }
+       seq = btrfs_inc_tree_mod_seq(fs_info);
        spin_unlock(&fs_info->tree_mod_seq_lock);
+       tree_mod_log_write_unlock(fs_info);
+
+       return seq;
 }
 
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
        if (!seq_putting)
                return;
 
-       BUG_ON(!(elem->flags & 1));
        spin_lock(&fs_info->tree_mod_seq_lock);
        list_del(&elem->list);
+       elem->seq = 0;
 
        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
-               if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+               if (cur_elem->seq < min_seq) {
                        if (seq_putting > cur_elem->seq) {
                                /*
                                 * blocker with lower sequence number exists, we
                                 * cannot remove anything from the log
                                 */
-                               goto out;
+                               spin_unlock(&fs_info->tree_mod_seq_lock);
+                               return;
                        }
                        min_seq = cur_elem->seq;
                }
        }
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+
+       /*
+        * we removed the lowest blocker from the blocker list, so there may be
+        * more processible delayed refs.
+        */
+       wake_up(&fs_info->tree_mod_seq_wait);
 
        /*
         * anything that's lower than the lowest existing (read: blocked)
         * sequence number can be removed from the tree.
         */
-       write_lock(&fs_info->tree_mod_log_lock);
+       tree_mod_log_write_lock(fs_info);
        tm_root = &fs_info->tree_mod_log;
        for (node = rb_first(tm_root); node; node = next) {
                next = rb_next(node);
                tm = container_of(node, struct tree_mod_elem, node);
-               if (tm->elem.seq > min_seq)
+               if (tm->seq > min_seq)
                        continue;
                rb_erase(node, tm_root);
-               list_del(&tm->elem.list);
                kfree(tm);
        }
-       write_unlock(&fs_info->tree_mod_log_lock);
-out:
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+       tree_mod_log_write_unlock(fs_info);
 }
 
 /*
@@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
        struct rb_node **new;
        struct rb_node *parent = NULL;
        struct tree_mod_elem *cur;
-       int ret = 0;
 
-       BUG_ON(!tm || !tm->elem.seq);
+       BUG_ON(!tm || !tm->seq);
 
-       write_lock(&fs_info->tree_mod_log_lock);
        tm_root = &fs_info->tree_mod_log;
        new = &tm_root->rb_node;
        while (*new) {
@@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
                        new = &((*new)->rb_left);
                else if (cur->index > tm->index)
                        new = &((*new)->rb_right);
-               else if (cur->elem.seq < tm->elem.seq)
+               else if (cur->seq < tm->seq)
                        new = &((*new)->rb_left);
-               else if (cur->elem.seq > tm->elem.seq)
+               else if (cur->seq > tm->seq)
                        new = &((*new)->rb_right);
                else {
                        kfree(tm);
-                       ret = -EEXIST;
-                       goto unlock;
+                       return -EEXIST;
                }
        }
 
        rb_link_node(&tm->node, parent, new);
        rb_insert_color(&tm->node, tm_root);
-unlock:
-       write_unlock(&fs_info->tree_mod_log_lock);
-       return ret;
+       return 0;
 }
 
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
                                    struct extent_buffer *eb) {
        smp_mb();
        if (list_empty(&(fs_info)->tree_mod_seq_list))
                return 1;
-       if (!eb)
-               return 0;
-       if (btrfs_header_level(eb) == 0)
+       if (eb && btrfs_header_level(eb) == 0)
+               return 1;
+
+       tree_mod_log_write_lock(fs_info);
+       if (list_empty(&fs_info->tree_mod_seq_list)) {
+               /*
+                * someone emptied the list while we were waiting for the lock.
+                * we must not add to the list when no blocker exists.
+                */
+               tree_mod_log_write_unlock(fs_info);
                return 1;
+       }
+
        return 0;
 }
 
 /*
- * This allocates memory and gets a tree modification sequence number when
- * needed.
+ * This allocates memory and gets a tree modification sequence number.
  *
- * Returns 0 when no sequence number is needed, < 0 on error.
- * Returns 1 when a sequence number was added. In this case,
- * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
- * after inserting into the rb tree.
+ * Returns <0 on error.
+ * Returns >0 (the added sequence number) on success.
  */
 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
                                 struct tree_mod_elem **tm_ret)
 {
        struct tree_mod_elem *tm;
-       int seq;
 
-       if (tree_mod_dont_log(fs_info, NULL))
-               return 0;
-
-       tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+       /*
+        * once we switch from spin locks to something different, we should
+        * honor the flags parameter here.
+        */
+       tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
        if (!tm)
                return -ENOMEM;
 
-       tm->elem.flags = 0;
-       spin_lock(&fs_info->tree_mod_seq_lock);
-       if (list_empty(&fs_info->tree_mod_seq_list)) {
-               /*
-                * someone emptied the list while we were waiting for the lock.
-                * we must not add to the list, because no blocker exists. items
-                * are removed from the list only when the existing blocker is
-                * removed from the list.
-                */
-               kfree(tm);
-               seq = 0;
-               spin_unlock(&fs_info->tree_mod_seq_lock);
-       } else {
-               __get_tree_mod_seq(fs_info, &tm->elem);
-               seq = tm->elem.seq;
-       }
-
-       return seq;
+       tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+       return tm->seq;
 }
 
-static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
-                            struct extent_buffer *eb, int slot,
-                            enum mod_log_op op, gfp_t flags)
+static inline int
+__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+                         struct extent_buffer *eb, int slot,
+                         enum mod_log_op op, gfp_t flags)
 {
-       struct tree_mod_elem *tm;
        int ret;
+       struct tree_mod_elem *tm;
 
        ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret <= 0)
+       if (ret < 0)
                return ret;
 
        tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
        tm->slot = slot;
        tm->generation = btrfs_node_ptr_generation(eb, slot);
 
-       ret = __tree_mod_log_insert(fs_info, tm);
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+                            struct extent_buffer *eb, int slot,
+                            enum mod_log_op op, gfp_t flags)
+{
+       int ret;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return 0;
+
+       ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+
+       tree_mod_log_write_unlock(fs_info);
        return ret;
 }
 
@@ -542,6 +582,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
        return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
 }
 
+static noinline int
+tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
+                            struct extent_buffer *eb, int slot,
+                            enum mod_log_op op)
+{
+       return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+}
+
 static noinline int
 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *eb, int dst_slot, int src_slot,
@@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                return 0;
 
        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-               ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+               ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
                                              MOD_LOG_KEY_REMOVE_WHILE_MOVING);
                BUG_ON(ret < 0);
        }
 
        ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret <= 0)
-               return ret;
+       if (ret < 0)
+               goto out;
 
        tm->index = eb->start >> PAGE_CACHE_SHIFT;
        tm->slot = src_slot;
@@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
        tm->op = MOD_LOG_MOVE_KEYS;
 
        ret = __tree_mod_log_insert(fs_info, tm);
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+       tree_mod_log_write_unlock(fs_info);
        return ret;
 }
 
+static inline void
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+       int i;
+       u32 nritems;
+       int ret;
+
+       nritems = btrfs_header_nritems(eb);
+       for (i = nritems - 1; i >= 0; i--) {
+               ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
+                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+               BUG_ON(ret < 0);
+       }
+}
+
 static noinline int
 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *old_root,
@@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        struct tree_mod_elem *tm;
        int ret;
 
+       if (tree_mod_dont_log(fs_info, NULL))
+               return 0;
+
+       __tree_mod_log_free_eb(fs_info, old_root);
+
        ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret <= 0)
-               return ret;
+       if (ret < 0)
+               goto out;
 
        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
        tm->old_root.logical = old_root->start;
@@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        tm->op = MOD_LOG_ROOT_REPLACE;
 
        ret = __tree_mod_log_insert(fs_info, tm);
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+       tree_mod_log_write_unlock(fs_info);
        return ret;
 }
 
@@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
        struct tree_mod_elem *found = NULL;
        u64 index = start >> PAGE_CACHE_SHIFT;
 
-       read_lock(&fs_info->tree_mod_log_lock);
+       tree_mod_log_read_lock(fs_info);
        tm_root = &fs_info->tree_mod_log;
        node = tm_root->rb_node;
        while (node) {
@@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
                        node = node->rb_left;
                } else if (cur->index > index) {
                        node = node->rb_right;
-               } else if (cur->elem.seq < min_seq) {
+               } else if (cur->seq < min_seq) {
                        node = node->rb_left;
                } else if (!smallest) {
                        /* we want the node with the highest seq */
                        if (found)
-                               BUG_ON(found->elem.seq > cur->elem.seq);
+                               BUG_ON(found->seq > cur->seq);
                        found = cur;
                        node = node->rb_left;
-               } else if (cur->elem.seq > min_seq) {
+               } else if (cur->seq > min_seq) {
                        /* we want the node with the smallest seq */
                        if (found)
-                               BUG_ON(found->elem.seq < cur->elem.seq);
+                               BUG_ON(found->seq < cur->seq);
                        found = cur;
                        node = node->rb_right;
                } else {
@@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
                        break;
                }
        }
-       read_unlock(&fs_info->tree_mod_log_lock);
+       tree_mod_log_read_unlock(fs_info);
 
        return found;
 }
@@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
 
-static inline void
+static noinline void
 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
                     struct extent_buffer *src, unsigned long dst_offset,
                     unsigned long src_offset, int nr_items)
@@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
        if (tree_mod_dont_log(fs_info, NULL))
                return;
 
-       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
+               tree_mod_log_write_unlock(fs_info);
                return;
+       }
 
-       /* speed this up by single seq for all operations? */
        for (i = 0; i < nr_items; i++) {
-               ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
-                                             MOD_LOG_KEY_REMOVE);
+               ret = tree_mod_log_insert_key_locked(fs_info, src,
+                                                    i + src_offset,
+                                                    MOD_LOG_KEY_REMOVE);
                BUG_ON(ret < 0);
-               ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
-                                             MOD_LOG_KEY_ADD);
+               ret = tree_mod_log_insert_key_locked(fs_info, dst,
+                                                    i + dst_offset,
+                                                    MOD_LOG_KEY_ADD);
                BUG_ON(ret < 0);
        }
+
+       tree_mod_log_write_unlock(fs_info);
 }
 
 static inline void
@@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
        BUG_ON(ret < 0);
 }
 
-static inline void
+static noinline void
 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
                          struct extent_buffer *eb,
                          struct btrfs_disk_key *disk_key, int slot, int atomic)
@@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
        BUG_ON(ret < 0);
 }
 
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
-                                struct extent_buffer *eb)
+static noinline void
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
-       int i;
-       int ret;
-       u32 nritems;
-
        if (tree_mod_dont_log(fs_info, eb))
                return;
 
-       nritems = btrfs_header_nritems(eb);
-       for (i = nritems - 1; i >= 0; i--) {
-               ret = tree_mod_log_insert_key(fs_info, eb, i,
-                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
-               BUG_ON(ret < 0);
-       }
+       __tree_mod_log_free_eb(fs_info, eb);
+
+       tree_mod_log_write_unlock(fs_info);
 }
 
-static inline void
+static noinline void
 tree_mod_log_set_root_pointer(struct btrfs_root *root,
                              struct extent_buffer *new_root_node)
 {
        int ret;
-       tree_mod_log_free_eb(root->fs_info, root->node);
        ret = tree_mod_log_insert_root(root->fs_info, root->node,
                                       new_root_node, GFP_NOFS);
        BUG_ON(ret < 0);
@@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
        n = btrfs_header_nritems(eb);
-       while (tm && tm->elem.seq >= time_seq) {
+       while (tm && tm->seq >= time_seq) {
                /*
                 * all the operations are recorded with the operator used for
                 * the modification. as we're going backwards, we do the
@@ -2721,6 +2788,78 @@ done:
        return ret;
 }
 
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+                              struct btrfs_key *key, struct btrfs_path *p,
+                              int find_higher, int return_any)
+{
+       int ret;
+       struct extent_buffer *leaf;
+
+again:
+       ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+       if (ret <= 0)
+               return ret;
+       /*
+        * a return value of 1 means the path is at the position where the
+        * item should be inserted. Normally this is the next bigger item,
+        * but in case the previous item is the last in a leaf, path points
+        * to the first free slot in the previous leaf, i.e. at an invalid
+        * item.
+        */
+       leaf = p->nodes[0];
+
+       if (find_higher) {
+               if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, p);
+                       if (ret <= 0)
+                               return ret;
+                       if (!return_any)
+                               return 1;
+                       /*
+                        * no higher item found, return the next
+                        * lower instead
+                        */
+                       return_any = 0;
+                       find_higher = 0;
+                       btrfs_release_path(p);
+                       goto again;
+               }
+       } else {
+               if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+                       /* we're sitting on an invalid slot */
+                       if (p->slots[0] == 0) {
+                               ret = btrfs_prev_leaf(root, p);
+                               if (ret <= 0)
+                                       return ret;
+                               if (!return_any)
+                                       return 1;
+                               /*
+                                * no lower item found, return the next
+                                * higher instead
+                                */
+                               return_any = 0;
+                               find_higher = 1;
+                               btrfs_release_path(p);
+                               goto again;
+                       }
+                       --p->slots[0];
+               }
+       }
+       return 0;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
index a0ee2f8e0566da9ba788c72697d4c9492ed37e18..00f9a50f986d95615e6affb651bf52510519396b 100644 (file)
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -883,6 +886,72 @@ struct btrfs_block_group_item {
        __le64 flags;
 } __attribute__ ((__packed__));
 
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON            (1ULL << 0)
+/*
+ * SCANNING is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_SCANNING      (1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT  (1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+       __le64 version;
+       /*
+        * the generation is updated during every commit. As older
+        * versions of btrfs are not aware of qgroups, it will be
+        * possible to detect inconsistencies by checking the
+        * generation on mount time
+        */
+       __le64 generation;
+
+       /* flag definitions see above */
+       __le64 flags;
+
+       /*
+        * only used during scanning to record the progress
+        * of the scan. It contains a logical address
+        */
+       __le64 scan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+       __le64 generation;
+       __le64 rfer;
+       __le64 rfer_cmpr;
+       __le64 excl;
+       __le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER    (1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL    (1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER    (1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL    (1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR   (1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR   (1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+       /*
+        * only updated when any of the other values change
+        */
+       __le64 flags;
+       __le64 max_rfer;
+       __le64 max_excl;
+       __le64 rsv_rfer;
+       __le64 rsv_excl;
+} __attribute__ ((__packed__));
+
 struct btrfs_space_info {
        u64 flags;
 
@@ -1030,6 +1099,13 @@ struct btrfs_block_group_cache {
        struct list_head cluster_list;
 };
 
+/* delayed seq elem */
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+};
+
+/* fs_info */
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
@@ -1044,6 +1120,7 @@ struct btrfs_fs_info {
        struct btrfs_root *dev_root;
        struct btrfs_root *fs_root;
        struct btrfs_root *csum_root;
+       struct btrfs_root *quota_root;
 
        /* the log root tree is a directory of all the other log roots */
        struct btrfs_root *log_root_tree;
@@ -1144,6 +1221,8 @@ struct btrfs_fs_info {
        spinlock_t tree_mod_seq_lock;
        atomic_t tree_mod_seq;
        struct list_head tree_mod_seq_list;
+       struct seq_list tree_mod_seq_elem;
+       wait_queue_head_t tree_mod_seq_wait;
 
        /* this protects tree_mod_log */
        rwlock_t tree_mod_log_lock;
@@ -1298,6 +1377,29 @@ struct btrfs_fs_info {
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        u32 check_integrity_print_mask;
 #endif
+       /*
+        * quota information
+        */
+       unsigned int quota_enabled:1;
+
+       /*
+        * quota_enabled only changes state after a commit. This holds the
+        * next state.
+        */
+       unsigned int pending_quota_state:1;
+
+       /* is qgroup tracking in a consistent state? */
+       u64 qgroup_flags;
+
+       /* holds configuration and tracking. Protected by qgroup_lock */
+       struct rb_root qgroup_tree;
+       spinlock_t qgroup_lock;
+
+       /* list of dirty qgroups to be written at next commit */
+       struct list_head dirty_qgroups;
+
+       /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+       u64 qgroup_seq;
 
        /* filesystem state */
        u64 fs_state;
@@ -1527,6 +1629,30 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY     216
 #define BTRFS_CHUNK_ITEM_KEY   228
 
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
 #define BTRFS_BALANCE_ITEM_KEY 248
 
 /*
@@ -2508,6 +2634,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
                            sizeof(val));
 }
 
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+                  generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+                  version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+                  flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
+                  scan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+                  generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+                  rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+                  excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+                        struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+                        rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+                        struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+                        excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+                        struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+                  flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+                  max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+                  max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+                  rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+                  rsv_excl, 64);
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -2703,6 +2872,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2753,6 +2924,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      ins_len, int cow);
 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
                          struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+                              struct btrfs_key *key, struct btrfs_path *p,
+                              int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
                       int start_slot, int cache_only, u64 *last_ret,
@@ -2835,11 +3009,22 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->chunk_root);
        kfree(fs_info->dev_root);
        kfree(fs_info->csum_root);
+       kfree(fs_info->quota_root);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
 
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                          struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+       return atomic_inc_return(&fs_info->tree_mod_seq);
+}
+
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
                        struct btrfs_path *path,
@@ -3198,17 +3383,49 @@ void btrfs_reada_detach(void *handle);
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         u64 start, int err);
 
-/* delayed seq elem */
-struct seq_list {
+/* qgroup.c */
+struct qgroup_update {
        struct list_head list;
-       u64 seq;
-       u32 flags;
+       struct btrfs_delayed_ref_node *node;
+       struct btrfs_delayed_extent_op *extent_op;
 };
 
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
-                           struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
-                           struct seq_list *elem);
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid,
+                       char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_node *node,
+                           struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_node *node,
+                            struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
 static inline int is_fstree(u64 rootid)
 {
index 13ae7b04790eaff72e8c23fb145fca8bfae88175..da7419ed01bb7e520cfbcac0fbef44ef607a92e5 100644 (file)
@@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+                           struct btrfs_delayed_ref_root *delayed_refs,
                            u64 seq)
 {
        struct seq_list *elem;
-
-       assert_spin_locked(&delayed_refs->lock);
-       if (list_empty(&delayed_refs->seq_head))
-               return 0;
-
-       elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
-       if (seq >= elem->seq) {
-               pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
-                        seq, elem->seq, delayed_refs);
-               return 1;
+       int ret = 0;
+
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               elem = list_first_entry(&fs_info->tree_mod_seq_list,
+                                       struct seq_list, list);
+               if (seq >= elem->seq) {
+                       pr_debug("holding back delayed_ref %llu, lowest is "
+                                "%llu (%p)\n", seq, elem->seq, delayed_refs);
+                       ret = 1;
+               }
        }
-       return 0;
+
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+       return ret;
 }
 
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -525,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (is_fstree(ref_root))
-               seq = inc_delayed_seq(delayed_refs);
+       if (need_ref_seq(for_cow, ref_root))
+               seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
        ref->seq = seq;
 
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (is_fstree(ref_root))
-               seq = inc_delayed_seq(delayed_refs);
+       if (need_ref_seq(for_cow, ref_root))
+               seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
        ref->seq = seq;
 
        full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
                                   for_cow);
-       if (!is_fstree(ref_root) &&
-           waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&fs_info->tree_mod_seq_wait))
+               wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
+       if (need_ref_seq(for_cow, ref_root))
+               btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
        return 0;
 }
@@ -707,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
                                   action, for_cow);
-       if (!is_fstree(ref_root) &&
-           waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&fs_info->tree_mod_seq_wait))
+               wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
+       if (need_ref_seq(for_cow, ref_root))
+               btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
        return 0;
 }
@@ -736,8 +744,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
 
-       if (waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       if (waitqueue_active(&fs_info->tree_mod_seq_wait))
+               wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
index 413927fb9957e41fdcfb82511e63d416b8a36c76..0d7c90c366b629152796c0c717c0e4cd5f50e9b1 100644 (file)
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
        int flushing;
 
        u64 run_delayed_start;
-
-       /*
-        * seq number of delayed refs. We need to know if a backref was being
-        * added before the currently processed ref or afterwards.
-        */
-       u64 seq;
-
-       /*
-        * seq_list holds a list of all seq numbers that are currently being
-        * added to the list. While walking backrefs (btrfs_find_all_roots,
-        * qgroups), which might take some time, no newer ref must be processed,
-        * as it might influence the outcome of the walk.
-        */
-       struct list_head seq_head;
-
-       /*
-        * when the only refs we have in the list must not be processed, we want
-        * to wait for more refs to show up or for the end of backref walking.
-        */
-       wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -195,34 +175,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
 
-static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
-{
-       assert_spin_locked(&delayed_refs->lock);
-       ++delayed_refs->seq;
-       return delayed_refs->seq;
-}
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+                           struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq);
 
-static inline void
-btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-                     struct seq_list *elem)
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
 {
-       assert_spin_locked(&delayed_refs->lock);
-       elem->seq = delayed_refs->seq;
-       list_add_tail(&elem->list, &delayed_refs->seq_head);
-}
+       if (for_cow)
+               return 0;
 
-static inline void
-btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-                     struct seq_list *elem)
-{
-       spin_lock(&delayed_refs->lock);
-       list_del(&elem->list);
-       wake_up(&delayed_refs->seq_wait);
-       spin_unlock(&delayed_refs->lock);
-}
+       if (rootid == BTRFS_FS_TREE_OBJECTID)
+               return 1;
 
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-                           u64 seq);
+       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+
+       return 0;
+}
 
 /*
  * a node might live in a head or a regular ref, this lets you
index 1a4a2a97592670b2d6f747afb240f9ee6722b19a..05f4fb6e060709c8d70bae439b1ba3cc15de7428 100644 (file)
@@ -1225,6 +1225,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
        return root;
 }
 
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_info *fs_info,
+                                    u64 objectid)
+{
+       struct extent_buffer *leaf;
+       struct btrfs_root *tree_root = fs_info->tree_root;
+       struct btrfs_root *root;
+       struct btrfs_key key;
+       int ret = 0;
+       u64 bytenr;
+
+       root = btrfs_alloc_root(fs_info);
+       if (!root)
+               return ERR_PTR(-ENOMEM);
+
+       __setup_root(tree_root->nodesize, tree_root->leafsize,
+                    tree_root->sectorsize, tree_root->stripesize,
+                    root, fs_info, objectid);
+       root->root_key.objectid = objectid;
+       root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root->root_key.offset = 0;
+
+       leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                     0, objectid, NULL, 0, 0, 0);
+       if (IS_ERR(leaf)) {
+               ret = PTR_ERR(leaf);
+               goto fail;
+       }
+
+       bytenr = leaf->start;
+       memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+       btrfs_set_header_bytenr(leaf, leaf->start);
+       btrfs_set_header_generation(leaf, trans->transid);
+       btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+       btrfs_set_header_owner(leaf, objectid);
+       root->node = leaf;
+
+       write_extent_buffer(leaf, fs_info->fsid,
+                           (unsigned long)btrfs_header_fsid(leaf),
+                           BTRFS_FSID_SIZE);
+       write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+                           (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+                           BTRFS_UUID_SIZE);
+       btrfs_mark_buffer_dirty(leaf);
+
+       root->commit_root = btrfs_root_node(root);
+       root->track_dirty = 1;
+
+
+       root->root_item.flags = 0;
+       root->root_item.byte_limit = 0;
+       btrfs_set_root_bytenr(&root->root_item, leaf->start);
+       btrfs_set_root_generation(&root->root_item, trans->transid);
+       btrfs_set_root_level(&root->root_item, 0);
+       btrfs_set_root_refs(&root->root_item, 1);
+       btrfs_set_root_used(&root->root_item, leaf->len);
+       btrfs_set_root_last_snapshot(&root->root_item, 0);
+       btrfs_set_root_dirid(&root->root_item, 0);
+       root->root_item.drop_level = 0;
+
+       key.objectid = objectid;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = 0;
+       ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+       if (ret)
+               goto fail;
+
+       btrfs_tree_unlock(leaf);
+
+fail:
+       if (ret)
+               return ERR_PTR(ret);
+
+       return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1396,6 +1472,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                return fs_info->dev_root;
        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
                return fs_info->csum_root;
+       if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+               return fs_info->quota_root ? fs_info->quota_root :
+                                            ERR_PTR(-ENOENT);
 again:
        spin_lock(&fs_info->fs_roots_radix_lock);
        root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1823,6 +1902,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
        free_extent_buffer(info->extent_root->commit_root);
        free_extent_buffer(info->csum_root->node);
        free_extent_buffer(info->csum_root->commit_root);
+       if (info->quota_root) {
+               free_extent_buffer(info->quota_root->node);
+               free_extent_buffer(info->quota_root->commit_root);
+       }
 
        info->tree_root->node = NULL;
        info->tree_root->commit_root = NULL;
@@ -1832,6 +1915,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
        info->extent_root->commit_root = NULL;
        info->csum_root->node = NULL;
        info->csum_root->commit_root = NULL;
+       if (info->quota_root) {
+               info->quota_root->node = NULL;
+               info->quota_root->commit_root = NULL;
+       }
 
        if (chunk_root) {
                free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1949,7 @@ int open_ctree(struct super_block *sb,
        struct btrfs_root *csum_root;
        struct btrfs_root *chunk_root;
        struct btrfs_root *dev_root;
+       struct btrfs_root *quota_root;
        struct btrfs_root *log_tree_root;
        int ret;
        int err = -EINVAL;
@@ -1873,9 +1961,10 @@ int open_ctree(struct super_block *sb,
        csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
        dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+       quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
 
        if (!tree_root || !extent_root || !csum_root ||
-           !chunk_root || !dev_root) {
+           !chunk_root || !dev_root || !quota_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -1944,6 +2033,8 @@ int open_ctree(struct super_block *sb,
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
 
+       init_waitqueue_head(&fs_info->tree_mod_seq_wait);
+
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        spin_lock_init(&fs_info->reada_lock);
@@ -2032,6 +2123,13 @@ int open_ctree(struct super_block *sb,
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
 
+       spin_lock_init(&fs_info->qgroup_lock);
+       fs_info->qgroup_tree = RB_ROOT;
+       INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+       fs_info->qgroup_seq = 1;
+       fs_info->quota_enabled = 0;
+       fs_info->pending_quota_state = 0;
+
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
 
@@ -2356,6 +2454,17 @@ retry_root_backup:
                goto recovery_tree_root;
        csum_root->track_dirty = 1;
 
+       ret = find_and_setup_root(tree_root, fs_info,
+                                 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+       if (ret) {
+               kfree(quota_root);
+               quota_root = fs_info->quota_root = NULL;
+       } else {
+               quota_root->track_dirty = 1;
+               fs_info->quota_enabled = 1;
+               fs_info->pending_quota_state = 1;
+       }
+
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
 
@@ -2415,6 +2524,9 @@ retry_root_backup:
                               " integrity check module %s\n", sb->s_id);
        }
 #endif
+       ret = btrfs_read_qgroup_config(fs_info);
+       if (ret)
+               goto fail_trans_kthread;
 
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
@@ -2425,7 +2537,7 @@ retry_root_backup:
                        printk(KERN_WARNING "Btrfs log replay required "
                               "on RO media\n");
                        err = -EIO;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                }
                blocksize =
                     btrfs_level_size(tree_root,
@@ -2434,7 +2546,7 @@ retry_root_backup:
                log_tree_root = btrfs_alloc_root(fs_info);
                if (!log_tree_root) {
                        err = -ENOMEM;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                }
 
                __setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2474,7 +2586,7 @@ retry_root_backup:
                        printk(KERN_WARNING
                               "btrfs: failed to recover relocation\n");
                        err = -EINVAL;
-                       goto fail_trans_kthread;
+                       goto fail_qgroup;
                }
        }
 
@@ -2484,10 +2596,10 @@ retry_root_backup:
 
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
-               goto fail_trans_kthread;
+               goto fail_qgroup;
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
-               goto fail_trans_kthread;
+               goto fail_qgroup;
        }
 
        if (sb->s_flags & MS_RDONLY)
@@ -2511,6 +2623,8 @@ retry_root_backup:
 
        return 0;
 
+fail_qgroup:
+       btrfs_free_qgroup_config(fs_info);
 fail_trans_kthread:
        kthread_stop(fs_info->transaction_kthread);
 fail_cleaner:
@@ -3109,6 +3223,8 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 2;
        smp_mb();
 
+       btrfs_free_qgroup_config(root->fs_info);
+
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -3128,6 +3244,10 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(fs_info->dev_root->commit_root);
        free_extent_buffer(fs_info->csum_root->node);
        free_extent_buffer(fs_info->csum_root->commit_root);
+       if (fs_info->quota_root) {
+               free_extent_buffer(fs_info->quota_root->node);
+               free_extent_buffer(fs_info->quota_root->commit_root);
+       }
 
        btrfs_free_block_groups(fs_info);
 
@@ -3258,7 +3378,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-static int btree_lock_page_hook(struct page *page, void *data,
+int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *))
 {
        struct inode *inode = page->mapping->host;
index 05b3fab39f7e814fc8c958e125f5a14c7e39d7f9..95e147eea23952c689ee9e87cfd39e7d61fde96b 100644 (file)
@@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 int btrfs_cleanup_transaction(struct btrfs_root *root);
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
                                  struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_info *fs_info,
+                                    u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+                               void (*flush_fn)(void *));
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
index 71b2d1c7da69948b932e31fa804d6b8c95d64db9..44f06201f376a83809b66fadcd70fd8400baa4c6 100644 (file)
@@ -34,6 +34,8 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
+#undef SCRAMBLE_DELAYED_REFS
+
 /*
  * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref = select_delayed_ref(locked_ref);
 
                if (ref && ref->seq &&
-                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        /*
                         * there are still refs with lower seq numbers in the
                         * process of being added. Don't run this ref yet.
@@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
 next:
-               do_chunk_alloc(trans, root->fs_info->extent_root,
+               do_chunk_alloc(trans, fs_info->extent_root,
                               2 * 1024 * 1024,
                               btrfs_get_alloc_profile(root, 0),
                               CHUNK_ALLOC_NO_FORCE);
@@ -2347,21 +2350,99 @@ next:
        return count;
 }
 
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
+                              struct btrfs_delayed_ref_root *delayed_refs,
                               unsigned long num_refs,
                               struct list_head *first_seq)
 {
        spin_unlock(&delayed_refs->lock);
        pr_debug("waiting for more refs (num %ld, first %p)\n",
                 num_refs, first_seq);
-       wait_event(delayed_refs->seq_wait,
+       wait_event(fs_info->tree_mod_seq_wait,
                   num_refs != delayed_refs->num_entries ||
-                  delayed_refs->seq_head.next != first_seq);
+                  fs_info->tree_mod_seq_list.next != first_seq);
        pr_debug("done waiting for more refs (num %ld, first %p)\n",
-                delayed_refs->num_entries, delayed_refs->seq_head.next);
+                delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
        spin_lock(&delayed_refs->lock);
 }
 
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int alt = 1;
+       u64 middle;
+       u64 first = 0, last = 0;
+
+       n = rb_first(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               first = entry->bytenr;
+       }
+       n = rb_last(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               last = entry->bytenr;
+       }
+       n = root->rb_node;
+
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+
+               middle = entry->bytenr;
+
+               if (alt)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+
+               alt = 1 - alt;
+       }
+       return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+{
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+       return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
                       CHUNK_ALLOC_NO_FORCE);
 
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
        consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+       delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
                run_most = 1;
@@ -2437,7 +2525,7 @@ again:
                                num_refs = delayed_refs->num_entries;
                                first_seq = root->fs_info->tree_mod_seq_list.next;
                        } else {
-                               wait_for_more_refs(delayed_refs,
+                               wait_for_more_refs(root->fs_info, delayed_refs,
                                                   num_refs, first_seq);
                                /*
                                 * after waiting, things have changed. we
@@ -2502,6 +2590,7 @@ again:
        }
 out:
        spin_unlock(&delayed_refs->lock);
+       assert_qgroups_uptodate(trans);
        return 0;
 }
 
@@ -4479,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
 
+       if (root->fs_info->quota_enabled) {
+               ret = btrfs_qgroup_reserve(root, num_bytes +
+                                          nr_extents * root->leafsize);
+               if (ret)
+                       return ret;
+       }
+
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
@@ -4557,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
+       if (root->fs_info->quota_enabled) {
+               btrfs_qgroup_free(root, num_bytes +
+                                       dropped * root->leafsize);
+       }
+
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -5193,8 +5294,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
 
        delayed_refs->num_entries--;
-       if (waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
+       if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
+               wake_up(&root->fs_info->tree_mod_seq_wait);
 
        /*
         * we don't take a ref on the node because we're removing it from the
index 17facea6a51c8428878dffe52a9910b825fb8fd8..e54b663fd3aab34972ee5a4d3ee21a54617df289 100644 (file)
@@ -336,7 +336,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
                                  char *name, int namelen,
-                                 u64 *async_transid)
+                                 u64 *async_transid,
+                                 struct btrfs_qgroup_inherit **inherit)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
@@ -368,6 +369,11 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
+       ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
+                                  inherit ? *inherit : NULL);
+       if (ret)
+               goto fail;
+
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
@@ -484,7 +490,7 @@ fail:
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                           char *name, int namelen, u64 *async_transid,
-                          bool readonly)
+                          bool readonly, struct btrfs_qgroup_inherit **inherit)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -502,6 +508,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
+       if (inherit) {
+               pending_snapshot->inherit = *inherit;
+               *inherit = NULL;        /* take responsibility to free it */
+       }
 
        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
        if (IS_ERR(trans)) {
@@ -635,7 +645,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
-                                  u64 *async_transid, bool readonly)
+                                  u64 *async_transid, bool readonly,
+                                  struct btrfs_qgroup_inherit **inherit)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -662,11 +673,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
 
        if (snap_src) {
-               error = create_snapshot(snap_src, dentry,
-                                       name, namelen, async_transid, readonly);
+               error = create_snapshot(snap_src, dentry, name, namelen,
+                                       async_transid, readonly, inherit);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
-                                     name, namelen, async_transid);
+                                     name, namelen, async_transid, inherit);
        }
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@ -1375,11 +1386,9 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-                                                   char *name,
-                                                   unsigned long fd,
-                                                   int subvol,
-                                                   u64 *transid,
-                                                   bool readonly)
+                               char *name, unsigned long fd, int subvol,
+                               u64 *transid, bool readonly,
+                               struct btrfs_qgroup_inherit **inherit)
 {
        struct file *src_file;
        int namelen;
@@ -1403,7 +1412,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
        if (subvol) {
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                    NULL, transid, readonly);
+                                    NULL, transid, readonly, inherit);
        } else {
                struct inode *src_inode;
                src_file = fget(fd);
@@ -1422,7 +1431,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                }
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                     BTRFS_I(src_inode)->root,
-                                    transid, readonly);
+                                    transid, readonly, inherit);
                fput(src_file);
        }
 out_drop_write:
@@ -1444,7 +1453,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol,
-                                             NULL, false);
+                                             NULL, false, NULL);
 
        kfree(vol_args);
        return ret;
@@ -1458,6 +1467,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        u64 transid = 0;
        u64 *ptr = NULL;
        bool readonly = false;
+       struct btrfs_qgroup_inherit *inherit = NULL;
 
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
@@ -1465,7 +1475,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
        if (vol_args->flags &
-           ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+           ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+             BTRFS_SUBVOL_QGROUP_INHERIT)) {
                ret = -EOPNOTSUPP;
                goto out;
        }
@@ -1474,10 +1485,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
                ptr = &transid;
        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
                readonly = true;
+       if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+               if (vol_args->size > PAGE_CACHE_SIZE) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+               if (IS_ERR(inherit)) {
+                       ret = PTR_ERR(inherit);
+                       goto out;
+               }
+       }
 
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-                                             vol_args->fd, subvol,
-                                             ptr, readonly);
+                                             vol_args->fd, subvol, ptr,
+                                             readonly, &inherit);
 
        if (ret == 0 && ptr &&
            copy_to_user(arg +
@@ -1486,6 +1508,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
                ret = -EFAULT;
 out:
        kfree(vol_args);
+       kfree(inherit);
        return ret;
 }
 
@@ -3401,6 +3424,183 @@ out:
        return ret;
 }
 
+static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_ioctl_quota_ctl_args *sa;
+       struct btrfs_trans_handle *trans = NULL;
+       int ret;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       goto out;
+               }
+       }
+
+       switch (sa->cmd) {
+       case BTRFS_QUOTA_CTL_ENABLE:
+               ret = btrfs_quota_enable(trans, root->fs_info);
+               break;
+       case BTRFS_QUOTA_CTL_DISABLE:
+               ret = btrfs_quota_disable(trans, root->fs_info);
+               break;
+       case BTRFS_QUOTA_CTL_RESCAN:
+               ret = btrfs_quota_rescan(root->fs_info);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if (copy_to_user(arg, sa, sizeof(*sa)))
+               ret = -EFAULT;
+
+       if (trans) {
+               err = btrfs_commit_transaction(trans, root);
+               if (err && !ret)
+                       ret = err;
+       }
+
+out:
+       kfree(sa);
+       return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_ioctl_qgroup_assign_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+
+       /* FIXME: check if the IDs really exist */
+       if (sa->assign) {
+               ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+                                               sa->src, sa->dst);
+       } else {
+               ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+                                               sa->src, sa->dst);
+       }
+
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+
+out:
+       kfree(sa);
+       return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_ioctl_qgroup_create_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+
+       /* FIXME: check if the IDs really exist */
+       if (sa->create) {
+               ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+                                         NULL);
+       } else {
+               ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+       }
+
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+
+out:
+       kfree(sa);
+       return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_ioctl_qgroup_limit_args *sa;
+       struct btrfs_trans_handle *trans;
+       int ret;
+       int err;
+       u64 qgroupid;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+
+       qgroupid = sa->qgroupid;
+       if (!qgroupid) {
+               /* take the current subvol as qgroup */
+               qgroupid = root->root_key.objectid;
+       }
+
+       /* FIXME: check if the IDs really exist */
+       ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+       err = btrfs_end_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+
+out:
+       kfree(sa);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -3422,6 +3622,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1);
+       case BTRFS_IOC_SUBVOL_CREATE_V2:
+               return btrfs_ioctl_snap_create_v2(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
        case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3485,6 +3687,14 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_balance_progress(root, argp);
        case BTRFS_IOC_GET_DEV_STATS:
                return btrfs_ioctl_get_dev_stats(root, argp);
+       case BTRFS_IOC_QUOTA_CTL:
+               return btrfs_ioctl_quota_ctl(root, argp);
+       case BTRFS_IOC_QGROUP_ASSIGN:
+               return btrfs_ioctl_qgroup_assign(root, argp);
+       case BTRFS_IOC_QGROUP_CREATE:
+               return btrfs_ioctl_qgroup_create(root, argp);
+       case BTRFS_IOC_QGROUP_LIMIT:
+               return btrfs_ioctl_qgroup_limit(root, argp);
        }
 
        return -ENOTTY;
index 4e3e5d342a2b3105d3ec0947ebacd2c519efcf24..3f9701d571ea39f8251f7c3fc2139244293111a8 100644 (file)
@@ -32,15 +32,46 @@ struct btrfs_ioctl_vol_args {
 
 #define BTRFS_SUBVOL_CREATE_ASYNC      (1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY            (1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT    (1ULL << 2)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_UUID_SIZE 16
 
+#define BTRFS_QGROUP_INHERIT_SET_LIMITS        (1ULL << 0)
+
+struct btrfs_qgroup_limit {
+       __u64   flags;
+       __u64   max_rfer;
+       __u64   max_excl;
+       __u64   rsv_rfer;
+       __u64   rsv_excl;
+};
+
+struct btrfs_qgroup_inherit {
+       __u64   flags;
+       __u64   num_qgroups;
+       __u64   num_ref_copies;
+       __u64   num_excl_copies;
+       struct btrfs_qgroup_limit lim;
+       __u64   qgroups[0];
+};
+
+struct btrfs_ioctl_qgroup_limit_args {
+       __u64   qgroupid;
+       struct btrfs_qgroup_limit lim;
+};
+
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
        __s64 fd;
        __u64 transid;
        __u64 flags;
-       __u64 unused[4];
+       union {
+               struct {
+                       __u64 size;
+                       struct btrfs_qgroup_inherit __user *qgroup_inherit;
+               };
+               __u64 unused[4];
+       };
        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 
@@ -299,6 +330,25 @@ struct btrfs_ioctl_get_dev_stats {
        __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
 };
 
+#define BTRFS_QUOTA_CTL_ENABLE 1
+#define BTRFS_QUOTA_CTL_DISABLE        2
+#define BTRFS_QUOTA_CTL_RESCAN 3
+struct btrfs_ioctl_quota_ctl_args {
+       __u64 cmd;
+       __u64 status;
+};
+
+struct btrfs_ioctl_qgroup_assign_args {
+       __u64 assign;
+       __u64 src;
+       __u64 dst;
+};
+
+struct btrfs_ioctl_qgroup_create_args {
+       __u64 create;
+       __u64 qgroupid;
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -343,6 +393,8 @@ struct btrfs_ioctl_get_dev_stats {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
                                   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
+                                  struct btrfs_ioctl_vol_args_v2)
 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
 #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
@@ -365,6 +417,14 @@ struct btrfs_ioctl_get_dev_stats {
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
                                     struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
+                              struct btrfs_ioctl_quota_ctl_args)
+#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
+                              struct btrfs_ioctl_qgroup_assign_args)
+#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
+                              struct btrfs_ioctl_qgroup_create_args)
+#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
+                              struct btrfs_ioctl_qgroup_limit_args)
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
                                      struct btrfs_ioctl_get_dev_stats)
 #endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644 (file)
index 0000000..bc424ae
--- /dev/null
@@ -0,0 +1,1571 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "ioctl.h"
+#include "backref.h"
+
+/* TODO XXX FIXME
+ *  - subvol delete -> delete when ref goes to 0? delete limits also?
+ *  - reorganize keys
+ *  - compressed
+ *  - sync
+ *  - rescan
+ *  - copy also limits on subvol creation
+ *  - limit
+ *  - caches fuer ulists
+ *  - performance benchmarks
+ *  - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+       u64 qgroupid;
+
+       /*
+        * state
+        */
+       u64 rfer;       /* referenced */
+       u64 rfer_cmpr;  /* referenced compressed */
+       u64 excl;       /* exclusive */
+       u64 excl_cmpr;  /* exclusive compressed */
+
+       /*
+        * limits
+        */
+       u64 lim_flags;  /* which limits are set */
+       u64 max_rfer;
+       u64 max_excl;
+       u64 rsv_rfer;
+       u64 rsv_excl;
+
+       /*
+        * reservation tracking
+        */
+       u64 reserved;
+
+       /*
+        * lists
+        */
+       struct list_head groups;  /* groups this group is member of */
+       struct list_head members; /* groups that are members of this group */
+       struct list_head dirty;   /* dirty groups */
+       struct rb_node node;      /* tree of qgroups */
+
+       /*
+        * temp variables for accounting operations
+        */
+       u64 tag;
+       u64 refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+       struct list_head next_group;
+       struct list_head next_member;
+       struct btrfs_qgroup *group;
+       struct btrfs_qgroup *member;
+};
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+                                          u64 qgroupid)
+{
+       struct rb_node *n = fs_info->qgroup_tree.rb_node;
+       struct btrfs_qgroup *qgroup;
+
+       while (n) {
+               qgroup = rb_entry(n, struct btrfs_qgroup, node);
+               if (qgroup->qgroupid < qgroupid)
+                       n = n->rb_left;
+               else if (qgroup->qgroupid > qgroupid)
+                       n = n->rb_right;
+               else
+                       return qgroup;
+       }
+       return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+                                         u64 qgroupid)
+{
+       struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct btrfs_qgroup *qgroup;
+
+       while (*p) {
+               parent = *p;
+               qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+               if (qgroup->qgroupid < qgroupid)
+                       p = &(*p)->rb_left;
+               else if (qgroup->qgroupid > qgroupid)
+                       p = &(*p)->rb_right;
+               else
+                       return qgroup;
+       }
+
+       qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+       if (!qgroup)
+               return ERR_PTR(-ENOMEM);
+
+       qgroup->qgroupid = qgroupid;
+       INIT_LIST_HEAD(&qgroup->groups);
+       INIT_LIST_HEAD(&qgroup->members);
+       INIT_LIST_HEAD(&qgroup->dirty);
+
+       rb_link_node(&qgroup->node, parent, p);
+       rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+       return qgroup;
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+       struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+       struct btrfs_qgroup_list *list;
+
+       if (!qgroup)
+               return -ENOENT;
+
+       rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+       list_del(&qgroup->dirty);
+
+       while (!list_empty(&qgroup->groups)) {
+               list = list_first_entry(&qgroup->groups,
+                                       struct btrfs_qgroup_list, next_group);
+               list_del(&list->next_group);
+               list_del(&list->next_member);
+               kfree(list);
+       }
+
+       while (!list_empty(&qgroup->members)) {
+               list = list_first_entry(&qgroup->members,
+                                       struct btrfs_qgroup_list, next_member);
+               list_del(&list->next_group);
+               list_del(&list->next_member);
+               kfree(list);
+       }
+       kfree(qgroup);
+
+       return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+                          u64 memberid, u64 parentid)
+{
+       struct btrfs_qgroup *member;
+       struct btrfs_qgroup *parent;
+       struct btrfs_qgroup_list *list;
+
+       member = find_qgroup_rb(fs_info, memberid);
+       parent = find_qgroup_rb(fs_info, parentid);
+       if (!member || !parent)
+               return -ENOENT;
+
+       list = kzalloc(sizeof(*list), GFP_ATOMIC);
+       if (!list)
+               return -ENOMEM;
+
+       list->group = parent;
+       list->member = member;
+       list_add_tail(&list->next_group, &member->groups);
+       list_add_tail(&list->next_member, &parent->members);
+
+       return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+                          u64 memberid, u64 parentid)
+{
+       struct btrfs_qgroup *member;
+       struct btrfs_qgroup *parent;
+       struct btrfs_qgroup_list *list;
+
+       member = find_qgroup_rb(fs_info, memberid);
+       parent = find_qgroup_rb(fs_info, parentid);
+       if (!member || !parent)
+               return -ENOENT;
+
+       list_for_each_entry(list, &member->groups, next_group) {
+               if (list->group == parent) {
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+                       return 0;
+               }
+       }
+       return -ENOENT;
+}
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_path *path = NULL;
+       struct extent_buffer *l;
+       int slot;
+       int ret = 0;
+       u64 flags = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /* default this to quota off, in case no status key is found */
+       fs_info->qgroup_flags = 0;
+
+       /*
+        * pass 1: read status, all qgroup infos and limits
+        */
+       key.objectid = 0;
+       key.type = 0;
+       key.offset = 0;
+       ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+       if (ret)
+               goto out;
+
+       while (1) {
+               struct btrfs_qgroup *qgroup;
+
+               slot = path->slots[0];
+               l = path->nodes[0];
+               btrfs_item_key_to_cpu(l, &found_key, slot);
+
+               if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+                       struct btrfs_qgroup_status_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_status_item);
+
+                       if (btrfs_qgroup_status_version(l, ptr) !=
+                           BTRFS_QGROUP_STATUS_VERSION) {
+                               printk(KERN_ERR
+                                "btrfs: old qgroup version, quota disabled\n");
+                               goto out;
+                       }
+                       if (btrfs_qgroup_status_generation(l, ptr) !=
+                           fs_info->generation) {
+                               flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                               printk(KERN_ERR
+                                       "btrfs: qgroup generation mismatch, "
+                                       "marked as inconsistent\n");
+                       }
+                       fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+                                                                         ptr);
+                       /* FIXME read scan element */
+                       goto next1;
+               }
+
+               if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+                   found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+                       goto next1;
+
+               qgroup = find_qgroup_rb(fs_info, found_key.offset);
+               if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+                   (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+                       printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+                       flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               }
+               if (!qgroup) {
+                       qgroup = add_qgroup_rb(fs_info, found_key.offset);
+                       if (IS_ERR(qgroup)) {
+                               ret = PTR_ERR(qgroup);
+                               goto out;
+                       }
+               }
+               switch (found_key.type) {
+               case BTRFS_QGROUP_INFO_KEY: {
+                       struct btrfs_qgroup_info_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_info_item);
+                       qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+                       qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+                       qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+                       qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+                       /* generation currently unused */
+                       break;
+               }
+               case BTRFS_QGROUP_LIMIT_KEY: {
+                       struct btrfs_qgroup_limit_item *ptr;
+
+                       ptr = btrfs_item_ptr(l, slot,
+                                            struct btrfs_qgroup_limit_item);
+                       qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+                       qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+                       qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+                       qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+                       qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+                       break;
+               }
+               }
+next1:
+               ret = btrfs_next_item(quota_root, path);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       break;
+       }
+       btrfs_release_path(path);
+
+       /*
+        * pass 2: read all qgroup relations
+        */
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+       if (ret)
+               goto out;
+       while (1) {
+               slot = path->slots[0];
+               l = path->nodes[0];
+               btrfs_item_key_to_cpu(l, &found_key, slot);
+
+               if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+                       goto next2;
+
+               if (found_key.objectid > found_key.offset) {
+                       /* parent <- member, not needed to build config */
+                       /* FIXME should we omit the key completely? */
+                       goto next2;
+               }
+
+               ret = add_relation_rb(fs_info, found_key.objectid,
+                                     found_key.offset);
+               if (ret)
+                       goto out;
+next2:
+               ret = btrfs_next_item(quota_root, path);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       break;
+       }
+out:
+       fs_info->qgroup_flags |= flags;
+       if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+               fs_info->quota_enabled = 0;
+               fs_info->pending_quota_state = 0;
+       }
+       btrfs_free_path(path);
+
+       return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is only called from close_ctree() or open_ctree(), both in single-
+ * treaded paths. Clean up the in-memory structures. No locking needed.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+       struct rb_node *n;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_qgroup_list *list;
+
+       while ((n = rb_first(&fs_info->qgroup_tree))) {
+               qgroup = rb_entry(n, struct btrfs_qgroup, node);
+               rb_erase(n, &fs_info->qgroup_tree);
+
+               WARN_ON(!list_empty(&qgroup->dirty));
+
+               while (!list_empty(&qgroup->groups)) {
+                       list = list_first_entry(&qgroup->groups,
+                                               struct btrfs_qgroup_list,
+                                               next_group);
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+               }
+
+               while (!list_empty(&qgroup->members)) {
+                       list = list_first_entry(&qgroup->members,
+                                               struct btrfs_qgroup_list,
+                                               next_member);
+                       list_del(&list->next_group);
+                       list_del(&list->next_member);
+                       kfree(list);
+               }
+               kfree(qgroup);
+       }
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *quota_root,
+                                   u64 src, u64 dst)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = src;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = dst;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+       btrfs_mark_buffer_dirty(path->nodes[0]);
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *quota_root,
+                                   u64 src, u64 dst)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = src;
+       key.type = BTRFS_QGROUP_RELATION_KEY;
+       key.offset = dst;
+
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *quota_root, u64 qgroupid)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_qgroup_info_item *qgroup_info;
+       struct btrfs_qgroup_limit_item *qgroup_limit;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroupid;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*qgroup_info));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_qgroup_info_item);
+       btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+       btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+       btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       btrfs_release_path(path);
+
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*qgroup_limit));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+                                 struct btrfs_qgroup_limit_item);
+       btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+       btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *quota_root, u64 qgroupid)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroupid;
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+       if (ret)
+               goto out;
+
+       btrfs_release_path(path);
+
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, u64 qgroupid,
+                                   u64 flags, u64 max_rfer, u64 max_excl,
+                                   u64 rsv_rfer, u64 rsv_excl)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_limit_item *qgroup_limit;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_LIMIT_KEY;
+       key.offset = qgroupid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       qgroup_limit = btrfs_item_ptr(l, path->slots[0],
+                                     struct btrfs_qgroup_limit_item);
+       btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+       btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+       btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+       btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+       btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_qgroup *qgroup)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_info_item *qgroup_info;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_INFO_KEY;
+       key.offset = qgroup->qgroupid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       qgroup_info = btrfs_item_ptr(l, path->slots[0],
+                                struct btrfs_qgroup_info_item);
+       btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+       btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+       btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+       btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+       btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_info *fs_info,
+                                   struct btrfs_root *root)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *l;
+       struct btrfs_qgroup_status_item *ptr;
+       int ret;
+       int slot;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_STATUS_KEY;
+       key.offset = 0;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+
+       if (ret)
+               goto out;
+
+       l = path->nodes[0];
+       slot = path->slots[0];
+       ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+       btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+       /* XXX scan */
+
+       btrfs_mark_buffer_dirty(l);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       if (!root)
+               return -EINVAL;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       while (1) {
+               key.objectid = 0;
+               key.offset = 0;
+               key.type = 0;
+
+               path->leave_spinning = 1;
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+               if (ret > 0) {
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+               } else if (ret < 0) {
+                       break;
+               }
+
+               ret = btrfs_del_item(trans, root, path);
+               if (ret)
+                       goto out;
+               btrfs_release_path(path);
+       }
+       ret = 0;
+out:
+       root->fs_info->pending_quota_state = 0;
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_path *path = NULL;
+       struct btrfs_qgroup_status_item *ptr;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret = 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       if (fs_info->quota_root) {
+               fs_info->pending_quota_state = 1;
+               spin_unlock(&fs_info->qgroup_lock);
+               goto out;
+       }
+       spin_unlock(&fs_info->qgroup_lock);
+
+       /*
+        * initially create the quota tree
+        */
+       quota_root = btrfs_create_tree(trans, fs_info,
+                                      BTRFS_QUOTA_TREE_OBJECTID);
+       if (IS_ERR(quota_root)) {
+               ret =  PTR_ERR(quota_root);
+               goto out;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_QGROUP_STATUS_KEY;
+       key.offset = 0;
+
+       ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+                                     sizeof(*ptr));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       ptr = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_qgroup_status_item);
+       btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+       btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+       fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+                               BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+
+       btrfs_mark_buffer_dirty(leaf);
+
+       spin_lock(&fs_info->qgroup_lock);
+       fs_info->quota_root = quota_root;
+       fs_info->pending_quota_state = 1;
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *tree_root = fs_info->tree_root;
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       fs_info->quota_enabled = 0;
+       fs_info->pending_quota_state = 0;
+       quota_root = fs_info->quota_root;
+       fs_info->quota_root = NULL;
+       btrfs_free_qgroup_config(fs_info);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = btrfs_clean_quota_tree(trans, quota_root);
+       if (ret)
+               goto out;
+
+       ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+       if (ret)
+               goto out;
+
+       list_del(&quota_root->dirty_list);
+
+       btrfs_tree_lock(quota_root->node);
+       clean_tree_block(trans, tree_root, quota_root->node);
+       btrfs_tree_unlock(quota_root->node);
+       btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+       free_extent_buffer(quota_root->node);
+       free_extent_buffer(quota_root->commit_root);
+       kfree(quota_root);
+out:
+       return ret;
+}
+
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+{
+       /* FIXME */
+       return 0;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+       if (ret)
+               return ret;
+
+       ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+       if (ret) {
+               del_qgroup_relation_item(trans, quota_root, src, dst);
+               return ret;
+       }
+
+       spin_lock(&fs_info->qgroup_lock);
+       ret = add_relation_rb(quota_root->fs_info, src, dst);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+       int err;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+       err = del_qgroup_relation_item(trans, quota_root, dst, src);
+       if (err && !ret)
+               ret = err;
+
+       spin_lock(&fs_info->qgroup_lock);
+       del_relation_rb(fs_info, src, dst);
+
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = add_qgroup_item(trans, quota_root, qgroupid);
+
+       spin_lock(&fs_info->qgroup_lock);
+       qgroup = add_qgroup_rb(fs_info, qgroupid);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       if (IS_ERR(qgroup))
+               ret = PTR_ERR(qgroup);
+
+       return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+       struct btrfs_root *quota_root;
+       int ret = 0;
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+       spin_lock(&fs_info->qgroup_lock);
+       del_qgroup_rb(quota_root->fs_info, qgroupid);
+
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit)
+{
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_qgroup *qgroup;
+       int ret = 0;
+
+       if (!quota_root)
+               return -EINVAL;
+
+       ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
+                                      limit->flags, limit->max_rfer,
+                                      limit->max_excl, limit->rsv_rfer,
+                                      limit->rsv_excl);
+       if (ret) {
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               printk(KERN_INFO "unable to update quota limit for %llu\n",
+                      (unsigned long long)qgroupid);
+       }
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       qgroup = find_qgroup_rb(fs_info, qgroupid);
+       if (!qgroup) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       qgroup->lim_flags = limit->flags;
+       qgroup->max_rfer = limit->max_rfer;
+       qgroup->max_excl = limit->max_excl;
+       qgroup->rsv_rfer = limit->rsv_rfer;
+       qgroup->rsv_excl = limit->rsv_excl;
+
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+                        struct btrfs_qgroup *qgroup)
+{
+       if (list_empty(&qgroup->dirty))
+               list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
+ * the modification into a list that's later used by btrfs_end_transaction to
+ * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_node *node,
+                           struct btrfs_delayed_extent_op *extent_op)
+{
+       struct qgroup_update *u;
+
+       BUG_ON(!trans->delayed_ref_elem.seq);
+       u = kmalloc(sizeof(*u), GFP_NOFS);
+       if (!u)
+               return -ENOMEM;
+
+       u->node = node;
+       u->extent_op = extent_op;
+       list_add_tail(&u->list, &trans->qgroup_ref_list);
+
+       return 0;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info,
+                            struct btrfs_delayed_ref_node *node,
+                            struct btrfs_delayed_extent_op *extent_op)
+{
+       struct btrfs_key ins;
+       struct btrfs_root *quota_root;
+       u64 ref_root;
+       struct btrfs_qgroup *qgroup;
+       struct ulist_node *unode;
+       struct ulist *roots = NULL;
+       struct ulist *tmp = NULL;
+       struct ulist_iterator uiter;
+       u64 seq;
+       int ret = 0;
+       int sgn;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       BUG_ON(!fs_info->quota_root);
+
+       ins.objectid = node->bytenr;
+       ins.offset = node->num_bytes;
+       ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+       if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+           node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+               struct btrfs_delayed_tree_ref *ref;
+               ref = btrfs_delayed_node_to_tree_ref(node);
+               ref_root = ref->root;
+       } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                  node->type == BTRFS_SHARED_DATA_REF_KEY) {
+               struct btrfs_delayed_data_ref *ref;
+               ref = btrfs_delayed_node_to_data_ref(node);
+               ref_root = ref->root;
+       } else {
+               BUG();
+       }
+
+       if (!is_fstree(ref_root)) {
+               /*
+                * non-fs-trees are not being accounted
+                */
+               return 0;
+       }
+
+       switch (node->action) {
+       case BTRFS_ADD_DELAYED_REF:
+       case BTRFS_ADD_DELAYED_EXTENT:
+               sgn = 1;
+               break;
+       case BTRFS_DROP_DELAYED_REF:
+               sgn = -1;
+               break;
+       case BTRFS_UPDATE_DELAYED_HEAD:
+               return 0;
+       default:
+               BUG();
+       }
+
+       /*
+        * the delayed ref sequence number we pass depends on the direction of
+        * the operation. for add operations, we pass (node->seq - 1) to skip
+        * the delayed ref's current sequence number, because we need the state
+        * of the tree before the add operation. for delete operations, we pass
+        * (node->seq) to include the delayed ref's current sequence number,
+        * because we need the state of the tree after the delete operation.
+        */
+       ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
+                                  sgn > 0 ? node->seq - 1 : node->seq, &roots);
+       if (ret < 0)
+               goto out;
+
+       spin_lock(&fs_info->qgroup_lock);
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto unlock;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto unlock;
+
+       /*
+        * step 1: for each old ref, visit all nodes once and inc refcnt
+        */
+       tmp = ulist_alloc(GFP_ATOMIC);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+       seq = fs_info->qgroup_seq;
+       fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               struct ulist_node *tmp_unode;
+               struct ulist_iterator tmp_uiter;
+               struct btrfs_qgroup *qg;
+
+               qg = find_qgroup_rb(fs_info, unode->val);
+               if (!qg)
+                       continue;
+
+               ulist_reinit(tmp);
+                                               /* XXX id not needed */
+               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ULIST_ITER_INIT(&tmp_uiter);
+               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+                       struct btrfs_qgroup_list *glist;
+
+                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       if (qg->refcnt < seq)
+                               qg->refcnt = seq + 1;
+                       else
+                               ++qg->refcnt;
+
+                       list_for_each_entry(glist, &qg->groups, next_group) {
+                               ulist_add(tmp, glist->group->qgroupid,
+                                         (unsigned long)glist->group,
+                                         GFP_ATOMIC);
+                       }
+               }
+       }
+
+       /*
+        * step 2: walk from the new root
+        */
+       ulist_reinit(tmp);
+       ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+               if (qg->refcnt < seq) {
+                       /* not visited by step 1 */
+                       qg->rfer += sgn * node->num_bytes;
+                       qg->rfer_cmpr += sgn * node->num_bytes;
+                       if (roots->nnodes == 0) {
+                               qg->excl += sgn * node->num_bytes;
+                               qg->excl_cmpr += sgn * node->num_bytes;
+                       }
+                       qgroup_dirty(fs_info, qg);
+               }
+               WARN_ON(qg->tag >= seq);
+               qg->tag = seq;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(tmp, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+
+       /*
+        * step 3: walk again from old refs
+        */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct ulist_node *tmp_unode;
+               struct ulist_iterator tmp_uiter;
+
+               qg = find_qgroup_rb(fs_info, unode->val);
+               if (!qg)
+                       continue;
+
+               ulist_reinit(tmp);
+               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ULIST_ITER_INIT(&tmp_uiter);
+               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+                       struct btrfs_qgroup_list *glist;
+
+                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       if (qg->tag == seq)
+                               continue;
+
+                       if (qg->refcnt - seq == roots->nnodes) {
+                               qg->excl -= sgn * node->num_bytes;
+                               qg->excl_cmpr -= sgn * node->num_bytes;
+                               qgroup_dirty(fs_info, qg);
+                       }
+
+                       list_for_each_entry(glist, &qg->groups, next_group) {
+                               ulist_add(tmp, glist->group->qgroupid,
+                                         (unsigned long)glist->group,
+                                         GFP_ATOMIC);
+                       }
+               }
+       }
+       ret = 0;
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       ulist_free(roots);
+       ulist_free(tmp);
+
+       return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       int ret = 0;
+
+       if (!quota_root)
+               goto out;
+
+       fs_info->quota_enabled = fs_info->pending_quota_state;
+
+       spin_lock(&fs_info->qgroup_lock);
+       while (!list_empty(&fs_info->dirty_qgroups)) {
+               struct btrfs_qgroup *qgroup;
+               qgroup = list_first_entry(&fs_info->dirty_qgroups,
+                                         struct btrfs_qgroup, dirty);
+               list_del_init(&qgroup->dirty);
+               spin_unlock(&fs_info->qgroup_lock);
+               ret = update_qgroup_info_item(trans, quota_root, qgroup);
+               if (ret)
+                       fs_info->qgroup_flags |=
+                                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               spin_lock(&fs_info->qgroup_lock);
+       }
+       if (fs_info->quota_enabled)
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+       else
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       ret = update_qgroup_status_item(trans, fs_info, quota_root);
+       if (ret)
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+out:
+
+       return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit)
+{
+       int ret = 0;
+       int i;
+       u64 *i_qgroups;
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_qgroup *srcgroup;
+       struct btrfs_qgroup *dstgroup;
+       u32 level_size = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       if (!quota_root)
+               return -EINVAL;
+
+       /*
+        * create a tracking group for the subvol itself
+        */
+       ret = add_qgroup_item(trans, quota_root, objectid);
+       if (ret)
+               goto out;
+
+       if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+               ret = update_qgroup_limit_item(trans, quota_root, objectid,
+                                              inherit->lim.flags,
+                                              inherit->lim.max_rfer,
+                                              inherit->lim.max_excl,
+                                              inherit->lim.rsv_rfer,
+                                              inherit->lim.rsv_excl);
+               if (ret)
+                       goto out;
+       }
+
+       if (srcid) {
+               struct btrfs_root *srcroot;
+               struct btrfs_key srckey;
+               int srcroot_level;
+
+               srckey.objectid = srcid;
+               srckey.type = BTRFS_ROOT_ITEM_KEY;
+               srckey.offset = (u64)-1;
+               srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+               if (IS_ERR(srcroot)) {
+                       ret = PTR_ERR(srcroot);
+                       goto out;
+               }
+
+               rcu_read_lock();
+               srcroot_level = btrfs_header_level(srcroot->node);
+               level_size = btrfs_level_size(srcroot, srcroot_level);
+               rcu_read_unlock();
+       }
+
+       /*
+        * add qgroup to all inherited groups
+        */
+       if (inherit) {
+               i_qgroups = (u64 *)(inherit + 1);
+               for (i = 0; i < inherit->num_qgroups; ++i) {
+                       ret = add_qgroup_relation_item(trans, quota_root,
+                                                      objectid, *i_qgroups);
+                       if (ret)
+                               goto out;
+                       ret = add_qgroup_relation_item(trans, quota_root,
+                                                      *i_qgroups, objectid);
+                       if (ret)
+                               goto out;
+                       ++i_qgroups;
+               }
+       }
+
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       dstgroup = add_qgroup_rb(fs_info, objectid);
+       if (!dstgroup)
+               goto unlock;
+
+       if (srcid) {
+               srcgroup = find_qgroup_rb(fs_info, srcid);
+               if (!srcgroup)
+                       goto unlock;
+               dstgroup->rfer = srcgroup->rfer - level_size;
+               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+               srcgroup->excl = level_size;
+               srcgroup->excl_cmpr = level_size;
+               qgroup_dirty(fs_info, dstgroup);
+               qgroup_dirty(fs_info, srcgroup);
+       }
+
+       if (!inherit)
+               goto unlock;
+
+       i_qgroups = (u64 *)(inherit + 1);
+       for (i = 0; i < inherit->num_qgroups; ++i) {
+               ret = add_relation_rb(quota_root->fs_info, objectid,
+                                     *i_qgroups);
+               if (ret)
+                       goto unlock;
+               ++i_qgroups;
+       }
+
+       for (i = 0; i <  inherit->num_ref_copies; ++i) {
+               struct btrfs_qgroup *src;
+               struct btrfs_qgroup *dst;
+
+               src = find_qgroup_rb(fs_info, i_qgroups[0]);
+               dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+               if (!src || !dst) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               dst->rfer = src->rfer - level_size;
+               dst->rfer_cmpr = src->rfer_cmpr - level_size;
+               i_qgroups += 2;
+       }
+       for (i = 0; i <  inherit->num_excl_copies; ++i) {
+               struct btrfs_qgroup *src;
+               struct btrfs_qgroup *dst;
+
+               src = find_qgroup_rb(fs_info, i_qgroups[0]);
+               dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+               if (!src || !dst) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               dst->excl = src->excl + level_size;
+               dst->excl_cmpr = src->excl_cmpr + level_size;
+               i_qgroups += 2;
+       }
+
+unlock:
+       spin_unlock(&fs_info->qgroup_lock);
+out:
+       return ret;
+}
+
+/*
+ * reserve some space for a qgroup and all its parents. The reservation takes
+ * place with start_transaction or dealloc_reserve, similar to ENOSPC
+ * accounting. If not enough space is available, EDQUOT is returned.
+ * We assume that the requested space is new for all qgroups.
+ */
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 ref_root = root->root_key.objectid;
+       int ret = 0;
+       struct ulist *ulist = NULL;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+
+       if (!is_fstree(ref_root))
+               return 0;
+
+       if (num_bytes == 0)
+               return 0;
+
+       spin_lock(&fs_info->qgroup_lock);
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto out;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto out;
+
+       /*
+        * in a first step, we check all affected qgroups if any limits would
+        * be exceeded
+        */
+       ulist = ulist_alloc(GFP_ATOMIC);
+       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+                   qg->reserved + qg->rfer + num_bytes >
+                   qg->max_rfer)
+                       ret = -EDQUOT;
+
+               if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+                   qg->reserved + qg->excl + num_bytes >
+                   qg->max_excl)
+                       ret = -EDQUOT;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(ulist, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+       if (ret)
+               goto out;
+
+       /*
+        * no limits exceeded, now record the reservation into all qgroups
+        */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               qg->reserved += num_bytes;
+       }
+
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(ulist);
+
+       return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+       struct btrfs_root *quota_root;
+       struct btrfs_qgroup *qgroup;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct ulist *ulist = NULL;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       u64 ref_root = root->root_key.objectid;
+
+       if (!is_fstree(ref_root))
+               return;
+
+       if (num_bytes == 0)
+               return;
+
+       spin_lock(&fs_info->qgroup_lock);
+
+       quota_root = fs_info->quota_root;
+       if (!quota_root)
+               goto out;
+
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto out;
+
+       ulist = ulist_alloc(GFP_ATOMIC);
+       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = (struct btrfs_qgroup *)unode->aux;
+
+               qg->reserved -= num_bytes;
+
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ulist_add(ulist, glist->group->qgroupid,
+                                 (unsigned long)glist->group, GFP_ATOMIC);
+               }
+       }
+
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(ulist);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+       if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+               return;
+       printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+               trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+               trans->delayed_ref_elem.seq);
+       BUG();
+}
index 328b95f67660af115fd0ac109ce1b6b5501260a7..cc20e95ea2898ce11af1c100d781d1795a1fb187 100644 (file)
@@ -38,7 +38,6 @@ void put_transaction(struct btrfs_transaction *transaction)
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(transaction->delayed_refs.root.rb_node);
-               WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -126,7 +125,6 @@ loop:
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
-       cur_trans->delayed_refs.seq = 1;
 
        /*
         * although the tree mod log is per file system and not per transaction,
@@ -145,10 +143,8 @@ loop:
        }
        atomic_set(&fs_info->tree_mod_seq, 0);
 
-       init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
-       INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -299,6 +295,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        struct btrfs_transaction *cur_trans;
        u64 num_bytes = 0;
        int ret;
+       u64 qgroup_reserved = 0;
 
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return ERR_PTR(-EROFS);
@@ -317,6 +314,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
         * the appropriate flushing if need be.
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
+               if (root->fs_info->quota_enabled &&
+                   is_fstree(root->root_key.objectid)) {
+                       qgroup_reserved = num_items * root->leafsize;
+                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+                       if (ret)
+                               return ERR_PTR(ret);
+               }
+
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                ret = btrfs_block_rsv_add(root,
                                          &root->fs_info->trans_block_rsv,
@@ -349,12 +354,16 @@ again:
        h->transaction = cur_trans;
        h->blocks_used = 0;
        h->bytes_reserved = 0;
+       h->root = root;
        h->delayed_ref_updates = 0;
        h->use_count = 1;
        h->adding_csums = 0;
        h->block_rsv = NULL;
        h->orig_rsv = NULL;
        h->aborted = 0;
+       h->qgroup_reserved = qgroup_reserved;
+       h->delayed_ref_elem.seq = 0;
+       INIT_LIST_HEAD(&h->qgroup_ref_list);
 
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -505,6 +514,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                return 0;
        }
 
+       /*
+        * do the qgroup accounting as early as possible
+        */
+       err = btrfs_delayed_refs_qgroup_accounting(trans, info);
+
+       btrfs_trans_release_metadata(trans, root);
+       trans->block_rsv = NULL;
+       /*
+        * the same root has to be passed to start_transaction and
+        * end_transaction. Subvolume quota depends on this.
+        */
+       WARN_ON(trans->root != root);
+
+       if (trans->qgroup_reserved) {
+               btrfs_qgroup_free(root, trans->qgroup_reserved);
+               trans->qgroup_reserved = 0;
+       }
+
        while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -559,6 +586,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
            root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                err = -EIO;
        }
+       assert_qgroups_uptodate(trans);
 
        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -777,6 +805,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        ret = btrfs_run_dev_stats(trans, root->fs_info);
        BUG_ON(ret);
 
+       ret = btrfs_run_qgroups(trans, root->fs_info);
+       BUG_ON(ret);
+
+       /* run_qgroups might have added some more refs */
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
+
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
@@ -949,6 +984,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                }
        }
 
+       ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
+                                  objectid, pending->inherit);
+       kfree(pending->inherit);
+       if (ret) {
+               pending->error = ret;
+               goto fail;
+       }
+
        key.objectid = objectid;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1344,6 +1387,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                if (ret)
                        goto cleanup_transaction;
 
+               /*
+                * running the delayed items may have added new refs. account
+                * them now so that they hinder processing of more delayed refs
+                * as little as possible.
+                */
+               btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
                /*
                 * rename don't use btrfs_join_transaction, so, once we
                 * set the transaction to blocked above, we aren't going
@@ -1456,6 +1506,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                            root->fs_info->chunk_root->node);
        switch_commit_root(root->fs_info->chunk_root);
 
+       assert_qgroups_uptodate(trans);
        update_super_roots(root);
 
        if (!root->fs_info->log_root_recovering) {
index d314a74b4968e28ab66b0033cab637e860983d3d..e8b8416c688b2a7b5d6c4f79c844bb07953cb3b5 100644 (file)
@@ -20,6 +20,7 @@
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "ctree.h"
 
 struct btrfs_transaction {
        u64 transid;
@@ -49,6 +50,7 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
        u64 bytes_reserved;
+       u64 qgroup_reserved;
        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
@@ -58,12 +60,21 @@ struct btrfs_trans_handle {
        struct btrfs_block_rsv *orig_rsv;
        int aborted;
        int adding_csums;
+       /*
+        * this root is only needed to validate that the root passed to
+        * start_transaction is the same as the one passed to end_transaction.
+        * Subvolume quota depends on this
+        */
+       struct btrfs_root *root;
+       struct seq_list delayed_ref_elem;
+       struct list_head qgroup_ref_list;
 };
 
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
        struct btrfs_root *root;
        struct btrfs_root *snap;
+       struct btrfs_qgroup_inherit *inherit;
        /* block reservation for the operation */
        struct btrfs_block_rsv block_rsv;
        /* extra metadata reseration for relocation */