Merge branch 'allocator-fixes' into for-linus-4.4
authorChris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
committerChris Mason <clm@fb.com>
Thu, 22 Oct 2015 02:00:38 +0000 (19:00 -0700)
Signed-off-by: Chris Mason <clm@fb.com>
17 files changed:
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/relocation.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
include/trace/events/btrfs.h

index ced08c9608e2a6d21c663f7c8fc588b23a9b4959..bc3c711e82f27cb0b1d05e217d434e8b0535986f 100644 (file)
@@ -1950,6 +1950,9 @@ struct btrfs_root {
        int send_in_progress;
        struct btrfs_subvolume_writers *subv_writers;
        atomic_t will_be_snapshoted;
+
+       /* For qgroup metadata space reserve */
+       atomic_t qgroup_meta_rsv;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3470,8 +3473,11 @@ enum btrfs_reserve_flush_enum {
        BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3487,8 +3493,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
                                      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                              unsigned short type);
index ac3e81da6d4edc8e33856840349cc88750771546..bd9b63b1ddb29b31e7748ca8f4bb89c546951cd4 100644 (file)
@@ -476,6 +476,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        INIT_LIST_HEAD(&head_ref->ref_list);
        head_ref->processing = 0;
        head_ref->total_ref_mod = count_mod;
+       head_ref->qgroup_reserved = 0;
+       head_ref->qgroup_ref_root = 0;
 
        /* Record qgroup extent info if provided */
        if (qrecord) {
@@ -746,6 +748,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_trans_handle *trans,
+                                    u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_head *ref_head;
+       int ret = 0;
+
+       if (!fs_info->quota_enabled || !is_fstree(ref_root))
+               return 0;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+
+       spin_lock(&delayed_refs->lock);
+       ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+       if (!ref_head) {
+               ret = -ENOENT;
+               goto out;
+       }
+       WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+       ref_head->qgroup_ref_root = ref_root;
+       ref_head->qgroup_reserved = num_bytes;
+out:
+       spin_unlock(&delayed_refs->lock);
+       return ret;
+}
+
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
index 13fb5e6090fe0efc7f55dc9b09e9faac57a3c91b..d4c41e26101c0096588d328bbaba72dc045bd3a7 100644 (file)
@@ -112,6 +112,17 @@ struct btrfs_delayed_ref_head {
         */
        int total_ref_mod;
 
+       /*
+        * For qgroup reserved space freeing.
+        *
+        * ref_root and reserved will be recorded after
+        * BTRFS_ADD_DELAYED_EXTENT is called.
+        * And will be used to free reserved qgroup space at
+        * run_delayed_refs() time.
+        */
+       u64 qgroup_ref_root;
+       u64 qgroup_reserved;
+
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -242,6 +253,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                               u64 owner, u64 offset, int action,
                               struct btrfs_delayed_extent_op *extent_op,
                               int no_quota);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_trans_handle *trans,
+                                    u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
index dceabb13ddc40de8ca0d9a7aa553ea037e382e08..86a11a902fcf00dd369084af9c3820fe637d265d 100644 (file)
@@ -1267,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshoted, 0);
+       atomic_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
index 475764502d42019edf3a152c45b3700169af3d09..92fdbc6b89e76e63725da41a4985713320136c80 100644 (file)
@@ -2409,6 +2409,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                      node->num_bytes);
                        }
                }
+
+               /* Also free its reserved qgroup space */
+               btrfs_qgroup_free_delayed_ref(root->fs_info,
+                                             head->qgroup_ref_root,
+                                             head->qgroup_reserved);
                return ret;
        }
 
@@ -3424,7 +3429,7 @@ again:
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
 
-       ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+       ret = btrfs_check_data_free_space(inode, 0, num_pages);
        if (ret)
                goto out_put;
 
@@ -3443,7 +3448,7 @@ again:
                dcs = BTRFS_DC_SETUP;
        else if (ret == -ENOSPC)
                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
-       btrfs_free_reserved_data_space(inode, num_pages);
+       btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
        iput(inode);
@@ -3983,11 +3988,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4109,38 +4110,86 @@ commit_trans:
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
-       ret = btrfs_qgroup_reserve(root, write_bytes);
-       if (ret)
-               goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
-out:
        spin_unlock(&data_sinfo->lock);
 
        return ret;
 }
 
 /*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       /* align the range */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
+
+       ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Use new btrfs_qgroup_reserve_data to reserve precious data space
+        *
+        * TODO: Find a good method to avoid reserve data space for NOCOW
+        * range, but don't impact performance on quota disable case.
+        */
+       ret = btrfs_qgroup_reserve_data(inode, start, len);
+       return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
  */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
 
-       /* make sure bytes are sectorsize aligned */
-       bytes = ALIGN(bytes, root->sectorsize);
+       /* Make sure the range is aligned to sectorsize */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
 
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
-       WARN_ON(data_sinfo->bytes_may_use < bytes);
-       data_sinfo->bytes_may_use -= bytes;
+       if (WARN_ON(data_sinfo->bytes_may_use < len))
+               data_sinfo->bytes_may_use = 0;
+       else
+               data_sinfo->bytes_may_use -= len;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
-                                     data_sinfo->flags, bytes, 0);
+                                     data_sinfo->flags, len, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+       btrfs_free_reserved_data_space_noquota(inode, start, len);
+       btrfs_qgroup_free_data(inode, start, len);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -5417,7 +5466,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * root->nodesize;
-               ret = btrfs_qgroup_reserve(root, num_bytes);
+               ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                if (ret)
                        return ret;
        } else {
@@ -5435,10 +5484,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-       if (ret) {
-               if (*qgroup_reserved)
-                       btrfs_qgroup_free(root, *qgroup_reserved);
-       }
+       if (ret && *qgroup_reserved)
+               btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
        return ret;
 }
@@ -5599,15 +5646,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        spin_unlock(&BTRFS_I(inode)->lock);
 
        if (root->fs_info->quota_enabled) {
-               ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+               ret = btrfs_qgroup_reserve_meta(root,
+                               nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
 
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
-               if (root->fs_info->quota_enabled)
-                       btrfs_qgroup_free(root, nr_extents * root->nodesize);
+               btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                goto out_fail;
        }
 
@@ -5730,41 +5777,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
  * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
  *
  * This will do the following things
  *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
  *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
  * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
  *
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
        int ret;
 
-       ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-       if (ret)
-               return ret;
-
-       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-       if (ret) {
-               btrfs_free_reserved_data_space(inode, num_bytes);
+       ret = btrfs_check_data_free_space(inode, start, len);
+       if (ret < 0)
                return ret;
-       }
-
-       return 0;
+       ret = btrfs_delalloc_reserve_metadata(inode, len);
+       if (ret < 0)
+               btrfs_free_reserved_data_space(inode, start, len);
+       return ret;
 }
 
 /**
  * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
  *
  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
  * called in the case that we don't need the metadata AND data reservations
@@ -5773,11 +5827,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
  * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
  */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
 {
-       btrfs_delalloc_release_metadata(inode, num_bytes);
-       btrfs_free_reserved_data_space(inode, num_bytes);
+       btrfs_delalloc_release_metadata(inode, len);
+       btrfs_free_reserved_data_space(inode, start, len);
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
index 6e6df34d74f051df17e9a3b6079cadf2af49d8bb..33a01ea414651d217349940ca6c0e2ed7bea6f4c 100644 (file)
@@ -131,6 +131,25 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+                                struct extent_changeset *changeset,
+                                int set)
+{
+       int ret;
+
+       if (!changeset)
+               return;
+       if (set && (state->state & bits) == bits)
+               return;
+       if (!set && (state->state & bits) == 0)
+               return;
+       changeset->bytes_changed += state->end - state->start + 1;
+       ret = ulist_add(changeset->range_changed, state->start, state->end,
+                       GFP_ATOMIC);
+       /* ENOMEM */
+       BUG_ON(ret < 0);
+}
+
 static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned *bits);
+                          struct extent_state *state, unsigned *bits,
+                          struct extent_changeset *changeset);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
                        struct rb_node ***p,
                        struct rb_node **parent,
-                       unsigned *bits)
+                       unsigned *bits, struct extent_changeset *changeset)
 {
        struct rb_node *node;
 
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
        state->start = start;
        state->end = end;
 
-       set_state_bits(tree, state, bits);
+       set_state_bits(tree, state, bits, changeset);
 
        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
        if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
-                                           unsigned *bits, int wake)
+                                           unsigned *bits, int wake,
+                                           struct extent_changeset *changeset)
 {
        struct extent_state *next;
        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                tree->dirty_bytes -= range;
        }
        clear_state_cb(tree, state, bits);
+       add_extent_changeset(state, bits_to_clear, changeset, 0);
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  *
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned bits, int wake, int delete,
-                    struct extent_state **cached_state,
-                    gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                             unsigned bits, int wake, int delete,
+                             struct extent_state **cached_state,
+                             gfp_t mask, struct extent_changeset *changeset)
 {
        struct extent_state *state;
        struct extent_state *cached;
@@ -671,7 +693,8 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       state = clear_state_bit(tree, state, &bits, wake);
+                       state = clear_state_bit(tree, state, &bits, wake,
+                                               changeset);
                        goto next;
                }
                goto search_again;
@@ -692,13 +715,13 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
 
-               clear_state_bit(tree, prealloc, &bits, wake);
+               clear_state_bit(tree, prealloc, &bits, wake, changeset);
 
                prealloc = NULL;
                goto out;
        }
 
-       state = clear_state_bit(tree, state, &bits, wake);
+       state = clear_state_bit(tree, state, &bits, wake, changeset);
 next:
        if (last_end == (u64)-1)
                goto out;
@@ -789,7 +812,7 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                          unsigned *bits)
+                          unsigned *bits, struct extent_changeset *changeset)
 {
        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
+       add_extent_changeset(state, bits_to_set, changeset, 1);
        state->state |= bits_to_set;
 }
 
@@ -835,7 +859,7 @@ static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                 unsigned bits, unsigned exclusive_bits,
                 u64 *failed_start, struct extent_state **cached_state,
-                gfp_t mask)
+                gfp_t mask, struct extent_changeset *changeset)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -873,7 +897,7 @@ again:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
 
@@ -899,7 +923,7 @@ hit_next:
                        goto out;
                }
 
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, changeset);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, changeset);
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
 
@@ -1008,7 +1032,7 @@ hit_next:
                if (err)
                        extent_io_tree_panic(tree, err);
 
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, changeset);
                cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   struct extent_state **cached_state, gfp_t mask)
 {
        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
-                               cached_state, mask);
+                               cached_state, mask, NULL);
 }
 
 
@@ -1111,7 +1135,7 @@ again:
                        goto out;
                }
                err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, NULL);
                cache_state(state, cached_state);
-               state = clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, NULL);
                        cache_state(state, cached_state);
-                       state = clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0,
+                                               NULL);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
                if (err)
                        extent_io_tree_panic(tree, err);
 
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, NULL);
                cache_state(prealloc, cached_state);
-               clear_state_bit(tree, prealloc, &clear_bits, 0);
+               clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
                prealloc = NULL;
                goto out;
        }
@@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, mask);
 }
 
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          unsigned bits, gfp_t mask,
+                          struct extent_changeset *changeset)
+{
+       /*
+        * We don't support EXTENT_LOCKED yet, as current changeset will
+        * record any bits changed, so for EXTENT_LOCKED case, it will
+        * either fail with -EEXIST or changeset will record the whole
+        * range.
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+                               changeset);
+}
+
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                    unsigned bits, int wake, int delete,
+                    struct extent_state **cached, gfp_t mask)
+{
+       return __clear_extent_bit(tree, start, end, bits, wake, delete,
+                                 cached, mask, NULL);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, gfp_t mask)
 {
@@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
        return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
 }
 
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            unsigned bits, gfp_t mask,
+                            struct extent_changeset *changeset)
+{
+       /*
+        * Don't support EXTENT_LOCKED case, same reason as
+        * set_record_extent_bits().
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+                                 changeset);
+}
+
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
@@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
        while (1) {
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                       EXTENT_LOCKED, &failed_start,
-                                      cached_state, GFP_NOFS);
+                                      cached_state, GFP_NOFS, NULL);
                if (err == -EEXIST) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
@@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
        u64 failed_start;
 
        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                              &failed_start, NULL, GFP_NOFS);
+                              &failed_start, NULL, GFP_NOFS, NULL);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
index c668f36898d3a4c2dd2e0e18456dd2cfa6c55747..f4c1ae11855f0b613894ea44026faf143021b613 100644 (file)
@@ -2,6 +2,7 @@
 #define __EXTENTIO__
 
 #include <linux/rbtree.h>
+#include "ulist.h"
 
 /* bits for the extent state */
 #define EXTENT_DIRTY           (1U << 0)
@@ -18,6 +19,7 @@
 #define EXTENT_NEED_WAIT       (1U << 13)
 #define EXTENT_DAMAGED         (1U << 14)
 #define EXTENT_NORESERVE       (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
 #define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -161,6 +163,17 @@ struct extent_buffer {
 #endif
 };
 
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+       /* How many bytes are set/cleared in this operation */
+       u64 bytes_changed;
+
+       /* Changed ranges */
+       struct ulist *range_changed;
+};
+
 static inline void extent_set_compress_type(unsigned long *bio_flags,
                                            int compress_type)
 {
@@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            unsigned bits, gfp_t mask,
+                            struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     unsigned bits, int wake, int delete,
                     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          unsigned bits, gfp_t mask,
+                          struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   unsigned bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
index 7031e9631519523d3c933c2190e724e85723c8b7..12432057a12ed73496f84223ff6284cabc8c57e0 100644 (file)
@@ -1507,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
 
                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-               ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-               if (ret == -ENOSPC &&
-                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-                                             BTRFS_INODE_PREALLOC))) {
+
+               if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                            BTRFS_INODE_PREALLOC)) {
                        ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret < 0)
+                               break;
                        if (ret > 0) {
+                               /*
+                                * For nodata cow case, no need to reserve
+                                * data space.
+                                */
                                only_release_metadata = true;
                                /*
                                 * our prealloc extent may be smaller than
@@ -1521,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                num_pages = DIV_ROUND_UP(write_bytes + offset,
                                                         PAGE_CACHE_SIZE);
                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                               ret = 0;
-                       } else {
-                               ret = -ENOSPC;
+                               goto reserve_metadata;
                        }
                }
-
-               if (ret)
+               ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+               if (ret < 0)
                        break;
 
+reserve_metadata:
                ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
                if (ret) {
                        if (!only_release_metadata)
-                               btrfs_free_reserved_data_space(inode,
-                                                              reserve_bytes);
+                               btrfs_free_reserved_data_space(inode, pos,
+                                                              write_bytes);
                        else
                                btrfs_end_write_no_snapshoting(root);
                        break;
@@ -1604,7 +1608,7 @@ again:
                                btrfs_delalloc_release_metadata(inode,
                                                                release_bytes);
                        else
-                               btrfs_delalloc_release_space(inode,
+                               btrfs_delalloc_release_space(inode, pos,
                                                             release_bytes);
                }
 
@@ -1657,7 +1661,7 @@ again:
                        btrfs_end_write_no_snapshoting(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
                } else {
-                       btrfs_delalloc_release_space(inode, release_bytes);
+                       btrfs_delalloc_release_space(inode, pos, release_bytes);
                }
        }
 
@@ -2538,17 +2542,61 @@ out_only_mutex:
        return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+       struct falloc_range *prev = NULL;
+       struct falloc_range *range = NULL;
+
+       if (list_empty(head))
+               goto insert;
+
+       /*
+        * As fallocate iterate by bytenr order, we only need to check
+        * the last range.
+        */
+       prev = list_entry(head->prev, struct falloc_range, list);
+       if (prev->start + prev->len == start) {
+               prev->len += len;
+               return 0;
+       }
+insert:
+       range = kmalloc(sizeof(*range), GFP_NOFS);
+       if (!range)
+               return -ENOMEM;
+       range->start = start;
+       range->len = len;
+       list_add_tail(&range->list, head);
+       return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        struct extent_state *cached_state = NULL;
+       struct falloc_range *range;
+       struct falloc_range *tmp;
+       struct list_head reserve_list;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
+       u64 actual_end = 0;
        struct extent_map *em;
        int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
@@ -2564,11 +2612,12 @@ static long btrfs_fallocate(struct file *file, int mode,
                return btrfs_punch_hole(inode, offset, len);
 
        /*
-        * Make sure we have enough space before we do the
-        * allocation.
+        * Only trigger disk allocation, don't trigger qgroup reserve
+        *
+        * For qgroup space, it will be checked later.
         */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
-       if (ret)
+       ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+       if (ret < 0)
                return ret;
 
        mutex_lock(&inode->i_mutex);
@@ -2576,6 +2625,13 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret)
                goto out;
 
+       /*
+        * TODO: Move these two operations after we have checked
+        * accurate reserved space, or fallocate can still fail but
+        * with page truncated or size expanded.
+        *
+        * But that's a minor problem and won't do much harm BTW.
+        */
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, i_size_read(inode),
                                        alloc_start);
@@ -2634,10 +2690,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                }
        }
 
+       /* First, check if we exceed the qgroup limit */
+       INIT_LIST_HEAD(&reserve_list);
        cur_offset = alloc_start;
        while (1) {
-               u64 actual_end;
-
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                if (IS_ERR_OR_NULL(em)) {
@@ -2650,57 +2706,82 @@ static long btrfs_fallocate(struct file *file, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = ALIGN(last_byte, blocksize);
-
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                       last_byte - cur_offset,
-                                                       1 << inode->i_blkbits,
-                                                       offset + len,
-                                                       &alloc_hint);
-               } else if (actual_end > inode->i_size &&
-                          !(mode & FALLOC_FL_KEEP_SIZE)) {
-                       struct btrfs_trans_handle *trans;
-                       struct btrfs_root *root = BTRFS_I(inode)->root;
-
-                       /*
-                        * We didn't need to allocate any more space, but we
-                        * still extended the size of the file so we need to
-                        * update i_size and the inode item.
-                        */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                       } else {
-                               inode->i_ctime = CURRENT_TIME;
-                               i_size_write(inode, actual_end);
-                               btrfs_ordered_update_i_size(inode, actual_end,
-                                                           NULL);
-                               ret = btrfs_update_inode(trans, root, inode);
-                               if (ret)
-                                       btrfs_end_transaction(trans, root);
-                               else
-                                       ret = btrfs_end_transaction(trans,
-                                                                   root);
+                       ret = add_falloc_range(&reserve_list, cur_offset,
+                                              last_byte - cur_offset);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
                        }
+                       ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+                                       last_byte - cur_offset);
+                       if (ret < 0)
+                               break;
                }
                free_extent_map(em);
-               if (ret < 0)
-                       break;
-
                cur_offset = last_byte;
-               if (cur_offset >= alloc_end) {
-                       ret = 0;
+               if (cur_offset >= alloc_end)
                        break;
+       }
+
+       /*
+        * If ret is still 0, means we're OK to fallocate.
+        * Or just cleanup the list and exit.
+        */
+       list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+               if (!ret)
+                       ret = btrfs_prealloc_file_range(inode, mode,
+                                       range->start,
+                                       range->len, 1 << inode->i_blkbits,
+                                       offset + len, &alloc_hint);
+               list_del(&range->list);
+               kfree(range);
+       }
+       if (ret < 0)
+               goto out_unlock;
+
+       if (actual_end > inode->i_size &&
+           !(mode & FALLOC_FL_KEEP_SIZE)) {
+               struct btrfs_trans_handle *trans;
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+
+               /*
+                * We didn't need to allocate any more space, but we
+                * still extended the size of the file so we need to
+                * update i_size and the inode item.
+                */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+               } else {
+                       inode->i_ctime = CURRENT_TIME;
+                       i_size_write(inode, actual_end);
+                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
+                       ret = btrfs_update_inode(trans, root, inode);
+                       if (ret)
+                               btrfs_end_transaction(trans, root);
+                       else
+                               ret = btrfs_end_transaction(trans, root);
                }
        }
+out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
 out:
+       /*
+        * As we waited the extent range, the data_rsv_map must be empty
+        * in the range, as written data range will be released from it.
+        * And for prealloacted extent, it will also be released when
+        * its metadata is written.
+        * So this is completely used as cleanup.
+        */
+       btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_start,
+                                      alloc_end - alloc_start);
        return ret;
 }
 
index d4a582ac3f730f82299e997d0866e3caecacac7b..767a6056ac45afce29844e469f582baec8ef28cd 100644 (file)
@@ -488,17 +488,17 @@ again:
        /* Just to make sure we have enough space */
        prealloc += 8 * PAGE_CACHE_SIZE;
 
-       ret = btrfs_delalloc_reserve_space(inode, prealloc);
+       ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
        if (ret)
                goto out_put;
 
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
        if (ret) {
-               btrfs_delalloc_release_space(inode, prealloc);
+               btrfs_delalloc_release_space(inode, 0, prealloc);
                goto out_put;
        }
-       btrfs_free_reserved_data_space(inode, prealloc);
+       btrfs_free_reserved_data_space(inode, 0, prealloc);
 
        ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
index a3e078365de537f9e12d96c8f140fc33da52b8b5..a018e4707dac577c79070f8279d1ce2d78e4ce7c 100644 (file)
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+       /*
+        * Don't forget to free the reserved space, as for inlined extent
+        * it won't count as data extent, free them directly here.
+        * And at reserve time, it's always aligned to page size, so
+        * just free one page here.
+        */
+       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
@@ -1769,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
-                       btrfs_free_reserved_data_space(inode, len);
+                       btrfs_free_reserved_data_space_noquota(inode,
+                                       state->start, len);
 
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                     root->fs_info->delalloc_batch);
@@ -1992,7 +2000,8 @@ again:
                goto again;
        }
 
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
                end_extent_writepage(page, ret, page_start, page_end);
@@ -2119,6 +2128,16 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
                                        btrfs_ino(inode), file_pos, &ins);
+       if (ret < 0)
+               goto out;
+       /*
+        * Release the reserved range from inode dirty range map, and
+        * move it to delayed ref codes, as now accounting only happens at
+        * commit_transaction() time.
+        */
+       btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+       ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+                       root->objectid, disk_bytenr, ram_bytes);
 out:
        btrfs_free_path(path);
 
@@ -2826,6 +2845,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+               /*
+                * For mwrite(mmap + memset to write) case, we still reserve
+                * space for NOCOW range.
+                * As NOCOW won't cause a new delayed ref, just free the space
+                */
+               btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+                                      ordered_extent->len);
                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (nolock)
                        trans = btrfs_join_transaction_nolock(root);
@@ -4628,14 +4655,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        if ((offset & (blocksize - 1)) == 0 &&
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode,
+                       round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
        if (ret)
                goto out;
 
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode,
+                               round_down(from, PAGE_CACHE_SIZE),
+                               PAGE_CACHE_SIZE);
                ret = -ENOMEM;
                goto out;
        }
@@ -4703,7 +4733,8 @@ again:
 
 out_unlock:
        if (ret)
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode, page_start,
+                                            PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -5101,6 +5132,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
                spin_unlock(&io_tree->lock);
 
                lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+               /*
+                * If still has DELALLOC flag, the extent didn't reach disk,
+                * and its reserved space won't be freed by delayed_ref.
+                * So we need to free its reserved space here.
+                * (Refer to comment in btrfs_invalidatepage, case 2)
+                *
+                * Note, end is the bytenr of last byte, so we need + 1 here.
+                */
+               if (state->state & EXTENT_DELALLOC)
+                       btrfs_qgroup_free_data(inode, start, end - start + 1);
+
                clear_extent_bit(io_tree, start, end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -7634,7 +7677,7 @@ unlock:
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
-               btrfs_free_reserved_data_space(inode, len);
+               btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
                current->journal_info = dio_data;
@@ -8424,7 +8467,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        mutex_unlock(&inode->i_mutex);
                        relock = true;
                }
-               ret = btrfs_delalloc_reserve_space(inode, count);
+               ret = btrfs_delalloc_reserve_space(inode, offset, count);
                if (ret)
                        goto out;
                dio_data.outstanding_extents = div64_u64(count +
@@ -8453,10 +8496,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED) {
                        if (dio_data.reserve)
-                               btrfs_delalloc_release_space(inode,
-                                                       dio_data.reserve);
+                               btrfs_delalloc_release_space(inode, offset,
+                                                            dio_data.reserve);
                } else if (ret >= 0 && (size_t)ret < count)
-                       btrfs_delalloc_release_space(inode,
+                       btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
        }
 out:
@@ -8615,6 +8658,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                }
        }
 
+       /*
+        * Qgroup reserved space handler
+        * Page here will be either
+        * 1) Already written to disk
+        *    In this case, its reserved space is released from data rsv map
+        *    and will be freed by delayed_ref handler finally.
+        *    So even we call qgroup_free_data(), it won't decrease reserved
+        *    space.
+        * 2) Not written to disk
+        *    This means the reserved space should be freed here.
+        */
+       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8665,7 +8720,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_end;
 
        sb_start_pagefault(inode->i_sb);
-       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       page_start = page_offset(page);
+       page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
                reserved = 1;
@@ -8684,8 +8743,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
        lock_page(page);
        size = i_size_read(inode);
-       page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
 
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
@@ -8762,7 +8819,7 @@ out_unlock:
        }
        unlock_page(page);
 out:
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+       btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -9051,6 +9108,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+       btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
index 685df7e1b24e531ae9a6f15c81c0475f8e25251c..7ed033a842121531c93f8b18c3e23f3eb8707817 100644 (file)
@@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
        page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
        ret = btrfs_delalloc_reserve_space(inode,
-                                          page_cnt << PAGE_CACHE_SHIFT);
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
        if (ret)
                return ret;
        i_done = 0;
@@ -1210,7 +1211,8 @@ again:
                BTRFS_I(inode)->outstanding_extents++;
                spin_unlock(&BTRFS_I(inode)->lock);
                btrfs_delalloc_release_space(inode,
-                                    (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+                               start_index << PAGE_CACHE_SHIFT,
+                               (page_cnt - i_done) << PAGE_CACHE_SHIFT);
        }
 
 
@@ -1235,7 +1237,9 @@ out:
                unlock_page(pages[i]);
                page_cache_release(pages[i]);
        }
-       btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+       btrfs_delalloc_release_space(inode,
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
        return ret;
 
 }
index d904ee1c53497034ed6685b8e12f40d0880aaaca..158633c9bbd9cb151fc3390b5a66684f40f175b3 100644 (file)
@@ -2035,7 +2035,7 @@ out:
        return ret;
 }
 
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
@@ -2116,14 +2116,13 @@ out:
        return ret;
 }
 
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+                              u64 ref_root, u64 num_bytes)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
-       struct btrfs_fs_info *fs_info = root->fs_info;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
-       u64 ref_root = root->root_key.objectid;
        int ret = 0;
 
        if (!is_fstree(ref_root))
@@ -2169,6 +2168,11 @@ out:
        spin_unlock(&fs_info->qgroup_lock);
 }
 
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+       return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+                                        num_bytes);
+}
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
        if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2486,3 +2490,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
                btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                 &fs_info->qgroup_rescan_work);
 }
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           len == 0)
+               return 0;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       trace_btrfs_qgroup_reserve_data(inode, start, len,
+                                       changeset.bytes_changed,
+                                       QGROUP_RESERVE);
+       if (ret < 0)
+               goto cleanup;
+       ret = qgroup_reserve(root, changeset.bytes_changed);
+       if (ret < 0)
+               goto cleanup;
+
+       ulist_free(changeset.range_changed);
+       return ret;
+
+cleanup:
+       /* cleanup already reserved ranges */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(changeset.range_changed, &uiter)))
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+                                unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+                                GFP_NOFS);
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+                                      int free)
+{
+       struct extent_changeset changeset;
+       int trace_op = QGROUP_RELEASE;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (!changeset.range_changed)
+               return -ENOMEM;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       if (ret < 0)
+               goto out;
+
+       if (free) {
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+               trace_op = QGROUP_FREE;
+       }
+       trace_btrfs_qgroup_release_data(inode, start, len,
+                                       changeset.bytes_changed, trace_op);
+out:
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           num_bytes == 0)
+               return 0;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       ret = qgroup_reserve(root, num_bytes);
+       if (ret < 0)
+               return ret;
+       atomic_add(num_bytes, &root->qgroup_meta_rsv);
+       return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+       int reserved;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+       if (reserved == 0)
+               return;
+       qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+       atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+       qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator iter;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (WARN_ON(!changeset.range_changed))
+               return;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                       EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+       WARN_ON(ret < 0);
+       if (WARN_ON(changeset.bytes_changed)) {
+               ULIST_ITER_INIT(&iter);
+               while ((unode = ulist_next(changeset.range_changed, &iter))) {
+                       btrfs_warn(BTRFS_I(inode)->root->fs_info,
+                               "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+                               inode->i_ino, unode->val, unode->aux);
+               }
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+       }
+       ulist_free(changeset.range_changed);
+}
index 6387dcfa354c6ecac672a543bf1d6b8e587e911b..ecb2c143ef756bd0356e3968b3f9f13fe5ac21cd 100644 (file)
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
        struct ulist *old_roots;
 };
 
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE         (1<<0)
+#define QGROUP_RELEASE         (1<<1)
+#define QGROUP_FREE            (1<<2)
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
                       struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                         struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
                         struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+                              u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+                                                u64 ref_root, u64 num_bytes)
+{
+       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+       trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
                               u64 rfer, u64 excl);
 #endif
 
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */
index 58ede0a564569ed082f09f4015229ab2cfd89f85..a7dc45622e90e96f196b50717bef8c9b1de779c7 100644 (file)
@@ -3034,8 +3034,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
        BUG_ON(cluster->start != cluster->boundary[0]);
        mutex_lock(&inode->i_mutex);
 
-       ret = btrfs_check_data_free_space(inode, cluster->end +
-                                         1 - cluster->start, 0);
+       ret = btrfs_check_data_free_space(inode, cluster->start,
+                                         cluster->end + 1 - cluster->start);
        if (ret)
                goto out;
 
@@ -3056,8 +3056,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
                        break;
                nr++;
        }
-       btrfs_free_reserved_data_space(inode, cluster->end +
-                                      1 - cluster->start);
+       btrfs_free_reserved_data_space(inode, cluster->start,
+                                      cluster->end + 1 - cluster->start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
index 222f9a99a3ceee6f7b84038503bbf24fb75612a6..418c6a2ad7d88658f8624d99a1ba0e9e84c13d45 100644 (file)
@@ -480,13 +480,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
         * the appropriate flushing if need be.
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
-               if (root->fs_info->quota_enabled &&
-                   is_fstree(root->root_key.objectid)) {
-                       qgroup_reserved = num_items * root->nodesize;
-                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-                       if (ret)
-                               return ERR_PTR(ret);
-               }
+               qgroup_reserved = num_items * root->nodesize;
+               ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+               if (ret)
+                       return ERR_PTR(ret);
 
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                /*
@@ -547,6 +544,7 @@ again:
        h->transaction = cur_trans;
        h->root = root;
        h->use_count = 1;
+
        h->type = type;
        h->can_flush_pending_bgs = true;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
@@ -567,7 +565,6 @@ again:
                h->bytes_reserved = num_bytes;
                h->reloc_reserved = reloc_reserved;
        }
-       h->qgroup_reserved = qgroup_reserved;
 
 got_it:
        btrfs_record_root_in_trans(h, root);
@@ -585,8 +582,7 @@ alloc_fail:
                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                        num_bytes);
 reserve_fail:
-       if (qgroup_reserved)
-               btrfs_qgroup_free(root, qgroup_reserved);
+       btrfs_qgroup_free_meta(root, qgroup_reserved);
        return ERR_PTR(ret);
 }
 
@@ -798,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                        must_run_delayed_refs = 2;
        }
 
-       if (trans->qgroup_reserved) {
-               /*
-                * the same root has to be passed here between start_transaction
-                * and end_transaction. Subvolume quota depends on this.
-                */
-               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
-
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 
@@ -1224,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
+                       btrfs_qgroup_free_meta_all(root);
                }
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1813,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
 
        cur_trans = trans->transaction;
 
@@ -2167,10 +2151,6 @@ cleanup_transaction:
        btrfs_trans_release_metadata(trans, root);
        btrfs_trans_release_chunk_metadata(trans);
        trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
        btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
        if (current->journal_info == trans)
                current->journal_info = NULL;
index 30ae75074ca4740657d5de3063a9b14fc7d10b84..b05b2f64d9133313f1af231c4a4254ff16972f00 100644 (file)
@@ -108,7 +108,6 @@ struct btrfs_trans_handle {
        u64 transid;
        u64 bytes_reserved;
        u64 chunk_bytes_reserved;
-       u64 qgroup_reserved;
        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
index 0b73af9be12f467d8b838c278297e30e273dbc49..b4473dab39d613e58e4d4e58e0049d72522047cd 100644 (file)
@@ -1117,6 +1117,119 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
        TP_ARGS(wq)
 );
 
+DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved),
+
+       TP_STRUCT__entry(
+               __field(        u64,            rootid          )
+               __field(        unsigned long,  ino             )
+               __field(        u64,            free_reserved   )
+       ),
+
+       TP_fast_assign(
+               __entry->rootid         =       BTRFS_I(inode)->root->objectid;
+               __entry->ino            =       inode->i_ino;
+               __entry->free_reserved  =       free_reserved;
+       ),
+
+       TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
+                 __entry->rootid, __entry->ino, __entry->free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved)
+);
+
+#define BTRFS_QGROUP_OPERATIONS                                \
+       { QGROUP_RESERVE,       "reserve"       },      \
+       { QGROUP_RELEASE,       "release"       },      \
+       { QGROUP_FREE,          "free"          }
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op),
+
+       TP_STRUCT__entry(
+               __field(        u64,            rootid          )
+               __field(        unsigned long,  ino             )
+               __field(        u64,            start           )
+               __field(        u64,            len             )
+               __field(        u64,            reserved        )
+               __field(        int,            op              )
+       ),
+
+       TP_fast_assign(
+               __entry->rootid         = BTRFS_I(inode)->root->objectid;
+               __entry->ino            = inode->i_ino;
+               __entry->start          = start;
+               __entry->len            = len;
+               __entry->reserved       = reserved;
+               __entry->op             = op;
+       ),
+
+       TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
+                 __entry->rootid, __entry->ino, __entry->start, __entry->len,
+                 __entry->reserved,
+                 __print_flags((unsigned long)__entry->op, "",
+                               BTRFS_QGROUP_OPERATIONS)
+       )
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op)
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op)
+);
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
+
+       TP_PROTO(u64 ref_root, u64 reserved),
+
+       TP_ARGS(ref_root, reserved),
+
+       TP_STRUCT__entry(
+               __field(        u64,            ref_root        )
+               __field(        u64,            reserved        )
+       ),
+
+       TP_fast_assign(
+               __entry->ref_root       = ref_root;
+               __entry->reserved       = reserved;
+       ),
+
+       TP_printk("root=%llu, reserved=%llu, op=free",
+                 __entry->ref_root, __entry->reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
+
+       TP_PROTO(u64 ref_root, u64 reserved),
+
+       TP_ARGS(ref_root, reserved)
+);
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */