Btrfs: make metadata chunks smaller
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include "compat.h"
26 #include "hash.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "free-space-cache.h"
34
35 static int update_block_group(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               u64 bytenr, u64 num_bytes, int alloc,
38                               int mark_free);
39 static int update_reserved_extents(struct btrfs_block_group_cache *cache,
40                                    u64 num_bytes, int reserve);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *extent_root, u64 alloc_bytes,
62                           u64 flags, int force);
63 static int pin_down_bytes(struct btrfs_trans_handle *trans,
64                           struct btrfs_root *root,
65                           struct btrfs_path *path,
66                           u64 bytenr, u64 num_bytes,
67                           int is_data, int reserved,
68                           struct extent_buffer **must_clean);
69 static int find_next_key(struct btrfs_path *path, int level,
70                          struct btrfs_key *key);
71 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72                             int dump_block_groups);
73
74 static noinline int
75 block_group_cache_done(struct btrfs_block_group_cache *cache)
76 {
77         smp_mb();
78         return cache->cached == BTRFS_CACHE_FINISHED;
79 }
80
81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
82 {
83         return (cache->flags & bits) == bits;
84 }
85
86 /*
87  * this adds the block group to the fs_info rb tree for the block group
88  * cache
89  */
90 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
91                                 struct btrfs_block_group_cache *block_group)
92 {
93         struct rb_node **p;
94         struct rb_node *parent = NULL;
95         struct btrfs_block_group_cache *cache;
96
97         spin_lock(&info->block_group_cache_lock);
98         p = &info->block_group_cache_tree.rb_node;
99
100         while (*p) {
101                 parent = *p;
102                 cache = rb_entry(parent, struct btrfs_block_group_cache,
103                                  cache_node);
104                 if (block_group->key.objectid < cache->key.objectid) {
105                         p = &(*p)->rb_left;
106                 } else if (block_group->key.objectid > cache->key.objectid) {
107                         p = &(*p)->rb_right;
108                 } else {
109                         spin_unlock(&info->block_group_cache_lock);
110                         return -EEXIST;
111                 }
112         }
113
114         rb_link_node(&block_group->cache_node, parent, p);
115         rb_insert_color(&block_group->cache_node,
116                         &info->block_group_cache_tree);
117         spin_unlock(&info->block_group_cache_lock);
118
119         return 0;
120 }
121
122 /*
123  * This will return the block group at or after bytenr if contains is 0, else
124  * it will return the block group that contains the bytenr
125  */
126 static struct btrfs_block_group_cache *
127 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
128                               int contains)
129 {
130         struct btrfs_block_group_cache *cache, *ret = NULL;
131         struct rb_node *n;
132         u64 end, start;
133
134         spin_lock(&info->block_group_cache_lock);
135         n = info->block_group_cache_tree.rb_node;
136
137         while (n) {
138                 cache = rb_entry(n, struct btrfs_block_group_cache,
139                                  cache_node);
140                 end = cache->key.objectid + cache->key.offset - 1;
141                 start = cache->key.objectid;
142
143                 if (bytenr < start) {
144                         if (!contains && (!ret || start < ret->key.objectid))
145                                 ret = cache;
146                         n = n->rb_left;
147                 } else if (bytenr > start) {
148                         if (contains && bytenr <= end) {
149                                 ret = cache;
150                                 break;
151                         }
152                         n = n->rb_right;
153                 } else {
154                         ret = cache;
155                         break;
156                 }
157         }
158         if (ret)
159                 atomic_inc(&ret->count);
160         spin_unlock(&info->block_group_cache_lock);
161
162         return ret;
163 }
164
165 static int add_excluded_extent(struct btrfs_root *root,
166                                u64 start, u64 num_bytes)
167 {
168         u64 end = start + num_bytes - 1;
169         set_extent_bits(&root->fs_info->freed_extents[0],
170                         start, end, EXTENT_UPTODATE, GFP_NOFS);
171         set_extent_bits(&root->fs_info->freed_extents[1],
172                         start, end, EXTENT_UPTODATE, GFP_NOFS);
173         return 0;
174 }
175
176 static void free_excluded_extents(struct btrfs_root *root,
177                                   struct btrfs_block_group_cache *cache)
178 {
179         u64 start, end;
180
181         start = cache->key.objectid;
182         end = start + cache->key.offset - 1;
183
184         clear_extent_bits(&root->fs_info->freed_extents[0],
185                           start, end, EXTENT_UPTODATE, GFP_NOFS);
186         clear_extent_bits(&root->fs_info->freed_extents[1],
187                           start, end, EXTENT_UPTODATE, GFP_NOFS);
188 }
189
190 static int exclude_super_stripes(struct btrfs_root *root,
191                                  struct btrfs_block_group_cache *cache)
192 {
193         u64 bytenr;
194         u64 *logical;
195         int stripe_len;
196         int i, nr, ret;
197
198         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
199                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
200                 cache->bytes_super += stripe_len;
201                 ret = add_excluded_extent(root, cache->key.objectid,
202                                           stripe_len);
203                 BUG_ON(ret);
204         }
205
206         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
207                 bytenr = btrfs_sb_offset(i);
208                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
209                                        cache->key.objectid, bytenr,
210                                        0, &logical, &nr, &stripe_len);
211                 BUG_ON(ret);
212
213                 while (nr--) {
214                         cache->bytes_super += stripe_len;
215                         ret = add_excluded_extent(root, logical[nr],
216                                                   stripe_len);
217                         BUG_ON(ret);
218                 }
219
220                 kfree(logical);
221         }
222         return 0;
223 }
224
225 static struct btrfs_caching_control *
226 get_caching_control(struct btrfs_block_group_cache *cache)
227 {
228         struct btrfs_caching_control *ctl;
229
230         spin_lock(&cache->lock);
231         if (cache->cached != BTRFS_CACHE_STARTED) {
232                 spin_unlock(&cache->lock);
233                 return NULL;
234         }
235
236         ctl = cache->caching_ctl;
237         atomic_inc(&ctl->count);
238         spin_unlock(&cache->lock);
239         return ctl;
240 }
241
242 static void put_caching_control(struct btrfs_caching_control *ctl)
243 {
244         if (atomic_dec_and_test(&ctl->count))
245                 kfree(ctl);
246 }
247
248 /*
249  * this is only called by cache_block_group, since we could have freed extents
250  * we need to check the pinned_extents for any extents that can't be used yet
251  * since their free space will be released as soon as the transaction commits.
252  */
253 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
254                               struct btrfs_fs_info *info, u64 start, u64 end)
255 {
256         u64 extent_start, extent_end, size, total_added = 0;
257         int ret;
258
259         while (start < end) {
260                 ret = find_first_extent_bit(info->pinned_extents, start,
261                                             &extent_start, &extent_end,
262                                             EXTENT_DIRTY | EXTENT_UPTODATE);
263                 if (ret)
264                         break;
265
266                 if (extent_start <= start) {
267                         start = extent_end + 1;
268                 } else if (extent_start > start && extent_start < end) {
269                         size = extent_start - start;
270                         total_added += size;
271                         ret = btrfs_add_free_space(block_group, start,
272                                                    size);
273                         BUG_ON(ret);
274                         start = extent_end + 1;
275                 } else {
276                         break;
277                 }
278         }
279
280         if (start < end) {
281                 size = end - start;
282                 total_added += size;
283                 ret = btrfs_add_free_space(block_group, start, size);
284                 BUG_ON(ret);
285         }
286
287         return total_added;
288 }
289
290 static int caching_kthread(void *data)
291 {
292         struct btrfs_block_group_cache *block_group = data;
293         struct btrfs_fs_info *fs_info = block_group->fs_info;
294         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
295         struct btrfs_root *extent_root = fs_info->extent_root;
296         struct btrfs_path *path;
297         struct extent_buffer *leaf;
298         struct btrfs_key key;
299         u64 total_found = 0;
300         u64 last = 0;
301         u32 nritems;
302         int ret = 0;
303
304         path = btrfs_alloc_path();
305         if (!path)
306                 return -ENOMEM;
307
308         exclude_super_stripes(extent_root, block_group);
309         spin_lock(&block_group->space_info->lock);
310         block_group->space_info->bytes_super += block_group->bytes_super;
311         spin_unlock(&block_group->space_info->lock);
312
313         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
314
315         /*
316          * We don't want to deadlock with somebody trying to allocate a new
317          * extent for the extent root while also trying to search the extent
318          * root to add free space.  So we skip locking and search the commit
319          * root, since its read-only
320          */
321         path->skip_locking = 1;
322         path->search_commit_root = 1;
323         path->reada = 2;
324
325         key.objectid = last;
326         key.offset = 0;
327         key.type = BTRFS_EXTENT_ITEM_KEY;
328 again:
329         mutex_lock(&caching_ctl->mutex);
330         /* need to make sure the commit_root doesn't disappear */
331         down_read(&fs_info->extent_commit_sem);
332
333         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
334         if (ret < 0)
335                 goto err;
336
337         leaf = path->nodes[0];
338         nritems = btrfs_header_nritems(leaf);
339
340         while (1) {
341                 smp_mb();
342                 if (fs_info->closing > 1) {
343                         last = (u64)-1;
344                         break;
345                 }
346
347                 if (path->slots[0] < nritems) {
348                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
349                 } else {
350                         ret = find_next_key(path, 0, &key);
351                         if (ret)
352                                 break;
353
354                         caching_ctl->progress = last;
355                         btrfs_release_path(extent_root, path);
356                         up_read(&fs_info->extent_commit_sem);
357                         mutex_unlock(&caching_ctl->mutex);
358                         if (btrfs_transaction_in_commit(fs_info))
359                                 schedule_timeout(1);
360                         else
361                                 cond_resched();
362                         goto again;
363                 }
364
365                 if (key.objectid < block_group->key.objectid) {
366                         path->slots[0]++;
367                         continue;
368                 }
369
370                 if (key.objectid >= block_group->key.objectid +
371                     block_group->key.offset)
372                         break;
373
374                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
375                         total_found += add_new_free_space(block_group,
376                                                           fs_info, last,
377                                                           key.objectid);
378                         last = key.objectid + key.offset;
379
380                         if (total_found > (1024 * 1024 * 2)) {
381                                 total_found = 0;
382                                 wake_up(&caching_ctl->wait);
383                         }
384                 }
385                 path->slots[0]++;
386         }
387         ret = 0;
388
389         total_found += add_new_free_space(block_group, fs_info, last,
390                                           block_group->key.objectid +
391                                           block_group->key.offset);
392         caching_ctl->progress = (u64)-1;
393
394         spin_lock(&block_group->lock);
395         block_group->caching_ctl = NULL;
396         block_group->cached = BTRFS_CACHE_FINISHED;
397         spin_unlock(&block_group->lock);
398
399 err:
400         btrfs_free_path(path);
401         up_read(&fs_info->extent_commit_sem);
402
403         free_excluded_extents(extent_root, block_group);
404
405         mutex_unlock(&caching_ctl->mutex);
406         wake_up(&caching_ctl->wait);
407
408         put_caching_control(caching_ctl);
409         atomic_dec(&block_group->space_info->caching_threads);
410         return 0;
411 }
412
413 static int cache_block_group(struct btrfs_block_group_cache *cache)
414 {
415         struct btrfs_fs_info *fs_info = cache->fs_info;
416         struct btrfs_caching_control *caching_ctl;
417         struct task_struct *tsk;
418         int ret = 0;
419
420         smp_mb();
421         if (cache->cached != BTRFS_CACHE_NO)
422                 return 0;
423
424         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
425         BUG_ON(!caching_ctl);
426
427         INIT_LIST_HEAD(&caching_ctl->list);
428         mutex_init(&caching_ctl->mutex);
429         init_waitqueue_head(&caching_ctl->wait);
430         caching_ctl->block_group = cache;
431         caching_ctl->progress = cache->key.objectid;
432         /* one for caching kthread, one for caching block group list */
433         atomic_set(&caching_ctl->count, 2);
434
435         spin_lock(&cache->lock);
436         if (cache->cached != BTRFS_CACHE_NO) {
437                 spin_unlock(&cache->lock);
438                 kfree(caching_ctl);
439                 return 0;
440         }
441         cache->caching_ctl = caching_ctl;
442         cache->cached = BTRFS_CACHE_STARTED;
443         spin_unlock(&cache->lock);
444
445         down_write(&fs_info->extent_commit_sem);
446         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
447         up_write(&fs_info->extent_commit_sem);
448
449         atomic_inc(&cache->space_info->caching_threads);
450
451         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
452                           cache->key.objectid);
453         if (IS_ERR(tsk)) {
454                 ret = PTR_ERR(tsk);
455                 printk(KERN_ERR "error running thread %d\n", ret);
456                 BUG();
457         }
458
459         return ret;
460 }
461
462 /*
463  * return the block group that starts at or after bytenr
464  */
465 static struct btrfs_block_group_cache *
466 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
467 {
468         struct btrfs_block_group_cache *cache;
469
470         cache = block_group_cache_tree_search(info, bytenr, 0);
471
472         return cache;
473 }
474
475 /*
476  * return the block group that contains the given bytenr
477  */
478 struct btrfs_block_group_cache *btrfs_lookup_block_group(
479                                                  struct btrfs_fs_info *info,
480                                                  u64 bytenr)
481 {
482         struct btrfs_block_group_cache *cache;
483
484         cache = block_group_cache_tree_search(info, bytenr, 1);
485
486         return cache;
487 }
488
489 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
490 {
491         if (atomic_dec_and_test(&cache->count))
492                 kfree(cache);
493 }
494
495 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
496                                                   u64 flags)
497 {
498         struct list_head *head = &info->space_info;
499         struct btrfs_space_info *found;
500
501         rcu_read_lock();
502         list_for_each_entry_rcu(found, head, list) {
503                 if (found->flags == flags) {
504                         rcu_read_unlock();
505                         return found;
506                 }
507         }
508         rcu_read_unlock();
509         return NULL;
510 }
511
512 /*
513  * after adding space to the filesystem, we need to clear the full flags
514  * on all the space infos.
515  */
516 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
517 {
518         struct list_head *head = &info->space_info;
519         struct btrfs_space_info *found;
520
521         rcu_read_lock();
522         list_for_each_entry_rcu(found, head, list)
523                 found->full = 0;
524         rcu_read_unlock();
525 }
526
527 static u64 div_factor(u64 num, int factor)
528 {
529         if (factor == 10)
530                 return num;
531         num *= factor;
532         do_div(num, 10);
533         return num;
534 }
535
536 u64 btrfs_find_block_group(struct btrfs_root *root,
537                            u64 search_start, u64 search_hint, int owner)
538 {
539         struct btrfs_block_group_cache *cache;
540         u64 used;
541         u64 last = max(search_hint, search_start);
542         u64 group_start = 0;
543         int full_search = 0;
544         int factor = 9;
545         int wrapped = 0;
546 again:
547         while (1) {
548                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
549                 if (!cache)
550                         break;
551
552                 spin_lock(&cache->lock);
553                 last = cache->key.objectid + cache->key.offset;
554                 used = btrfs_block_group_used(&cache->item);
555
556                 if ((full_search || !cache->ro) &&
557                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
558                         if (used + cache->pinned + cache->reserved <
559                             div_factor(cache->key.offset, factor)) {
560                                 group_start = cache->key.objectid;
561                                 spin_unlock(&cache->lock);
562                                 btrfs_put_block_group(cache);
563                                 goto found;
564                         }
565                 }
566                 spin_unlock(&cache->lock);
567                 btrfs_put_block_group(cache);
568                 cond_resched();
569         }
570         if (!wrapped) {
571                 last = search_start;
572                 wrapped = 1;
573                 goto again;
574         }
575         if (!full_search && factor < 10) {
576                 last = search_start;
577                 full_search = 1;
578                 factor = 10;
579                 goto again;
580         }
581 found:
582         return group_start;
583 }
584
585 /* simple helper to search for an existing extent at a given offset */
586 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
587 {
588         int ret;
589         struct btrfs_key key;
590         struct btrfs_path *path;
591
592         path = btrfs_alloc_path();
593         BUG_ON(!path);
594         key.objectid = start;
595         key.offset = len;
596         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
597         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
598                                 0, 0);
599         btrfs_free_path(path);
600         return ret;
601 }
602
603 /*
604  * Back reference rules.  Back refs have three main goals:
605  *
606  * 1) differentiate between all holders of references to an extent so that
607  *    when a reference is dropped we can make sure it was a valid reference
608  *    before freeing the extent.
609  *
610  * 2) Provide enough information to quickly find the holders of an extent
611  *    if we notice a given block is corrupted or bad.
612  *
613  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
614  *    maintenance.  This is actually the same as #2, but with a slightly
615  *    different use case.
616  *
617  * There are two kinds of back refs. The implicit back refs is optimized
618  * for pointers in non-shared tree blocks. For a given pointer in a block,
619  * back refs of this kind provide information about the block's owner tree
620  * and the pointer's key. These information allow us to find the block by
621  * b-tree searching. The full back refs is for pointers in tree blocks not
622  * referenced by their owner trees. The location of tree block is recorded
623  * in the back refs. Actually the full back refs is generic, and can be
624  * used in all cases the implicit back refs is used. The major shortcoming
625  * of the full back refs is its overhead. Every time a tree block gets
626  * COWed, we have to update back refs entry for all pointers in it.
627  *
628  * For a newly allocated tree block, we use implicit back refs for
629  * pointers in it. This means most tree related operations only involve
630  * implicit back refs. For a tree block created in old transaction, the
631  * only way to drop a reference to it is COW it. So we can detect the
632  * event that tree block loses its owner tree's reference and do the
633  * back refs conversion.
634  *
635  * When a tree block is COW'd through a tree, there are four cases:
636  *
637  * The reference count of the block is one and the tree is the block's
638  * owner tree. Nothing to do in this case.
639  *
640  * The reference count of the block is one and the tree is not the
641  * block's owner tree. In this case, full back refs is used for pointers
642  * in the block. Remove these full back refs, add implicit back refs for
643  * every pointers in the new block.
644  *
645  * The reference count of the block is greater than one and the tree is
646  * the block's owner tree. In this case, implicit back refs is used for
647  * pointers in the block. Add full back refs for every pointers in the
648  * block, increase lower level extents' reference counts. The original
649  * implicit back refs are entailed to the new block.
650  *
651  * The reference count of the block is greater than one and the tree is
652  * not the block's owner tree. Add implicit back refs for every pointer in
653  * the new block, increase lower level extents' reference count.
654  *
655  * Back Reference Key composing:
656  *
657  * The key objectid corresponds to the first byte in the extent,
658  * The key type is used to differentiate between types of back refs.
659  * There are different meanings of the key offset for different types
660  * of back refs.
661  *
662  * File extents can be referenced by:
663  *
664  * - multiple snapshots, subvolumes, or different generations in one subvol
665  * - different files inside a single subvolume
666  * - different offsets inside a file (bookend extents in file.c)
667  *
668  * The extent ref structure for the implicit back refs has fields for:
669  *
670  * - Objectid of the subvolume root
671  * - objectid of the file holding the reference
672  * - original offset in the file
673  * - how many bookend extents
674  *
675  * The key offset for the implicit back refs is hash of the first
676  * three fields.
677  *
678  * The extent ref structure for the full back refs has field for:
679  *
680  * - number of pointers in the tree leaf
681  *
682  * The key offset for the implicit back refs is the first byte of
683  * the tree leaf
684  *
685  * When a file extent is allocated, The implicit back refs is used.
686  * the fields are filled in:
687  *
688  *     (root_key.objectid, inode objectid, offset in file, 1)
689  *
690  * When a file extent is removed file truncation, we find the
691  * corresponding implicit back refs and check the following fields:
692  *
693  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
694  *
695  * Btree extents can be referenced by:
696  *
697  * - Different subvolumes
698  *
699  * Both the implicit back refs and the full back refs for tree blocks
700  * only consist of key. The key offset for the implicit back refs is
701  * objectid of block's owner tree. The key offset for the full back refs
702  * is the first byte of parent block.
703  *
704  * When implicit back refs is used, information about the lowest key and
705  * level of the tree block are required. These information are stored in
706  * tree block info structure.
707  */
708
709 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
710 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
711                                   struct btrfs_root *root,
712                                   struct btrfs_path *path,
713                                   u64 owner, u32 extra_size)
714 {
715         struct btrfs_extent_item *item;
716         struct btrfs_extent_item_v0 *ei0;
717         struct btrfs_extent_ref_v0 *ref0;
718         struct btrfs_tree_block_info *bi;
719         struct extent_buffer *leaf;
720         struct btrfs_key key;
721         struct btrfs_key found_key;
722         u32 new_size = sizeof(*item);
723         u64 refs;
724         int ret;
725
726         leaf = path->nodes[0];
727         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
728
729         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
730         ei0 = btrfs_item_ptr(leaf, path->slots[0],
731                              struct btrfs_extent_item_v0);
732         refs = btrfs_extent_refs_v0(leaf, ei0);
733
734         if (owner == (u64)-1) {
735                 while (1) {
736                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
737                                 ret = btrfs_next_leaf(root, path);
738                                 if (ret < 0)
739                                         return ret;
740                                 BUG_ON(ret > 0);
741                                 leaf = path->nodes[0];
742                         }
743                         btrfs_item_key_to_cpu(leaf, &found_key,
744                                               path->slots[0]);
745                         BUG_ON(key.objectid != found_key.objectid);
746                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
747                                 path->slots[0]++;
748                                 continue;
749                         }
750                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
751                                               struct btrfs_extent_ref_v0);
752                         owner = btrfs_ref_objectid_v0(leaf, ref0);
753                         break;
754                 }
755         }
756         btrfs_release_path(root, path);
757
758         if (owner < BTRFS_FIRST_FREE_OBJECTID)
759                 new_size += sizeof(*bi);
760
761         new_size -= sizeof(*ei0);
762         ret = btrfs_search_slot(trans, root, &key, path,
763                                 new_size + extra_size, 1);
764         if (ret < 0)
765                 return ret;
766         BUG_ON(ret);
767
768         ret = btrfs_extend_item(trans, root, path, new_size);
769         BUG_ON(ret);
770
771         leaf = path->nodes[0];
772         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
773         btrfs_set_extent_refs(leaf, item, refs);
774         /* FIXME: get real generation */
775         btrfs_set_extent_generation(leaf, item, 0);
776         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
777                 btrfs_set_extent_flags(leaf, item,
778                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
779                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
780                 bi = (struct btrfs_tree_block_info *)(item + 1);
781                 /* FIXME: get first key of the block */
782                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
783                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
784         } else {
785                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
786         }
787         btrfs_mark_buffer_dirty(leaf);
788         return 0;
789 }
790 #endif
791
792 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
793 {
794         u32 high_crc = ~(u32)0;
795         u32 low_crc = ~(u32)0;
796         __le64 lenum;
797
798         lenum = cpu_to_le64(root_objectid);
799         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
800         lenum = cpu_to_le64(owner);
801         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
802         lenum = cpu_to_le64(offset);
803         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
804
805         return ((u64)high_crc << 31) ^ (u64)low_crc;
806 }
807
808 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
809                                      struct btrfs_extent_data_ref *ref)
810 {
811         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
812                                     btrfs_extent_data_ref_objectid(leaf, ref),
813                                     btrfs_extent_data_ref_offset(leaf, ref));
814 }
815
816 static int match_extent_data_ref(struct extent_buffer *leaf,
817                                  struct btrfs_extent_data_ref *ref,
818                                  u64 root_objectid, u64 owner, u64 offset)
819 {
820         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
821             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
822             btrfs_extent_data_ref_offset(leaf, ref) != offset)
823                 return 0;
824         return 1;
825 }
826
827 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
828                                            struct btrfs_root *root,
829                                            struct btrfs_path *path,
830                                            u64 bytenr, u64 parent,
831                                            u64 root_objectid,
832                                            u64 owner, u64 offset)
833 {
834         struct btrfs_key key;
835         struct btrfs_extent_data_ref *ref;
836         struct extent_buffer *leaf;
837         u32 nritems;
838         int ret;
839         int recow;
840         int err = -ENOENT;
841
842         key.objectid = bytenr;
843         if (parent) {
844                 key.type = BTRFS_SHARED_DATA_REF_KEY;
845                 key.offset = parent;
846         } else {
847                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
848                 key.offset = hash_extent_data_ref(root_objectid,
849                                                   owner, offset);
850         }
851 again:
852         recow = 0;
853         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
854         if (ret < 0) {
855                 err = ret;
856                 goto fail;
857         }
858
859         if (parent) {
860                 if (!ret)
861                         return 0;
862 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
863                 key.type = BTRFS_EXTENT_REF_V0_KEY;
864                 btrfs_release_path(root, path);
865                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
866                 if (ret < 0) {
867                         err = ret;
868                         goto fail;
869                 }
870                 if (!ret)
871                         return 0;
872 #endif
873                 goto fail;
874         }
875
876         leaf = path->nodes[0];
877         nritems = btrfs_header_nritems(leaf);
878         while (1) {
879                 if (path->slots[0] >= nritems) {
880                         ret = btrfs_next_leaf(root, path);
881                         if (ret < 0)
882                                 err = ret;
883                         if (ret)
884                                 goto fail;
885
886                         leaf = path->nodes[0];
887                         nritems = btrfs_header_nritems(leaf);
888                         recow = 1;
889                 }
890
891                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
892                 if (key.objectid != bytenr ||
893                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
894                         goto fail;
895
896                 ref = btrfs_item_ptr(leaf, path->slots[0],
897                                      struct btrfs_extent_data_ref);
898
899                 if (match_extent_data_ref(leaf, ref, root_objectid,
900                                           owner, offset)) {
901                         if (recow) {
902                                 btrfs_release_path(root, path);
903                                 goto again;
904                         }
905                         err = 0;
906                         break;
907                 }
908                 path->slots[0]++;
909         }
910 fail:
911         return err;
912 }
913
914 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
915                                            struct btrfs_root *root,
916                                            struct btrfs_path *path,
917                                            u64 bytenr, u64 parent,
918                                            u64 root_objectid, u64 owner,
919                                            u64 offset, int refs_to_add)
920 {
921         struct btrfs_key key;
922         struct extent_buffer *leaf;
923         u32 size;
924         u32 num_refs;
925         int ret;
926
927         key.objectid = bytenr;
928         if (parent) {
929                 key.type = BTRFS_SHARED_DATA_REF_KEY;
930                 key.offset = parent;
931                 size = sizeof(struct btrfs_shared_data_ref);
932         } else {
933                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
934                 key.offset = hash_extent_data_ref(root_objectid,
935                                                   owner, offset);
936                 size = sizeof(struct btrfs_extent_data_ref);
937         }
938
939         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
940         if (ret && ret != -EEXIST)
941                 goto fail;
942
943         leaf = path->nodes[0];
944         if (parent) {
945                 struct btrfs_shared_data_ref *ref;
946                 ref = btrfs_item_ptr(leaf, path->slots[0],
947                                      struct btrfs_shared_data_ref);
948                 if (ret == 0) {
949                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
950                 } else {
951                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
952                         num_refs += refs_to_add;
953                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
954                 }
955         } else {
956                 struct btrfs_extent_data_ref *ref;
957                 while (ret == -EEXIST) {
958                         ref = btrfs_item_ptr(leaf, path->slots[0],
959                                              struct btrfs_extent_data_ref);
960                         if (match_extent_data_ref(leaf, ref, root_objectid,
961                                                   owner, offset))
962                                 break;
963                         btrfs_release_path(root, path);
964                         key.offset++;
965                         ret = btrfs_insert_empty_item(trans, root, path, &key,
966                                                       size);
967                         if (ret && ret != -EEXIST)
968                                 goto fail;
969
970                         leaf = path->nodes[0];
971                 }
972                 ref = btrfs_item_ptr(leaf, path->slots[0],
973                                      struct btrfs_extent_data_ref);
974                 if (ret == 0) {
975                         btrfs_set_extent_data_ref_root(leaf, ref,
976                                                        root_objectid);
977                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
978                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
979                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
980                 } else {
981                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
982                         num_refs += refs_to_add;
983                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
984                 }
985         }
986         btrfs_mark_buffer_dirty(leaf);
987         ret = 0;
988 fail:
989         btrfs_release_path(root, path);
990         return ret;
991 }
992
993 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
994                                            struct btrfs_root *root,
995                                            struct btrfs_path *path,
996                                            int refs_to_drop)
997 {
998         struct btrfs_key key;
999         struct btrfs_extent_data_ref *ref1 = NULL;
1000         struct btrfs_shared_data_ref *ref2 = NULL;
1001         struct extent_buffer *leaf;
1002         u32 num_refs = 0;
1003         int ret = 0;
1004
1005         leaf = path->nodes[0];
1006         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1007
1008         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1009                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1010                                       struct btrfs_extent_data_ref);
1011                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1012         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1013                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1014                                       struct btrfs_shared_data_ref);
1015                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1016 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1017         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1018                 struct btrfs_extent_ref_v0 *ref0;
1019                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1020                                       struct btrfs_extent_ref_v0);
1021                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1022 #endif
1023         } else {
1024                 BUG();
1025         }
1026
1027         BUG_ON(num_refs < refs_to_drop);
1028         num_refs -= refs_to_drop;
1029
1030         if (num_refs == 0) {
1031                 ret = btrfs_del_item(trans, root, path);
1032         } else {
1033                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1034                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1035                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1036                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1037 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1038                 else {
1039                         struct btrfs_extent_ref_v0 *ref0;
1040                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1041                                         struct btrfs_extent_ref_v0);
1042                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1043                 }
1044 #endif
1045                 btrfs_mark_buffer_dirty(leaf);
1046         }
1047         return ret;
1048 }
1049
1050 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1051                                           struct btrfs_path *path,
1052                                           struct btrfs_extent_inline_ref *iref)
1053 {
1054         struct btrfs_key key;
1055         struct extent_buffer *leaf;
1056         struct btrfs_extent_data_ref *ref1;
1057         struct btrfs_shared_data_ref *ref2;
1058         u32 num_refs = 0;
1059
1060         leaf = path->nodes[0];
1061         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1062         if (iref) {
1063                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1064                     BTRFS_EXTENT_DATA_REF_KEY) {
1065                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1066                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1067                 } else {
1068                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1069                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1070                 }
1071         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1072                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1073                                       struct btrfs_extent_data_ref);
1074                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1075         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1076                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1077                                       struct btrfs_shared_data_ref);
1078                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1079 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1080         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1081                 struct btrfs_extent_ref_v0 *ref0;
1082                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1083                                       struct btrfs_extent_ref_v0);
1084                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1085 #endif
1086         } else {
1087                 WARN_ON(1);
1088         }
1089         return num_refs;
1090 }
1091
1092 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1093                                           struct btrfs_root *root,
1094                                           struct btrfs_path *path,
1095                                           u64 bytenr, u64 parent,
1096                                           u64 root_objectid)
1097 {
1098         struct btrfs_key key;
1099         int ret;
1100
1101         key.objectid = bytenr;
1102         if (parent) {
1103                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1104                 key.offset = parent;
1105         } else {
1106                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1107                 key.offset = root_objectid;
1108         }
1109
1110         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111         if (ret > 0)
1112                 ret = -ENOENT;
1113 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1114         if (ret == -ENOENT && parent) {
1115                 btrfs_release_path(root, path);
1116                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1117                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1118                 if (ret > 0)
1119                         ret = -ENOENT;
1120         }
1121 #endif
1122         return ret;
1123 }
1124
1125 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1126                                           struct btrfs_root *root,
1127                                           struct btrfs_path *path,
1128                                           u64 bytenr, u64 parent,
1129                                           u64 root_objectid)
1130 {
1131         struct btrfs_key key;
1132         int ret;
1133
1134         key.objectid = bytenr;
1135         if (parent) {
1136                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1137                 key.offset = parent;
1138         } else {
1139                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1140                 key.offset = root_objectid;
1141         }
1142
1143         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1144         btrfs_release_path(root, path);
1145         return ret;
1146 }
1147
1148 static inline int extent_ref_type(u64 parent, u64 owner)
1149 {
1150         int type;
1151         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1152                 if (parent > 0)
1153                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1154                 else
1155                         type = BTRFS_TREE_BLOCK_REF_KEY;
1156         } else {
1157                 if (parent > 0)
1158                         type = BTRFS_SHARED_DATA_REF_KEY;
1159                 else
1160                         type = BTRFS_EXTENT_DATA_REF_KEY;
1161         }
1162         return type;
1163 }
1164
1165 static int find_next_key(struct btrfs_path *path, int level,
1166                          struct btrfs_key *key)
1167
1168 {
1169         for (; level < BTRFS_MAX_LEVEL; level++) {
1170                 if (!path->nodes[level])
1171                         break;
1172                 if (path->slots[level] + 1 >=
1173                     btrfs_header_nritems(path->nodes[level]))
1174                         continue;
1175                 if (level == 0)
1176                         btrfs_item_key_to_cpu(path->nodes[level], key,
1177                                               path->slots[level] + 1);
1178                 else
1179                         btrfs_node_key_to_cpu(path->nodes[level], key,
1180                                               path->slots[level] + 1);
1181                 return 0;
1182         }
1183         return 1;
1184 }
1185
1186 /*
1187  * look for inline back ref. if back ref is found, *ref_ret is set
1188  * to the address of inline back ref, and 0 is returned.
1189  *
1190  * if back ref isn't found, *ref_ret is set to the address where it
1191  * should be inserted, and -ENOENT is returned.
1192  *
1193  * if insert is true and there are too many inline back refs, the path
1194  * points to the extent item, and -EAGAIN is returned.
1195  *
1196  * NOTE: inline back refs are ordered in the same way that back ref
1197  *       items in the tree are ordered.
1198  */
1199 static noinline_for_stack
1200 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1201                                  struct btrfs_root *root,
1202                                  struct btrfs_path *path,
1203                                  struct btrfs_extent_inline_ref **ref_ret,
1204                                  u64 bytenr, u64 num_bytes,
1205                                  u64 parent, u64 root_objectid,
1206                                  u64 owner, u64 offset, int insert)
1207 {
1208         struct btrfs_key key;
1209         struct extent_buffer *leaf;
1210         struct btrfs_extent_item *ei;
1211         struct btrfs_extent_inline_ref *iref;
1212         u64 flags;
1213         u64 item_size;
1214         unsigned long ptr;
1215         unsigned long end;
1216         int extra_size;
1217         int type;
1218         int want;
1219         int ret;
1220         int err = 0;
1221
1222         key.objectid = bytenr;
1223         key.type = BTRFS_EXTENT_ITEM_KEY;
1224         key.offset = num_bytes;
1225
1226         want = extent_ref_type(parent, owner);
1227         if (insert) {
1228                 extra_size = btrfs_extent_inline_ref_size(want);
1229                 path->keep_locks = 1;
1230         } else
1231                 extra_size = -1;
1232         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1233         if (ret < 0) {
1234                 err = ret;
1235                 goto out;
1236         }
1237         BUG_ON(ret);
1238
1239         leaf = path->nodes[0];
1240         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1241 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1242         if (item_size < sizeof(*ei)) {
1243                 if (!insert) {
1244                         err = -ENOENT;
1245                         goto out;
1246                 }
1247                 ret = convert_extent_item_v0(trans, root, path, owner,
1248                                              extra_size);
1249                 if (ret < 0) {
1250                         err = ret;
1251                         goto out;
1252                 }
1253                 leaf = path->nodes[0];
1254                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1255         }
1256 #endif
1257         BUG_ON(item_size < sizeof(*ei));
1258
1259         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1260         flags = btrfs_extent_flags(leaf, ei);
1261
1262         ptr = (unsigned long)(ei + 1);
1263         end = (unsigned long)ei + item_size;
1264
1265         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1266                 ptr += sizeof(struct btrfs_tree_block_info);
1267                 BUG_ON(ptr > end);
1268         } else {
1269                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1270         }
1271
1272         err = -ENOENT;
1273         while (1) {
1274                 if (ptr >= end) {
1275                         WARN_ON(ptr > end);
1276                         break;
1277                 }
1278                 iref = (struct btrfs_extent_inline_ref *)ptr;
1279                 type = btrfs_extent_inline_ref_type(leaf, iref);
1280                 if (want < type)
1281                         break;
1282                 if (want > type) {
1283                         ptr += btrfs_extent_inline_ref_size(type);
1284                         continue;
1285                 }
1286
1287                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1288                         struct btrfs_extent_data_ref *dref;
1289                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1290                         if (match_extent_data_ref(leaf, dref, root_objectid,
1291                                                   owner, offset)) {
1292                                 err = 0;
1293                                 break;
1294                         }
1295                         if (hash_extent_data_ref_item(leaf, dref) <
1296                             hash_extent_data_ref(root_objectid, owner, offset))
1297                                 break;
1298                 } else {
1299                         u64 ref_offset;
1300                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1301                         if (parent > 0) {
1302                                 if (parent == ref_offset) {
1303                                         err = 0;
1304                                         break;
1305                                 }
1306                                 if (ref_offset < parent)
1307                                         break;
1308                         } else {
1309                                 if (root_objectid == ref_offset) {
1310                                         err = 0;
1311                                         break;
1312                                 }
1313                                 if (ref_offset < root_objectid)
1314                                         break;
1315                         }
1316                 }
1317                 ptr += btrfs_extent_inline_ref_size(type);
1318         }
1319         if (err == -ENOENT && insert) {
1320                 if (item_size + extra_size >=
1321                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1322                         err = -EAGAIN;
1323                         goto out;
1324                 }
1325                 /*
1326                  * To add new inline back ref, we have to make sure
1327                  * there is no corresponding back ref item.
1328                  * For simplicity, we just do not add new inline back
1329                  * ref if there is any kind of item for this block
1330                  */
1331                 if (find_next_key(path, 0, &key) == 0 &&
1332                     key.objectid == bytenr &&
1333                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1334                         err = -EAGAIN;
1335                         goto out;
1336                 }
1337         }
1338         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1339 out:
1340         if (insert) {
1341                 path->keep_locks = 0;
1342                 btrfs_unlock_up_safe(path, 1);
1343         }
1344         return err;
1345 }
1346
1347 /*
1348  * helper to add new inline back ref
1349  */
1350 static noinline_for_stack
1351 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1352                                 struct btrfs_root *root,
1353                                 struct btrfs_path *path,
1354                                 struct btrfs_extent_inline_ref *iref,
1355                                 u64 parent, u64 root_objectid,
1356                                 u64 owner, u64 offset, int refs_to_add,
1357                                 struct btrfs_delayed_extent_op *extent_op)
1358 {
1359         struct extent_buffer *leaf;
1360         struct btrfs_extent_item *ei;
1361         unsigned long ptr;
1362         unsigned long end;
1363         unsigned long item_offset;
1364         u64 refs;
1365         int size;
1366         int type;
1367         int ret;
1368
1369         leaf = path->nodes[0];
1370         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1371         item_offset = (unsigned long)iref - (unsigned long)ei;
1372
1373         type = extent_ref_type(parent, owner);
1374         size = btrfs_extent_inline_ref_size(type);
1375
1376         ret = btrfs_extend_item(trans, root, path, size);
1377         BUG_ON(ret);
1378
1379         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1380         refs = btrfs_extent_refs(leaf, ei);
1381         refs += refs_to_add;
1382         btrfs_set_extent_refs(leaf, ei, refs);
1383         if (extent_op)
1384                 __run_delayed_extent_op(extent_op, leaf, ei);
1385
1386         ptr = (unsigned long)ei + item_offset;
1387         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1388         if (ptr < end - size)
1389                 memmove_extent_buffer(leaf, ptr + size, ptr,
1390                                       end - size - ptr);
1391
1392         iref = (struct btrfs_extent_inline_ref *)ptr;
1393         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1394         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1395                 struct btrfs_extent_data_ref *dref;
1396                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1397                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1398                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1399                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1400                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1401         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1402                 struct btrfs_shared_data_ref *sref;
1403                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1404                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1405                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1406         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1407                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1408         } else {
1409                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1410         }
1411         btrfs_mark_buffer_dirty(leaf);
1412         return 0;
1413 }
1414
1415 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1416                                  struct btrfs_root *root,
1417                                  struct btrfs_path *path,
1418                                  struct btrfs_extent_inline_ref **ref_ret,
1419                                  u64 bytenr, u64 num_bytes, u64 parent,
1420                                  u64 root_objectid, u64 owner, u64 offset)
1421 {
1422         int ret;
1423
1424         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1425                                            bytenr, num_bytes, parent,
1426                                            root_objectid, owner, offset, 0);
1427         if (ret != -ENOENT)
1428                 return ret;
1429
1430         btrfs_release_path(root, path);
1431         *ref_ret = NULL;
1432
1433         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1434                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1435                                             root_objectid);
1436         } else {
1437                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1438                                              root_objectid, owner, offset);
1439         }
1440         return ret;
1441 }
1442
1443 /*
1444  * helper to update/remove inline back ref
1445  */
1446 static noinline_for_stack
1447 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1448                                  struct btrfs_root *root,
1449                                  struct btrfs_path *path,
1450                                  struct btrfs_extent_inline_ref *iref,
1451                                  int refs_to_mod,
1452                                  struct btrfs_delayed_extent_op *extent_op)
1453 {
1454         struct extent_buffer *leaf;
1455         struct btrfs_extent_item *ei;
1456         struct btrfs_extent_data_ref *dref = NULL;
1457         struct btrfs_shared_data_ref *sref = NULL;
1458         unsigned long ptr;
1459         unsigned long end;
1460         u32 item_size;
1461         int size;
1462         int type;
1463         int ret;
1464         u64 refs;
1465
1466         leaf = path->nodes[0];
1467         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1468         refs = btrfs_extent_refs(leaf, ei);
1469         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1470         refs += refs_to_mod;
1471         btrfs_set_extent_refs(leaf, ei, refs);
1472         if (extent_op)
1473                 __run_delayed_extent_op(extent_op, leaf, ei);
1474
1475         type = btrfs_extent_inline_ref_type(leaf, iref);
1476
1477         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1478                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1479                 refs = btrfs_extent_data_ref_count(leaf, dref);
1480         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1481                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1482                 refs = btrfs_shared_data_ref_count(leaf, sref);
1483         } else {
1484                 refs = 1;
1485                 BUG_ON(refs_to_mod != -1);
1486         }
1487
1488         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1489         refs += refs_to_mod;
1490
1491         if (refs > 0) {
1492                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1493                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1494                 else
1495                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1496         } else {
1497                 size =  btrfs_extent_inline_ref_size(type);
1498                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1499                 ptr = (unsigned long)iref;
1500                 end = (unsigned long)ei + item_size;
1501                 if (ptr + size < end)
1502                         memmove_extent_buffer(leaf, ptr, ptr + size,
1503                                               end - ptr - size);
1504                 item_size -= size;
1505                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1506                 BUG_ON(ret);
1507         }
1508         btrfs_mark_buffer_dirty(leaf);
1509         return 0;
1510 }
1511
1512 static noinline_for_stack
1513 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1514                                  struct btrfs_root *root,
1515                                  struct btrfs_path *path,
1516                                  u64 bytenr, u64 num_bytes, u64 parent,
1517                                  u64 root_objectid, u64 owner,
1518                                  u64 offset, int refs_to_add,
1519                                  struct btrfs_delayed_extent_op *extent_op)
1520 {
1521         struct btrfs_extent_inline_ref *iref;
1522         int ret;
1523
1524         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1525                                            bytenr, num_bytes, parent,
1526                                            root_objectid, owner, offset, 1);
1527         if (ret == 0) {
1528                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1529                 ret = update_inline_extent_backref(trans, root, path, iref,
1530                                                    refs_to_add, extent_op);
1531         } else if (ret == -ENOENT) {
1532                 ret = setup_inline_extent_backref(trans, root, path, iref,
1533                                                   parent, root_objectid,
1534                                                   owner, offset, refs_to_add,
1535                                                   extent_op);
1536         }
1537         return ret;
1538 }
1539
1540 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1541                                  struct btrfs_root *root,
1542                                  struct btrfs_path *path,
1543                                  u64 bytenr, u64 parent, u64 root_objectid,
1544                                  u64 owner, u64 offset, int refs_to_add)
1545 {
1546         int ret;
1547         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1548                 BUG_ON(refs_to_add != 1);
1549                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1550                                             parent, root_objectid);
1551         } else {
1552                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1553                                              parent, root_objectid,
1554                                              owner, offset, refs_to_add);
1555         }
1556         return ret;
1557 }
1558
1559 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1560                                  struct btrfs_root *root,
1561                                  struct btrfs_path *path,
1562                                  struct btrfs_extent_inline_ref *iref,
1563                                  int refs_to_drop, int is_data)
1564 {
1565         int ret;
1566
1567         BUG_ON(!is_data && refs_to_drop != 1);
1568         if (iref) {
1569                 ret = update_inline_extent_backref(trans, root, path, iref,
1570                                                    -refs_to_drop, NULL);
1571         } else if (is_data) {
1572                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1573         } else {
1574                 ret = btrfs_del_item(trans, root, path);
1575         }
1576         return ret;
1577 }
1578
1579 static void btrfs_issue_discard(struct block_device *bdev,
1580                                 u64 start, u64 len)
1581 {
1582         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1583                              DISCARD_FL_BARRIER);
1584 }
1585
1586 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1587                                 u64 num_bytes)
1588 {
1589         int ret;
1590         u64 map_length = num_bytes;
1591         struct btrfs_multi_bio *multi = NULL;
1592
1593         if (!btrfs_test_opt(root, DISCARD))
1594                 return 0;
1595
1596         /* Tell the block device(s) that the sectors can be discarded */
1597         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1598                               bytenr, &map_length, &multi, 0);
1599         if (!ret) {
1600                 struct btrfs_bio_stripe *stripe = multi->stripes;
1601                 int i;
1602
1603                 if (map_length > num_bytes)
1604                         map_length = num_bytes;
1605
1606                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1607                         btrfs_issue_discard(stripe->dev->bdev,
1608                                             stripe->physical,
1609                                             map_length);
1610                 }
1611                 kfree(multi);
1612         }
1613
1614         return ret;
1615 }
1616
1617 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1618                          struct btrfs_root *root,
1619                          u64 bytenr, u64 num_bytes, u64 parent,
1620                          u64 root_objectid, u64 owner, u64 offset)
1621 {
1622         int ret;
1623         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1624                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1625
1626         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1627                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1628                                         parent, root_objectid, (int)owner,
1629                                         BTRFS_ADD_DELAYED_REF, NULL);
1630         } else {
1631                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1632                                         parent, root_objectid, owner, offset,
1633                                         BTRFS_ADD_DELAYED_REF, NULL);
1634         }
1635         return ret;
1636 }
1637
1638 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1639                                   struct btrfs_root *root,
1640                                   u64 bytenr, u64 num_bytes,
1641                                   u64 parent, u64 root_objectid,
1642                                   u64 owner, u64 offset, int refs_to_add,
1643                                   struct btrfs_delayed_extent_op *extent_op)
1644 {
1645         struct btrfs_path *path;
1646         struct extent_buffer *leaf;
1647         struct btrfs_extent_item *item;
1648         u64 refs;
1649         int ret;
1650         int err = 0;
1651
1652         path = btrfs_alloc_path();
1653         if (!path)
1654                 return -ENOMEM;
1655
1656         path->reada = 1;
1657         path->leave_spinning = 1;
1658         /* this will setup the path even if it fails to insert the back ref */
1659         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1660                                            path, bytenr, num_bytes, parent,
1661                                            root_objectid, owner, offset,
1662                                            refs_to_add, extent_op);
1663         if (ret == 0)
1664                 goto out;
1665
1666         if (ret != -EAGAIN) {
1667                 err = ret;
1668                 goto out;
1669         }
1670
1671         leaf = path->nodes[0];
1672         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1673         refs = btrfs_extent_refs(leaf, item);
1674         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1675         if (extent_op)
1676                 __run_delayed_extent_op(extent_op, leaf, item);
1677
1678         btrfs_mark_buffer_dirty(leaf);
1679         btrfs_release_path(root->fs_info->extent_root, path);
1680
1681         path->reada = 1;
1682         path->leave_spinning = 1;
1683
1684         /* now insert the actual backref */
1685         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1686                                     path, bytenr, parent, root_objectid,
1687                                     owner, offset, refs_to_add);
1688         BUG_ON(ret);
1689 out:
1690         btrfs_free_path(path);
1691         return err;
1692 }
1693
1694 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1695                                 struct btrfs_root *root,
1696                                 struct btrfs_delayed_ref_node *node,
1697                                 struct btrfs_delayed_extent_op *extent_op,
1698                                 int insert_reserved)
1699 {
1700         int ret = 0;
1701         struct btrfs_delayed_data_ref *ref;
1702         struct btrfs_key ins;
1703         u64 parent = 0;
1704         u64 ref_root = 0;
1705         u64 flags = 0;
1706
1707         ins.objectid = node->bytenr;
1708         ins.offset = node->num_bytes;
1709         ins.type = BTRFS_EXTENT_ITEM_KEY;
1710
1711         ref = btrfs_delayed_node_to_data_ref(node);
1712         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1713                 parent = ref->parent;
1714         else
1715                 ref_root = ref->root;
1716
1717         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1718                 if (extent_op) {
1719                         BUG_ON(extent_op->update_key);
1720                         flags |= extent_op->flags_to_set;
1721                 }
1722                 ret = alloc_reserved_file_extent(trans, root,
1723                                                  parent, ref_root, flags,
1724                                                  ref->objectid, ref->offset,
1725                                                  &ins, node->ref_mod);
1726         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1727                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1728                                              node->num_bytes, parent,
1729                                              ref_root, ref->objectid,
1730                                              ref->offset, node->ref_mod,
1731                                              extent_op);
1732         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1733                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1734                                           node->num_bytes, parent,
1735                                           ref_root, ref->objectid,
1736                                           ref->offset, node->ref_mod,
1737                                           extent_op);
1738         } else {
1739                 BUG();
1740         }
1741         return ret;
1742 }
1743
1744 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1745                                     struct extent_buffer *leaf,
1746                                     struct btrfs_extent_item *ei)
1747 {
1748         u64 flags = btrfs_extent_flags(leaf, ei);
1749         if (extent_op->update_flags) {
1750                 flags |= extent_op->flags_to_set;
1751                 btrfs_set_extent_flags(leaf, ei, flags);
1752         }
1753
1754         if (extent_op->update_key) {
1755                 struct btrfs_tree_block_info *bi;
1756                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1757                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1758                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1759         }
1760 }
1761
1762 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1763                                  struct btrfs_root *root,
1764                                  struct btrfs_delayed_ref_node *node,
1765                                  struct btrfs_delayed_extent_op *extent_op)
1766 {
1767         struct btrfs_key key;
1768         struct btrfs_path *path;
1769         struct btrfs_extent_item *ei;
1770         struct extent_buffer *leaf;
1771         u32 item_size;
1772         int ret;
1773         int err = 0;
1774
1775         path = btrfs_alloc_path();
1776         if (!path)
1777                 return -ENOMEM;
1778
1779         key.objectid = node->bytenr;
1780         key.type = BTRFS_EXTENT_ITEM_KEY;
1781         key.offset = node->num_bytes;
1782
1783         path->reada = 1;
1784         path->leave_spinning = 1;
1785         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1786                                 path, 0, 1);
1787         if (ret < 0) {
1788                 err = ret;
1789                 goto out;
1790         }
1791         if (ret > 0) {
1792                 err = -EIO;
1793                 goto out;
1794         }
1795
1796         leaf = path->nodes[0];
1797         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1798 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1799         if (item_size < sizeof(*ei)) {
1800                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1801                                              path, (u64)-1, 0);
1802                 if (ret < 0) {
1803                         err = ret;
1804                         goto out;
1805                 }
1806                 leaf = path->nodes[0];
1807                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1808         }
1809 #endif
1810         BUG_ON(item_size < sizeof(*ei));
1811         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1812         __run_delayed_extent_op(extent_op, leaf, ei);
1813
1814         btrfs_mark_buffer_dirty(leaf);
1815 out:
1816         btrfs_free_path(path);
1817         return err;
1818 }
1819
1820 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1821                                 struct btrfs_root *root,
1822                                 struct btrfs_delayed_ref_node *node,
1823                                 struct btrfs_delayed_extent_op *extent_op,
1824                                 int insert_reserved)
1825 {
1826         int ret = 0;
1827         struct btrfs_delayed_tree_ref *ref;
1828         struct btrfs_key ins;
1829         u64 parent = 0;
1830         u64 ref_root = 0;
1831
1832         ins.objectid = node->bytenr;
1833         ins.offset = node->num_bytes;
1834         ins.type = BTRFS_EXTENT_ITEM_KEY;
1835
1836         ref = btrfs_delayed_node_to_tree_ref(node);
1837         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1838                 parent = ref->parent;
1839         else
1840                 ref_root = ref->root;
1841
1842         BUG_ON(node->ref_mod != 1);
1843         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1844                 BUG_ON(!extent_op || !extent_op->update_flags ||
1845                        !extent_op->update_key);
1846                 ret = alloc_reserved_tree_block(trans, root,
1847                                                 parent, ref_root,
1848                                                 extent_op->flags_to_set,
1849                                                 &extent_op->key,
1850                                                 ref->level, &ins);
1851         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1852                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1853                                              node->num_bytes, parent, ref_root,
1854                                              ref->level, 0, 1, extent_op);
1855         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1856                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1857                                           node->num_bytes, parent, ref_root,
1858                                           ref->level, 0, 1, extent_op);
1859         } else {
1860                 BUG();
1861         }
1862         return ret;
1863 }
1864
1865
1866 /* helper function to actually process a single delayed ref entry */
1867 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1868                                struct btrfs_root *root,
1869                                struct btrfs_delayed_ref_node *node,
1870                                struct btrfs_delayed_extent_op *extent_op,
1871                                int insert_reserved)
1872 {
1873         int ret;
1874         if (btrfs_delayed_ref_is_head(node)) {
1875                 struct btrfs_delayed_ref_head *head;
1876                 /*
1877                  * we've hit the end of the chain and we were supposed
1878                  * to insert this extent into the tree.  But, it got
1879                  * deleted before we ever needed to insert it, so all
1880                  * we have to do is clean up the accounting
1881                  */
1882                 BUG_ON(extent_op);
1883                 head = btrfs_delayed_node_to_head(node);
1884                 if (insert_reserved) {
1885                         int mark_free = 0;
1886                         struct extent_buffer *must_clean = NULL;
1887
1888                         ret = pin_down_bytes(trans, root, NULL,
1889                                              node->bytenr, node->num_bytes,
1890                                              head->is_data, 1, &must_clean);
1891                         if (ret > 0)
1892                                 mark_free = 1;
1893
1894                         if (must_clean) {
1895                                 clean_tree_block(NULL, root, must_clean);
1896                                 btrfs_tree_unlock(must_clean);
1897                                 free_extent_buffer(must_clean);
1898                         }
1899                         if (head->is_data) {
1900                                 ret = btrfs_del_csums(trans, root,
1901                                                       node->bytenr,
1902                                                       node->num_bytes);
1903                                 BUG_ON(ret);
1904                         }
1905                         if (mark_free) {
1906                                 ret = btrfs_free_reserved_extent(root,
1907                                                         node->bytenr,
1908                                                         node->num_bytes);
1909                                 BUG_ON(ret);
1910                         }
1911                 }
1912                 mutex_unlock(&head->mutex);
1913                 return 0;
1914         }
1915
1916         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1917             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1918                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
1919                                            insert_reserved);
1920         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1921                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1922                 ret = run_delayed_data_ref(trans, root, node, extent_op,
1923                                            insert_reserved);
1924         else
1925                 BUG();
1926         return ret;
1927 }
1928
1929 static noinline struct btrfs_delayed_ref_node *
1930 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1931 {
1932         struct rb_node *node;
1933         struct btrfs_delayed_ref_node *ref;
1934         int action = BTRFS_ADD_DELAYED_REF;
1935 again:
1936         /*
1937          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1938          * this prevents ref count from going down to zero when
1939          * there still are pending delayed ref.
1940          */
1941         node = rb_prev(&head->node.rb_node);
1942         while (1) {
1943                 if (!node)
1944                         break;
1945                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1946                                 rb_node);
1947                 if (ref->bytenr != head->node.bytenr)
1948                         break;
1949                 if (ref->action == action)
1950                         return ref;
1951                 node = rb_prev(node);
1952         }
1953         if (action == BTRFS_ADD_DELAYED_REF) {
1954                 action = BTRFS_DROP_DELAYED_REF;
1955                 goto again;
1956         }
1957         return NULL;
1958 }
1959
1960 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1961                                        struct btrfs_root *root,
1962                                        struct list_head *cluster)
1963 {
1964         struct btrfs_delayed_ref_root *delayed_refs;
1965         struct btrfs_delayed_ref_node *ref;
1966         struct btrfs_delayed_ref_head *locked_ref = NULL;
1967         struct btrfs_delayed_extent_op *extent_op;
1968         int ret;
1969         int count = 0;
1970         int must_insert_reserved = 0;
1971
1972         delayed_refs = &trans->transaction->delayed_refs;
1973         while (1) {
1974                 if (!locked_ref) {
1975                         /* pick a new head ref from the cluster list */
1976                         if (list_empty(cluster))
1977                                 break;
1978
1979                         locked_ref = list_entry(cluster->next,
1980                                      struct btrfs_delayed_ref_head, cluster);
1981
1982                         /* grab the lock that says we are going to process
1983                          * all the refs for this head */
1984                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
1985
1986                         /*
1987                          * we may have dropped the spin lock to get the head
1988                          * mutex lock, and that might have given someone else
1989                          * time to free the head.  If that's true, it has been
1990                          * removed from our list and we can move on.
1991                          */
1992                         if (ret == -EAGAIN) {
1993                                 locked_ref = NULL;
1994                                 count++;
1995                                 continue;
1996                         }
1997                 }
1998
1999                 /*
2000                  * record the must insert reserved flag before we
2001                  * drop the spin lock.
2002                  */
2003                 must_insert_reserved = locked_ref->must_insert_reserved;
2004                 locked_ref->must_insert_reserved = 0;
2005
2006                 extent_op = locked_ref->extent_op;
2007                 locked_ref->extent_op = NULL;
2008
2009                 /*
2010                  * locked_ref is the head node, so we have to go one
2011                  * node back for any delayed ref updates
2012                  */
2013                 ref = select_delayed_ref(locked_ref);
2014                 if (!ref) {
2015                         /* All delayed refs have been processed, Go ahead
2016                          * and send the head node to run_one_delayed_ref,
2017                          * so that any accounting fixes can happen
2018                          */
2019                         ref = &locked_ref->node;
2020
2021                         if (extent_op && must_insert_reserved) {
2022                                 kfree(extent_op);
2023                                 extent_op = NULL;
2024                         }
2025
2026                         if (extent_op) {
2027                                 spin_unlock(&delayed_refs->lock);
2028
2029                                 ret = run_delayed_extent_op(trans, root,
2030                                                             ref, extent_op);
2031                                 BUG_ON(ret);
2032                                 kfree(extent_op);
2033
2034                                 cond_resched();
2035                                 spin_lock(&delayed_refs->lock);
2036                                 continue;
2037                         }
2038
2039                         list_del_init(&locked_ref->cluster);
2040                         locked_ref = NULL;
2041                 }
2042
2043                 ref->in_tree = 0;
2044                 rb_erase(&ref->rb_node, &delayed_refs->root);
2045                 delayed_refs->num_entries--;
2046
2047                 spin_unlock(&delayed_refs->lock);
2048
2049                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2050                                           must_insert_reserved);
2051                 BUG_ON(ret);
2052
2053                 btrfs_put_delayed_ref(ref);
2054                 kfree(extent_op);
2055                 count++;
2056
2057                 cond_resched();
2058                 spin_lock(&delayed_refs->lock);
2059         }
2060         return count;
2061 }
2062
2063 /*
2064  * this starts processing the delayed reference count updates and
2065  * extent insertions we have queued up so far.  count can be
2066  * 0, which means to process everything in the tree at the start
2067  * of the run (but not newly added entries), or it can be some target
2068  * number you'd like to process.
2069  */
2070 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2071                            struct btrfs_root *root, unsigned long count)
2072 {
2073         struct rb_node *node;
2074         struct btrfs_delayed_ref_root *delayed_refs;
2075         struct btrfs_delayed_ref_node *ref;
2076         struct list_head cluster;
2077         int ret;
2078         int run_all = count == (unsigned long)-1;
2079         int run_most = 0;
2080
2081         if (root == root->fs_info->extent_root)
2082                 root = root->fs_info->tree_root;
2083
2084         delayed_refs = &trans->transaction->delayed_refs;
2085         INIT_LIST_HEAD(&cluster);
2086 again:
2087         spin_lock(&delayed_refs->lock);
2088         if (count == 0) {
2089                 count = delayed_refs->num_entries * 2;
2090                 run_most = 1;
2091         }
2092         while (1) {
2093                 if (!(run_all || run_most) &&
2094                     delayed_refs->num_heads_ready < 64)
2095                         break;
2096
2097                 /*
2098                  * go find something we can process in the rbtree.  We start at
2099                  * the beginning of the tree, and then build a cluster
2100                  * of refs to process starting at the first one we are able to
2101                  * lock
2102                  */
2103                 ret = btrfs_find_ref_cluster(trans, &cluster,
2104                                              delayed_refs->run_delayed_start);
2105                 if (ret)
2106                         break;
2107
2108                 ret = run_clustered_refs(trans, root, &cluster);
2109                 BUG_ON(ret < 0);
2110
2111                 count -= min_t(unsigned long, ret, count);
2112
2113                 if (count == 0)
2114                         break;
2115         }
2116
2117         if (run_all) {
2118                 node = rb_first(&delayed_refs->root);
2119                 if (!node)
2120                         goto out;
2121                 count = (unsigned long)-1;
2122
2123                 while (node) {
2124                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2125                                        rb_node);
2126                         if (btrfs_delayed_ref_is_head(ref)) {
2127                                 struct btrfs_delayed_ref_head *head;
2128
2129                                 head = btrfs_delayed_node_to_head(ref);
2130                                 atomic_inc(&ref->refs);
2131
2132                                 spin_unlock(&delayed_refs->lock);
2133                                 mutex_lock(&head->mutex);
2134                                 mutex_unlock(&head->mutex);
2135
2136                                 btrfs_put_delayed_ref(ref);
2137                                 cond_resched();
2138                                 goto again;
2139                         }
2140                         node = rb_next(node);
2141                 }
2142                 spin_unlock(&delayed_refs->lock);
2143                 schedule_timeout(1);
2144                 goto again;
2145         }
2146 out:
2147         spin_unlock(&delayed_refs->lock);
2148         return 0;
2149 }
2150
2151 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2152                                 struct btrfs_root *root,
2153                                 u64 bytenr, u64 num_bytes, u64 flags,
2154                                 int is_data)
2155 {
2156         struct btrfs_delayed_extent_op *extent_op;
2157         int ret;
2158
2159         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2160         if (!extent_op)
2161                 return -ENOMEM;
2162
2163         extent_op->flags_to_set = flags;
2164         extent_op->update_flags = 1;
2165         extent_op->update_key = 0;
2166         extent_op->is_data = is_data ? 1 : 0;
2167
2168         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2169         if (ret)
2170                 kfree(extent_op);
2171         return ret;
2172 }
2173
2174 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2175                                       struct btrfs_root *root,
2176                                       struct btrfs_path *path,
2177                                       u64 objectid, u64 offset, u64 bytenr)
2178 {
2179         struct btrfs_delayed_ref_head *head;
2180         struct btrfs_delayed_ref_node *ref;
2181         struct btrfs_delayed_data_ref *data_ref;
2182         struct btrfs_delayed_ref_root *delayed_refs;
2183         struct rb_node *node;
2184         int ret = 0;
2185
2186         ret = -ENOENT;
2187         delayed_refs = &trans->transaction->delayed_refs;
2188         spin_lock(&delayed_refs->lock);
2189         head = btrfs_find_delayed_ref_head(trans, bytenr);
2190         if (!head)
2191                 goto out;
2192
2193         if (!mutex_trylock(&head->mutex)) {
2194                 atomic_inc(&head->node.refs);
2195                 spin_unlock(&delayed_refs->lock);
2196
2197                 btrfs_release_path(root->fs_info->extent_root, path);
2198
2199                 mutex_lock(&head->mutex);
2200                 mutex_unlock(&head->mutex);
2201                 btrfs_put_delayed_ref(&head->node);
2202                 return -EAGAIN;
2203         }
2204
2205         node = rb_prev(&head->node.rb_node);
2206         if (!node)
2207                 goto out_unlock;
2208
2209         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2210
2211         if (ref->bytenr != bytenr)
2212                 goto out_unlock;
2213
2214         ret = 1;
2215         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2216                 goto out_unlock;
2217
2218         data_ref = btrfs_delayed_node_to_data_ref(ref);
2219
2220         node = rb_prev(node);
2221         if (node) {
2222                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2223                 if (ref->bytenr == bytenr)
2224                         goto out_unlock;
2225         }
2226
2227         if (data_ref->root != root->root_key.objectid ||
2228             data_ref->objectid != objectid || data_ref->offset != offset)
2229                 goto out_unlock;
2230
2231         ret = 0;
2232 out_unlock:
2233         mutex_unlock(&head->mutex);
2234 out:
2235         spin_unlock(&delayed_refs->lock);
2236         return ret;
2237 }
2238
2239 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2240                                         struct btrfs_root *root,
2241                                         struct btrfs_path *path,
2242                                         u64 objectid, u64 offset, u64 bytenr)
2243 {
2244         struct btrfs_root *extent_root = root->fs_info->extent_root;
2245         struct extent_buffer *leaf;
2246         struct btrfs_extent_data_ref *ref;
2247         struct btrfs_extent_inline_ref *iref;
2248         struct btrfs_extent_item *ei;
2249         struct btrfs_key key;
2250         u32 item_size;
2251         int ret;
2252
2253         key.objectid = bytenr;
2254         key.offset = (u64)-1;
2255         key.type = BTRFS_EXTENT_ITEM_KEY;
2256
2257         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2258         if (ret < 0)
2259                 goto out;
2260         BUG_ON(ret == 0);
2261
2262         ret = -ENOENT;
2263         if (path->slots[0] == 0)
2264                 goto out;
2265
2266         path->slots[0]--;
2267         leaf = path->nodes[0];
2268         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2269
2270         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2271                 goto out;
2272
2273         ret = 1;
2274         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2275 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2276         if (item_size < sizeof(*ei)) {
2277                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2278                 goto out;
2279         }
2280 #endif
2281         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2282
2283         if (item_size != sizeof(*ei) +
2284             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2285                 goto out;
2286
2287         if (btrfs_extent_generation(leaf, ei) <=
2288             btrfs_root_last_snapshot(&root->root_item))
2289                 goto out;
2290
2291         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2292         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2293             BTRFS_EXTENT_DATA_REF_KEY)
2294                 goto out;
2295
2296         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2297         if (btrfs_extent_refs(leaf, ei) !=
2298             btrfs_extent_data_ref_count(leaf, ref) ||
2299             btrfs_extent_data_ref_root(leaf, ref) !=
2300             root->root_key.objectid ||
2301             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2302             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2303                 goto out;
2304
2305         ret = 0;
2306 out:
2307         return ret;
2308 }
2309
2310 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2311                           struct btrfs_root *root,
2312                           u64 objectid, u64 offset, u64 bytenr)
2313 {
2314         struct btrfs_path *path;
2315         int ret;
2316         int ret2;
2317
2318         path = btrfs_alloc_path();
2319         if (!path)
2320                 return -ENOENT;
2321
2322         do {
2323                 ret = check_committed_ref(trans, root, path, objectid,
2324                                           offset, bytenr);
2325                 if (ret && ret != -ENOENT)
2326                         goto out;
2327
2328                 ret2 = check_delayed_ref(trans, root, path, objectid,
2329                                          offset, bytenr);
2330         } while (ret2 == -EAGAIN);
2331
2332         if (ret2 && ret2 != -ENOENT) {
2333                 ret = ret2;
2334                 goto out;
2335         }
2336
2337         if (ret != -ENOENT || ret2 != -ENOENT)
2338                 ret = 0;
2339 out:
2340         btrfs_free_path(path);
2341         return ret;
2342 }
2343
2344 #if 0
2345 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2346                     struct extent_buffer *buf, u32 nr_extents)
2347 {
2348         struct btrfs_key key;
2349         struct btrfs_file_extent_item *fi;
2350         u64 root_gen;
2351         u32 nritems;
2352         int i;
2353         int level;
2354         int ret = 0;
2355         int shared = 0;
2356
2357         if (!root->ref_cows)
2358                 return 0;
2359
2360         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2361                 shared = 0;
2362                 root_gen = root->root_key.offset;
2363         } else {
2364                 shared = 1;
2365                 root_gen = trans->transid - 1;
2366         }
2367
2368         level = btrfs_header_level(buf);
2369         nritems = btrfs_header_nritems(buf);
2370
2371         if (level == 0) {
2372                 struct btrfs_leaf_ref *ref;
2373                 struct btrfs_extent_info *info;
2374
2375                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2376                 if (!ref) {
2377                         ret = -ENOMEM;
2378                         goto out;
2379                 }
2380
2381                 ref->root_gen = root_gen;
2382                 ref->bytenr = buf->start;
2383                 ref->owner = btrfs_header_owner(buf);
2384                 ref->generation = btrfs_header_generation(buf);
2385                 ref->nritems = nr_extents;
2386                 info = ref->extents;
2387
2388                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2389                         u64 disk_bytenr;
2390                         btrfs_item_key_to_cpu(buf, &key, i);
2391                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2392                                 continue;
2393                         fi = btrfs_item_ptr(buf, i,
2394                                             struct btrfs_file_extent_item);
2395                         if (btrfs_file_extent_type(buf, fi) ==
2396                             BTRFS_FILE_EXTENT_INLINE)
2397                                 continue;
2398                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2399                         if (disk_bytenr == 0)
2400                                 continue;
2401
2402                         info->bytenr = disk_bytenr;
2403                         info->num_bytes =
2404                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2405                         info->objectid = key.objectid;
2406                         info->offset = key.offset;
2407                         info++;
2408                 }
2409
2410                 ret = btrfs_add_leaf_ref(root, ref, shared);
2411                 if (ret == -EEXIST && shared) {
2412                         struct btrfs_leaf_ref *old;
2413                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2414                         BUG_ON(!old);
2415                         btrfs_remove_leaf_ref(root, old);
2416                         btrfs_free_leaf_ref(root, old);
2417                         ret = btrfs_add_leaf_ref(root, ref, shared);
2418                 }
2419                 WARN_ON(ret);
2420                 btrfs_free_leaf_ref(root, ref);
2421         }
2422 out:
2423         return ret;
2424 }
2425
2426 /* when a block goes through cow, we update the reference counts of
2427  * everything that block points to.  The internal pointers of the block
2428  * can be in just about any order, and it is likely to have clusters of
2429  * things that are close together and clusters of things that are not.
2430  *
2431  * To help reduce the seeks that come with updating all of these reference
2432  * counts, sort them by byte number before actual updates are done.
2433  *
2434  * struct refsort is used to match byte number to slot in the btree block.
2435  * we sort based on the byte number and then use the slot to actually
2436  * find the item.
2437  *
2438  * struct refsort is smaller than strcut btrfs_item and smaller than
2439  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2440  * for a btree block, there's no way for a kmalloc of refsorts for a
2441  * single node to be bigger than a page.
2442  */
2443 struct refsort {
2444         u64 bytenr;
2445         u32 slot;
2446 };
2447
2448 /*
2449  * for passing into sort()
2450  */
2451 static int refsort_cmp(const void *a_void, const void *b_void)
2452 {
2453         const struct refsort *a = a_void;
2454         const struct refsort *b = b_void;
2455
2456         if (a->bytenr < b->bytenr)
2457                 return -1;
2458         if (a->bytenr > b->bytenr)
2459                 return 1;
2460         return 0;
2461 }
2462 #endif
2463
2464 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2465                            struct btrfs_root *root,
2466                            struct extent_buffer *buf,
2467                            int full_backref, int inc)
2468 {
2469         u64 bytenr;
2470         u64 num_bytes;
2471         u64 parent;
2472         u64 ref_root;
2473         u32 nritems;
2474         struct btrfs_key key;
2475         struct btrfs_file_extent_item *fi;
2476         int i;
2477         int level;
2478         int ret = 0;
2479         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2480                             u64, u64, u64, u64, u64, u64);
2481
2482         ref_root = btrfs_header_owner(buf);
2483         nritems = btrfs_header_nritems(buf);
2484         level = btrfs_header_level(buf);
2485
2486         if (!root->ref_cows && level == 0)
2487                 return 0;
2488
2489         if (inc)
2490                 process_func = btrfs_inc_extent_ref;
2491         else
2492                 process_func = btrfs_free_extent;
2493
2494         if (full_backref)
2495                 parent = buf->start;
2496         else
2497                 parent = 0;
2498
2499         for (i = 0; i < nritems; i++) {
2500                 if (level == 0) {
2501                         btrfs_item_key_to_cpu(buf, &key, i);
2502                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2503                                 continue;
2504                         fi = btrfs_item_ptr(buf, i,
2505                                             struct btrfs_file_extent_item);
2506                         if (btrfs_file_extent_type(buf, fi) ==
2507                             BTRFS_FILE_EXTENT_INLINE)
2508                                 continue;
2509                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2510                         if (bytenr == 0)
2511                                 continue;
2512
2513                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2514                         key.offset -= btrfs_file_extent_offset(buf, fi);
2515                         ret = process_func(trans, root, bytenr, num_bytes,
2516                                            parent, ref_root, key.objectid,
2517                                            key.offset);
2518                         if (ret)
2519                                 goto fail;
2520                 } else {
2521                         bytenr = btrfs_node_blockptr(buf, i);
2522                         num_bytes = btrfs_level_size(root, level - 1);
2523                         ret = process_func(trans, root, bytenr, num_bytes,
2524                                            parent, ref_root, level - 1, 0);
2525                         if (ret)
2526                                 goto fail;
2527                 }
2528         }
2529         return 0;
2530 fail:
2531         BUG();
2532         return ret;
2533 }
2534
2535 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2536                   struct extent_buffer *buf, int full_backref)
2537 {
2538         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2539 }
2540
2541 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2542                   struct extent_buffer *buf, int full_backref)
2543 {
2544         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2545 }
2546
2547 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2548                                  struct btrfs_root *root,
2549                                  struct btrfs_path *path,
2550                                  struct btrfs_block_group_cache *cache)
2551 {
2552         int ret;
2553         struct btrfs_root *extent_root = root->fs_info->extent_root;
2554         unsigned long bi;
2555         struct extent_buffer *leaf;
2556
2557         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2558         if (ret < 0)
2559                 goto fail;
2560         BUG_ON(ret);
2561
2562         leaf = path->nodes[0];
2563         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2564         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2565         btrfs_mark_buffer_dirty(leaf);
2566         btrfs_release_path(extent_root, path);
2567 fail:
2568         if (ret)
2569                 return ret;
2570         return 0;
2571
2572 }
2573
2574 static struct btrfs_block_group_cache *
2575 next_block_group(struct btrfs_root *root,
2576                  struct btrfs_block_group_cache *cache)
2577 {
2578         struct rb_node *node;
2579         spin_lock(&root->fs_info->block_group_cache_lock);
2580         node = rb_next(&cache->cache_node);
2581         btrfs_put_block_group(cache);
2582         if (node) {
2583                 cache = rb_entry(node, struct btrfs_block_group_cache,
2584                                  cache_node);
2585                 atomic_inc(&cache->count);
2586         } else
2587                 cache = NULL;
2588         spin_unlock(&root->fs_info->block_group_cache_lock);
2589         return cache;
2590 }
2591
2592 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2593                                    struct btrfs_root *root)
2594 {
2595         struct btrfs_block_group_cache *cache;
2596         int err = 0;
2597         struct btrfs_path *path;
2598         u64 last = 0;
2599
2600         path = btrfs_alloc_path();
2601         if (!path)
2602                 return -ENOMEM;
2603
2604         while (1) {
2605                 if (last == 0) {
2606                         err = btrfs_run_delayed_refs(trans, root,
2607                                                      (unsigned long)-1);
2608                         BUG_ON(err);
2609                 }
2610
2611                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2612                 while (cache) {
2613                         if (cache->dirty)
2614                                 break;
2615                         cache = next_block_group(root, cache);
2616                 }
2617                 if (!cache) {
2618                         if (last == 0)
2619                                 break;
2620                         last = 0;
2621                         continue;
2622                 }
2623
2624                 cache->dirty = 0;
2625                 last = cache->key.objectid + cache->key.offset;
2626
2627                 err = write_one_cache_group(trans, root, path, cache);
2628                 BUG_ON(err);
2629                 btrfs_put_block_group(cache);
2630         }
2631
2632         btrfs_free_path(path);
2633         return 0;
2634 }
2635
2636 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2637 {
2638         struct btrfs_block_group_cache *block_group;
2639         int readonly = 0;
2640
2641         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2642         if (!block_group || block_group->ro)
2643                 readonly = 1;
2644         if (block_group)
2645                 btrfs_put_block_group(block_group);
2646         return readonly;
2647 }
2648
2649 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2650                              u64 total_bytes, u64 bytes_used,
2651                              struct btrfs_space_info **space_info)
2652 {
2653         struct btrfs_space_info *found;
2654
2655         found = __find_space_info(info, flags);
2656         if (found) {
2657                 spin_lock(&found->lock);
2658                 found->total_bytes += total_bytes;
2659                 found->bytes_used += bytes_used;
2660                 found->full = 0;
2661                 spin_unlock(&found->lock);
2662                 *space_info = found;
2663                 return 0;
2664         }
2665         found = kzalloc(sizeof(*found), GFP_NOFS);
2666         if (!found)
2667                 return -ENOMEM;
2668
2669         INIT_LIST_HEAD(&found->block_groups);
2670         init_rwsem(&found->groups_sem);
2671         spin_lock_init(&found->lock);
2672         found->flags = flags;
2673         found->total_bytes = total_bytes;
2674         found->bytes_used = bytes_used;
2675         found->bytes_pinned = 0;
2676         found->bytes_reserved = 0;
2677         found->bytes_readonly = 0;
2678         found->bytes_delalloc = 0;
2679         found->full = 0;
2680         found->force_alloc = 0;
2681         *space_info = found;
2682         list_add_rcu(&found->list, &info->space_info);
2683         atomic_set(&found->caching_threads, 0);
2684         return 0;
2685 }
2686
2687 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2688 {
2689         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2690                                    BTRFS_BLOCK_GROUP_RAID1 |
2691                                    BTRFS_BLOCK_GROUP_RAID10 |
2692                                    BTRFS_BLOCK_GROUP_DUP);
2693         if (extra_flags) {
2694                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2695                         fs_info->avail_data_alloc_bits |= extra_flags;
2696                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2697                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2698                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2699                         fs_info->avail_system_alloc_bits |= extra_flags;
2700         }
2701 }
2702
2703 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2704 {
2705         spin_lock(&cache->space_info->lock);
2706         spin_lock(&cache->lock);
2707         if (!cache->ro) {
2708                 cache->space_info->bytes_readonly += cache->key.offset -
2709                                         btrfs_block_group_used(&cache->item);
2710                 cache->ro = 1;
2711         }
2712         spin_unlock(&cache->lock);
2713         spin_unlock(&cache->space_info->lock);
2714 }
2715
2716 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2717 {
2718         u64 num_devices = root->fs_info->fs_devices->rw_devices;
2719
2720         if (num_devices == 1)
2721                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2722         if (num_devices < 4)
2723                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2724
2725         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2726             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2727                       BTRFS_BLOCK_GROUP_RAID10))) {
2728                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2729         }
2730
2731         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2732             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2733                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2734         }
2735
2736         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2737             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2738              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2739              (flags & BTRFS_BLOCK_GROUP_DUP)))
2740                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2741         return flags;
2742 }
2743
2744 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2745 {
2746         struct btrfs_fs_info *info = root->fs_info;
2747         u64 alloc_profile;
2748
2749         if (data) {
2750                 alloc_profile = info->avail_data_alloc_bits &
2751                         info->data_alloc_profile;
2752                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2753         } else if (root == root->fs_info->chunk_root) {
2754                 alloc_profile = info->avail_system_alloc_bits &
2755                         info->system_alloc_profile;
2756                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2757         } else {
2758                 alloc_profile = info->avail_metadata_alloc_bits &
2759                         info->metadata_alloc_profile;
2760                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2761         }
2762
2763         return btrfs_reduce_alloc_profile(root, data);
2764 }
2765
2766 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2767 {
2768         u64 alloc_target;
2769
2770         alloc_target = btrfs_get_alloc_profile(root, 1);
2771         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2772                                                        alloc_target);
2773 }
2774
2775 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2776 {
2777         u64 num_bytes;
2778         int level;
2779
2780         level = BTRFS_MAX_LEVEL - 2;
2781         /*
2782          * NOTE: these calculations are absolutely the worst possible case.
2783          * This assumes that _every_ item we insert will require a new leaf, and
2784          * that the tree has grown to its maximum level size.
2785          */
2786
2787         /*
2788          * for every item we insert we could insert both an extent item and a
2789          * extent ref item.  Then for ever item we insert, we will need to cow
2790          * both the original leaf, plus the leaf to the left and right of it.
2791          *
2792          * Unless we are talking about the extent root, then we just want the
2793          * number of items * 2, since we just need the extent item plus its ref.
2794          */
2795         if (root == root->fs_info->extent_root)
2796                 num_bytes = num_items * 2;
2797         else
2798                 num_bytes = (num_items + (2 * num_items)) * 3;
2799
2800         /*
2801          * num_bytes is total number of leaves we could need times the leaf
2802          * size, and then for every leaf we could end up cow'ing 2 nodes per
2803          * level, down to the leaf level.
2804          */
2805         num_bytes = (num_bytes * root->leafsize) +
2806                 (num_bytes * (level * 2)) * root->nodesize;
2807
2808         return num_bytes;
2809 }
2810
2811 /*
2812  * Unreserve metadata space for delalloc.  If we have less reserved credits than
2813  * we have extents, this function does nothing.
2814  */
2815 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2816                                           struct inode *inode, int num_items)
2817 {
2818         struct btrfs_fs_info *info = root->fs_info;
2819         struct btrfs_space_info *meta_sinfo;
2820         u64 num_bytes;
2821         u64 alloc_target;
2822         bool bug = false;
2823
2824         /* get the space info for where the metadata will live */
2825         alloc_target = btrfs_get_alloc_profile(root, 0);
2826         meta_sinfo = __find_space_info(info, alloc_target);
2827
2828         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2829                                            num_items);
2830
2831         spin_lock(&meta_sinfo->lock);
2832         spin_lock(&BTRFS_I(inode)->accounting_lock);
2833         if (BTRFS_I(inode)->reserved_extents <=
2834             BTRFS_I(inode)->outstanding_extents) {
2835                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2836                 spin_unlock(&meta_sinfo->lock);
2837                 return 0;
2838         }
2839         spin_unlock(&BTRFS_I(inode)->accounting_lock);
2840
2841         BTRFS_I(inode)->reserved_extents--;
2842         BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2843
2844         if (meta_sinfo->bytes_delalloc < num_bytes) {
2845                 bug = true;
2846                 meta_sinfo->bytes_delalloc = 0;
2847         } else {
2848                 meta_sinfo->bytes_delalloc -= num_bytes;
2849         }
2850         spin_unlock(&meta_sinfo->lock);
2851
2852         BUG_ON(bug);
2853
2854         return 0;
2855 }
2856
2857 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2858 {
2859         u64 thresh;
2860
2861         thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2862                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2863                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2864                 meta_sinfo->bytes_may_use;
2865
2866         thresh = meta_sinfo->total_bytes - thresh;
2867         thresh *= 80;
2868         do_div(thresh, 100);
2869         if (thresh <= meta_sinfo->bytes_delalloc)
2870                 meta_sinfo->force_delalloc = 1;
2871         else
2872                 meta_sinfo->force_delalloc = 0;
2873 }
2874
2875 struct async_flush {
2876         struct btrfs_root *root;
2877         struct btrfs_space_info *info;
2878         struct btrfs_work work;
2879 };
2880
2881 static noinline void flush_delalloc_async(struct btrfs_work *work)
2882 {
2883         struct async_flush *async;
2884         struct btrfs_root *root;
2885         struct btrfs_space_info *info;
2886
2887         async = container_of(work, struct async_flush, work);
2888         root = async->root;
2889         info = async->info;
2890
2891         btrfs_start_delalloc_inodes(root, 0);
2892         wake_up(&info->flush_wait);
2893         btrfs_wait_ordered_extents(root, 0, 0);
2894
2895         spin_lock(&info->lock);
2896         info->flushing = 0;
2897         spin_unlock(&info->lock);
2898         wake_up(&info->flush_wait);
2899
2900         kfree(async);
2901 }
2902
2903 static void wait_on_flush(struct btrfs_space_info *info)
2904 {
2905         DEFINE_WAIT(wait);
2906         u64 used;
2907
2908         while (1) {
2909                 prepare_to_wait(&info->flush_wait, &wait,
2910                                 TASK_UNINTERRUPTIBLE);
2911                 spin_lock(&info->lock);
2912                 if (!info->flushing) {
2913                         spin_unlock(&info->lock);
2914                         break;
2915                 }
2916
2917                 used = info->bytes_used + info->bytes_reserved +
2918                         info->bytes_pinned + info->bytes_readonly +
2919                         info->bytes_super + info->bytes_root +
2920                         info->bytes_may_use + info->bytes_delalloc;
2921                 if (used < info->total_bytes) {
2922                         spin_unlock(&info->lock);
2923                         break;
2924                 }
2925                 spin_unlock(&info->lock);
2926                 schedule();
2927         }
2928         finish_wait(&info->flush_wait, &wait);
2929 }
2930
2931 static void flush_delalloc(struct btrfs_root *root,
2932                                  struct btrfs_space_info *info)
2933 {
2934         struct async_flush *async;
2935         bool wait = false;
2936
2937         spin_lock(&info->lock);
2938
2939         if (!info->flushing) {
2940                 info->flushing = 1;
2941                 init_waitqueue_head(&info->flush_wait);
2942         } else {
2943                 wait = true;
2944         }
2945
2946         spin_unlock(&info->lock);
2947
2948         if (wait) {
2949                 wait_on_flush(info);
2950                 return;
2951         }
2952
2953         async = kzalloc(sizeof(*async), GFP_NOFS);
2954         if (!async)
2955                 goto flush;
2956
2957         async->root = root;
2958         async->info = info;
2959         async->work.func = flush_delalloc_async;
2960
2961         btrfs_queue_worker(&root->fs_info->enospc_workers,
2962                            &async->work);
2963         wait_on_flush(info);
2964         return;
2965
2966 flush:
2967         btrfs_start_delalloc_inodes(root, 0);
2968         btrfs_wait_ordered_extents(root, 0, 0);
2969
2970         spin_lock(&info->lock);
2971         info->flushing = 0;
2972         spin_unlock(&info->lock);
2973         wake_up(&info->flush_wait);
2974 }
2975
2976 static int maybe_allocate_chunk(struct btrfs_root *root,
2977                                  struct btrfs_space_info *info)
2978 {
2979         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2980         struct btrfs_trans_handle *trans;
2981         bool wait = false;
2982         int ret = 0;
2983         u64 min_metadata;
2984         u64 free_space;
2985
2986         free_space = btrfs_super_total_bytes(disk_super);
2987         /*
2988          * we allow the metadata to grow to a max of either 10gb or 5% of the
2989          * space in the volume.
2990          */
2991         min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2992                              div64_u64(free_space * 5, 100));
2993         if (info->total_bytes >= min_metadata) {
2994                 spin_unlock(&info->lock);
2995                 return 0;
2996         }
2997
2998         if (info->full) {
2999                 spin_unlock(&info->lock);
3000                 return 0;
3001         }
3002
3003         if (!info->allocating_chunk) {
3004                 info->force_alloc = 1;
3005                 info->allocating_chunk = 1;
3006                 init_waitqueue_head(&info->allocate_wait);
3007         } else {
3008                 wait = true;
3009         }
3010
3011         spin_unlock(&info->lock);
3012
3013         if (wait) {
3014                 wait_event(info->allocate_wait,
3015                            !info->allocating_chunk);
3016                 return 1;
3017         }
3018
3019         trans = btrfs_start_transaction(root, 1);
3020         if (!trans) {
3021                 ret = -ENOMEM;
3022                 goto out;
3023         }
3024
3025         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3026                              4096 + 2 * 1024 * 1024,
3027                              info->flags, 0);
3028         btrfs_end_transaction(trans, root);
3029         if (ret)
3030                 goto out;
3031 out:
3032         spin_lock(&info->lock);
3033         info->allocating_chunk = 0;
3034         spin_unlock(&info->lock);
3035         wake_up(&info->allocate_wait);
3036
3037         if (ret)
3038                 return 0;
3039         return 1;
3040 }
3041
3042 /*
3043  * Reserve metadata space for delalloc.
3044  */
3045 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3046                                         struct inode *inode, int num_items)
3047 {
3048         struct btrfs_fs_info *info = root->fs_info;
3049         struct btrfs_space_info *meta_sinfo;
3050         u64 num_bytes;
3051         u64 used;
3052         u64 alloc_target;
3053         int flushed = 0;
3054         int force_delalloc;
3055
3056         /* get the space info for where the metadata will live */
3057         alloc_target = btrfs_get_alloc_profile(root, 0);
3058         meta_sinfo = __find_space_info(info, alloc_target);
3059
3060         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3061                                            num_items);
3062 again:
3063         spin_lock(&meta_sinfo->lock);
3064
3065         force_delalloc = meta_sinfo->force_delalloc;
3066
3067         if (unlikely(!meta_sinfo->bytes_root))
3068                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3069
3070         if (!flushed)
3071                 meta_sinfo->bytes_delalloc += num_bytes;
3072
3073         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3074                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3075                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3076                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3077
3078         if (used > meta_sinfo->total_bytes) {
3079                 flushed++;
3080
3081                 if (flushed == 1) {
3082                         if (maybe_allocate_chunk(root, meta_sinfo))
3083                                 goto again;
3084                         flushed++;
3085                 } else {
3086                         spin_unlock(&meta_sinfo->lock);
3087                 }
3088
3089                 if (flushed == 2) {
3090                         filemap_flush(inode->i_mapping);
3091                         goto again;
3092                 } else if (flushed == 3) {
3093                         flush_delalloc(root, meta_sinfo);
3094                         goto again;
3095                 }
3096                 spin_lock(&meta_sinfo->lock);
3097                 meta_sinfo->bytes_delalloc -= num_bytes;
3098                 spin_unlock(&meta_sinfo->lock);
3099                 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3100                        BTRFS_I(inode)->outstanding_extents,
3101                        BTRFS_I(inode)->reserved_extents);
3102                 dump_space_info(meta_sinfo, 0, 0);
3103                 return -ENOSPC;
3104         }
3105
3106         BTRFS_I(inode)->reserved_extents++;
3107         check_force_delalloc(meta_sinfo);
3108         spin_unlock(&meta_sinfo->lock);
3109
3110         if (!flushed && force_delalloc)
3111                 filemap_flush(inode->i_mapping);
3112
3113         return 0;
3114 }
3115
3116 /*
3117  * unreserve num_items number of items worth of metadata space.  This needs to
3118  * be paired with btrfs_reserve_metadata_space.
3119  *
3120  * NOTE: if you have the option, run this _AFTER_ you do a
3121  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3122  * oprations which will result in more used metadata, so we want to make sure we
3123  * can do that without issue.
3124  */
3125 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3126 {
3127         struct btrfs_fs_info *info = root->fs_info;
3128         struct btrfs_space_info *meta_sinfo;
3129         u64 num_bytes;
3130         u64 alloc_target;
3131         bool bug = false;
3132
3133         /* get the space info for where the metadata will live */
3134         alloc_target = btrfs_get_alloc_profile(root, 0);
3135         meta_sinfo = __find_space_info(info, alloc_target);
3136
3137         num_bytes = calculate_bytes_needed(root, num_items);
3138
3139         spin_lock(&meta_sinfo->lock);
3140         if (meta_sinfo->bytes_may_use < num_bytes) {
3141                 bug = true;
3142                 meta_sinfo->bytes_may_use = 0;
3143         } else {
3144                 meta_sinfo->bytes_may_use -= num_bytes;
3145         }
3146         spin_unlock(&meta_sinfo->lock);
3147
3148         BUG_ON(bug);
3149
3150         return 0;
3151 }
3152
3153 /*
3154  * Reserve some metadata space for use.  We'll calculate the worste case number
3155  * of bytes that would be needed to modify num_items number of items.  If we
3156  * have space, fantastic, if not, you get -ENOSPC.  Please call
3157  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3158  * items you reserved, since whatever metadata you needed should have already
3159  * been allocated.
3160  *
3161  * This will commit the transaction to make more space if we don't have enough
3162  * metadata space.  THe only time we don't do this is if we're reserving space
3163  * inside of a transaction, then we will just return -ENOSPC and it is the
3164  * callers responsibility to handle it properly.
3165  */
3166 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3167 {
3168         struct btrfs_fs_info *info = root->fs_info;
3169         struct btrfs_space_info *meta_sinfo;
3170         u64 num_bytes;
3171         u64 used;
3172         u64 alloc_target;
3173         int retries = 0;
3174
3175         /* get the space info for where the metadata will live */
3176         alloc_target = btrfs_get_alloc_profile(root, 0);
3177         meta_sinfo = __find_space_info(info, alloc_target);
3178
3179         num_bytes = calculate_bytes_needed(root, num_items);
3180 again:
3181         spin_lock(&meta_sinfo->lock);
3182
3183         if (unlikely(!meta_sinfo->bytes_root))
3184                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3185
3186         if (!retries)
3187                 meta_sinfo->bytes_may_use += num_bytes;
3188
3189         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3190                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3191                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3192                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3193
3194         if (used > meta_sinfo->total_bytes) {
3195                 retries++;
3196                 if (retries == 1) {
3197                         if (maybe_allocate_chunk(root, meta_sinfo))
3198                                 goto again;
3199                         retries++;
3200                 } else {
3201                         spin_unlock(&meta_sinfo->lock);
3202                 }
3203
3204                 if (retries == 2) {
3205                         flush_delalloc(root, meta_sinfo);
3206                         goto again;
3207                 }
3208                 spin_lock(&meta_sinfo->lock);
3209                 meta_sinfo->bytes_may_use -= num_bytes;
3210                 spin_unlock(&meta_sinfo->lock);
3211
3212                 dump_space_info(meta_sinfo, 0, 0);
3213                 return -ENOSPC;
3214         }
3215
3216         check_force_delalloc(meta_sinfo);
3217         spin_unlock(&meta_sinfo->lock);
3218
3219         return 0;
3220 }
3221
3222 /*
3223  * This will check the space that the inode allocates from to make sure we have
3224  * enough space for bytes.
3225  */
3226 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3227                                 u64 bytes)
3228 {
3229         struct btrfs_space_info *data_sinfo;
3230         int ret = 0, committed = 0;
3231
3232         /* make sure bytes are sectorsize aligned */
3233         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3234
3235         data_sinfo = BTRFS_I(inode)->space_info;
3236         if (!data_sinfo)
3237                 goto alloc;
3238
3239 again:
3240         /* make sure we have enough space to handle the data first */
3241         spin_lock(&data_sinfo->lock);
3242         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
3243             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
3244             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
3245             data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
3246                 struct btrfs_trans_handle *trans;
3247
3248                 /*
3249                  * if we don't have enough free bytes in this space then we need
3250                  * to alloc a new chunk.
3251                  */
3252                 if (!data_sinfo->full) {
3253                         u64 alloc_target;
3254
3255                         data_sinfo->force_alloc = 1;
3256                         spin_unlock(&data_sinfo->lock);
3257 alloc:
3258                         alloc_target = btrfs_get_alloc_profile(root, 1);
3259                         trans = btrfs_start_transaction(root, 1);
3260                         if (!trans)
3261                                 return -ENOMEM;
3262
3263                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3264                                              bytes + 2 * 1024 * 1024,
3265                                              alloc_target, 0);
3266                         btrfs_end_transaction(trans, root);
3267                         if (ret)
3268                                 return ret;
3269
3270                         if (!data_sinfo) {
3271                                 btrfs_set_inode_space_info(root, inode);
3272                                 data_sinfo = BTRFS_I(inode)->space_info;
3273                         }
3274                         goto again;
3275                 }
3276                 spin_unlock(&data_sinfo->lock);
3277
3278                 /* commit the current transaction and try again */
3279                 if (!committed && !root->fs_info->open_ioctl_trans) {
3280                         committed = 1;
3281                         trans = btrfs_join_transaction(root, 1);
3282                         if (!trans)
3283                                 return -ENOMEM;
3284                         ret = btrfs_commit_transaction(trans, root);
3285                         if (ret)
3286                                 return ret;
3287                         goto again;
3288                 }
3289
3290                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3291                        ", %llu bytes_used, %llu bytes_reserved, "
3292                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
3293                        "%llu total\n", (unsigned long long)bytes,
3294                        (unsigned long long)data_sinfo->bytes_delalloc,
3295                        (unsigned long long)data_sinfo->bytes_used,
3296                        (unsigned long long)data_sinfo->bytes_reserved,
3297                        (unsigned long long)data_sinfo->bytes_pinned,
3298                        (unsigned long long)data_sinfo->bytes_readonly,
3299                        (unsigned long long)data_sinfo->bytes_may_use,
3300                        (unsigned long long)data_sinfo->total_bytes);
3301                 return -ENOSPC;
3302         }
3303         data_sinfo->bytes_may_use += bytes;
3304         BTRFS_I(inode)->reserved_bytes += bytes;
3305         spin_unlock(&data_sinfo->lock);
3306
3307         return 0;
3308 }
3309
3310 /*
3311  * if there was an error for whatever reason after calling
3312  * btrfs_check_data_free_space, call this so we can cleanup the counters.
3313  */
3314 void btrfs_free_reserved_data_space(struct btrfs_root *root,
3315                                     struct inode *inode, u64 bytes)
3316 {
3317         struct btrfs_space_info *data_sinfo;
3318
3319         /* make sure bytes are sectorsize aligned */
3320         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3321
3322         data_sinfo = BTRFS_I(inode)->space_info;
3323         spin_lock(&data_sinfo->lock);
3324         data_sinfo->bytes_may_use -= bytes;
3325         BTRFS_I(inode)->reserved_bytes -= bytes;
3326         spin_unlock(&data_sinfo->lock);
3327 }
3328
3329 /* called when we are adding a delalloc extent to the inode's io_tree */
3330 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3331                                   u64 bytes)
3332 {
3333         struct btrfs_space_info *data_sinfo;
3334
3335         /* get the space info for where this inode will be storing its data */
3336         data_sinfo = BTRFS_I(inode)->space_info;
3337
3338         /* make sure we have enough space to handle the data first */
3339         spin_lock(&data_sinfo->lock);
3340         data_sinfo->bytes_delalloc += bytes;
3341
3342         /*
3343          * we are adding a delalloc extent without calling
3344          * btrfs_check_data_free_space first.  This happens on a weird
3345          * writepage condition, but shouldn't hurt our accounting
3346          */
3347         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3348                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3349                 BTRFS_I(inode)->reserved_bytes = 0;
3350         } else {
3351                 data_sinfo->bytes_may_use -= bytes;
3352                 BTRFS_I(inode)->reserved_bytes -= bytes;
3353         }
3354
3355         spin_unlock(&data_sinfo->lock);
3356 }
3357
3358 /* called when we are clearing an delalloc extent from the inode's io_tree */
3359 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3360                               u64 bytes)
3361 {
3362         struct btrfs_space_info *info;
3363
3364         info = BTRFS_I(inode)->space_info;
3365
3366         spin_lock(&info->lock);
3367         info->bytes_delalloc -= bytes;
3368         spin_unlock(&info->lock);
3369 }
3370
3371 static void force_metadata_allocation(struct btrfs_fs_info *info)
3372 {
3373         struct list_head *head = &info->space_info;
3374         struct btrfs_space_info *found;
3375
3376         rcu_read_lock();
3377         list_for_each_entry_rcu(found, head, list) {
3378                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3379                         found->force_alloc = 1;
3380         }
3381         rcu_read_unlock();
3382 }
3383
3384 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3385                           struct btrfs_root *extent_root, u64 alloc_bytes,
3386                           u64 flags, int force)
3387 {
3388         struct btrfs_space_info *space_info;
3389         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3390         u64 thresh;
3391         int ret = 0;
3392
3393         mutex_lock(&fs_info->chunk_mutex);
3394
3395         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3396
3397         space_info = __find_space_info(extent_root->fs_info, flags);
3398         if (!space_info) {
3399                 ret = update_space_info(extent_root->fs_info, flags,
3400                                         0, 0, &space_info);
3401                 BUG_ON(ret);
3402         }
3403         BUG_ON(!space_info);
3404
3405         spin_lock(&space_info->lock);
3406         if (space_info->force_alloc)
3407                 force = 1;
3408         if (space_info->full) {
3409                 spin_unlock(&space_info->lock);
3410                 goto out;
3411         }
3412
3413         thresh = space_info->total_bytes - space_info->bytes_readonly;
3414         thresh = div_factor(thresh, 8);
3415         if (!force &&
3416            (space_info->bytes_used + space_info->bytes_pinned +
3417             space_info->bytes_reserved + alloc_bytes) < thresh) {
3418                 spin_unlock(&space_info->lock);
3419                 goto out;
3420         }
3421         spin_unlock(&space_info->lock);
3422
3423         /*
3424          * if we're doing a data chunk, go ahead and make sure that
3425          * we keep a reasonable number of metadata chunks allocated in the
3426          * FS as well.
3427          */
3428         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3429                 fs_info->data_chunk_allocations++;
3430                 if (!(fs_info->data_chunk_allocations %
3431                       fs_info->metadata_ratio))
3432                         force_metadata_allocation(fs_info);
3433         }
3434
3435         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3436         spin_lock(&space_info->lock);
3437         if (ret)
3438                 space_info->full = 1;
3439         space_info->force_alloc = 0;
3440         spin_unlock(&space_info->lock);
3441 out:
3442         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3443         return ret;
3444 }
3445
3446 static int update_block_group(struct btrfs_trans_handle *trans,
3447                               struct btrfs_root *root,
3448                               u64 bytenr, u64 num_bytes, int alloc,
3449                               int mark_free)
3450 {
3451         struct btrfs_block_group_cache *cache;
3452         struct btrfs_fs_info *info = root->fs_info;
3453         u64 total = num_bytes;
3454         u64 old_val;
3455         u64 byte_in_group;
3456
3457         /* block accounting for super block */
3458         spin_lock(&info->delalloc_lock);
3459         old_val = btrfs_super_bytes_used(&info->super_copy);
3460         if (alloc)
3461                 old_val += num_bytes;
3462         else
3463                 old_val -= num_bytes;
3464         btrfs_set_super_bytes_used(&info->super_copy, old_val);
3465         spin_unlock(&info->delalloc_lock);
3466
3467         while (total) {
3468                 cache = btrfs_lookup_block_group(info, bytenr);
3469                 if (!cache)
3470                         return -1;
3471                 byte_in_group = bytenr - cache->key.objectid;
3472                 WARN_ON(byte_in_group > cache->key.offset);
3473
3474                 spin_lock(&cache->space_info->lock);
3475                 spin_lock(&cache->lock);
3476                 cache->dirty = 1;
3477                 old_val = btrfs_block_group_used(&cache->item);
3478                 num_bytes = min(total, cache->key.offset - byte_in_group);
3479                 if (alloc) {
3480                         old_val += num_bytes;
3481                         btrfs_set_block_group_used(&cache->item, old_val);
3482                         cache->reserved -= num_bytes;
3483                         cache->space_info->bytes_used += num_bytes;
3484                         cache->space_info->bytes_reserved -= num_bytes;
3485                         if (cache->ro)
3486                                 cache->space_info->bytes_readonly -= num_bytes;
3487                         spin_unlock(&cache->lock);
3488                         spin_unlock(&cache->space_info->lock);
3489                 } else {
3490                         old_val -= num_bytes;
3491                         cache->space_info->bytes_used -= num_bytes;
3492                         if (cache->ro)
3493                                 cache->space_info->bytes_readonly += num_bytes;
3494                         btrfs_set_block_group_used(&cache->item, old_val);
3495                         spin_unlock(&cache->lock);
3496                         spin_unlock(&cache->space_info->lock);
3497                         if (mark_free) {
3498                                 int ret;
3499
3500                                 ret = btrfs_discard_extent(root, bytenr,
3501                                                            num_bytes);
3502                                 WARN_ON(ret);
3503
3504                                 ret = btrfs_add_free_space(cache, bytenr,
3505                                                            num_bytes);
3506                                 WARN_ON(ret);
3507                         }
3508                 }
3509                 btrfs_put_block_group(cache);
3510                 total -= num_bytes;
3511                 bytenr += num_bytes;
3512         }
3513         return 0;
3514 }
3515
3516 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3517 {
3518         struct btrfs_block_group_cache *cache;
3519         u64 bytenr;
3520
3521         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3522         if (!cache)
3523                 return 0;
3524
3525         bytenr = cache->key.objectid;
3526         btrfs_put_block_group(cache);
3527
3528         return bytenr;
3529 }
3530
3531 /*
3532  * this function must be called within transaction
3533  */
3534 int btrfs_pin_extent(struct btrfs_root *root,
3535                      u64 bytenr, u64 num_bytes, int reserved)
3536 {
3537         struct btrfs_fs_info *fs_info = root->fs_info;
3538         struct btrfs_block_group_cache *cache;
3539
3540         cache = btrfs_lookup_block_group(fs_info, bytenr);
3541         BUG_ON(!cache);
3542
3543         spin_lock(&cache->space_info->lock);
3544         spin_lock(&cache->lock);
3545         cache->pinned += num_bytes;
3546         cache->space_info->bytes_pinned += num_bytes;
3547         if (reserved) {
3548                 cache->reserved -= num_bytes;
3549                 cache->space_info->bytes_reserved -= num_bytes;
3550         }
3551         spin_unlock(&cache->lock);
3552         spin_unlock(&cache->space_info->lock);
3553
3554         btrfs_put_block_group(cache);
3555
3556         set_extent_dirty(fs_info->pinned_extents,
3557                          bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3558         return 0;
3559 }
3560
3561 static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3562                                    u64 num_bytes, int reserve)
3563 {
3564         spin_lock(&cache->space_info->lock);
3565         spin_lock(&cache->lock);
3566         if (reserve) {
3567                 cache->reserved += num_bytes;
3568                 cache->space_info->bytes_reserved += num_bytes;
3569         } else {
3570                 cache->reserved -= num_bytes;
3571                 cache->space_info->bytes_reserved -= num_bytes;
3572         }
3573         spin_unlock(&cache->lock);
3574         spin_unlock(&cache->space_info->lock);
3575         return 0;
3576 }
3577
3578 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3579                                 struct btrfs_root *root)
3580 {
3581         struct btrfs_fs_info *fs_info = root->fs_info;
3582         struct btrfs_caching_control *next;
3583         struct btrfs_caching_control *caching_ctl;
3584         struct btrfs_block_group_cache *cache;
3585
3586         down_write(&fs_info->extent_commit_sem);
3587
3588         list_for_each_entry_safe(caching_ctl, next,
3589                                  &fs_info->caching_block_groups, list) {
3590                 cache = caching_ctl->block_group;
3591                 if (block_group_cache_done(cache)) {
3592                         cache->last_byte_to_unpin = (u64)-1;
3593                         list_del_init(&caching_ctl->list);
3594                         put_caching_control(caching_ctl);
3595                 } else {
3596                         cache->last_byte_to_unpin = caching_ctl->progress;
3597                 }
3598         }
3599
3600         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3601                 fs_info->pinned_extents = &fs_info->freed_extents[1];
3602         else
3603                 fs_info->pinned_extents = &fs_info->freed_extents[0];
3604
3605         up_write(&fs_info->extent_commit_sem);
3606         return 0;
3607 }
3608
3609 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3610 {
3611         struct btrfs_fs_info *fs_info = root->fs_info;
3612         struct btrfs_block_group_cache *cache = NULL;
3613         u64 len;
3614
3615         while (start <= end) {
3616                 if (!cache ||
3617                     start >= cache->key.objectid + cache->key.offset) {
3618                         if (cache)
3619                                 btrfs_put_block_group(cache);
3620                         cache = btrfs_lookup_block_group(fs_info, start);
3621                         BUG_ON(!cache);
3622                 }
3623
3624                 len = cache->key.objectid + cache->key.offset - start;
3625                 len = min(len, end + 1 - start);
3626
3627                 if (start < cache->last_byte_to_unpin) {
3628                         len = min(len, cache->last_byte_to_unpin - start);
3629                         btrfs_add_free_space(cache, start, len);
3630                 }
3631
3632                 spin_lock(&cache->space_info->lock);
3633                 spin_lock(&cache->lock);
3634                 cache->pinned -= len;
3635                 cache->space_info->bytes_pinned -= len;
3636                 spin_unlock(&cache->lock);
3637                 spin_unlock(&cache->space_info->lock);
3638
3639                 start += len;
3640         }
3641
3642         if (cache)
3643                 btrfs_put_block_group(cache);
3644         return 0;
3645 }
3646
3647 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3648                                struct btrfs_root *root)
3649 {
3650         struct btrfs_fs_info *fs_info = root->fs_info;
3651         struct extent_io_tree *unpin;
3652         u64 start;
3653         u64 end;
3654         int ret;
3655
3656         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3657                 unpin = &fs_info->freed_extents[1];
3658         else
3659                 unpin = &fs_info->freed_extents[0];
3660
3661         while (1) {
3662                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3663                                             EXTENT_DIRTY);
3664                 if (ret)
3665                         break;
3666
3667                 ret = btrfs_discard_extent(root, start, end + 1 - start);
3668
3669                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3670                 unpin_extent_range(root, start, end);
3671                 cond_resched();
3672         }
3673
3674         return ret;
3675 }
3676
3677 static int pin_down_bytes(struct btrfs_trans_handle *trans,
3678                           struct btrfs_root *root,
3679                           struct btrfs_path *path,
3680                           u64 bytenr, u64 num_bytes,
3681                           int is_data, int reserved,
3682                           struct extent_buffer **must_clean)
3683 {
3684         int err = 0;
3685         struct extent_buffer *buf;
3686
3687         if (is_data)
3688                 goto pinit;
3689
3690         /*
3691          * discard is sloooow, and so triggering discards on
3692          * individual btree blocks isn't a good plan.  Just
3693          * pin everything in discard mode.
3694          */
3695         if (btrfs_test_opt(root, DISCARD))
3696                 goto pinit;
3697
3698         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3699         if (!buf)
3700                 goto pinit;
3701
3702         /* we can reuse a block if it hasn't been written
3703          * and it is from this transaction.  We can't
3704          * reuse anything from the tree log root because
3705          * it has tiny sub-transactions.
3706          */
3707         if (btrfs_buffer_uptodate(buf, 0) &&
3708             btrfs_try_tree_lock(buf)) {
3709                 u64 header_owner = btrfs_header_owner(buf);
3710                 u64 header_transid = btrfs_header_generation(buf);
3711                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3712                     header_transid == trans->transid &&
3713                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3714                         *must_clean = buf;
3715                         return 1;
3716                 }
3717                 btrfs_tree_unlock(buf);
3718         }
3719         free_extent_buffer(buf);
3720 pinit:
3721         if (path)
3722                 btrfs_set_path_blocking(path);
3723         /* unlocks the pinned mutex */
3724         btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3725
3726         BUG_ON(err < 0);
3727         return 0;
3728 }
3729
3730 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3731                                 struct btrfs_root *root,
3732                                 u64 bytenr, u64 num_bytes, u64 parent,
3733                                 u64 root_objectid, u64 owner_objectid,
3734                                 u64 owner_offset, int refs_to_drop,
3735                                 struct btrfs_delayed_extent_op *extent_op)
3736 {
3737         struct btrfs_key key;
3738         struct btrfs_path *path;
3739         struct btrfs_fs_info *info = root->fs_info;
3740         struct btrfs_root *extent_root = info->extent_root;
3741         struct extent_buffer *leaf;
3742         struct btrfs_extent_item *ei;
3743         struct btrfs_extent_inline_ref *iref;
3744         int ret;
3745         int is_data;
3746         int extent_slot = 0;
3747         int found_extent = 0;
3748         int num_to_del = 1;
3749         u32 item_size;
3750         u64 refs;
3751
3752         path = btrfs_alloc_path();
3753         if (!path)
3754                 return -ENOMEM;
3755
3756         path->reada = 1;
3757         path->leave_spinning = 1;
3758
3759         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3760         BUG_ON(!is_data && refs_to_drop != 1);
3761
3762         ret = lookup_extent_backref(trans, extent_root, path, &iref,
3763                                     bytenr, num_bytes, parent,
3764                                     root_objectid, owner_objectid,
3765                                     owner_offset);
3766         if (ret == 0) {
3767                 extent_slot = path->slots[0];
3768                 while (extent_slot >= 0) {
3769                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3770                                               extent_slot);
3771                         if (key.objectid != bytenr)
3772                                 break;
3773                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3774                             key.offset == num_bytes) {
3775                                 found_extent = 1;
3776                                 break;
3777                         }
3778                         if (path->slots[0] - extent_slot > 5)
3779                                 break;
3780                         extent_slot--;
3781                 }
3782 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3783                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3784                 if (found_extent && item_size < sizeof(*ei))
3785                         found_extent = 0;
3786 #endif
3787                 if (!found_extent) {
3788                         BUG_ON(iref);
3789                         ret = remove_extent_backref(trans, extent_root, path,
3790                                                     NULL, refs_to_drop,
3791                                                     is_data);
3792                         BUG_ON(ret);
3793                         btrfs_release_path(extent_root, path);
3794                         path->leave_spinning = 1;
3795
3796                         key.objectid = bytenr;
3797                         key.type = BTRFS_EXTENT_ITEM_KEY;
3798                         key.offset = num_bytes;
3799
3800                         ret = btrfs_search_slot(trans, extent_root,
3801                                                 &key, path, -1, 1);
3802                         if (ret) {
3803                                 printk(KERN_ERR "umm, got %d back from search"
3804                                        ", was looking for %llu\n", ret,
3805                                        (unsigned long long)bytenr);
3806                                 btrfs_print_leaf(extent_root, path->nodes[0]);
3807                         }
3808                         BUG_ON(ret);
3809                         extent_slot = path->slots[0];
3810                 }
3811         } else {
3812                 btrfs_print_leaf(extent_root, path->nodes[0]);
3813                 WARN_ON(1);
3814                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
3815                        "parent %llu root %llu  owner %llu offset %llu\n",
3816                        (unsigned long long)bytenr,
3817                        (unsigned long long)parent,
3818                        (unsigned long long)root_objectid,
3819                        (unsigned long long)owner_objectid,
3820                        (unsigned long long)owner_offset);
3821         }
3822
3823         leaf = path->nodes[0];
3824         item_size = btrfs_item_size_nr(leaf, extent_slot);
3825 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3826         if (item_size < sizeof(*ei)) {
3827                 BUG_ON(found_extent || extent_slot != path->slots[0]);
3828                 ret = convert_extent_item_v0(trans, extent_root, path,
3829                                              owner_objectid, 0);
3830                 BUG_ON(ret < 0);
3831
3832                 btrfs_release_path(extent_root, path);
3833                 path->leave_spinning = 1;
3834
3835                 key.objectid = bytenr;
3836                 key.type = BTRFS_EXTENT_ITEM_KEY;
3837                 key.offset = num_bytes;
3838
3839                 ret = btrfs_search_slot(trans, extent_root, &key, path,
3840                                         -1, 1);
3841                 if (ret) {
3842                         printk(KERN_ERR "umm, got %d back from search"
3843                                ", was looking for %llu\n", ret,
3844                                (unsigned long long)bytenr);
3845                         btrfs_print_leaf(extent_root, path->nodes[0]);
3846                 }
3847                 BUG_ON(ret);
3848                 extent_slot = path->slots[0];
3849                 leaf = path->nodes[0];
3850                 item_size = btrfs_item_size_nr(leaf, extent_slot);
3851         }
3852 #endif
3853         BUG_ON(item_size < sizeof(*ei));
3854         ei = btrfs_item_ptr(leaf, extent_slot,
3855                             struct btrfs_extent_item);
3856         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3857                 struct btrfs_tree_block_info *bi;
3858                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3859                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3860                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3861         }
3862
3863         refs = btrfs_extent_refs(leaf, ei);
3864         BUG_ON(refs < refs_to_drop);
3865         refs -= refs_to_drop;
3866
3867         if (refs > 0) {
3868                 if (extent_op)
3869                         __run_delayed_extent_op(extent_op, leaf, ei);
3870                 /*
3871                  * In the case of inline back ref, reference count will
3872                  * be updated by remove_extent_backref
3873                  */
3874                 if (iref) {
3875                         BUG_ON(!found_extent);
3876                 } else {
3877                         btrfs_set_extent_refs(leaf, ei, refs);
3878                         btrfs_mark_buffer_dirty(leaf);
3879                 }
3880                 if (found_extent) {
3881                         ret = remove_extent_backref(trans, extent_root, path,
3882                                                     iref, refs_to_drop,
3883                                                     is_data);
3884                         BUG_ON(ret);
3885                 }
3886         } else {
3887                 int mark_free = 0;
3888                 struct extent_buffer *must_clean = NULL;
3889
3890                 if (found_extent) {
3891                         BUG_ON(is_data && refs_to_drop !=
3892                                extent_data_ref_count(root, path, iref));
3893                         if (iref) {
3894                                 BUG_ON(path->slots[0] != extent_slot);
3895                         } else {
3896                                 BUG_ON(path->slots[0] != extent_slot + 1);
3897                                 path->slots[0] = extent_slot;
3898                                 num_to_del = 2;
3899                         }
3900                 }
3901
3902                 ret = pin_down_bytes(trans, root, path, bytenr,
3903                                      num_bytes, is_data, 0, &must_clean);
3904                 if (ret > 0)
3905                         mark_free = 1;
3906                 BUG_ON(ret < 0);
3907                 /*
3908                  * it is going to be very rare for someone to be waiting
3909                  * on the block we're freeing.  del_items might need to
3910                  * schedule, so rather than get fancy, just force it
3911                  * to blocking here
3912                  */
3913                 if (must_clean)
3914                         btrfs_set_lock_blocking(must_clean);
3915
3916                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3917                                       num_to_del);
3918                 BUG_ON(ret);
3919                 btrfs_release_path(extent_root, path);
3920
3921                 if (must_clean) {
3922                         clean_tree_block(NULL, root, must_clean);
3923                         btrfs_tree_unlock(must_clean);
3924                         free_extent_buffer(must_clean);
3925                 }
3926
3927                 if (is_data) {
3928                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3929                         BUG_ON(ret);
3930                 } else {
3931                         invalidate_mapping_pages(info->btree_inode->i_mapping,
3932                              bytenr >> PAGE_CACHE_SHIFT,
3933                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3934                 }
3935
3936                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3937                                          mark_free);
3938                 BUG_ON(ret);
3939         }
3940         btrfs_free_path(path);
3941         return ret;
3942 }
3943
3944 /*
3945  * when we free an extent, it is possible (and likely) that we free the last
3946  * delayed ref for that extent as well.  This searches the delayed ref tree for
3947  * a given extent, and if there are no other delayed refs to be processed, it
3948  * removes it from the tree.
3949  */
3950 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3951                                       struct btrfs_root *root, u64 bytenr)
3952 {
3953         struct btrfs_delayed_ref_head *head;
3954         struct btrfs_delayed_ref_root *delayed_refs;
3955         struct btrfs_delayed_ref_node *ref;
3956         struct rb_node *node;
3957         int ret;
3958
3959         delayed_refs = &trans->transaction->delayed_refs;
3960         spin_lock(&delayed_refs->lock);
3961         head = btrfs_find_delayed_ref_head(trans, bytenr);
3962         if (!head)
3963                 goto out;
3964
3965         node = rb_prev(&head->node.rb_node);
3966         if (!node)
3967                 goto out;
3968
3969         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3970
3971         /* there are still entries for this ref, we can't drop it */
3972         if (ref->bytenr == bytenr)
3973                 goto out;
3974
3975         if (head->extent_op) {
3976                 if (!head->must_insert_reserved)
3977                         goto out;
3978                 kfree(head->extent_op);
3979                 head->extent_op = NULL;
3980         }
3981
3982         /*
3983          * waiting for the lock here would deadlock.  If someone else has it
3984          * locked they are already in the process of dropping it anyway
3985          */
3986         if (!mutex_trylock(&head->mutex))
3987                 goto out;
3988
3989         /*
3990          * at this point we have a head with no other entries.  Go
3991          * ahead and process it.
3992          */
3993         head->node.in_tree = 0;
3994         rb_erase(&head->node.rb_node, &delayed_refs->root);
3995
3996         delayed_refs->num_entries--;
3997
3998         /*
3999          * we don't take a ref on the node because we're removing it from the
4000          * tree, so we just steal the ref the tree was holding.
4001          */
4002         delayed_refs->num_heads--;
4003         if (list_empty(&head->cluster))
4004                 delayed_refs->num_heads_ready--;
4005
4006         list_del_init(&head->cluster);
4007         spin_unlock(&delayed_refs->lock);
4008
4009         ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
4010                                   &head->node, head->extent_op,
4011                                   head->must_insert_reserved);
4012         BUG_ON(ret);
4013         btrfs_put_delayed_ref(&head->node);
4014         return 0;
4015 out:
4016         spin_unlock(&delayed_refs->lock);
4017         return 0;
4018 }
4019
4020 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4021                       struct btrfs_root *root,
4022                       u64 bytenr, u64 num_bytes, u64 parent,
4023                       u64 root_objectid, u64 owner, u64 offset)
4024 {
4025         int ret;
4026
4027         /*
4028          * tree log blocks never actually go into the extent allocation
4029          * tree, just update pinning info and exit early.
4030          */
4031         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4032                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4033                 /* unlocks the pinned mutex */
4034                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4035                 ret = 0;
4036         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4037                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4038                                         parent, root_objectid, (int)owner,
4039                                         BTRFS_DROP_DELAYED_REF, NULL);
4040                 BUG_ON(ret);
4041                 ret = check_ref_cleanup(trans, root, bytenr);
4042                 BUG_ON(ret);
4043         } else {
4044                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4045                                         parent, root_objectid, owner,
4046                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4047                 BUG_ON(ret);
4048         }
4049         return ret;
4050 }
4051
4052 int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4053                           struct btrfs_root *root,
4054                           u64 bytenr, u32 blocksize,
4055                           u64 parent, u64 root_objectid, int level)
4056 {
4057         u64 used;
4058         spin_lock(&root->node_lock);
4059         used = btrfs_root_used(&root->root_item) - blocksize;
4060         btrfs_set_root_used(&root->root_item, used);
4061         spin_unlock(&root->node_lock);
4062
4063         return btrfs_free_extent(trans, root, bytenr, blocksize,
4064                                  parent, root_objectid, level, 0);
4065 }
4066
4067 static u64 stripe_align(struct btrfs_root *root, u64 val)
4068 {
4069         u64 mask = ((u64)root->stripesize - 1);
4070         u64 ret = (val + mask) & ~mask;
4071         return ret;
4072 }
4073
4074 /*
4075  * when we wait for progress in the block group caching, its because
4076  * our allocation attempt failed at least once.  So, we must sleep
4077  * and let some progress happen before we try again.
4078  *
4079  * This function will sleep at least once waiting for new free space to
4080  * show up, and then it will check the block group free space numbers
4081  * for our min num_bytes.  Another option is to have it go ahead
4082  * and look in the rbtree for a free extent of a given size, but this
4083  * is a good start.
4084  */
4085 static noinline int
4086 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4087                                 u64 num_bytes)
4088 {
4089         struct btrfs_caching_control *caching_ctl;
4090         DEFINE_WAIT(wait);
4091
4092         caching_ctl = get_caching_control(cache);
4093         if (!caching_ctl)
4094                 return 0;
4095
4096         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4097                    (cache->free_space >= num_bytes));
4098
4099         put_caching_control(caching_ctl);
4100         return 0;
4101 }
4102
4103 static noinline int
4104 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4105 {
4106         struct btrfs_caching_control *caching_ctl;
4107         DEFINE_WAIT(wait);
4108
4109         caching_ctl = get_caching_control(cache);
4110         if (!caching_ctl)
4111                 return 0;
4112
4113         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4114
4115         put_caching_control(caching_ctl);
4116         return 0;
4117 }
4118
4119 enum btrfs_loop_type {
4120         LOOP_FIND_IDEAL = 0,
4121         LOOP_CACHING_NOWAIT = 1,
4122         LOOP_CACHING_WAIT = 2,
4123         LOOP_ALLOC_CHUNK = 3,
4124         LOOP_NO_EMPTY_SIZE = 4,
4125 };
4126
4127 /*
4128  * walks the btree of allocated extents and find a hole of a given size.
4129  * The key ins is changed to record the hole:
4130  * ins->objectid == block start
4131  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4132  * ins->offset == number of blocks
4133  * Any available blocks before search_start are skipped.
4134  */
4135 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4136                                      struct btrfs_root *orig_root,
4137                                      u64 num_bytes, u64 empty_size,
4138                                      u64 search_start, u64 search_end,
4139                                      u64 hint_byte, struct btrfs_key *ins,
4140                                      u64 exclude_start, u64 exclude_nr,
4141                                      int data)
4142 {
4143         int ret = 0;
4144         struct btrfs_root *root = orig_root->fs_info->extent_root;
4145         struct btrfs_free_cluster *last_ptr = NULL;
4146         struct btrfs_block_group_cache *block_group = NULL;
4147         int empty_cluster = 2 * 1024 * 1024;
4148         int allowed_chunk_alloc = 0;
4149         int done_chunk_alloc = 0;
4150         struct btrfs_space_info *space_info;
4151         int last_ptr_loop = 0;
4152         int loop = 0;
4153         bool found_uncached_bg = false;
4154         bool failed_cluster_refill = false;
4155         bool failed_alloc = false;
4156         u64 ideal_cache_percent = 0;
4157         u64 ideal_cache_offset = 0;
4158
4159         WARN_ON(num_bytes < root->sectorsize);
4160         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4161         ins->objectid = 0;
4162         ins->offset = 0;
4163
4164         space_info = __find_space_info(root->fs_info, data);
4165
4166         if (orig_root->ref_cows || empty_size)
4167                 allowed_chunk_alloc = 1;
4168
4169         if (data & BTRFS_BLOCK_GROUP_METADATA) {
4170                 last_ptr = &root->fs_info->meta_alloc_cluster;
4171                 if (!btrfs_test_opt(root, SSD))
4172                         empty_cluster = 64 * 1024;
4173         }
4174
4175         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4176                 last_ptr = &root->fs_info->data_alloc_cluster;
4177         }
4178
4179         if (last_ptr) {
4180                 spin_lock(&last_ptr->lock);
4181                 if (last_ptr->block_group)
4182                         hint_byte = last_ptr->window_start;
4183                 spin_unlock(&last_ptr->lock);
4184         }
4185
4186         search_start = max(search_start, first_logical_byte(root, 0));
4187         search_start = max(search_start, hint_byte);
4188
4189         if (!last_ptr)
4190                 empty_cluster = 0;
4191
4192         if (search_start == hint_byte) {
4193 ideal_cache:
4194                 block_group = btrfs_lookup_block_group(root->fs_info,
4195                                                        search_start);
4196                 /*
4197                  * we don't want to use the block group if it doesn't match our
4198                  * allocation bits, or if its not cached.
4199                  *
4200                  * However if we are re-searching with an ideal block group
4201                  * picked out then we don't care that the block group is cached.
4202                  */
4203                 if (block_group && block_group_bits(block_group, data) &&
4204                     (block_group->cached != BTRFS_CACHE_NO ||
4205                      search_start == ideal_cache_offset)) {
4206                         down_read(&space_info->groups_sem);
4207                         if (list_empty(&block_group->list) ||
4208                             block_group->ro) {
4209                                 /*
4210                                  * someone is removing this block group,
4211                                  * we can't jump into the have_block_group
4212                                  * target because our list pointers are not
4213                                  * valid
4214                                  */
4215                                 btrfs_put_block_group(block_group);
4216                                 up_read(&space_info->groups_sem);
4217                         } else {
4218                                 goto have_block_group;
4219                         }
4220                 } else if (block_group) {
4221                         btrfs_put_block_group(block_group);
4222                 }
4223         }
4224 search:
4225         down_read(&space_info->groups_sem);
4226         list_for_each_entry(block_group, &space_info->block_groups, list) {
4227                 u64 offset;
4228                 int cached;
4229
4230                 atomic_inc(&block_group->count);
4231                 search_start = block_group->key.objectid;
4232
4233 have_block_group:
4234                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4235                         u64 free_percent;
4236
4237                         free_percent = btrfs_block_group_used(&block_group->item);
4238                         free_percent *= 100;
4239                         free_percent = div64_u64(free_percent,
4240                                                  block_group->key.offset);
4241                         free_percent = 100 - free_percent;
4242                         if (free_percent > ideal_cache_percent &&
4243                             likely(!block_group->ro)) {
4244                                 ideal_cache_offset = block_group->key.objectid;
4245                                 ideal_cache_percent = free_percent;
4246                         }
4247
4248                         /*
4249                          * We only want to start kthread caching if we are at
4250                          * the point where we will wait for caching to make
4251                          * progress, or if our ideal search is over and we've
4252                          * found somebody to start caching.
4253                          */
4254                         if (loop > LOOP_CACHING_NOWAIT ||
4255                             (loop > LOOP_FIND_IDEAL &&
4256                              atomic_read(&space_info->caching_threads) < 2)) {
4257                                 ret = cache_block_group(block_group);
4258                                 BUG_ON(ret);
4259                         }
4260                         found_uncached_bg = true;
4261
4262                         /*
4263                          * If loop is set for cached only, try the next block
4264                          * group.
4265                          */
4266                         if (loop == LOOP_FIND_IDEAL)
4267                                 goto loop;
4268                 }
4269
4270                 cached = block_group_cache_done(block_group);
4271                 if (unlikely(!cached))
4272                         found_uncached_bg = true;
4273
4274                 if (unlikely(block_group->ro))
4275                         goto loop;
4276
4277                 /*
4278                  * Ok we want to try and use the cluster allocator, so lets look
4279                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4280                  * have tried the cluster allocator plenty of times at this
4281                  * point and not have found anything, so we are likely way too
4282                  * fragmented for the clustering stuff to find anything, so lets
4283                  * just skip it and let the allocator find whatever block it can
4284                  * find
4285                  */
4286                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4287                         /*
4288                          * the refill lock keeps out other
4289                          * people trying to start a new cluster
4290                          */
4291                         spin_lock(&last_ptr->refill_lock);
4292                         if (last_ptr->block_group &&
4293                             (last_ptr->block_group->ro ||
4294                             !block_group_bits(last_ptr->block_group, data))) {
4295                                 offset = 0;
4296                                 goto refill_cluster;
4297                         }
4298
4299                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4300                                                  num_bytes, search_start);
4301                         if (offset) {
4302                                 /* we have a block, we're done */
4303                                 spin_unlock(&last_ptr->refill_lock);
4304                                 goto checks;
4305                         }
4306
4307                         spin_lock(&last_ptr->lock);
4308                         /*
4309                          * whoops, this cluster doesn't actually point to
4310                          * this block group.  Get a ref on the block
4311                          * group is does point to and try again
4312                          */
4313                         if (!last_ptr_loop && last_ptr->block_group &&
4314                             last_ptr->block_group != block_group) {
4315
4316                                 btrfs_put_block_group(block_group);
4317                                 block_group = last_ptr->block_group;
4318                                 atomic_inc(&block_group->count);
4319                                 spin_unlock(&last_ptr->lock);
4320                                 spin_unlock(&last_ptr->refill_lock);
4321
4322                                 last_ptr_loop = 1;
4323                                 search_start = block_group->key.objectid;
4324                                 /*
4325                                  * we know this block group is properly
4326                                  * in the list because
4327                                  * btrfs_remove_block_group, drops the
4328                                  * cluster before it removes the block
4329                                  * group from the list
4330                                  */
4331                                 goto have_block_group;
4332                         }
4333                         spin_unlock(&last_ptr->lock);
4334 refill_cluster:
4335                         /*
4336                          * this cluster didn't work out, free it and
4337                          * start over
4338                          */
4339                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4340
4341                         last_ptr_loop = 0;
4342
4343                         /* allocate a cluster in this block group */
4344                         ret = btrfs_find_space_cluster(trans, root,
4345                                                block_group, last_ptr,
4346                                                offset, num_bytes,
4347                                                empty_cluster + empty_size);
4348                         if (ret == 0) {
4349                                 /*
4350                                  * now pull our allocation out of this
4351                                  * cluster
4352                                  */
4353                                 offset = btrfs_alloc_from_cluster(block_group,
4354                                                   last_ptr, num_bytes,
4355                                                   search_start);
4356                                 if (offset) {
4357                                         /* we found one, proceed */
4358                                         spin_unlock(&last_ptr->refill_lock);
4359                                         goto checks;
4360                                 }
4361                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
4362                                    && !failed_cluster_refill) {
4363                                 spin_unlock(&last_ptr->refill_lock);
4364
4365                                 failed_cluster_refill = true;
4366                                 wait_block_group_cache_progress(block_group,
4367                                        num_bytes + empty_cluster + empty_size);
4368                                 goto have_block_group;
4369                         }
4370
4371                         /*
4372                          * at this point we either didn't find a cluster
4373                          * or we weren't able to allocate a block from our
4374                          * cluster.  Free the cluster we've been trying
4375                          * to use, and go to the next block group
4376                          */
4377                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4378                         spin_unlock(&last_ptr->refill_lock);
4379                         goto loop;
4380                 }
4381
4382                 offset = btrfs_find_space_for_alloc(block_group, search_start,
4383                                                     num_bytes, empty_size);
4384                 /*
4385                  * If we didn't find a chunk, and we haven't failed on this
4386                  * block group before, and this block group is in the middle of
4387                  * caching and we are ok with waiting, then go ahead and wait
4388                  * for progress to be made, and set failed_alloc to true.
4389                  *
4390                  * If failed_alloc is true then we've already waited on this
4391                  * block group once and should move on to the next block group.
4392                  */
4393                 if (!offset && !failed_alloc && !cached &&
4394                     loop > LOOP_CACHING_NOWAIT) {
4395                         wait_block_group_cache_progress(block_group,
4396                                                 num_bytes + empty_size);
4397                         failed_alloc = true;
4398                         goto have_block_group;
4399                 } else if (!offset) {
4400                         goto loop;
4401                 }
4402 checks:
4403                 search_start = stripe_align(root, offset);
4404                 /* move on to the next group */
4405                 if (search_start + num_bytes >= search_end) {
4406                         btrfs_add_free_space(block_group, offset, num_bytes);
4407                         goto loop;
4408                 }
4409
4410                 /* move on to the next group */
4411                 if (search_start + num_bytes >
4412                     block_group->key.objectid + block_group->key.offset) {
4413                         btrfs_add_free_space(block_group, offset, num_bytes);
4414                         goto loop;
4415                 }
4416
4417                 if (exclude_nr > 0 &&
4418                     (search_start + num_bytes > exclude_start &&
4419                      search_start < exclude_start + exclude_nr)) {
4420                         search_start = exclude_start + exclude_nr;
4421
4422                         btrfs_add_free_space(block_group, offset, num_bytes);
4423                         /*
4424                          * if search_start is still in this block group
4425                          * then we just re-search this block group
4426                          */
4427                         if (search_start >= block_group->key.objectid &&
4428                             search_start < (block_group->key.objectid +
4429                                             block_group->key.offset))
4430                                 goto have_block_group;
4431                         goto loop;
4432                 }
4433
4434                 ins->objectid = search_start;
4435                 ins->offset = num_bytes;
4436
4437                 if (offset < search_start)
4438                         btrfs_add_free_space(block_group, offset,
4439                                              search_start - offset);
4440                 BUG_ON(offset > search_start);
4441
4442                 update_reserved_extents(block_group, num_bytes, 1);
4443
4444                 /* we are all good, lets return */
4445                 break;
4446 loop:
4447                 failed_cluster_refill = false;
4448                 failed_alloc = false;
4449                 btrfs_put_block_group(block_group);
4450         }
4451         up_read(&space_info->groups_sem);
4452
4453         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4454          *                      for them to make caching progress.  Also
4455          *                      determine the best possible bg to cache
4456          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4457          *                      caching kthreads as we move along
4458          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4459          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4460          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4461          *                      again
4462          */
4463         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4464             (found_uncached_bg || empty_size || empty_cluster ||
4465              allowed_chunk_alloc)) {
4466                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4467                         found_uncached_bg = false;
4468                         loop++;
4469                         if (!ideal_cache_percent &&
4470                             atomic_read(&space_info->caching_threads))
4471                                 goto search;
4472
4473                         /*
4474                          * 1 of the following 2 things have happened so far
4475                          *
4476                          * 1) We found an ideal block group for caching that
4477                          * is mostly full and will cache quickly, so we might
4478                          * as well wait for it.
4479                          *
4480                          * 2) We searched for cached only and we didn't find
4481                          * anything, and we didn't start any caching kthreads
4482                          * either, so chances are we will loop through and
4483                          * start a couple caching kthreads, and then come back
4484                          * around and just wait for them.  This will be slower
4485                          * because we will have 2 caching kthreads reading at
4486                          * the same time when we could have just started one
4487                          * and waited for it to get far enough to give us an
4488                          * allocation, so go ahead and go to the wait caching
4489                          * loop.
4490                          */
4491                         loop = LOOP_CACHING_WAIT;
4492                         search_start = ideal_cache_offset;
4493                         ideal_cache_percent = 0;
4494                         goto ideal_cache;
4495                 } else if (loop == LOOP_FIND_IDEAL) {
4496                         /*
4497                          * Didn't find a uncached bg, wait on anything we find
4498                          * next.
4499                          */
4500                         loop = LOOP_CACHING_WAIT;
4501                         goto search;
4502                 }
4503
4504                 if (loop < LOOP_CACHING_WAIT) {
4505                         loop++;
4506                         goto search;
4507                 }
4508
4509                 if (loop == LOOP_ALLOC_CHUNK) {
4510                         empty_size = 0;
4511                         empty_cluster = 0;
4512                 }
4513
4514                 if (allowed_chunk_alloc) {
4515                         ret = do_chunk_alloc(trans, root, num_bytes +
4516                                              2 * 1024 * 1024, data, 1);
4517                         allowed_chunk_alloc = 0;
4518                         done_chunk_alloc = 1;
4519                 } else if (!done_chunk_alloc) {
4520                         space_info->force_alloc = 1;
4521                 }
4522
4523                 if (loop < LOOP_NO_EMPTY_SIZE) {
4524                         loop++;
4525                         goto search;
4526                 }
4527                 ret = -ENOSPC;
4528         } else if (!ins->objectid) {
4529                 ret = -ENOSPC;
4530         }
4531
4532         /* we found what we needed */
4533         if (ins->objectid) {
4534                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
4535                         trans->block_group = block_group->key.objectid;
4536
4537                 btrfs_put_block_group(block_group);
4538                 ret = 0;
4539         }
4540
4541         return ret;
4542 }
4543
4544 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4545                             int dump_block_groups)
4546 {
4547         struct btrfs_block_group_cache *cache;
4548
4549         spin_lock(&info->lock);
4550         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4551                (unsigned long long)(info->total_bytes - info->bytes_used -
4552                                     info->bytes_pinned - info->bytes_reserved -
4553                                     info->bytes_super),
4554                (info->full) ? "" : "not ");
4555         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4556                " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4557                "\n",
4558                (unsigned long long)info->total_bytes,
4559                (unsigned long long)info->bytes_pinned,
4560                (unsigned long long)info->bytes_delalloc,
4561                (unsigned long long)info->bytes_may_use,
4562                (unsigned long long)info->bytes_used,
4563                (unsigned long long)info->bytes_root,
4564                (unsigned long long)info->bytes_super,
4565                (unsigned long long)info->bytes_reserved);
4566         spin_unlock(&info->lock);
4567
4568         if (!dump_block_groups)
4569                 return;
4570
4571         down_read(&info->groups_sem);
4572         list_for_each_entry(cache, &info->block_groups, list) {
4573                 spin_lock(&cache->lock);
4574                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4575                        "%llu pinned %llu reserved\n",
4576                        (unsigned long long)cache->key.objectid,
4577                        (unsigned long long)cache->key.offset,
4578                        (unsigned long long)btrfs_block_group_used(&cache->item),
4579                        (unsigned long long)cache->pinned,
4580                        (unsigned long long)cache->reserved);
4581                 btrfs_dump_free_space(cache, bytes);
4582                 spin_unlock(&cache->lock);
4583         }
4584         up_read(&info->groups_sem);
4585 }
4586
4587 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4588                          struct btrfs_root *root,
4589                          u64 num_bytes, u64 min_alloc_size,
4590                          u64 empty_size, u64 hint_byte,
4591                          u64 search_end, struct btrfs_key *ins,
4592                          u64 data)
4593 {
4594         int ret;
4595         u64 search_start = 0;
4596
4597         data = btrfs_get_alloc_profile(root, data);
4598 again:
4599         /*
4600          * the only place that sets empty_size is btrfs_realloc_node, which
4601          * is not called recursively on allocations
4602          */
4603         if (empty_size || root->ref_cows)
4604                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4605                                      num_bytes + 2 * 1024 * 1024, data, 0);
4606
4607         WARN_ON(num_bytes < root->sectorsize);
4608         ret = find_free_extent(trans, root, num_bytes, empty_size,
4609                                search_start, search_end, hint_byte, ins,
4610                                trans->alloc_exclude_start,
4611                                trans->alloc_exclude_nr, data);
4612
4613         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4614                 num_bytes = num_bytes >> 1;
4615                 num_bytes = num_bytes & ~(root->sectorsize - 1);
4616                 num_bytes = max(num_bytes, min_alloc_size);
4617                 do_chunk_alloc(trans, root->fs_info->extent_root,
4618                                num_bytes, data, 1);
4619                 goto again;
4620         }
4621         if (ret == -ENOSPC) {
4622                 struct btrfs_space_info *sinfo;
4623
4624                 sinfo = __find_space_info(root->fs_info, data);
4625                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4626                        "wanted %llu\n", (unsigned long long)data,
4627                        (unsigned long long)num_bytes);
4628                 dump_space_info(sinfo, num_bytes, 1);
4629         }
4630
4631         return ret;
4632 }
4633
4634 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4635 {
4636         struct btrfs_block_group_cache *cache;
4637         int ret = 0;
4638
4639         cache = btrfs_lookup_block_group(root->fs_info, start);
4640         if (!cache) {
4641                 printk(KERN_ERR "Unable to find block group for %llu\n",
4642                        (unsigned long long)start);
4643                 return -ENOSPC;
4644         }
4645
4646         ret = btrfs_discard_extent(root, start, len);
4647
4648         btrfs_add_free_space(cache, start, len);
4649         update_reserved_extents(cache, len, 0);
4650         btrfs_put_block_group(cache);
4651
4652         return ret;
4653 }
4654
4655 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4656                                       struct btrfs_root *root,
4657                                       u64 parent, u64 root_objectid,
4658                                       u64 flags, u64 owner, u64 offset,
4659                                       struct btrfs_key *ins, int ref_mod)
4660 {
4661         int ret;
4662         struct btrfs_fs_info *fs_info = root->fs_info;
4663         struct btrfs_extent_item *extent_item;
4664         struct btrfs_extent_inline_ref *iref;
4665         struct btrfs_path *path;
4666         struct extent_buffer *leaf;
4667         int type;
4668         u32 size;
4669
4670         if (parent > 0)
4671                 type = BTRFS_SHARED_DATA_REF_KEY;
4672         else
4673                 type = BTRFS_EXTENT_DATA_REF_KEY;
4674
4675         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4676
4677         path = btrfs_alloc_path();
4678         BUG_ON(!path);
4679
4680         path->leave_spinning = 1;
4681         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4682                                       ins, size);
4683         BUG_ON(ret);
4684
4685         leaf = path->nodes[0];
4686         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4687                                      struct btrfs_extent_item);
4688         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4689         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4690         btrfs_set_extent_flags(leaf, extent_item,
4691                                flags | BTRFS_EXTENT_FLAG_DATA);
4692
4693         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4694         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4695         if (parent > 0) {
4696                 struct btrfs_shared_data_ref *ref;
4697                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4698                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4699                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4700         } else {
4701                 struct btrfs_extent_data_ref *ref;
4702                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4703                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4704                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4705                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4706                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4707         }
4708
4709         btrfs_mark_buffer_dirty(path->nodes[0]);
4710         btrfs_free_path(path);
4711
4712         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4713                                  1, 0);
4714         if (ret) {
4715                 printk(KERN_ERR "btrfs update block group failed for %llu "
4716                        "%llu\n", (unsigned long long)ins->objectid,
4717                        (unsigned long long)ins->offset);
4718                 BUG();
4719         }
4720         return ret;
4721 }
4722
4723 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4724                                      struct btrfs_root *root,
4725                                      u64 parent, u64 root_objectid,
4726                                      u64 flags, struct btrfs_disk_key *key,
4727                                      int level, struct btrfs_key *ins)
4728 {
4729         int ret;
4730         struct btrfs_fs_info *fs_info = root->fs_info;
4731         struct btrfs_extent_item *extent_item;
4732         struct btrfs_tree_block_info *block_info;
4733         struct btrfs_extent_inline_ref *iref;
4734         struct btrfs_path *path;
4735         struct extent_buffer *leaf;
4736         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
4737
4738         path = btrfs_alloc_path();
4739         BUG_ON(!path);
4740
4741         path->leave_spinning = 1;
4742         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4743                                       ins, size);
4744         BUG_ON(ret);
4745
4746         leaf = path->nodes[0];
4747         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4748                                      struct btrfs_extent_item);
4749         btrfs_set_extent_refs(leaf, extent_item, 1);
4750         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4751         btrfs_set_extent_flags(leaf, extent_item,
4752                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4753         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4754
4755         btrfs_set_tree_block_key(leaf, block_info, key);
4756         btrfs_set_tree_block_level(leaf, block_info, level);
4757
4758         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4759         if (parent > 0) {
4760                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4761                 btrfs_set_extent_inline_ref_type(leaf, iref,
4762                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4763                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4764         } else {
4765                 btrfs_set_extent_inline_ref_type(leaf, iref,
4766                                                  BTRFS_TREE_BLOCK_REF_KEY);
4767                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4768         }
4769
4770         btrfs_mark_buffer_dirty(leaf);
4771         btrfs_free_path(path);
4772
4773         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4774                                  1, 0);
4775         if (ret) {
4776                 printk(KERN_ERR "btrfs update block group failed for %llu "
4777                        "%llu\n", (unsigned long long)ins->objectid,
4778                        (unsigned long long)ins->offset);
4779                 BUG();
4780         }
4781         return ret;
4782 }
4783
4784 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4785                                      struct btrfs_root *root,
4786                                      u64 root_objectid, u64 owner,
4787                                      u64 offset, struct btrfs_key *ins)
4788 {
4789         int ret;
4790
4791         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4792
4793         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4794                                          0, root_objectid, owner, offset,
4795                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
4796         return ret;
4797 }
4798
4799 /*
4800  * this is used by the tree logging recovery code.  It records that
4801  * an extent has been allocated and makes sure to clear the free
4802  * space cache bits as well
4803  */
4804 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4805                                    struct btrfs_root *root,
4806                                    u64 root_objectid, u64 owner, u64 offset,
4807                                    struct btrfs_key *ins)
4808 {
4809         int ret;
4810         struct btrfs_block_group_cache *block_group;
4811         struct btrfs_caching_control *caching_ctl;
4812         u64 start = ins->objectid;
4813         u64 num_bytes = ins->offset;
4814
4815         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4816         cache_block_group(block_group);
4817         caching_ctl = get_caching_control(block_group);
4818
4819         if (!caching_ctl) {
4820                 BUG_ON(!block_group_cache_done(block_group));
4821                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4822                 BUG_ON(ret);
4823         } else {
4824                 mutex_lock(&caching_ctl->mutex);
4825
4826                 if (start >= caching_ctl->progress) {
4827                         ret = add_excluded_extent(root, start, num_bytes);
4828                         BUG_ON(ret);
4829                 } else if (start + num_bytes <= caching_ctl->progress) {
4830                         ret = btrfs_remove_free_space(block_group,
4831                                                       start, num_bytes);
4832                         BUG_ON(ret);
4833                 } else {
4834                         num_bytes = caching_ctl->progress - start;
4835                         ret = btrfs_remove_free_space(block_group,
4836                                                       start, num_bytes);
4837                         BUG_ON(ret);
4838
4839                         start = caching_ctl->progress;
4840                         num_bytes = ins->objectid + ins->offset -
4841                                     caching_ctl->progress;
4842                         ret = add_excluded_extent(root, start, num_bytes);
4843                         BUG_ON(ret);
4844                 }
4845
4846                 mutex_unlock(&caching_ctl->mutex);
4847                 put_caching_control(caching_ctl);
4848         }
4849
4850         update_reserved_extents(block_group, ins->offset, 1);
4851         btrfs_put_block_group(block_group);
4852         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4853                                          0, owner, offset, ins, 1);
4854         return ret;
4855 }
4856
4857 /*
4858  * finds a free extent and does all the dirty work required for allocation
4859  * returns the key for the extent through ins, and a tree buffer for
4860  * the first block of the extent through buf.
4861  *
4862  * returns 0 if everything worked, non-zero otherwise.
4863  */
4864 static int alloc_tree_block(struct btrfs_trans_handle *trans,
4865                             struct btrfs_root *root,
4866                             u64 num_bytes, u64 parent, u64 root_objectid,
4867                             struct btrfs_disk_key *key, int level,
4868                             u64 empty_size, u64 hint_byte, u64 search_end,
4869                             struct btrfs_key *ins)
4870 {
4871         int ret;
4872         u64 flags = 0;
4873
4874         ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4875                                    empty_size, hint_byte, search_end,
4876                                    ins, 0);
4877         if (ret)
4878                 return ret;
4879
4880         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4881                 if (parent == 0)
4882                         parent = ins->objectid;
4883                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4884         } else
4885                 BUG_ON(parent > 0);
4886
4887         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4888                 struct btrfs_delayed_extent_op *extent_op;
4889                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4890                 BUG_ON(!extent_op);
4891                 if (key)
4892                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4893                 else
4894                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4895                 extent_op->flags_to_set = flags;
4896                 extent_op->update_key = 1;
4897                 extent_op->update_flags = 1;
4898                 extent_op->is_data = 0;
4899
4900                 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4901                                         ins->offset, parent, root_objectid,
4902                                         level, BTRFS_ADD_DELAYED_EXTENT,
4903                                         extent_op);
4904                 BUG_ON(ret);
4905         }
4906
4907         if (root_objectid == root->root_key.objectid) {
4908                 u64 used;
4909                 spin_lock(&root->node_lock);
4910                 used = btrfs_root_used(&root->root_item) + num_bytes;
4911                 btrfs_set_root_used(&root->root_item, used);
4912                 spin_unlock(&root->node_lock);
4913         }
4914         return ret;
4915 }
4916
4917 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4918                                             struct btrfs_root *root,
4919                                             u64 bytenr, u32 blocksize,
4920                                             int level)
4921 {
4922         struct extent_buffer *buf;
4923
4924         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
4925         if (!buf)
4926                 return ERR_PTR(-ENOMEM);
4927         btrfs_set_header_generation(buf, trans->transid);
4928         btrfs_set_buffer_lockdep_class(buf, level);
4929         btrfs_tree_lock(buf);
4930         clean_tree_block(trans, root, buf);
4931
4932         btrfs_set_lock_blocking(buf);
4933         btrfs_set_buffer_uptodate(buf);
4934
4935         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4936                 /*
4937                  * we allow two log transactions at a time, use different
4938                  * EXENT bit to differentiate dirty pages.
4939                  */
4940                 if (root->log_transid % 2 == 0)
4941                         set_extent_dirty(&root->dirty_log_pages, buf->start,
4942                                         buf->start + buf->len - 1, GFP_NOFS);
4943                 else
4944                         set_extent_new(&root->dirty_log_pages, buf->start,
4945                                         buf->start + buf->len - 1, GFP_NOFS);
4946         } else {
4947                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4948                          buf->start + buf->len - 1, GFP_NOFS);
4949         }
4950         trans->blocks_used++;
4951         /* this returns a buffer locked for blocking */
4952         return buf;
4953 }
4954
4955 /*
4956  * helper function to allocate a block for a given tree
4957  * returns the tree buffer or NULL.
4958  */
4959 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4960                                         struct btrfs_root *root, u32 blocksize,
4961                                         u64 parent, u64 root_objectid,
4962                                         struct btrfs_disk_key *key, int level,
4963                                         u64 hint, u64 empty_size)
4964 {
4965         struct btrfs_key ins;
4966         int ret;
4967         struct extent_buffer *buf;
4968
4969         ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
4970                                key, level, empty_size, hint, (u64)-1, &ins);
4971         if (ret) {
4972                 BUG_ON(ret > 0);
4973                 return ERR_PTR(ret);
4974         }
4975
4976         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4977                                     blocksize, level);
4978         return buf;
4979 }
4980
4981 struct walk_control {
4982         u64 refs[BTRFS_MAX_LEVEL];
4983         u64 flags[BTRFS_MAX_LEVEL];
4984         struct btrfs_key update_progress;
4985         int stage;
4986         int level;
4987         int shared_level;
4988         int update_ref;
4989         int keep_locks;
4990         int reada_slot;
4991         int reada_count;
4992 };
4993
4994 #define DROP_REFERENCE  1
4995 #define UPDATE_BACKREF  2
4996
4997 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4998                                      struct btrfs_root *root,
4999                                      struct walk_control *wc,
5000                                      struct btrfs_path *path)
5001 {
5002         u64 bytenr;
5003         u64 generation;
5004         u64 refs;
5005         u64 flags;
5006         u64 last = 0;
5007         u32 nritems;
5008         u32 blocksize;
5009         struct btrfs_key key;
5010         struct extent_buffer *eb;
5011         int ret;
5012         int slot;
5013         int nread = 0;
5014
5015         if (path->slots[wc->level] < wc->reada_slot) {
5016                 wc->reada_count = wc->reada_count * 2 / 3;
5017                 wc->reada_count = max(wc->reada_count, 2);
5018         } else {
5019                 wc->reada_count = wc->reada_count * 3 / 2;
5020                 wc->reada_count = min_t(int, wc->reada_count,
5021                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5022         }
5023
5024         eb = path->nodes[wc->level];
5025         nritems = btrfs_header_nritems(eb);
5026         blocksize = btrfs_level_size(root, wc->level - 1);
5027
5028         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5029                 if (nread >= wc->reada_count)
5030                         break;
5031
5032                 cond_resched();
5033                 bytenr = btrfs_node_blockptr(eb, slot);
5034                 generation = btrfs_node_ptr_generation(eb, slot);
5035
5036                 if (slot == path->slots[wc->level])
5037                         goto reada;
5038
5039                 if (wc->stage == UPDATE_BACKREF &&
5040                     generation <= root->root_key.offset)
5041                         continue;
5042
5043                 /* We don't lock the tree block, it's OK to be racy here */
5044                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5045                                                &refs, &flags);
5046                 BUG_ON(ret);
5047                 BUG_ON(refs == 0);
5048
5049                 if (wc->stage == DROP_REFERENCE) {
5050                         if (refs == 1)
5051                                 goto reada;
5052
5053                         if (wc->level == 1 &&
5054                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5055                                 continue;
5056                         if (!wc->update_ref ||
5057                             generation <= root->root_key.offset)
5058                                 continue;
5059                         btrfs_node_key_to_cpu(eb, &key, slot);
5060                         ret = btrfs_comp_cpu_keys(&key,
5061                                                   &wc->update_progress);
5062                         if (ret < 0)
5063                                 continue;
5064                 } else {
5065                         if (wc->level == 1 &&
5066                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5067                                 continue;
5068                 }
5069 reada:
5070                 ret = readahead_tree_block(root, bytenr, blocksize,
5071                                            generation);
5072                 if (ret)
5073                         break;
5074                 last = bytenr + blocksize;
5075                 nread++;
5076         }
5077         wc->reada_slot = slot;
5078 }
5079
5080 /*
5081  * hepler to process tree block while walking down the tree.
5082  *
5083  * when wc->stage == UPDATE_BACKREF, this function updates
5084  * back refs for pointers in the block.
5085  *
5086  * NOTE: return value 1 means we should stop walking down.
5087  */
5088 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5089                                    struct btrfs_root *root,
5090                                    struct btrfs_path *path,
5091                                    struct walk_control *wc, int lookup_info)
5092 {
5093         int level = wc->level;
5094         struct extent_buffer *eb = path->nodes[level];
5095         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5096         int ret;
5097
5098         if (wc->stage == UPDATE_BACKREF &&
5099             btrfs_header_owner(eb) != root->root_key.objectid)
5100                 return 1;
5101
5102         /*
5103          * when reference count of tree block is 1, it won't increase
5104          * again. once full backref flag is set, we never clear it.
5105          */
5106         if (lookup_info &&
5107             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5108              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5109                 BUG_ON(!path->locks[level]);
5110                 ret = btrfs_lookup_extent_info(trans, root,
5111                                                eb->start, eb->len,
5112                                                &wc->refs[level],
5113                                                &wc->flags[level]);
5114                 BUG_ON(ret);
5115                 BUG_ON(wc->refs[level] == 0);
5116         }
5117
5118         if (wc->stage == DROP_REFERENCE) {
5119                 if (wc->refs[level] > 1)
5120                         return 1;
5121
5122                 if (path->locks[level] && !wc->keep_locks) {
5123                         btrfs_tree_unlock(eb);
5124                         path->locks[level] = 0;
5125                 }
5126                 return 0;
5127         }
5128
5129         /* wc->stage == UPDATE_BACKREF */
5130         if (!(wc->flags[level] & flag)) {
5131                 BUG_ON(!path->locks[level]);
5132                 ret = btrfs_inc_ref(trans, root, eb, 1);
5133                 BUG_ON(ret);
5134                 ret = btrfs_dec_ref(trans, root, eb, 0);
5135                 BUG_ON(ret);
5136                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5137                                                   eb->len, flag, 0);
5138                 BUG_ON(ret);
5139                 wc->flags[level] |= flag;
5140         }
5141
5142         /*
5143          * the block is shared by multiple trees, so it's not good to
5144          * keep the tree lock
5145          */
5146         if (path->locks[level] && level > 0) {
5147                 btrfs_tree_unlock(eb);
5148                 path->locks[level] = 0;
5149         }
5150         return 0;
5151 }
5152
5153 /*
5154  * hepler to process tree block pointer.
5155  *
5156  * when wc->stage == DROP_REFERENCE, this function checks
5157  * reference count of the block pointed to. if the block
5158  * is shared and we need update back refs for the subtree
5159  * rooted at the block, this function changes wc->stage to
5160  * UPDATE_BACKREF. if the block is shared and there is no
5161  * need to update back, this function drops the reference
5162  * to the block.
5163  *
5164  * NOTE: return value 1 means we should stop walking down.
5165  */
5166 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5167                                  struct btrfs_root *root,
5168                                  struct btrfs_path *path,
5169                                  struct walk_control *wc, int *lookup_info)
5170 {
5171         u64 bytenr;
5172         u64 generation;
5173         u64 parent;
5174         u32 blocksize;
5175         struct btrfs_key key;
5176         struct extent_buffer *next;
5177         int level = wc->level;
5178         int reada = 0;
5179         int ret = 0;
5180
5181         generation = btrfs_node_ptr_generation(path->nodes[level],
5182                                                path->slots[level]);
5183         /*
5184          * if the lower level block was created before the snapshot
5185          * was created, we know there is no need to update back refs
5186          * for the subtree
5187          */
5188         if (wc->stage == UPDATE_BACKREF &&
5189             generation <= root->root_key.offset) {
5190                 *lookup_info = 1;
5191                 return 1;
5192         }
5193
5194         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5195         blocksize = btrfs_level_size(root, level - 1);
5196
5197         next = btrfs_find_tree_block(root, bytenr, blocksize);
5198         if (!next) {
5199                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5200                 reada = 1;
5201         }
5202         btrfs_tree_lock(next);
5203         btrfs_set_lock_blocking(next);
5204
5205         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5206                                        &wc->refs[level - 1],
5207                                        &wc->flags[level - 1]);
5208         BUG_ON(ret);
5209         BUG_ON(wc->refs[level - 1] == 0);
5210         *lookup_info = 0;
5211
5212         if (wc->stage == DROP_REFERENCE) {
5213                 if (wc->refs[level - 1] > 1) {
5214                         if (level == 1 &&
5215                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5216                                 goto skip;
5217
5218                         if (!wc->update_ref ||
5219                             generation <= root->root_key.offset)
5220                                 goto skip;
5221
5222                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5223                                               path->slots[level]);
5224                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5225                         if (ret < 0)
5226                                 goto skip;
5227
5228                         wc->stage = UPDATE_BACKREF;
5229                         wc->shared_level = level - 1;
5230                 }
5231         } else {
5232                 if (level == 1 &&
5233                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5234                         goto skip;
5235         }
5236
5237         if (!btrfs_buffer_uptodate(next, generation)) {
5238                 btrfs_tree_unlock(next);
5239                 free_extent_buffer(next);
5240                 next = NULL;
5241                 *lookup_info = 1;
5242         }
5243
5244         if (!next) {
5245                 if (reada && level == 1)
5246                         reada_walk_down(trans, root, wc, path);
5247                 next = read_tree_block(root, bytenr, blocksize, generation);
5248                 btrfs_tree_lock(next);
5249                 btrfs_set_lock_blocking(next);
5250         }
5251
5252         level--;
5253         BUG_ON(level != btrfs_header_level(next));
5254         path->nodes[level] = next;
5255         path->slots[level] = 0;
5256         path->locks[level] = 1;
5257         wc->level = level;
5258         if (wc->level == 1)
5259                 wc->reada_slot = 0;
5260         return 0;
5261 skip:
5262         wc->refs[level - 1] = 0;
5263         wc->flags[level - 1] = 0;
5264         if (wc->stage == DROP_REFERENCE) {
5265                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5266                         parent = path->nodes[level]->start;
5267                 } else {
5268                         BUG_ON(root->root_key.objectid !=
5269                                btrfs_header_owner(path->nodes[level]));
5270                         parent = 0;
5271                 }
5272
5273                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5274                                         root->root_key.objectid, level - 1, 0);
5275                 BUG_ON(ret);
5276         }
5277         btrfs_tree_unlock(next);
5278         free_extent_buffer(next);
5279         *lookup_info = 1;
5280         return 1;
5281 }
5282
5283 /*
5284  * hepler to process tree block while walking up the tree.
5285  *
5286  * when wc->stage == DROP_REFERENCE, this function drops
5287  * reference count on the block.
5288  *
5289  * when wc->stage == UPDATE_BACKREF, this function changes
5290  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5291  * to UPDATE_BACKREF previously while processing the block.
5292  *
5293  * NOTE: return value 1 means we should stop walking up.
5294  */
5295 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5296                                  struct btrfs_root *root,
5297                                  struct btrfs_path *path,
5298                                  struct walk_control *wc)
5299 {
5300         int ret = 0;
5301         int level = wc->level;
5302         struct extent_buffer *eb = path->nodes[level];
5303         u64 parent = 0;
5304
5305         if (wc->stage == UPDATE_BACKREF) {
5306                 BUG_ON(wc->shared_level < level);
5307                 if (level < wc->shared_level)
5308                         goto out;
5309
5310                 ret = find_next_key(path, level + 1, &wc->update_progress);
5311                 if (ret > 0)
5312                         wc->update_ref = 0;
5313
5314                 wc->stage = DROP_REFERENCE;
5315                 wc->shared_level = -1;
5316                 path->slots[level] = 0;
5317
5318                 /*
5319                  * check reference count again if the block isn't locked.
5320                  * we should start walking down the tree again if reference
5321                  * count is one.
5322                  */
5323                 if (!path->locks[level]) {
5324                         BUG_ON(level == 0);
5325                         btrfs_tree_lock(eb);
5326                         btrfs_set_lock_blocking(eb);
5327                         path->locks[level] = 1;
5328
5329                         ret = btrfs_lookup_extent_info(trans, root,
5330                                                        eb->start, eb->len,
5331                                                        &wc->refs[level],
5332                                                        &wc->flags[level]);
5333                         BUG_ON(ret);
5334                         BUG_ON(wc->refs[level] == 0);
5335                         if (wc->refs[level] == 1) {
5336                                 btrfs_tree_unlock(eb);
5337                                 path->locks[level] = 0;
5338                                 return 1;
5339                         }
5340                 }
5341         }
5342
5343         /* wc->stage == DROP_REFERENCE */
5344         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5345
5346         if (wc->refs[level] == 1) {
5347                 if (level == 0) {
5348                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5349                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5350                         else
5351                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5352                         BUG_ON(ret);
5353                 }
5354                 /* make block locked assertion in clean_tree_block happy */
5355                 if (!path->locks[level] &&
5356                     btrfs_header_generation(eb) == trans->transid) {
5357                         btrfs_tree_lock(eb);
5358                         btrfs_set_lock_blocking(eb);
5359                         path->locks[level] = 1;
5360                 }
5361                 clean_tree_block(trans, root, eb);
5362         }
5363
5364         if (eb == root->node) {
5365                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5366                         parent = eb->start;
5367                 else
5368                         BUG_ON(root->root_key.objectid !=
5369                                btrfs_header_owner(eb));
5370         } else {
5371                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5372                         parent = path->nodes[level + 1]->start;
5373                 else
5374                         BUG_ON(root->root_key.objectid !=
5375                                btrfs_header_owner(path->nodes[level + 1]));
5376         }
5377
5378         ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
5379                                 root->root_key.objectid, level, 0);
5380         BUG_ON(ret);
5381 out:
5382         wc->refs[level] = 0;
5383         wc->flags[level] = 0;
5384         return ret;
5385 }
5386
5387 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5388                                    struct btrfs_root *root,
5389                                    struct btrfs_path *path,
5390                                    struct walk_control *wc)
5391 {
5392         int level = wc->level;
5393         int lookup_info = 1;
5394         int ret;
5395
5396         while (level >= 0) {
5397                 if (path->slots[level] >=
5398                     btrfs_header_nritems(path->nodes[level]))
5399                         break;
5400
5401                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5402                 if (ret > 0)
5403                         break;
5404
5405                 if (level == 0)
5406                         break;
5407
5408                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5409                 if (ret > 0) {
5410                         path->slots[level]++;
5411                         continue;
5412                 }
5413                 level = wc->level;
5414         }
5415         return 0;
5416 }
5417
5418 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5419                                  struct btrfs_root *root,
5420                                  struct btrfs_path *path,
5421                                  struct walk_control *wc, int max_level)
5422 {
5423         int level = wc->level;
5424         int ret;
5425
5426         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5427         while (level < max_level && path->nodes[level]) {
5428                 wc->level = level;
5429                 if (path->slots[level] + 1 <
5430                     btrfs_header_nritems(path->nodes[level])) {
5431                         path->slots[level]++;
5432                         return 0;
5433                 } else {
5434                         ret = walk_up_proc(trans, root, path, wc);
5435                         if (ret > 0)
5436                                 return 0;
5437
5438                         if (path->locks[level]) {
5439                                 btrfs_tree_unlock(path->nodes[level]);
5440                                 path->locks[level] = 0;
5441                         }
5442                         free_extent_buffer(path->nodes[level]);
5443                         path->nodes[level] = NULL;
5444                         level++;
5445                 }
5446         }
5447         return 1;
5448 }
5449
5450 /*
5451  * drop a subvolume tree.
5452  *
5453  * this function traverses the tree freeing any blocks that only
5454  * referenced by the tree.
5455  *
5456  * when a shared tree block is found. this function decreases its
5457  * reference count by one. if update_ref is true, this function
5458  * also make sure backrefs for the shared block and all lower level
5459  * blocks are properly updated.
5460  */
5461 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5462 {
5463         struct btrfs_path *path;
5464         struct btrfs_trans_handle *trans;
5465         struct btrfs_root *tree_root = root->fs_info->tree_root;
5466         struct btrfs_root_item *root_item = &root->root_item;
5467         struct walk_control *wc;
5468         struct btrfs_key key;
5469         int err = 0;
5470         int ret;
5471         int level;
5472
5473         path = btrfs_alloc_path();
5474         BUG_ON(!path);
5475
5476         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5477         BUG_ON(!wc);
5478
5479         trans = btrfs_start_transaction(tree_root, 1);
5480
5481         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5482                 level = btrfs_header_level(root->node);
5483                 path->nodes[level] = btrfs_lock_root_node(root);
5484                 btrfs_set_lock_blocking(path->nodes[level]);
5485                 path->slots[level] = 0;
5486                 path->locks[level] = 1;
5487                 memset(&wc->update_progress, 0,
5488                        sizeof(wc->update_progress));
5489         } else {
5490                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5491                 memcpy(&wc->update_progress, &key,
5492                        sizeof(wc->update_progress));
5493
5494                 level = root_item->drop_level;
5495                 BUG_ON(level == 0);
5496                 path->lowest_level = level;
5497                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5498                 path->lowest_level = 0;
5499                 if (ret < 0) {
5500                         err = ret;
5501                         goto out;
5502                 }
5503                 WARN_ON(ret > 0);
5504
5505                 /*
5506                  * unlock our path, this is safe because only this
5507                  * function is allowed to delete this snapshot
5508                  */
5509                 btrfs_unlock_up_safe(path, 0);
5510
5511                 level = btrfs_header_level(root->node);
5512                 while (1) {
5513                         btrfs_tree_lock(path->nodes[level]);
5514                         btrfs_set_lock_blocking(path->nodes[level]);
5515
5516                         ret = btrfs_lookup_extent_info(trans, root,
5517                                                 path->nodes[level]->start,
5518                                                 path->nodes[level]->len,
5519                                                 &wc->refs[level],
5520                                                 &wc->flags[level]);
5521                         BUG_ON(ret);
5522                         BUG_ON(wc->refs[level] == 0);
5523
5524                         if (level == root_item->drop_level)
5525                                 break;
5526
5527                         btrfs_tree_unlock(path->nodes[level]);
5528                         WARN_ON(wc->refs[level] != 1);
5529                         level--;
5530                 }
5531         }
5532
5533         wc->level = level;
5534         wc->shared_level = -1;
5535         wc->stage = DROP_REFERENCE;
5536         wc->update_ref = update_ref;
5537         wc->keep_locks = 0;
5538         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5539
5540         while (1) {
5541                 ret = walk_down_tree(trans, root, path, wc);
5542                 if (ret < 0) {
5543                         err = ret;
5544                         break;
5545                 }
5546
5547                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5548                 if (ret < 0) {
5549                         err = ret;
5550                         break;
5551                 }
5552
5553                 if (ret > 0) {
5554                         BUG_ON(wc->stage != DROP_REFERENCE);
5555                         break;
5556                 }
5557
5558                 if (wc->stage == DROP_REFERENCE) {
5559                         level = wc->level;
5560                         btrfs_node_key(path->nodes[level],
5561                                        &root_item->drop_progress,
5562                                        path->slots[level]);
5563                         root_item->drop_level = level;
5564                 }
5565
5566                 BUG_ON(wc->level == 0);
5567                 if (trans->transaction->in_commit ||
5568                     trans->transaction->delayed_refs.flushing) {
5569                         ret = btrfs_update_root(trans, tree_root,
5570                                                 &root->root_key,
5571                                                 root_item);
5572                         BUG_ON(ret);
5573
5574                         btrfs_end_transaction(trans, tree_root);
5575                         trans = btrfs_start_transaction(tree_root, 1);
5576                 } else {
5577                         unsigned long update;
5578                         update = trans->delayed_ref_updates;
5579                         trans->delayed_ref_updates = 0;
5580                         if (update)
5581                                 btrfs_run_delayed_refs(trans, tree_root,
5582                                                        update);
5583                 }
5584         }
5585         btrfs_release_path(root, path);
5586         BUG_ON(err);
5587
5588         ret = btrfs_del_root(trans, tree_root, &root->root_key);
5589         BUG_ON(ret);
5590
5591         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5592                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5593                                            NULL, NULL);
5594                 BUG_ON(ret < 0);
5595                 if (ret > 0) {
5596                         ret = btrfs_del_orphan_item(trans, tree_root,
5597                                                     root->root_key.objectid);
5598                         BUG_ON(ret);
5599                 }
5600         }
5601
5602         if (root->in_radix) {
5603                 btrfs_free_fs_root(tree_root->fs_info, root);
5604         } else {
5605                 free_extent_buffer(root->node);
5606                 free_extent_buffer(root->commit_root);
5607                 kfree(root);
5608         }
5609 out:
5610         btrfs_end_transaction(trans, tree_root);
5611         kfree(wc);
5612         btrfs_free_path(path);
5613         return err;
5614 }
5615
5616 /*
5617  * drop subtree rooted at tree block 'node'.
5618  *
5619  * NOTE: this function will unlock and release tree block 'node'
5620  */
5621 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5622                         struct btrfs_root *root,
5623                         struct extent_buffer *node,
5624                         struct extent_buffer *parent)
5625 {
5626         struct btrfs_path *path;
5627         struct walk_control *wc;
5628         int level;
5629         int parent_level;
5630         int ret = 0;
5631         int wret;
5632
5633         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5634
5635         path = btrfs_alloc_path();
5636         BUG_ON(!path);
5637
5638         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5639         BUG_ON(!wc);
5640
5641         btrfs_assert_tree_locked(parent);
5642         parent_level = btrfs_header_level(parent);
5643         extent_buffer_get(parent);
5644         path->nodes[parent_level] = parent;
5645         path->slots[parent_level] = btrfs_header_nritems(parent);
5646
5647         btrfs_assert_tree_locked(node);
5648         level = btrfs_header_level(node);
5649         path->nodes[level] = node;
5650         path->slots[level] = 0;
5651         path->locks[level] = 1;
5652
5653         wc->refs[parent_level] = 1;
5654         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5655         wc->level = level;
5656         wc->shared_level = -1;
5657         wc->stage = DROP_REFERENCE;
5658         wc->update_ref = 0;
5659         wc->keep_locks = 1;
5660         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5661
5662         while (1) {
5663                 wret = walk_down_tree(trans, root, path, wc);
5664                 if (wret < 0) {
5665                         ret = wret;
5666                         break;
5667                 }
5668
5669                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5670                 if (wret < 0)
5671                         ret = wret;
5672                 if (wret != 0)
5673                         break;
5674         }
5675
5676         kfree(wc);
5677         btrfs_free_path(path);
5678         return ret;
5679 }
5680
5681 #if 0
5682 static unsigned long calc_ra(unsigned long start, unsigned long last,
5683                              unsigned long nr)
5684 {
5685         return min(last, start + nr - 1);
5686 }
5687
5688 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
5689                                          u64 len)
5690 {
5691         u64 page_start;
5692         u64 page_end;
5693         unsigned long first_index;
5694         unsigned long last_index;
5695         unsigned long i;
5696         struct page *page;
5697         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5698         struct file_ra_state *ra;
5699         struct btrfs_ordered_extent *ordered;
5700         unsigned int total_read = 0;
5701         unsigned int total_dirty = 0;
5702         int ret = 0;
5703
5704         ra = kzalloc(sizeof(*ra), GFP_NOFS);
5705
5706         mutex_lock(&inode->i_mutex);
5707         first_index = start >> PAGE_CACHE_SHIFT;
5708         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
5709
5710         /* make sure the dirty trick played by the caller work */
5711         ret = invalidate_inode_pages2_range(inode->i_mapping,
5712                                             first_index, last_index);
5713         if (ret)
5714                 goto out_unlock;
5715
5716         file_ra_state_init(ra, inode->i_mapping);
5717
5718         for (i = first_index ; i <= last_index; i++) {
5719                 if (total_read % ra->ra_pages == 0) {
5720                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
5721                                        calc_ra(i, last_index, ra->ra_pages));
5722                 }
5723                 total_read++;
5724 again:
5725                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
5726                         BUG_ON(1);
5727                 page = grab_cache_page(inode->i_mapping, i);
5728                 if (!page) {
5729                         ret = -ENOMEM;
5730                         goto out_unlock;
5731                 }
5732                 if (!PageUptodate(page)) {
5733                         btrfs_readpage(NULL, page);
5734                         lock_page(page);
5735                         if (!PageUptodate(page)) {
5736                                 unlock_page(page);
5737                                 page_cache_release(page);
5738                                 ret = -EIO;
5739                                 goto out_unlock;
5740                         }
5741                 }
5742                 wait_on_page_writeback(page);
5743
5744                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
5745                 page_end = page_start + PAGE_CACHE_SIZE - 1;
5746                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
5747
5748                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5749                 if (ordered) {
5750                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5751                         unlock_page(page);
5752                         page_cache_release(page);
5753                         btrfs_start_ordered_extent(inode, ordered, 1);
5754                         btrfs_put_ordered_extent(ordered);
5755                         goto again;
5756                 }
5757                 set_page_extent_mapped(page);
5758
5759                 if (i == first_index)
5760                         set_extent_bits(io_tree, page_start, page_end,
5761                                         EXTENT_BOUNDARY, GFP_NOFS);
5762                 btrfs_set_extent_delalloc(inode, page_start, page_end);
5763
5764                 set_page_dirty(page);
5765                 total_dirty++;
5766
5767                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5768                 unlock_page(page);
5769                 page_cache_release(page);
5770         }
5771
5772 out_unlock:
5773         kfree(ra);
5774         mutex_unlock(&inode->i_mutex);
5775         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
5776         return ret;
5777 }
5778
5779 static noinline int relocate_data_extent(struct inode *reloc_inode,
5780                                          struct btrfs_key *extent_key,
5781                                          u64 offset)
5782 {
5783         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5784         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
5785         struct extent_map *em;
5786         u64 start = extent_key->objectid - offset;
5787         u64 end = start + extent_key->offset - 1;
5788
5789         em = alloc_extent_map(GFP_NOFS);
5790         BUG_ON(!em || IS_ERR(em));
5791
5792         em->start = start;
5793         em->len = extent_key->offset;
5794         em->block_len = extent_key->offset;
5795         em->block_start = extent_key->objectid;
5796         em->bdev = root->fs_info->fs_devices->latest_bdev;
5797         set_bit(EXTENT_FLAG_PINNED, &em->flags);
5798
5799         /* setup extent map to cheat btrfs_readpage */
5800         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5801         while (1) {
5802                 int ret;
5803                 write_lock(&em_tree->lock);
5804                 ret = add_extent_mapping(em_tree, em);
5805                 write_unlock(&em_tree->lock);
5806                 if (ret != -EEXIST) {
5807                         free_extent_map(em);
5808                         break;
5809                 }
5810                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
5811         }
5812         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5813
5814         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
5815 }
5816
5817 struct btrfs_ref_path {
5818         u64 extent_start;
5819         u64 nodes[BTRFS_MAX_LEVEL];
5820         u64 root_objectid;
5821         u64 root_generation;
5822         u64 owner_objectid;
5823         u32 num_refs;
5824         int lowest_level;
5825         int current_level;
5826         int shared_level;
5827
5828         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
5829         u64 new_nodes[BTRFS_MAX_LEVEL];
5830 };
5831
5832 struct disk_extent {
5833         u64 ram_bytes;
5834         u64 disk_bytenr;
5835         u64 disk_num_bytes;
5836         u64 offset;
5837         u64 num_bytes;
5838         u8 compression;
5839         u8 encryption;
5840         u16 other_encoding;
5841 };
5842
5843 static int is_cowonly_root(u64 root_objectid)
5844 {
5845         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
5846             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
5847             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
5848             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
5849             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5850             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
5851                 return 1;
5852         return 0;
5853 }
5854
5855 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
5856                                     struct btrfs_root *extent_root,
5857                                     struct btrfs_ref_path *ref_path,
5858                                     int first_time)
5859 {
5860         struct extent_buffer *leaf;
5861         struct btrfs_path *path;
5862         struct btrfs_extent_ref *ref;
5863         struct btrfs_key key;
5864         struct btrfs_key found_key;
5865         u64 bytenr;
5866         u32 nritems;
5867         int level;
5868         int ret = 1;
5869
5870         path = btrfs_alloc_path();
5871         if (!path)
5872                 return -ENOMEM;
5873
5874         if (first_time) {
5875                 ref_path->lowest_level = -1;
5876                 ref_path->current_level = -1;
5877                 ref_path->shared_level = -1;
5878                 goto walk_up;
5879         }
5880 walk_down:
5881         level = ref_path->current_level - 1;
5882         while (level >= -1) {
5883                 u64 parent;
5884                 if (level < ref_path->lowest_level)
5885                         break;
5886
5887                 if (level >= 0)
5888                         bytenr = ref_path->nodes[level];
5889                 else
5890                         bytenr = ref_path->extent_start;
5891                 BUG_ON(bytenr == 0);
5892
5893                 parent = ref_path->nodes[level + 1];
5894                 ref_path->nodes[level + 1] = 0;
5895                 ref_path->current_level = level;
5896                 BUG_ON(parent == 0);
5897
5898                 key.objectid = bytenr;
5899                 key.offset = parent + 1;
5900                 key.type = BTRFS_EXTENT_REF_KEY;
5901
5902                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5903                 if (ret < 0)
5904                         goto out;
5905                 BUG_ON(ret == 0);
5906
5907                 leaf = path->nodes[0];
5908                 nritems = btrfs_header_nritems(leaf);
5909                 if (path->slots[0] >= nritems) {
5910                         ret = btrfs_next_leaf(extent_root, path);
5911                         if (ret < 0)
5912                                 goto out;
5913                         if (ret > 0)
5914                                 goto next;
5915                         leaf = path->nodes[0];
5916                 }
5917
5918                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5919                 if (found_key.objectid == bytenr &&
5920                     found_key.type == BTRFS_EXTENT_REF_KEY) {
5921                         if (level < ref_path->shared_level)
5922                                 ref_path->shared_level = level;
5923                         goto found;
5924                 }
5925 next:
5926                 level--;
5927                 btrfs_release_path(extent_root, path);
5928                 cond_resched();
5929         }
5930         /* reached lowest level */
5931         ret = 1;
5932         goto out;
5933 walk_up:
5934         level = ref_path->current_level;
5935         while (level < BTRFS_MAX_LEVEL - 1) {
5936                 u64 ref_objectid;
5937
5938                 if (level >= 0)
5939                         bytenr = ref_path->nodes[level];
5940                 else
5941                         bytenr = ref_path->extent_start;
5942
5943                 BUG_ON(bytenr == 0);
5944
5945                 key.objectid = bytenr;
5946                 key.offset = 0;
5947                 key.type = BTRFS_EXTENT_REF_KEY;
5948
5949                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5950                 if (ret < 0)
5951                         goto out;
5952
5953                 leaf = path->nodes[0];
5954                 nritems = btrfs_header_nritems(leaf);
5955                 if (path->slots[0] >= nritems) {
5956                         ret = btrfs_next_leaf(extent_root, path);
5957                         if (ret < 0)
5958                                 goto out;
5959                         if (ret > 0) {
5960                                 /* the extent was freed by someone */
5961                                 if (ref_path->lowest_level == level)
5962                                         goto out;
5963                                 btrfs_release_path(extent_root, path);
5964                                 goto walk_down;
5965                         }
5966                         leaf = path->nodes[0];
5967                 }
5968
5969                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5970                 if (found_key.objectid != bytenr ||
5971                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
5972                         /* the extent was freed by someone */
5973                         if (ref_path->lowest_level == level) {
5974                                 ret = 1;
5975                                 goto out;
5976                         }
5977                         btrfs_release_path(extent_root, path);
5978                         goto walk_down;
5979                 }
5980 found:
5981                 ref = btrfs_item_ptr(leaf, path->slots[0],
5982                                 struct btrfs_extent_ref);
5983                 ref_objectid = btrfs_ref_objectid(leaf, ref);
5984                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5985                         if (first_time) {
5986                                 level = (int)ref_objectid;
5987                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
5988                                 ref_path->lowest_level = level;
5989                                 ref_path->current_level = level;
5990                                 ref_path->nodes[level] = bytenr;
5991                         } else {
5992                                 WARN_ON(ref_objectid != level);
5993                         }
5994                 } else {
5995                         WARN_ON(level != -1);
5996                 }
5997                 first_time = 0;
5998
5999                 if (ref_path->lowest_level == level) {
6000                         ref_path->owner_objectid = ref_objectid;
6001                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6002                 }
6003
6004                 /*
6005                  * the block is tree root or the block isn't in reference
6006                  * counted tree.
6007                  */
6008                 if (found_key.objectid == found_key.offset ||
6009                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6010                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6011                         ref_path->root_generation =
6012                                 btrfs_ref_generation(leaf, ref);
6013                         if (level < 0) {
6014                                 /* special reference from the tree log */
6015                                 ref_path->nodes[0] = found_key.offset;
6016                                 ref_path->current_level = 0;
6017                         }
6018                         ret = 0;
6019                         goto out;
6020                 }
6021
6022                 level++;
6023                 BUG_ON(ref_path->nodes[level] != 0);
6024                 ref_path->nodes[level] = found_key.offset;
6025                 ref_path->current_level = level;
6026
6027                 /*
6028                  * the reference was created in the running transaction,
6029                  * no need to continue walking up.
6030                  */
6031                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6032                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6033                         ref_path->root_generation =
6034                                 btrfs_ref_generation(leaf, ref);
6035                         ret = 0;
6036                         goto out;
6037                 }
6038
6039                 btrfs_release_path(extent_root, path);
6040                 cond_resched();
6041         }
6042         /* reached max tree level, but no tree root found. */
6043         BUG();
6044 out:
6045         btrfs_free_path(path);
6046         return ret;
6047 }
6048
6049 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6050                                 struct btrfs_root *extent_root,
6051                                 struct btrfs_ref_path *ref_path,
6052                                 u64 extent_start)
6053 {
6054         memset(ref_path, 0, sizeof(*ref_path));
6055         ref_path->extent_start = extent_start;
6056
6057         return __next_ref_path(trans, extent_root, ref_path, 1);
6058 }
6059
6060 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6061                                struct btrfs_root *extent_root,
6062                                struct btrfs_ref_path *ref_path)
6063 {
6064         return __next_ref_path(trans, extent_root, ref_path, 0);
6065 }
6066
6067 static noinline int get_new_locations(struct inode *reloc_inode,
6068                                       struct btrfs_key *extent_key,
6069                                       u64 offset, int no_fragment,
6070                                       struct disk_extent **extents,
6071                                       int *nr_extents)
6072 {
6073         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6074         struct btrfs_path *path;
6075         struct btrfs_file_extent_item *fi;
6076         struct extent_buffer *leaf;
6077         struct disk_extent *exts = *extents;
6078         struct btrfs_key found_key;
6079         u64 cur_pos;
6080         u64 last_byte;
6081         u32 nritems;
6082         int nr = 0;
6083         int max = *nr_extents;
6084         int ret;
6085
6086         WARN_ON(!no_fragment && *extents);
6087         if (!exts) {
6088                 max = 1;
6089                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6090                 if (!exts)
6091                         return -ENOMEM;
6092         }
6093
6094         path = btrfs_alloc_path();
6095         BUG_ON(!path);
6096
6097         cur_pos = extent_key->objectid - offset;
6098         last_byte = extent_key->objectid + extent_key->offset;
6099         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6100                                        cur_pos, 0);
6101         if (ret < 0)
6102                 goto out;
6103         if (ret > 0) {
6104                 ret = -ENOENT;
6105                 goto out;
6106         }
6107
6108         while (1) {
6109                 leaf = path->nodes[0];
6110                 nritems = btrfs_header_nritems(leaf);
6111                 if (path->slots[0] >= nritems) {
6112                         ret = btrfs_next_leaf(root, path);
6113                         if (ret < 0)
6114                                 goto out;
6115                         if (ret > 0)
6116                                 break;
6117                         leaf = path->nodes[0];
6118                 }
6119
6120                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6121                 if (found_key.offset != cur_pos ||
6122                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
6123                     found_key.objectid != reloc_inode->i_ino)
6124                         break;
6125
6126                 fi = btrfs_item_ptr(leaf, path->slots[0],
6127                                     struct btrfs_file_extent_item);
6128                 if (btrfs_file_extent_type(leaf, fi) !=
6129                     BTRFS_FILE_EXTENT_REG ||
6130                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6131                         break;
6132
6133                 if (nr == max) {
6134                         struct disk_extent *old = exts;
6135                         max *= 2;
6136                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6137                         memcpy(exts, old, sizeof(*exts) * nr);
6138                         if (old != *extents)
6139                                 kfree(old);
6140                 }
6141
6142                 exts[nr].disk_bytenr =
6143                         btrfs_file_extent_disk_bytenr(leaf, fi);
6144                 exts[nr].disk_num_bytes =
6145                         btrfs_file_extent_disk_num_bytes(leaf, fi);
6146                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6147                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6148                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6149                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6150                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6151                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6152                                                                            fi);
6153                 BUG_ON(exts[nr].offset > 0);
6154                 BUG_ON(exts[nr].compression || exts[nr].encryption);
6155                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6156
6157                 cur_pos += exts[nr].num_bytes;
6158                 nr++;
6159
6160                 if (cur_pos + offset >= last_byte)
6161                         break;
6162
6163                 if (no_fragment) {
6164                         ret = 1;
6165                         goto out;
6166                 }
6167                 path->slots[0]++;
6168         }
6169
6170         BUG_ON(cur_pos + offset > last_byte);
6171         if (cur_pos + offset < last_byte) {
6172                 ret = -ENOENT;
6173                 goto out;
6174         }
6175         ret = 0;
6176 out:
6177         btrfs_free_path(path);
6178         if (ret) {
6179                 if (exts != *extents)
6180                         kfree(exts);
6181         } else {
6182                 *extents = exts;
6183                 *nr_extents = nr;
6184         }
6185         return ret;
6186 }
6187
6188 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6189                                         struct btrfs_root *root,
6190                                         struct btrfs_path *path,
6191                                         struct btrfs_key *extent_key,
6192                                         struct btrfs_key *leaf_key,
6193                                         struct btrfs_ref_path *ref_path,
6194                                         struct disk_extent *new_extents,
6195                                         int nr_extents)
6196 {
6197         struct extent_buffer *leaf;
6198         struct btrfs_file_extent_item *fi;
6199         struct inode *inode = NULL;
6200         struct btrfs_key key;
6201         u64 lock_start = 0;
6202         u64 lock_end = 0;
6203         u64 num_bytes;
6204         u64 ext_offset;
6205         u64 search_end = (u64)-1;
6206         u32 nritems;
6207         int nr_scaned = 0;
6208         int extent_locked = 0;
6209         int extent_type;
6210         int ret;
6211
6212         memcpy(&key, leaf_key, sizeof(key));
6213         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6214                 if (key.objectid < ref_path->owner_objectid ||
6215                     (key.objectid == ref_path->owner_objectid &&
6216                      key.type < BTRFS_EXTENT_DATA_KEY)) {
6217                         key.objectid = ref_path->owner_objectid;
6218                         key.type = BTRFS_EXTENT_DATA_KEY;
6219                         key.offset = 0;
6220                 }
6221         }
6222
6223         while (1) {
6224                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6225                 if (ret < 0)
6226                         goto out;
6227
6228                 leaf = path->nodes[0];
6229                 nritems = btrfs_header_nritems(leaf);
6230 next:
6231                 if (extent_locked && ret > 0) {
6232                         /*
6233                          * the file extent item was modified by someone
6234                          * before the extent got locked.
6235                          */
6236                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6237                                       lock_end, GFP_NOFS);
6238                         extent_locked = 0;
6239                 }
6240
6241                 if (path->slots[0] >= nritems) {
6242                         if (++nr_scaned > 2)
6243                                 break;
6244
6245                         BUG_ON(extent_locked);
6246                         ret = btrfs_next_leaf(root, path);
6247                         if (ret < 0)
6248                                 goto out;
6249                         if (ret > 0)
6250                                 break;
6251                         leaf = path->nodes[0];
6252                         nritems = btrfs_header_nritems(leaf);
6253                 }
6254
6255                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6256
6257                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6258                         if ((key.objectid > ref_path->owner_objectid) ||
6259                             (key.objectid == ref_path->owner_objectid &&
6260                              key.type > BTRFS_EXTENT_DATA_KEY) ||
6261                             key.offset >= search_end)
6262                                 break;
6263                 }
6264
6265                 if (inode && key.objectid != inode->i_ino) {
6266                         BUG_ON(extent_locked);
6267                         btrfs_release_path(root, path);
6268                         mutex_unlock(&inode->i_mutex);
6269                         iput(inode);
6270                         inode = NULL;
6271                         continue;
6272                 }
6273
6274                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6275                         path->slots[0]++;
6276                         ret = 1;
6277                         goto next;
6278                 }
6279                 fi = btrfs_item_ptr(leaf, path->slots[0],
6280                                     struct btrfs_file_extent_item);
6281                 extent_type = btrfs_file_extent_type(leaf, fi);
6282                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6283                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6284                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6285                      extent_key->objectid)) {
6286                         path->slots[0]++;
6287                         ret = 1;
6288                         goto next;
6289                 }
6290
6291                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6292                 ext_offset = btrfs_file_extent_offset(leaf, fi);
6293
6294                 if (search_end == (u64)-1) {
6295                         search_end = key.offset - ext_offset +
6296                                 btrfs_file_extent_ram_bytes(leaf, fi);
6297                 }
6298
6299                 if (!extent_locked) {
6300                         lock_start = key.offset;
6301                         lock_end = lock_start + num_bytes - 1;
6302                 } else {
6303                         if (lock_start > key.offset ||
6304                             lock_end + 1 < key.offset + num_bytes) {
6305                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6306                                               lock_start, lock_end, GFP_NOFS);
6307                                 extent_locked = 0;
6308                         }
6309                 }
6310
6311                 if (!inode) {
6312                         btrfs_release_path(root, path);
6313
6314                         inode = btrfs_iget_locked(root->fs_info->sb,
6315                                                   key.objectid, root);
6316                         if (inode->i_state & I_NEW) {
6317                                 BTRFS_I(inode)->root = root;
6318                                 BTRFS_I(inode)->location.objectid =
6319                                         key.objectid;
6320                                 BTRFS_I(inode)->location.type =
6321                                         BTRFS_INODE_ITEM_KEY;
6322                                 BTRFS_I(inode)->location.offset = 0;
6323                                 btrfs_read_locked_inode(inode);
6324                                 unlock_new_inode(inode);
6325                         }
6326                         /*
6327                          * some code call btrfs_commit_transaction while
6328                          * holding the i_mutex, so we can't use mutex_lock
6329                          * here.
6330                          */
6331                         if (is_bad_inode(inode) ||
6332                             !mutex_trylock(&inode->i_mutex)) {
6333                                 iput(inode);
6334                                 inode = NULL;
6335                                 key.offset = (u64)-1;
6336                                 goto skip;
6337                         }
6338                 }
6339
6340                 if (!extent_locked) {
6341                         struct btrfs_ordered_extent *ordered;
6342
6343                         btrfs_release_path(root, path);
6344
6345                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6346                                     lock_end, GFP_NOFS);
6347                         ordered = btrfs_lookup_first_ordered_extent(inode,
6348                                                                     lock_end);
6349                         if (ordered &&
6350                             ordered->file_offset <= lock_end &&
6351                             ordered->file_offset + ordered->len > lock_start) {
6352                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6353                                               lock_start, lock_end, GFP_NOFS);
6354                                 btrfs_start_ordered_extent(inode, ordered, 1);
6355                                 btrfs_put_ordered_extent(ordered);
6356                                 key.offset += num_bytes;
6357                                 goto skip;
6358                         }
6359                         if (ordered)
6360                                 btrfs_put_ordered_extent(ordered);
6361
6362                         extent_locked = 1;
6363                         continue;
6364                 }
6365
6366                 if (nr_extents == 1) {
6367                         /* update extent pointer in place */
6368                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
6369                                                 new_extents[0].disk_bytenr);
6370                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6371                                                 new_extents[0].disk_num_bytes);
6372                         btrfs_mark_buffer_dirty(leaf);
6373
6374                         btrfs_drop_extent_cache(inode, key.offset,
6375                                                 key.offset + num_bytes - 1, 0);
6376
6377                         ret = btrfs_inc_extent_ref(trans, root,
6378                                                 new_extents[0].disk_bytenr,
6379                                                 new_extents[0].disk_num_bytes,
6380                                                 leaf->start,
6381                                                 root->root_key.objectid,
6382                                                 trans->transid,
6383                                                 key.objectid);
6384                         BUG_ON(ret);
6385
6386                         ret = btrfs_free_extent(trans, root,
6387                                                 extent_key->objectid,
6388                                                 extent_key->offset,
6389                                                 leaf->start,
6390                                                 btrfs_header_owner(leaf),
6391                                                 btrfs_header_generation(leaf),
6392                                                 key.objectid, 0);
6393                         BUG_ON(ret);
6394
6395                         btrfs_release_path(root, path);
6396                         key.offset += num_bytes;
6397                 } else {
6398                         BUG_ON(1);
6399 #if 0
6400                         u64 alloc_hint;
6401                         u64 extent_len;
6402                         int i;
6403                         /*
6404                          * drop old extent pointer at first, then insert the
6405                          * new pointers one bye one
6406                          */
6407                         btrfs_release_path(root, path);
6408                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
6409                                                  key.offset + num_bytes,
6410                                                  key.offset, &alloc_hint);
6411                         BUG_ON(ret);
6412
6413                         for (i = 0; i < nr_extents; i++) {
6414                                 if (ext_offset >= new_extents[i].num_bytes) {
6415                                         ext_offset -= new_extents[i].num_bytes;
6416                                         continue;
6417                                 }
6418                                 extent_len = min(new_extents[i].num_bytes -
6419                                                  ext_offset, num_bytes);
6420
6421                                 ret = btrfs_insert_empty_item(trans, root,
6422                                                               path, &key,
6423                                                               sizeof(*fi));
6424                                 BUG_ON(ret);
6425
6426                                 leaf = path->nodes[0];
6427                                 fi = btrfs_item_ptr(leaf, path->slots[0],
6428                                                 struct btrfs_file_extent_item);
6429                                 btrfs_set_file_extent_generation(leaf, fi,
6430                                                         trans->transid);
6431                                 btrfs_set_file_extent_type(leaf, fi,
6432                                                         BTRFS_FILE_EXTENT_REG);
6433                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6434                                                 new_extents[i].disk_bytenr);
6435                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6436                                                 new_extents[i].disk_num_bytes);
6437                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
6438                                                 new_extents[i].ram_bytes);
6439
6440                                 btrfs_set_file_extent_compression(leaf, fi,
6441                                                 new_extents[i].compression);
6442                                 btrfs_set_file_extent_encryption(leaf, fi,
6443                                                 new_extents[i].encryption);
6444                                 btrfs_set_file_extent_other_encoding(leaf, fi,
6445                                                 new_extents[i].other_encoding);
6446
6447                                 btrfs_set_file_extent_num_bytes(leaf, fi,
6448                                                         extent_len);
6449                                 ext_offset += new_extents[i].offset;
6450                                 btrfs_set_file_extent_offset(leaf, fi,
6451                                                         ext_offset);
6452                                 btrfs_mark_buffer_dirty(leaf);
6453
6454                                 btrfs_drop_extent_cache(inode, key.offset,
6455                                                 key.offset + extent_len - 1, 0);
6456
6457                                 ret = btrfs_inc_extent_ref(trans, root,
6458                                                 new_extents[i].disk_bytenr,
6459                                                 new_extents[i].disk_num_bytes,
6460                                                 leaf->start,
6461                                                 root->root_key.objectid,
6462                                                 trans->transid, key.objectid);
6463                                 BUG_ON(ret);
6464                                 btrfs_release_path(root, path);
6465
6466                                 inode_add_bytes(inode, extent_len);
6467
6468                                 ext_offset = 0;
6469                                 num_bytes -= extent_len;
6470                                 key.offset += extent_len;
6471
6472                                 if (num_bytes == 0)
6473                                         break;
6474                         }
6475                         BUG_ON(i >= nr_extents);
6476 #endif
6477                 }
6478
6479                 if (extent_locked) {
6480                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6481                                       lock_end, GFP_NOFS);
6482                         extent_locked = 0;
6483                 }
6484 skip:
6485                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6486                     key.offset >= search_end)
6487                         break;
6488
6489                 cond_resched();
6490         }
6491         ret = 0;
6492 out:
6493         btrfs_release_path(root, path);
6494         if (inode) {
6495                 mutex_unlock(&inode->i_mutex);
6496                 if (extent_locked) {
6497                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6498                                       lock_end, GFP_NOFS);
6499                 }
6500                 iput(inode);
6501         }
6502         return ret;
6503 }
6504
6505 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6506                                struct btrfs_root *root,
6507                                struct extent_buffer *buf, u64 orig_start)
6508 {
6509         int level;
6510         int ret;
6511
6512         BUG_ON(btrfs_header_generation(buf) != trans->transid);
6513         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6514
6515         level = btrfs_header_level(buf);
6516         if (level == 0) {
6517                 struct btrfs_leaf_ref *ref;
6518                 struct btrfs_leaf_ref *orig_ref;
6519
6520                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6521                 if (!orig_ref)
6522                         return -ENOENT;
6523
6524                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6525                 if (!ref) {
6526                         btrfs_free_leaf_ref(root, orig_ref);
6527                         return -ENOMEM;
6528                 }
6529
6530                 ref->nritems = orig_ref->nritems;
6531                 memcpy(ref->extents, orig_ref->extents,
6532                         sizeof(ref->extents[0]) * ref->nritems);
6533
6534                 btrfs_free_leaf_ref(root, orig_ref);
6535
6536                 ref->root_gen = trans->transid;
6537                 ref->bytenr = buf->start;
6538                 ref->owner = btrfs_header_owner(buf);
6539                 ref->generation = btrfs_header_generation(buf);
6540
6541                 ret = btrfs_add_leaf_ref(root, ref, 0);
6542                 WARN_ON(ret);
6543                 btrfs_free_leaf_ref(root, ref);
6544         }
6545         return 0;
6546 }
6547
6548 static noinline int invalidate_extent_cache(struct btrfs_root *root,
6549                                         struct extent_buffer *leaf,
6550                                         struct btrfs_block_group_cache *group,
6551                                         struct btrfs_root *target_root)
6552 {
6553         struct btrfs_key key;
6554         struct inode *inode = NULL;
6555         struct btrfs_file_extent_item *fi;
6556         u64 num_bytes;
6557         u64 skip_objectid = 0;
6558         u32 nritems;
6559         u32 i;
6560
6561         nritems = btrfs_header_nritems(leaf);
6562         for (i = 0; i < nritems; i++) {
6563                 btrfs_item_key_to_cpu(leaf, &key, i);
6564                 if (key.objectid == skip_objectid ||
6565                     key.type != BTRFS_EXTENT_DATA_KEY)
6566                         continue;
6567                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6568                 if (btrfs_file_extent_type(leaf, fi) ==
6569                     BTRFS_FILE_EXTENT_INLINE)
6570                         continue;
6571                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6572                         continue;
6573                 if (!inode || inode->i_ino != key.objectid) {
6574                         iput(inode);
6575                         inode = btrfs_ilookup(target_root->fs_info->sb,
6576                                               key.objectid, target_root, 1);
6577                 }
6578                 if (!inode) {
6579                         skip_objectid = key.objectid;
6580                         continue;
6581                 }
6582                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6583
6584                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6585                             key.offset + num_bytes - 1, GFP_NOFS);
6586                 btrfs_drop_extent_cache(inode, key.offset,
6587                                         key.offset + num_bytes - 1, 1);
6588                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6589                               key.offset + num_bytes - 1, GFP_NOFS);
6590                 cond_resched();
6591         }
6592         iput(inode);
6593         return 0;
6594 }
6595
6596 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
6597                                         struct btrfs_root *root,
6598                                         struct extent_buffer *leaf,
6599                                         struct btrfs_block_group_cache *group,
6600                                         struct inode *reloc_inode)
6601 {
6602         struct btrfs_key key;
6603         struct btrfs_key extent_key;
6604         struct btrfs_file_extent_item *fi;
6605         struct btrfs_leaf_ref *ref;
6606         struct disk_extent *new_extent;
6607         u64 bytenr;
6608         u64 num_bytes;
6609         u32 nritems;
6610         u32 i;
6611         int ext_index;
6612         int nr_extent;
6613         int ret;
6614
6615         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
6616         BUG_ON(!new_extent);
6617
6618         ref = btrfs_lookup_leaf_ref(root, leaf->start);
6619         BUG_ON(!ref);
6620
6621         ext_index = -1;
6622         nritems = btrfs_header_nritems(leaf);
6623         for (i = 0; i < nritems; i++) {
6624                 btrfs_item_key_to_cpu(leaf, &key, i);
6625                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6626                         continue;
6627                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6628                 if (btrfs_file_extent_type(leaf, fi) ==
6629                     BTRFS_FILE_EXTENT_INLINE)
6630                         continue;
6631                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6632                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6633                 if (bytenr == 0)
6634                         continue;
6635
6636                 ext_index++;
6637                 if (bytenr >= group->key.objectid + group->key.offset ||
6638                     bytenr + num_bytes <= group->key.objectid)
6639                         continue;
6640
6641                 extent_key.objectid = bytenr;
6642                 extent_key.offset = num_bytes;
6643                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6644                 nr_extent = 1;
6645                 ret = get_new_locations(reloc_inode, &extent_key,
6646                                         group->key.objectid, 1,
6647                                         &new_extent, &nr_extent);
6648                 if (ret > 0)
6649                         continue;
6650                 BUG_ON(ret < 0);
6651
6652                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
6653                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
6654                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
6655                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
6656
6657                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6658                                                 new_extent->disk_bytenr);
6659                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6660                                                 new_extent->disk_num_bytes);
6661                 btrfs_mark_buffer_dirty(leaf);
6662
6663                 ret = btrfs_inc_extent_ref(trans, root,
6664                                         new_extent->disk_bytenr,
6665                                         new_extent->disk_num_bytes,
6666                                         leaf->start,
6667                                         root->root_key.objectid,
6668                                         trans->transid, key.objectid);
6669                 BUG_ON(ret);
6670
6671                 ret = btrfs_free_extent(trans, root,
6672                                         bytenr, num_bytes, leaf->start,
6673                                         btrfs_header_owner(leaf),
6674                                         btrfs_header_generation(leaf),
6675                                         key.objectid, 0);
6676                 BUG_ON(ret);
6677                 cond_resched();
6678         }
6679         kfree(new_extent);
6680         BUG_ON(ext_index + 1 != ref->nritems);
6681         btrfs_free_leaf_ref(root, ref);
6682         return 0;
6683 }
6684
6685 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
6686                           struct btrfs_root *root)
6687 {
6688         struct btrfs_root *reloc_root;
6689         int ret;
6690
6691         if (root->reloc_root) {
6692                 reloc_root = root->reloc_root;
6693                 root->reloc_root = NULL;
6694                 list_add(&reloc_root->dead_list,
6695                          &root->fs_info->dead_reloc_roots);
6696
6697                 btrfs_set_root_bytenr(&reloc_root->root_item,
6698                                       reloc_root->node->start);
6699                 btrfs_set_root_level(&root->root_item,
6700                                      btrfs_header_level(reloc_root->node));
6701                 memset(&reloc_root->root_item.drop_progress, 0,
6702                         sizeof(struct btrfs_disk_key));
6703                 reloc_root->root_item.drop_level = 0;
6704
6705                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
6706                                         &reloc_root->root_key,
6707                                         &reloc_root->root_item);
6708                 BUG_ON(ret);
6709         }
6710         return 0;
6711 }
6712
6713 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
6714 {
6715         struct btrfs_trans_handle *trans;
6716         struct btrfs_root *reloc_root;
6717         struct btrfs_root *prev_root = NULL;
6718         struct list_head dead_roots;
6719         int ret;
6720         unsigned long nr;
6721
6722         INIT_LIST_HEAD(&dead_roots);
6723         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
6724
6725         while (!list_empty(&dead_roots)) {
6726                 reloc_root = list_entry(dead_roots.prev,
6727                                         struct btrfs_root, dead_list);
6728                 list_del_init(&reloc_root->dead_list);
6729
6730                 BUG_ON(reloc_root->commit_root != NULL);
6731                 while (1) {
6732                         trans = btrfs_join_transaction(root, 1);
6733                         BUG_ON(!trans);
6734
6735                         mutex_lock(&root->fs_info->drop_mutex);
6736                         ret = btrfs_drop_snapshot(trans, reloc_root);
6737                         if (ret != -EAGAIN)
6738                                 break;
6739                         mutex_unlock(&root->fs_info->drop_mutex);
6740
6741                         nr = trans->blocks_used;
6742                         ret = btrfs_end_transaction(trans, root);
6743                         BUG_ON(ret);
6744                         btrfs_btree_balance_dirty(root, nr);
6745                 }
6746
6747                 free_extent_buffer(reloc_root->node);
6748
6749                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
6750                                      &reloc_root->root_key);
6751                 BUG_ON(ret);
6752                 mutex_unlock(&root->fs_info->drop_mutex);
6753
6754                 nr = trans->blocks_used;
6755                 ret = btrfs_end_transaction(trans, root);
6756                 BUG_ON(ret);
6757                 btrfs_btree_balance_dirty(root, nr);
6758
6759                 kfree(prev_root);
6760                 prev_root = reloc_root;
6761         }
6762         if (prev_root) {
6763                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
6764                 kfree(prev_root);
6765         }
6766         return 0;
6767 }
6768
6769 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
6770 {
6771         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
6772         return 0;
6773 }
6774
6775 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
6776 {
6777         struct btrfs_root *reloc_root;
6778         struct btrfs_trans_handle *trans;
6779         struct btrfs_key location;
6780         int found;
6781         int ret;
6782
6783         mutex_lock(&root->fs_info->tree_reloc_mutex);
6784         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
6785         BUG_ON(ret);
6786         found = !list_empty(&root->fs_info->dead_reloc_roots);
6787         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6788
6789         if (found) {
6790                 trans = btrfs_start_transaction(root, 1);
6791                 BUG_ON(!trans);
6792                 ret = btrfs_commit_transaction(trans, root);
6793                 BUG_ON(ret);
6794         }
6795
6796         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6797         location.offset = (u64)-1;
6798         location.type = BTRFS_ROOT_ITEM_KEY;
6799
6800         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
6801         BUG_ON(!reloc_root);
6802         btrfs_orphan_cleanup(reloc_root);
6803         return 0;
6804 }
6805
6806 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
6807                                     struct btrfs_root *root)
6808 {
6809         struct btrfs_root *reloc_root;
6810         struct extent_buffer *eb;
6811         struct btrfs_root_item *root_item;
6812         struct btrfs_key root_key;
6813         int ret;
6814
6815         BUG_ON(!root->ref_cows);
6816         if (root->reloc_root)
6817                 return 0;
6818
6819         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
6820         BUG_ON(!root_item);
6821
6822         ret = btrfs_copy_root(trans, root, root->commit_root,
6823                               &eb, BTRFS_TREE_RELOC_OBJECTID);
6824         BUG_ON(ret);
6825
6826         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
6827         root_key.offset = root->root_key.objectid;
6828         root_key.type = BTRFS_ROOT_ITEM_KEY;
6829
6830         memcpy(root_item, &root->root_item, sizeof(root_item));
6831         btrfs_set_root_refs(root_item, 0);
6832         btrfs_set_root_bytenr(root_item, eb->start);
6833         btrfs_set_root_level(root_item, btrfs_header_level(eb));
6834         btrfs_set_root_generation(root_item, trans->transid);
6835
6836         btrfs_tree_unlock(eb);
6837         free_extent_buffer(eb);
6838
6839         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
6840                                 &root_key, root_item);
6841         BUG_ON(ret);
6842         kfree(root_item);
6843
6844         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
6845                                                  &root_key);
6846         BUG_ON(!reloc_root);
6847         reloc_root->last_trans = trans->transid;
6848         reloc_root->commit_root = NULL;
6849         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
6850
6851         root->reloc_root = reloc_root;
6852         return 0;
6853 }
6854
6855 /*
6856  * Core function of space balance.
6857  *
6858  * The idea is using reloc trees to relocate tree blocks in reference
6859  * counted roots. There is one reloc tree for each subvol, and all
6860  * reloc trees share same root key objectid. Reloc trees are snapshots
6861  * of the latest committed roots of subvols (root->commit_root).
6862  *
6863  * To relocate a tree block referenced by a subvol, there are two steps.
6864  * COW the block through subvol's reloc tree, then update block pointer
6865  * in the subvol to point to the new block. Since all reloc trees share
6866  * same root key objectid, doing special handing for tree blocks owned
6867  * by them is easy. Once a tree block has been COWed in one reloc tree,
6868  * we can use the resulting new block directly when the same block is
6869  * required to COW again through other reloc trees. By this way, relocated
6870  * tree blocks are shared between reloc trees, so they are also shared
6871  * between subvols.
6872  */
6873 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
6874                                       struct btrfs_root *root,
6875                                       struct btrfs_path *path,
6876                                       struct btrfs_key *first_key,
6877                                       struct btrfs_ref_path *ref_path,
6878                                       struct btrfs_block_group_cache *group,
6879                                       struct inode *reloc_inode)
6880 {
6881         struct btrfs_root *reloc_root;
6882         struct extent_buffer *eb = NULL;
6883         struct btrfs_key *keys;
6884         u64 *nodes;
6885         int level;
6886         int shared_level;
6887         int lowest_level = 0;
6888         int ret;
6889
6890         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
6891                 lowest_level = ref_path->owner_objectid;
6892
6893         if (!root->ref_cows) {
6894                 path->lowest_level = lowest_level;
6895                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
6896                 BUG_ON(ret < 0);
6897                 path->lowest_level = 0;
6898                 btrfs_release_path(root, path);
6899                 return 0;
6900         }
6901
6902         mutex_lock(&root->fs_info->tree_reloc_mutex);
6903         ret = init_reloc_tree(trans, root);
6904         BUG_ON(ret);
6905         reloc_root = root->reloc_root;
6906
6907         shared_level = ref_path->shared_level;
6908         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
6909
6910         keys = ref_path->node_keys;
6911         nodes = ref_path->new_nodes;
6912         memset(&keys[shared_level + 1], 0,
6913                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
6914         memset(&nodes[shared_level + 1], 0,
6915                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
6916
6917         if (nodes[lowest_level] == 0) {
6918                 path->lowest_level = lowest_level;
6919                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6920                                         0, 1);
6921                 BUG_ON(ret);
6922                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
6923                         eb = path->nodes[level];
6924                         if (!eb || eb == reloc_root->node)
6925                                 break;
6926                         nodes[level] = eb->start;
6927                         if (level == 0)
6928                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
6929                         else
6930                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
6931                 }
6932                 if (nodes[0] &&
6933                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6934                         eb = path->nodes[0];
6935                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
6936                                                       group, reloc_inode);
6937                         BUG_ON(ret);
6938                 }
6939                 btrfs_release_path(reloc_root, path);
6940         } else {
6941                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
6942                                        lowest_level);
6943                 BUG_ON(ret);
6944         }
6945
6946         /*
6947          * replace tree blocks in the fs tree with tree blocks in
6948          * the reloc tree.
6949          */
6950         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
6951         BUG_ON(ret < 0);
6952
6953         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6954                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6955                                         0, 0);
6956                 BUG_ON(ret);
6957                 extent_buffer_get(path->nodes[0]);
6958                 eb = path->nodes[0];
6959                 btrfs_release_path(reloc_root, path);
6960                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
6961                 BUG_ON(ret);
6962                 free_extent_buffer(eb);
6963         }
6964
6965         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6966         path->lowest_level = 0;
6967         return 0;
6968 }
6969
6970 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
6971                                         struct btrfs_root *root,
6972                                         struct btrfs_path *path,
6973                                         struct btrfs_key *first_key,
6974                                         struct btrfs_ref_path *ref_path)
6975 {
6976         int ret;
6977
6978         ret = relocate_one_path(trans, root, path, first_key,
6979                                 ref_path, NULL, NULL);
6980         BUG_ON(ret);
6981
6982         return 0;
6983 }
6984
6985 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
6986                                     struct btrfs_root *extent_root,
6987                                     struct btrfs_path *path,
6988                                     struct btrfs_key *extent_key)
6989 {
6990         int ret;
6991
6992         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
6993         if (ret)
6994                 goto out;
6995         ret = btrfs_del_item(trans, extent_root, path);
6996 out:
6997         btrfs_release_path(extent_root, path);
6998         return ret;
6999 }
7000
7001 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7002                                                 struct btrfs_ref_path *ref_path)
7003 {
7004         struct btrfs_key root_key;
7005
7006         root_key.objectid = ref_path->root_objectid;
7007         root_key.type = BTRFS_ROOT_ITEM_KEY;
7008         if (is_cowonly_root(ref_path->root_objectid))
7009                 root_key.offset = 0;
7010         else
7011                 root_key.offset = (u64)-1;
7012
7013         return btrfs_read_fs_root_no_name(fs_info, &root_key);
7014 }
7015
7016 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7017                                         struct btrfs_path *path,
7018                                         struct btrfs_key *extent_key,
7019                                         struct btrfs_block_group_cache *group,
7020                                         struct inode *reloc_inode, int pass)
7021 {
7022         struct btrfs_trans_handle *trans;
7023         struct btrfs_root *found_root;
7024         struct btrfs_ref_path *ref_path = NULL;
7025         struct disk_extent *new_extents = NULL;
7026         int nr_extents = 0;
7027         int loops;
7028         int ret;
7029         int level;
7030         struct btrfs_key first_key;
7031         u64 prev_block = 0;
7032
7033
7034         trans = btrfs_start_transaction(extent_root, 1);
7035         BUG_ON(!trans);
7036
7037         if (extent_key->objectid == 0) {
7038                 ret = del_extent_zero(trans, extent_root, path, extent_key);
7039                 goto out;
7040         }
7041
7042         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7043         if (!ref_path) {
7044                 ret = -ENOMEM;
7045                 goto out;
7046         }
7047
7048         for (loops = 0; ; loops++) {
7049                 if (loops == 0) {
7050                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7051                                                    extent_key->objectid);
7052                 } else {
7053                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7054                 }
7055                 if (ret < 0)
7056                         goto out;
7057                 if (ret > 0)
7058                         break;
7059
7060                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7061                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7062                         continue;
7063
7064                 found_root = read_ref_root(extent_root->fs_info, ref_path);
7065                 BUG_ON(!found_root);
7066                 /*
7067                  * for reference counted tree, only process reference paths
7068                  * rooted at the latest committed root.
7069                  */
7070                 if (found_root->ref_cows &&
7071                     ref_path->root_generation != found_root->root_key.offset)
7072                         continue;
7073
7074                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7075                         if (pass == 0) {
7076                                 /*
7077                                  * copy data extents to new locations
7078                                  */
7079                                 u64 group_start = group->key.objectid;
7080                                 ret = relocate_data_extent(reloc_inode,
7081                                                            extent_key,
7082                                                            group_start);
7083                                 if (ret < 0)
7084                                         goto out;
7085                                 break;
7086                         }
7087                         level = 0;
7088                 } else {
7089                         level = ref_path->owner_objectid;
7090                 }
7091
7092                 if (prev_block != ref_path->nodes[level]) {
7093                         struct extent_buffer *eb;
7094                         u64 block_start = ref_path->nodes[level];
7095                         u64 block_size = btrfs_level_size(found_root, level);
7096
7097                         eb = read_tree_block(found_root, block_start,
7098                                              block_size, 0);
7099                         btrfs_tree_lock(eb);
7100                         BUG_ON(level != btrfs_header_level(eb));
7101
7102                         if (level == 0)
7103                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
7104                         else
7105                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
7106
7107                         btrfs_tree_unlock(eb);
7108                         free_extent_buffer(eb);
7109                         prev_block = block_start;
7110                 }
7111
7112                 mutex_lock(&extent_root->fs_info->trans_mutex);
7113                 btrfs_record_root_in_trans(found_root);
7114                 mutex_unlock(&extent_root->fs_info->trans_mutex);
7115                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7116                         /*
7117                          * try to update data extent references while
7118                          * keeping metadata shared between snapshots.
7119                          */
7120                         if (pass == 1) {
7121                                 ret = relocate_one_path(trans, found_root,
7122                                                 path, &first_key, ref_path,
7123                                                 group, reloc_inode);
7124                                 if (ret < 0)
7125                                         goto out;
7126                                 continue;
7127                         }
7128                         /*
7129                          * use fallback method to process the remaining
7130                          * references.
7131                          */
7132                         if (!new_extents) {
7133                                 u64 group_start = group->key.objectid;
7134                                 new_extents = kmalloc(sizeof(*new_extents),
7135                                                       GFP_NOFS);
7136                                 nr_extents = 1;
7137                                 ret = get_new_locations(reloc_inode,
7138                                                         extent_key,
7139                                                         group_start, 1,
7140                                                         &new_extents,
7141                                                         &nr_extents);
7142                                 if (ret)
7143                                         goto out;
7144                         }
7145                         ret = replace_one_extent(trans, found_root,
7146                                                 path, extent_key,
7147                                                 &first_key, ref_path,
7148                                                 new_extents, nr_extents);
7149                 } else {
7150                         ret = relocate_tree_block(trans, found_root, path,
7151                                                   &first_key, ref_path);
7152                 }
7153                 if (ret < 0)
7154                         goto out;
7155         }
7156         ret = 0;
7157 out:
7158         btrfs_end_transaction(trans, extent_root);
7159         kfree(new_extents);
7160         kfree(ref_path);
7161         return ret;
7162 }
7163 #endif
7164
7165 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7166 {
7167         u64 num_devices;
7168         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7169                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7170
7171         num_devices = root->fs_info->fs_devices->rw_devices;
7172         if (num_devices == 1) {
7173                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7174                 stripped = flags & ~stripped;
7175
7176                 /* turn raid0 into single device chunks */
7177                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7178                         return stripped;
7179
7180                 /* turn mirroring into duplication */
7181                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7182                              BTRFS_BLOCK_GROUP_RAID10))
7183                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7184                 return flags;
7185         } else {
7186                 /* they already had raid on here, just return */
7187                 if (flags & stripped)
7188                         return flags;
7189
7190                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7191                 stripped = flags & ~stripped;
7192
7193                 /* switch duplicated blocks with raid1 */
7194                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7195                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7196
7197                 /* turn single device chunks into raid0 */
7198                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7199         }
7200         return flags;
7201 }
7202
7203 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
7204                      struct btrfs_block_group_cache *shrink_block_group,
7205                      int force)
7206 {
7207         struct btrfs_trans_handle *trans;
7208         u64 new_alloc_flags;
7209         u64 calc;
7210
7211         spin_lock(&shrink_block_group->lock);
7212         if (btrfs_block_group_used(&shrink_block_group->item) +
7213             shrink_block_group->reserved > 0) {
7214                 spin_unlock(&shrink_block_group->lock);
7215
7216                 trans = btrfs_start_transaction(root, 1);
7217                 spin_lock(&shrink_block_group->lock);
7218
7219                 new_alloc_flags = update_block_group_flags(root,
7220                                                    shrink_block_group->flags);
7221                 if (new_alloc_flags != shrink_block_group->flags) {
7222                         calc =
7223                              btrfs_block_group_used(&shrink_block_group->item);
7224                 } else {
7225                         calc = shrink_block_group->key.offset;
7226                 }
7227                 spin_unlock(&shrink_block_group->lock);
7228
7229                 do_chunk_alloc(trans, root->fs_info->extent_root,
7230                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
7231
7232                 btrfs_end_transaction(trans, root);
7233         } else
7234                 spin_unlock(&shrink_block_group->lock);
7235         return 0;
7236 }
7237
7238
7239 int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
7240                                          struct btrfs_block_group_cache *group)
7241
7242 {
7243         __alloc_chunk_for_shrink(root, group, 1);
7244         set_block_group_readonly(group);
7245         return 0;
7246 }
7247
7248 /*
7249  * checks to see if its even possible to relocate this block group.
7250  *
7251  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7252  * ok to go ahead and try.
7253  */
7254 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7255 {
7256         struct btrfs_block_group_cache *block_group;
7257         struct btrfs_space_info *space_info;
7258         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7259         struct btrfs_device *device;
7260         int full = 0;
7261         int ret = 0;
7262
7263         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7264
7265         /* odd, couldn't find the block group, leave it alone */
7266         if (!block_group)
7267                 return -1;
7268
7269         /* no bytes used, we're good */
7270         if (!btrfs_block_group_used(&block_group->item))
7271                 goto out;
7272
7273         space_info = block_group->space_info;
7274         spin_lock(&space_info->lock);
7275
7276         full = space_info->full;
7277
7278         /*
7279          * if this is the last block group we have in this space, we can't
7280          * relocate it unless we're able to allocate a new chunk below.
7281          *
7282          * Otherwise, we need to make sure we have room in the space to handle
7283          * all of the extents from this block group.  If we can, we're good
7284          */
7285         if ((space_info->total_bytes != block_group->key.offset) &&
7286            (space_info->bytes_used + space_info->bytes_reserved +
7287             space_info->bytes_pinned + space_info->bytes_readonly +
7288             btrfs_block_group_used(&block_group->item) <
7289             space_info->total_bytes)) {
7290                 spin_unlock(&space_info->lock);
7291                 goto out;
7292         }
7293         spin_unlock(&space_info->lock);
7294
7295         /*
7296          * ok we don't have enough space, but maybe we have free space on our
7297          * devices to allocate new chunks for relocation, so loop through our
7298          * alloc devices and guess if we have enough space.  However, if we
7299          * were marked as full, then we know there aren't enough chunks, and we
7300          * can just return.
7301          */
7302         ret = -1;
7303         if (full)
7304                 goto out;
7305
7306         mutex_lock(&root->fs_info->chunk_mutex);
7307         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7308                 u64 min_free = btrfs_block_group_used(&block_group->item);
7309                 u64 dev_offset, max_avail;
7310
7311                 /*
7312                  * check to make sure we can actually find a chunk with enough
7313                  * space to fit our block group in.
7314                  */
7315                 if (device->total_bytes > device->bytes_used + min_free) {
7316                         ret = find_free_dev_extent(NULL, device, min_free,
7317                                                    &dev_offset, &max_avail);
7318                         if (!ret)
7319                                 break;
7320                         ret = -1;
7321                 }
7322         }
7323         mutex_unlock(&root->fs_info->chunk_mutex);
7324 out:
7325         btrfs_put_block_group(block_group);
7326         return ret;
7327 }
7328
7329 static int find_first_block_group(struct btrfs_root *root,
7330                 struct btrfs_path *path, struct btrfs_key *key)
7331 {
7332         int ret = 0;
7333         struct btrfs_key found_key;
7334         struct extent_buffer *leaf;
7335         int slot;
7336
7337         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7338         if (ret < 0)
7339                 goto out;
7340
7341         while (1) {
7342                 slot = path->slots[0];
7343                 leaf = path->nodes[0];
7344                 if (slot >= btrfs_header_nritems(leaf)) {
7345                         ret = btrfs_next_leaf(root, path);
7346                         if (ret == 0)
7347                                 continue;
7348                         if (ret < 0)
7349                                 goto out;
7350                         break;
7351                 }
7352                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7353
7354                 if (found_key.objectid >= key->objectid &&
7355                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7356                         ret = 0;
7357                         goto out;
7358                 }
7359                 path->slots[0]++;
7360         }
7361         ret = -ENOENT;
7362 out:
7363         return ret;
7364 }
7365
7366 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7367 {
7368         struct btrfs_block_group_cache *block_group;
7369         struct btrfs_space_info *space_info;
7370         struct btrfs_caching_control *caching_ctl;
7371         struct rb_node *n;
7372
7373         down_write(&info->extent_commit_sem);
7374         while (!list_empty(&info->caching_block_groups)) {
7375                 caching_ctl = list_entry(info->caching_block_groups.next,
7376                                          struct btrfs_caching_control, list);
7377                 list_del(&caching_ctl->list);
7378                 put_caching_control(caching_ctl);
7379         }
7380         up_write(&info->extent_commit_sem);
7381
7382         spin_lock(&info->block_group_cache_lock);
7383         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7384                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7385                                        cache_node);
7386                 rb_erase(&block_group->cache_node,
7387                          &info->block_group_cache_tree);
7388                 spin_unlock(&info->block_group_cache_lock);
7389
7390                 down_write(&block_group->space_info->groups_sem);
7391                 list_del(&block_group->list);
7392                 up_write(&block_group->space_info->groups_sem);
7393
7394                 if (block_group->cached == BTRFS_CACHE_STARTED)
7395                         wait_block_group_cache_done(block_group);
7396
7397                 btrfs_remove_free_space_cache(block_group);
7398
7399                 WARN_ON(atomic_read(&block_group->count) != 1);
7400                 kfree(block_group);
7401
7402                 spin_lock(&info->block_group_cache_lock);
7403         }
7404         spin_unlock(&info->block_group_cache_lock);
7405
7406         /* now that all the block groups are freed, go through and
7407          * free all the space_info structs.  This is only called during
7408          * the final stages of unmount, and so we know nobody is
7409          * using them.  We call synchronize_rcu() once before we start,
7410          * just to be on the safe side.
7411          */
7412         synchronize_rcu();
7413
7414         while(!list_empty(&info->space_info)) {
7415                 space_info = list_entry(info->space_info.next,
7416                                         struct btrfs_space_info,
7417                                         list);
7418
7419                 list_del(&space_info->list);
7420                 kfree(space_info);
7421         }
7422         return 0;
7423 }
7424
7425 int btrfs_read_block_groups(struct btrfs_root *root)
7426 {
7427         struct btrfs_path *path;
7428         int ret;
7429         struct btrfs_block_group_cache *cache;
7430         struct btrfs_fs_info *info = root->fs_info;
7431         struct btrfs_space_info *space_info;
7432         struct btrfs_key key;
7433         struct btrfs_key found_key;
7434         struct extent_buffer *leaf;
7435
7436         root = info->extent_root;
7437         key.objectid = 0;
7438         key.offset = 0;
7439         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7440         path = btrfs_alloc_path();
7441         if (!path)
7442                 return -ENOMEM;
7443
7444         while (1) {
7445                 ret = find_first_block_group(root, path, &key);
7446                 if (ret > 0) {
7447                         ret = 0;
7448                         goto error;
7449                 }
7450                 if (ret != 0)
7451                         goto error;
7452
7453                 leaf = path->nodes[0];
7454                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7455                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7456                 if (!cache) {
7457                         ret = -ENOMEM;
7458                         break;
7459                 }
7460
7461                 atomic_set(&cache->count, 1);
7462                 spin_lock_init(&cache->lock);
7463                 spin_lock_init(&cache->tree_lock);
7464                 cache->fs_info = info;
7465                 INIT_LIST_HEAD(&cache->list);
7466                 INIT_LIST_HEAD(&cache->cluster_list);
7467
7468                 /*
7469                  * we only want to have 32k of ram per block group for keeping
7470                  * track of free space, and if we pass 1/2 of that we want to
7471                  * start converting things over to using bitmaps
7472                  */
7473                 cache->extents_thresh = ((1024 * 32) / 2) /
7474                         sizeof(struct btrfs_free_space);
7475
7476                 read_extent_buffer(leaf, &cache->item,
7477                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7478                                    sizeof(cache->item));
7479                 memcpy(&cache->key, &found_key, sizeof(found_key));
7480
7481                 key.objectid = found_key.objectid + found_key.offset;
7482                 btrfs_release_path(root, path);
7483                 cache->flags = btrfs_block_group_flags(&cache->item);
7484                 cache->sectorsize = root->sectorsize;
7485
7486                 /*
7487                  * check for two cases, either we are full, and therefore
7488                  * don't need to bother with the caching work since we won't
7489                  * find any space, or we are empty, and we can just add all
7490                  * the space in and be done with it.  This saves us _alot_ of
7491                  * time, particularly in the full case.
7492                  */
7493                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7494                         exclude_super_stripes(root, cache);
7495                         cache->last_byte_to_unpin = (u64)-1;
7496                         cache->cached = BTRFS_CACHE_FINISHED;
7497                         free_excluded_extents(root, cache);
7498                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7499                         exclude_super_stripes(root, cache);
7500                         cache->last_byte_to_unpin = (u64)-1;
7501                         cache->cached = BTRFS_CACHE_FINISHED;
7502                         add_new_free_space(cache, root->fs_info,
7503                                            found_key.objectid,
7504                                            found_key.objectid +
7505                                            found_key.offset);
7506                         free_excluded_extents(root, cache);
7507                 }
7508
7509                 ret = update_space_info(info, cache->flags, found_key.offset,
7510                                         btrfs_block_group_used(&cache->item),
7511                                         &space_info);
7512                 BUG_ON(ret);
7513                 cache->space_info = space_info;
7514                 spin_lock(&cache->space_info->lock);
7515                 cache->space_info->bytes_super += cache->bytes_super;
7516                 spin_unlock(&cache->space_info->lock);
7517
7518                 down_write(&space_info->groups_sem);
7519                 list_add_tail(&cache->list, &space_info->block_groups);
7520                 up_write(&space_info->groups_sem);
7521
7522                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7523                 BUG_ON(ret);
7524
7525                 set_avail_alloc_bits(root->fs_info, cache->flags);
7526                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7527                         set_block_group_readonly(cache);
7528         }
7529         ret = 0;
7530 error:
7531         btrfs_free_path(path);
7532         return ret;
7533 }
7534
7535 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7536                            struct btrfs_root *root, u64 bytes_used,
7537                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7538                            u64 size)
7539 {
7540         int ret;
7541         struct btrfs_root *extent_root;
7542         struct btrfs_block_group_cache *cache;
7543
7544         extent_root = root->fs_info->extent_root;
7545
7546         root->fs_info->last_trans_log_full_commit = trans->transid;
7547
7548         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7549         if (!cache)
7550                 return -ENOMEM;
7551
7552         cache->key.objectid = chunk_offset;
7553         cache->key.offset = size;
7554         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7555         cache->sectorsize = root->sectorsize;
7556
7557         /*
7558          * we only want to have 32k of ram per block group for keeping track
7559          * of free space, and if we pass 1/2 of that we want to start
7560          * converting things over to using bitmaps
7561          */
7562         cache->extents_thresh = ((1024 * 32) / 2) /
7563                 sizeof(struct btrfs_free_space);
7564         atomic_set(&cache->count, 1);
7565         spin_lock_init(&cache->lock);
7566         spin_lock_init(&cache->tree_lock);
7567         INIT_LIST_HEAD(&cache->list);
7568         INIT_LIST_HEAD(&cache->cluster_list);
7569
7570         btrfs_set_block_group_used(&cache->item, bytes_used);
7571         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7572         cache->flags = type;
7573         btrfs_set_block_group_flags(&cache->item, type);
7574
7575         cache->last_byte_to_unpin = (u64)-1;
7576         cache->cached = BTRFS_CACHE_FINISHED;
7577         exclude_super_stripes(root, cache);
7578
7579         add_new_free_space(cache, root->fs_info, chunk_offset,
7580                            chunk_offset + size);
7581
7582         free_excluded_extents(root, cache);
7583
7584         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7585                                 &cache->space_info);
7586         BUG_ON(ret);
7587
7588         spin_lock(&cache->space_info->lock);
7589         cache->space_info->bytes_super += cache->bytes_super;
7590         spin_unlock(&cache->space_info->lock);
7591
7592         down_write(&cache->space_info->groups_sem);
7593         list_add_tail(&cache->list, &cache->space_info->block_groups);
7594         up_write(&cache->space_info->groups_sem);
7595
7596         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7597         BUG_ON(ret);
7598
7599         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7600                                 sizeof(cache->item));
7601         BUG_ON(ret);
7602
7603         set_avail_alloc_bits(extent_root->fs_info, type);
7604
7605         return 0;
7606 }
7607
7608 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7609                              struct btrfs_root *root, u64 group_start)
7610 {
7611         struct btrfs_path *path;
7612         struct btrfs_block_group_cache *block_group;
7613         struct btrfs_free_cluster *cluster;
7614         struct btrfs_key key;
7615         int ret;
7616
7617         root = root->fs_info->extent_root;
7618
7619         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7620         BUG_ON(!block_group);
7621         BUG_ON(!block_group->ro);
7622
7623         memcpy(&key, &block_group->key, sizeof(key));
7624
7625         /* make sure this block group isn't part of an allocation cluster */
7626         cluster = &root->fs_info->data_alloc_cluster;
7627         spin_lock(&cluster->refill_lock);
7628         btrfs_return_cluster_to_free_space(block_group, cluster);
7629         spin_unlock(&cluster->refill_lock);
7630
7631         /*
7632          * make sure this block group isn't part of a metadata
7633          * allocation cluster
7634          */
7635         cluster = &root->fs_info->meta_alloc_cluster;
7636         spin_lock(&cluster->refill_lock);
7637         btrfs_return_cluster_to_free_space(block_group, cluster);
7638         spin_unlock(&cluster->refill_lock);
7639
7640         path = btrfs_alloc_path();
7641         BUG_ON(!path);
7642
7643         spin_lock(&root->fs_info->block_group_cache_lock);
7644         rb_erase(&block_group->cache_node,
7645                  &root->fs_info->block_group_cache_tree);
7646         spin_unlock(&root->fs_info->block_group_cache_lock);
7647
7648         down_write(&block_group->space_info->groups_sem);
7649         /*
7650          * we must use list_del_init so people can check to see if they
7651          * are still on the list after taking the semaphore
7652          */
7653         list_del_init(&block_group->list);
7654         up_write(&block_group->space_info->groups_sem);
7655
7656         if (block_group->cached == BTRFS_CACHE_STARTED)
7657                 wait_block_group_cache_done(block_group);
7658
7659         btrfs_remove_free_space_cache(block_group);
7660
7661         spin_lock(&block_group->space_info->lock);
7662         block_group->space_info->total_bytes -= block_group->key.offset;
7663         block_group->space_info->bytes_readonly -= block_group->key.offset;
7664         spin_unlock(&block_group->space_info->lock);
7665
7666         btrfs_clear_space_info_full(root->fs_info);
7667
7668         btrfs_put_block_group(block_group);
7669         btrfs_put_block_group(block_group);
7670
7671         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7672         if (ret > 0)
7673                 ret = -EIO;
7674         if (ret < 0)
7675                 goto out;
7676
7677         ret = btrfs_del_item(trans, root, path);
7678 out:
7679         btrfs_free_path(path);
7680         return ret;
7681 }