Btrfs: Allocator fix variety pack
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include "hash.h"
22 #include "crc32c.h"
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "print-tree.h"
26 #include "transaction.h"
27 #include "volumes.h"
28
29 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
30 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
31 #define BLOCK_GROUP_SYSTEM   EXTENT_NEW
32
33 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
34
35 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
36                                  btrfs_root *extent_root);
37 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
38                                btrfs_root *extent_root);
39
40
41 static int cache_block_group(struct btrfs_root *root,
42                              struct btrfs_block_group_cache *block_group)
43 {
44         struct btrfs_path *path;
45         int ret;
46         struct btrfs_key key;
47         struct extent_buffer *leaf;
48         struct extent_io_tree *free_space_cache;
49         int slot;
50         u64 last = 0;
51         u64 hole_size;
52         u64 first_free;
53         int found = 0;
54
55         if (!block_group)
56                 return 0;
57
58         root = root->fs_info->extent_root;
59         free_space_cache = &root->fs_info->free_space_cache;
60
61         if (block_group->cached)
62                 return 0;
63
64         path = btrfs_alloc_path();
65         if (!path)
66                 return -ENOMEM;
67
68         path->reada = 2;
69         first_free = block_group->key.objectid;
70         key.objectid = block_group->key.objectid;
71         key.offset = 0;
72         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
73         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
74         if (ret < 0)
75                 return ret;
76         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
77         if (ret < 0)
78                 return ret;
79         if (ret == 0) {
80                 leaf = path->nodes[0];
81                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
82                 if (key.objectid + key.offset > first_free)
83                         first_free = key.objectid + key.offset;
84         }
85         while(1) {
86                 leaf = path->nodes[0];
87                 slot = path->slots[0];
88                 if (slot >= btrfs_header_nritems(leaf)) {
89                         ret = btrfs_next_leaf(root, path);
90                         if (ret < 0)
91                                 goto err;
92                         if (ret == 0) {
93                                 continue;
94                         } else {
95                                 break;
96                         }
97                 }
98                 btrfs_item_key_to_cpu(leaf, &key, slot);
99                 if (key.objectid < block_group->key.objectid) {
100                         goto next;
101                 }
102                 if (key.objectid >= block_group->key.objectid +
103                     block_group->key.offset) {
104                         break;
105                 }
106
107                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
108                         if (!found) {
109                                 last = first_free;
110                                 found = 1;
111                         }
112                         if (key.objectid > last) {
113                                 hole_size = key.objectid - last;
114                                 set_extent_dirty(free_space_cache, last,
115                                                  last + hole_size - 1,
116                                                  GFP_NOFS);
117                         }
118                         last = key.objectid + key.offset;
119                 }
120 next:
121                 path->slots[0]++;
122         }
123
124         if (!found)
125                 last = first_free;
126         if (block_group->key.objectid +
127             block_group->key.offset > last) {
128                 hole_size = block_group->key.objectid +
129                         block_group->key.offset - last;
130                 set_extent_dirty(free_space_cache, last,
131                                  last + hole_size - 1, GFP_NOFS);
132         }
133         block_group->cached = 1;
134 err:
135         btrfs_free_path(path);
136         return 0;
137 }
138
139 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
140                                                        btrfs_fs_info *info,
141                                                          u64 bytenr)
142 {
143         struct extent_io_tree *block_group_cache;
144         struct btrfs_block_group_cache *block_group = NULL;
145         u64 ptr;
146         u64 start;
147         u64 end;
148         int ret;
149
150         bytenr = max_t(u64, bytenr,
151                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
152         block_group_cache = &info->block_group_cache;
153         ret = find_first_extent_bit(block_group_cache,
154                                     bytenr, &start, &end,
155                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
156                                     BLOCK_GROUP_SYSTEM);
157         if (ret) {
158                 return NULL;
159         }
160         ret = get_state_private(block_group_cache, start, &ptr);
161         if (ret)
162                 return NULL;
163
164         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
165         return block_group;
166 }
167
168 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
169                                                          btrfs_fs_info *info,
170                                                          u64 bytenr)
171 {
172         struct extent_io_tree *block_group_cache;
173         struct btrfs_block_group_cache *block_group = NULL;
174         u64 ptr;
175         u64 start;
176         u64 end;
177         int ret;
178
179         bytenr = max_t(u64, bytenr,
180                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
181         block_group_cache = &info->block_group_cache;
182         ret = find_first_extent_bit(block_group_cache,
183                                     bytenr, &start, &end,
184                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
185                                     BLOCK_GROUP_SYSTEM);
186         if (ret) {
187                 return NULL;
188         }
189         ret = get_state_private(block_group_cache, start, &ptr);
190         if (ret)
191                 return NULL;
192
193         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
194         if (block_group->key.objectid <= bytenr && bytenr <
195             block_group->key.objectid + block_group->key.offset)
196                 return block_group;
197         return NULL;
198 }
199
200 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
201 {
202         return (cache->flags & bits) == bits;
203 }
204
205 static int noinline find_search_start(struct btrfs_root *root,
206                               struct btrfs_block_group_cache **cache_ret,
207                               u64 *start_ret, u64 num, int data)
208 {
209         int ret;
210         struct btrfs_block_group_cache *cache = *cache_ret;
211         struct extent_io_tree *free_space_cache;
212         struct extent_state *state;
213         u64 last;
214         u64 start = 0;
215         u64 cache_miss = 0;
216         u64 total_fs_bytes;
217         u64 search_start = *start_ret;
218         int wrapped = 0;
219
220         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
221         free_space_cache = &root->fs_info->free_space_cache;
222
223         if (!cache)
224                 goto out;
225
226 again:
227         ret = cache_block_group(root, cache);
228         if (ret) {
229                 goto out;
230         }
231
232         last = max(search_start, cache->key.objectid);
233         if (!block_group_bits(cache, data) || cache->ro)
234                 goto new_group;
235
236         spin_lock_irq(&free_space_cache->lock);
237         state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
238         while(1) {
239                 if (!state) {
240                         if (!cache_miss)
241                                 cache_miss = last;
242                         spin_unlock_irq(&free_space_cache->lock);
243                         goto new_group;
244                 }
245
246                 start = max(last, state->start);
247                 last = state->end + 1;
248                 if (last - start < num) {
249                         do {
250                                 state = extent_state_next(state);
251                         } while(state && !(state->state & EXTENT_DIRTY));
252                         continue;
253                 }
254                 spin_unlock_irq(&free_space_cache->lock);
255                 if (cache->ro) {
256                         goto new_group;
257                 }
258                 if (start + num > cache->key.objectid + cache->key.offset)
259                         goto new_group;
260                 if (!block_group_bits(cache, data)) {
261                         printk("block group bits don't match %Lu %d\n", cache->flags, data);
262                 }
263                 *start_ret = start;
264                 return 0;
265         }
266 out:
267         cache = btrfs_lookup_block_group(root->fs_info, search_start);
268         if (!cache) {
269                 printk("Unable to find block group for %Lu\n", search_start);
270                 WARN_ON(1);
271         }
272         return -ENOSPC;
273
274 new_group:
275         last = cache->key.objectid + cache->key.offset;
276 wrapped:
277         cache = btrfs_lookup_first_block_group(root->fs_info, last);
278         if (!cache || cache->key.objectid >= total_fs_bytes) {
279 no_cache:
280                 if (!wrapped) {
281                         wrapped = 1;
282                         last = search_start;
283                         goto wrapped;
284                 }
285                 goto out;
286         }
287         if (cache_miss && !cache->cached) {
288                 cache_block_group(root, cache);
289                 last = cache_miss;
290                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
291         }
292         cache_miss = 0;
293         cache = btrfs_find_block_group(root, cache, last, data, 0);
294         if (!cache)
295                 goto no_cache;
296         *cache_ret = cache;
297         goto again;
298 }
299
300 static u64 div_factor(u64 num, int factor)
301 {
302         if (factor == 10)
303                 return num;
304         num *= factor;
305         do_div(num, 10);
306         return num;
307 }
308
309 static int block_group_state_bits(u64 flags)
310 {
311         int bits = 0;
312         if (flags & BTRFS_BLOCK_GROUP_DATA)
313                 bits |= BLOCK_GROUP_DATA;
314         if (flags & BTRFS_BLOCK_GROUP_METADATA)
315                 bits |= BLOCK_GROUP_METADATA;
316         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
317                 bits |= BLOCK_GROUP_SYSTEM;
318         return bits;
319 }
320
321 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
322                                                  struct btrfs_block_group_cache
323                                                  *hint, u64 search_start,
324                                                  int data, int owner)
325 {
326         struct btrfs_block_group_cache *cache;
327         struct extent_io_tree *block_group_cache;
328         struct btrfs_block_group_cache *found_group = NULL;
329         struct btrfs_fs_info *info = root->fs_info;
330         u64 used;
331         u64 last = 0;
332         u64 start;
333         u64 end;
334         u64 free_check;
335         u64 ptr;
336         int bit;
337         int ret;
338         int full_search = 0;
339         int factor = 10;
340         int wrapped = 0;
341
342         block_group_cache = &info->block_group_cache;
343
344         if (data & BTRFS_BLOCK_GROUP_METADATA)
345                 factor = 9;
346
347         bit = block_group_state_bits(data);
348
349         if (search_start) {
350                 struct btrfs_block_group_cache *shint;
351                 shint = btrfs_lookup_first_block_group(info, search_start);
352                 if (shint && block_group_bits(shint, data) && !shint->ro) {
353                         used = btrfs_block_group_used(&shint->item);
354                         if (used + shint->pinned <
355                             div_factor(shint->key.offset, factor)) {
356                                 return shint;
357                         }
358                 }
359         }
360         if (hint && !hint->ro && block_group_bits(hint, data)) {
361                 used = btrfs_block_group_used(&hint->item);
362                 if (used + hint->pinned <
363                     div_factor(hint->key.offset, factor)) {
364                         return hint;
365                 }
366                 last = hint->key.objectid + hint->key.offset;
367         } else {
368                 if (hint)
369                         last = max(hint->key.objectid, search_start);
370                 else
371                         last = search_start;
372         }
373 again:
374         while(1) {
375                 ret = find_first_extent_bit(block_group_cache, last,
376                                             &start, &end, bit);
377                 if (ret)
378                         break;
379
380                 ret = get_state_private(block_group_cache, start, &ptr);
381                 if (ret) {
382                         last = end + 1;
383                         continue;
384                 }
385
386                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
387                 last = cache->key.objectid + cache->key.offset;
388                 used = btrfs_block_group_used(&cache->item);
389
390                 if (!cache->ro && block_group_bits(cache, data)) {
391                         free_check = div_factor(cache->key.offset, factor);
392                         if (used + cache->pinned < free_check) {
393                                 found_group = cache;
394                                 goto found;
395                         }
396                 }
397                 cond_resched();
398         }
399         if (!wrapped) {
400                 last = search_start;
401                 wrapped = 1;
402                 goto again;
403         }
404         if (!full_search && factor < 10) {
405                 last = search_start;
406                 full_search = 1;
407                 factor = 10;
408                 goto again;
409         }
410 found:
411         return found_group;
412 }
413
414 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
415                            u64 owner, u64 owner_offset)
416 {
417         u32 high_crc = ~(u32)0;
418         u32 low_crc = ~(u32)0;
419         __le64 lenum;
420         lenum = cpu_to_le64(root_objectid);
421         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
422         lenum = cpu_to_le64(ref_generation);
423         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
424         if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
425                 lenum = cpu_to_le64(owner);
426                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
427                 lenum = cpu_to_le64(owner_offset);
428                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
429         }
430         return ((u64)high_crc << 32) | (u64)low_crc;
431 }
432
433 static int match_extent_ref(struct extent_buffer *leaf,
434                             struct btrfs_extent_ref *disk_ref,
435                             struct btrfs_extent_ref *cpu_ref)
436 {
437         int ret;
438         int len;
439
440         if (cpu_ref->objectid)
441                 len = sizeof(*cpu_ref);
442         else
443                 len = 2 * sizeof(u64);
444         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
445                                    len);
446         return ret == 0;
447 }
448
449 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
450                                           struct btrfs_root *root,
451                                           struct btrfs_path *path, u64 bytenr,
452                                           u64 root_objectid,
453                                           u64 ref_generation, u64 owner,
454                                           u64 owner_offset, int del)
455 {
456         u64 hash;
457         struct btrfs_key key;
458         struct btrfs_key found_key;
459         struct btrfs_extent_ref ref;
460         struct extent_buffer *leaf;
461         struct btrfs_extent_ref *disk_ref;
462         int ret;
463         int ret2;
464
465         btrfs_set_stack_ref_root(&ref, root_objectid);
466         btrfs_set_stack_ref_generation(&ref, ref_generation);
467         btrfs_set_stack_ref_objectid(&ref, owner);
468         btrfs_set_stack_ref_offset(&ref, owner_offset);
469
470         hash = hash_extent_ref(root_objectid, ref_generation, owner,
471                                owner_offset);
472         key.offset = hash;
473         key.objectid = bytenr;
474         key.type = BTRFS_EXTENT_REF_KEY;
475
476         while (1) {
477                 ret = btrfs_search_slot(trans, root, &key, path,
478                                         del ? -1 : 0, del);
479                 if (ret < 0)
480                         goto out;
481                 leaf = path->nodes[0];
482                 if (ret != 0) {
483                         u32 nritems = btrfs_header_nritems(leaf);
484                         if (path->slots[0] >= nritems) {
485                                 ret2 = btrfs_next_leaf(root, path);
486                                 if (ret2)
487                                         goto out;
488                                 leaf = path->nodes[0];
489                         }
490                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
491                         if (found_key.objectid != bytenr ||
492                             found_key.type != BTRFS_EXTENT_REF_KEY)
493                                 goto out;
494                         key.offset = found_key.offset;
495                         if (del) {
496                                 btrfs_release_path(root, path);
497                                 continue;
498                         }
499                 }
500                 disk_ref = btrfs_item_ptr(path->nodes[0],
501                                           path->slots[0],
502                                           struct btrfs_extent_ref);
503                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
504                         ret = 0;
505                         goto out;
506                 }
507                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
508                 key.offset = found_key.offset + 1;
509                 btrfs_release_path(root, path);
510         }
511 out:
512         return ret;
513 }
514
515 /*
516  * Back reference rules.  Back refs have three main goals:
517  *
518  * 1) differentiate between all holders of references to an extent so that
519  *    when a reference is dropped we can make sure it was a valid reference
520  *    before freeing the extent.
521  *
522  * 2) Provide enough information to quickly find the holders of an extent
523  *    if we notice a given block is corrupted or bad.
524  *
525  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
526  *    maintenance.  This is actually the same as #2, but with a slightly
527  *    different use case.
528  *
529  * File extents can be referenced by:
530  *
531  * - multiple snapshots, subvolumes, or different generations in one subvol
532  * - different files inside a single subvolume (in theory, not implemented yet)
533  * - different offsets inside a file (bookend extents in file.c)
534  *
535  * The extent ref structure has fields for:
536  *
537  * - Objectid of the subvolume root
538  * - Generation number of the tree holding the reference
539  * - objectid of the file holding the reference
540  * - offset in the file corresponding to the key holding the reference
541  *
542  * When a file extent is allocated the fields are filled in:
543  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
544  *
545  * When a leaf is cow'd new references are added for every file extent found
546  * in the leaf.  It looks the same as the create case, but trans->transid
547  * will be different when the block is cow'd.
548  *
549  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
550  *
551  * When a file extent is removed either during snapshot deletion or file
552  * truncation, the corresponding back reference is found
553  * by searching for:
554  *
555  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
556  *      inode objectid, offset in file)
557  *
558  * Btree extents can be referenced by:
559  *
560  * - Different subvolumes
561  * - Different generations of the same subvolume
562  *
563  * Storing sufficient information for a full reverse mapping of a btree
564  * block would require storing the lowest key of the block in the backref,
565  * and it would require updating that lowest key either before write out or
566  * every time it changed.  Instead, the objectid of the lowest key is stored
567  * along with the level of the tree block.  This provides a hint
568  * about where in the btree the block can be found.  Searches through the
569  * btree only need to look for a pointer to that block, so they stop one
570  * level higher than the level recorded in the backref.
571  *
572  * Some btrees do not do reference counting on their extents.  These
573  * include the extent tree and the tree of tree roots.  Backrefs for these
574  * trees always have a generation of zero.
575  *
576  * When a tree block is created, back references are inserted:
577  *
578  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
579  *
580  * When a tree block is cow'd in a reference counted root,
581  * new back references are added for all the blocks it points to.
582  * These are of the form (trans->transid will have increased since creation):
583  *
584  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
585  *
586  * Because the lowest_key_objectid and the level are just hints
587  * they are not used when backrefs are deleted.  When a backref is deleted:
588  *
589  * if backref was for a tree root:
590  *     root_objectid = root->root_key.objectid
591  * else
592  *     root_objectid = btrfs_header_owner(parent)
593  *
594  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
595  *
596  * Back Reference Key hashing:
597  *
598  * Back references have four fields, each 64 bits long.  Unfortunately,
599  * This is hashed into a single 64 bit number and placed into the key offset.
600  * The key objectid corresponds to the first byte in the extent, and the
601  * key type is set to BTRFS_EXTENT_REF_KEY
602  */
603 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
604                                  struct btrfs_root *root,
605                                  struct btrfs_path *path, u64 bytenr,
606                                  u64 root_objectid, u64 ref_generation,
607                                  u64 owner, u64 owner_offset)
608 {
609         u64 hash;
610         struct btrfs_key key;
611         struct btrfs_extent_ref ref;
612         struct btrfs_extent_ref *disk_ref;
613         int ret;
614
615         btrfs_set_stack_ref_root(&ref, root_objectid);
616         btrfs_set_stack_ref_generation(&ref, ref_generation);
617         btrfs_set_stack_ref_objectid(&ref, owner);
618         btrfs_set_stack_ref_offset(&ref, owner_offset);
619
620         hash = hash_extent_ref(root_objectid, ref_generation, owner,
621                                owner_offset);
622         key.offset = hash;
623         key.objectid = bytenr;
624         key.type = BTRFS_EXTENT_REF_KEY;
625
626         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
627         while (ret == -EEXIST) {
628                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
629                                           struct btrfs_extent_ref);
630                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
631                         goto out;
632                 key.offset++;
633                 btrfs_release_path(root, path);
634                 ret = btrfs_insert_empty_item(trans, root, path, &key,
635                                               sizeof(ref));
636         }
637         if (ret)
638                 goto out;
639         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
640                                   struct btrfs_extent_ref);
641         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
642                             sizeof(ref));
643         btrfs_mark_buffer_dirty(path->nodes[0]);
644 out:
645         btrfs_release_path(root, path);
646         return ret;
647 }
648
649 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
650                                 struct btrfs_root *root,
651                                 u64 bytenr, u64 num_bytes,
652                                 u64 root_objectid, u64 ref_generation,
653                                 u64 owner, u64 owner_offset)
654 {
655         struct btrfs_path *path;
656         int ret;
657         struct btrfs_key key;
658         struct extent_buffer *l;
659         struct btrfs_extent_item *item;
660         u32 refs;
661
662         WARN_ON(num_bytes < root->sectorsize);
663         path = btrfs_alloc_path();
664         if (!path)
665                 return -ENOMEM;
666
667         path->reada = 1;
668         key.objectid = bytenr;
669         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
670         key.offset = num_bytes;
671         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
672                                 0, 1);
673         if (ret < 0)
674                 return ret;
675         if (ret != 0) {
676                 BUG();
677         }
678         BUG_ON(ret != 0);
679         l = path->nodes[0];
680         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
681         refs = btrfs_extent_refs(l, item);
682         btrfs_set_extent_refs(l, item, refs + 1);
683         btrfs_mark_buffer_dirty(path->nodes[0]);
684
685         btrfs_release_path(root->fs_info->extent_root, path);
686
687         path->reada = 1;
688         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
689                                           path, bytenr, root_objectid,
690                                           ref_generation, owner, owner_offset);
691         BUG_ON(ret);
692         finish_current_insert(trans, root->fs_info->extent_root);
693         del_pending_extents(trans, root->fs_info->extent_root);
694
695         btrfs_free_path(path);
696         return 0;
697 }
698
699 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
700                          struct btrfs_root *root)
701 {
702         finish_current_insert(trans, root->fs_info->extent_root);
703         del_pending_extents(trans, root->fs_info->extent_root);
704         return 0;
705 }
706
707 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
708                              struct btrfs_root *root, u64 bytenr,
709                              u64 num_bytes, u32 *refs)
710 {
711         struct btrfs_path *path;
712         int ret;
713         struct btrfs_key key;
714         struct extent_buffer *l;
715         struct btrfs_extent_item *item;
716
717         WARN_ON(num_bytes < root->sectorsize);
718         path = btrfs_alloc_path();
719         path->reada = 1;
720         key.objectid = bytenr;
721         key.offset = num_bytes;
722         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
723         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
724                                 0, 0);
725         if (ret < 0)
726                 goto out;
727         if (ret != 0) {
728                 btrfs_print_leaf(root, path->nodes[0]);
729                 printk("failed to find block number %Lu\n", bytenr);
730                 BUG();
731         }
732         l = path->nodes[0];
733         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
734         *refs = btrfs_extent_refs(l, item);
735 out:
736         btrfs_free_path(path);
737         return 0;
738 }
739
740 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
741                                   struct btrfs_path *count_path,
742                                   u64 expected_owner,
743                                   u64 first_extent)
744 {
745         struct btrfs_root *extent_root = root->fs_info->extent_root;
746         struct btrfs_path *path;
747         u64 bytenr;
748         u64 found_objectid;
749         u64 found_owner;
750         u64 root_objectid = root->root_key.objectid;
751         u32 total_count = 0;
752         u32 extent_refs;
753         u32 cur_count;
754         u32 nritems;
755         int ret;
756         struct btrfs_key key;
757         struct btrfs_key found_key;
758         struct extent_buffer *l;
759         struct btrfs_extent_item *item;
760         struct btrfs_extent_ref *ref_item;
761         int level = -1;
762
763         path = btrfs_alloc_path();
764 again:
765         if (level == -1)
766                 bytenr = first_extent;
767         else
768                 bytenr = count_path->nodes[level]->start;
769
770         cur_count = 0;
771         key.objectid = bytenr;
772         key.offset = 0;
773
774         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
775         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
776         if (ret < 0)
777                 goto out;
778         BUG_ON(ret == 0);
779
780         l = path->nodes[0];
781         btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
782
783         if (found_key.objectid != bytenr ||
784             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
785                 goto out;
786         }
787
788         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
789         extent_refs = btrfs_extent_refs(l, item);
790         while (1) {
791                 l = path->nodes[0];
792                 nritems = btrfs_header_nritems(l);
793                 if (path->slots[0] >= nritems) {
794                         ret = btrfs_next_leaf(extent_root, path);
795                         if (ret == 0)
796                                 continue;
797                         break;
798                 }
799                 btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
800                 if (found_key.objectid != bytenr)
801                         break;
802
803                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
804                         path->slots[0]++;
805                         continue;
806                 }
807
808                 cur_count++;
809                 ref_item = btrfs_item_ptr(l, path->slots[0],
810                                           struct btrfs_extent_ref);
811                 found_objectid = btrfs_ref_root(l, ref_item);
812
813                 if (found_objectid != root_objectid) {
814                         total_count = 2;
815                         goto out;
816                 }
817                 if (level == -1) {
818                         found_owner = btrfs_ref_objectid(l, ref_item);
819                         if (found_owner != expected_owner) {
820                                 total_count = 2;
821                                 goto out;
822                         }
823                         /*
824                          * nasty.  we don't count a reference held by
825                          * the running transaction.  This allows nodatacow
826                          * to avoid cow most of the time
827                          */
828                         if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
829                             btrfs_ref_generation(l, ref_item) ==
830                             root->fs_info->generation) {
831                                 extent_refs--;
832                         }
833                 }
834                 total_count = 1;
835                 path->slots[0]++;
836         }
837         /*
838          * if there is more than one reference against a data extent,
839          * we have to assume the other ref is another snapshot
840          */
841         if (level == -1 && extent_refs > 1) {
842                 total_count = 2;
843                 goto out;
844         }
845         if (cur_count == 0) {
846                 total_count = 0;
847                 goto out;
848         }
849         if (level >= 0 && root->node == count_path->nodes[level])
850                 goto out;
851         level++;
852         btrfs_release_path(root, path);
853         goto again;
854
855 out:
856         btrfs_free_path(path);
857         return total_count;
858 }
859 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
860                        struct btrfs_root *root, u64 owner_objectid)
861 {
862         u64 generation;
863         u64 key_objectid;
864         u64 level;
865         u32 nritems;
866         struct btrfs_disk_key disk_key;
867
868         level = btrfs_header_level(root->node);
869         generation = trans->transid;
870         nritems = btrfs_header_nritems(root->node);
871         if (nritems > 0) {
872                 if (level == 0)
873                         btrfs_item_key(root->node, &disk_key, 0);
874                 else
875                         btrfs_node_key(root->node, &disk_key, 0);
876                 key_objectid = btrfs_disk_key_objectid(&disk_key);
877         } else {
878                 key_objectid = 0;
879         }
880         return btrfs_inc_extent_ref(trans, root, root->node->start,
881                                     root->node->len, owner_objectid,
882                                     generation, level, key_objectid);
883 }
884
885 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
886                   struct extent_buffer *buf)
887 {
888         u64 bytenr;
889         u32 nritems;
890         struct btrfs_key key;
891         struct btrfs_file_extent_item *fi;
892         int i;
893         int level;
894         int ret;
895         int faili;
896
897         if (!root->ref_cows)
898                 return 0;
899
900         level = btrfs_header_level(buf);
901         nritems = btrfs_header_nritems(buf);
902         for (i = 0; i < nritems; i++) {
903                 if (level == 0) {
904                         u64 disk_bytenr;
905                         btrfs_item_key_to_cpu(buf, &key, i);
906                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
907                                 continue;
908                         fi = btrfs_item_ptr(buf, i,
909                                             struct btrfs_file_extent_item);
910                         if (btrfs_file_extent_type(buf, fi) ==
911                             BTRFS_FILE_EXTENT_INLINE)
912                                 continue;
913                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
914                         if (disk_bytenr == 0)
915                                 continue;
916                         ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
917                                     btrfs_file_extent_disk_num_bytes(buf, fi),
918                                     root->root_key.objectid, trans->transid,
919                                     key.objectid, key.offset);
920                         if (ret) {
921                                 faili = i;
922                                 goto fail;
923                         }
924                 } else {
925                         bytenr = btrfs_node_blockptr(buf, i);
926                         btrfs_node_key_to_cpu(buf, &key, i);
927                         ret = btrfs_inc_extent_ref(trans, root, bytenr,
928                                            btrfs_level_size(root, level - 1),
929                                            root->root_key.objectid,
930                                            trans->transid,
931                                            level - 1, key.objectid);
932                         if (ret) {
933                                 faili = i;
934                                 goto fail;
935                         }
936                 }
937         }
938         return 0;
939 fail:
940         WARN_ON(1);
941 #if 0
942         for (i =0; i < faili; i++) {
943                 if (level == 0) {
944                         u64 disk_bytenr;
945                         btrfs_item_key_to_cpu(buf, &key, i);
946                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
947                                 continue;
948                         fi = btrfs_item_ptr(buf, i,
949                                             struct btrfs_file_extent_item);
950                         if (btrfs_file_extent_type(buf, fi) ==
951                             BTRFS_FILE_EXTENT_INLINE)
952                                 continue;
953                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
954                         if (disk_bytenr == 0)
955                                 continue;
956                         err = btrfs_free_extent(trans, root, disk_bytenr,
957                                     btrfs_file_extent_disk_num_bytes(buf,
958                                                                       fi), 0);
959                         BUG_ON(err);
960                 } else {
961                         bytenr = btrfs_node_blockptr(buf, i);
962                         err = btrfs_free_extent(trans, root, bytenr,
963                                         btrfs_level_size(root, level - 1), 0);
964                         BUG_ON(err);
965                 }
966         }
967 #endif
968         return ret;
969 }
970
971 static int write_one_cache_group(struct btrfs_trans_handle *trans,
972                                  struct btrfs_root *root,
973                                  struct btrfs_path *path,
974                                  struct btrfs_block_group_cache *cache)
975 {
976         int ret;
977         int pending_ret;
978         struct btrfs_root *extent_root = root->fs_info->extent_root;
979         unsigned long bi;
980         struct extent_buffer *leaf;
981
982         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
983         if (ret < 0)
984                 goto fail;
985         BUG_ON(ret);
986
987         leaf = path->nodes[0];
988         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
989         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
990         btrfs_mark_buffer_dirty(leaf);
991         btrfs_release_path(extent_root, path);
992 fail:
993         finish_current_insert(trans, extent_root);
994         pending_ret = del_pending_extents(trans, extent_root);
995         if (ret)
996                 return ret;
997         if (pending_ret)
998                 return pending_ret;
999         return 0;
1000
1001 }
1002
1003 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1004                                    struct btrfs_root *root)
1005 {
1006         struct extent_io_tree *block_group_cache;
1007         struct btrfs_block_group_cache *cache;
1008         int ret;
1009         int err = 0;
1010         int werr = 0;
1011         struct btrfs_path *path;
1012         u64 last = 0;
1013         u64 start;
1014         u64 end;
1015         u64 ptr;
1016
1017         block_group_cache = &root->fs_info->block_group_cache;
1018         path = btrfs_alloc_path();
1019         if (!path)
1020                 return -ENOMEM;
1021
1022         while(1) {
1023                 ret = find_first_extent_bit(block_group_cache, last,
1024                                             &start, &end, BLOCK_GROUP_DIRTY);
1025                 if (ret)
1026                         break;
1027
1028                 last = end + 1;
1029                 ret = get_state_private(block_group_cache, start, &ptr);
1030                 if (ret)
1031                         break;
1032                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
1033                 err = write_one_cache_group(trans, root,
1034                                             path, cache);
1035                 /*
1036                  * if we fail to write the cache group, we want
1037                  * to keep it marked dirty in hopes that a later
1038                  * write will work
1039                  */
1040                 if (err) {
1041                         werr = err;
1042                         continue;
1043                 }
1044                 clear_extent_bits(block_group_cache, start, end,
1045                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
1046         }
1047         btrfs_free_path(path);
1048         return werr;
1049 }
1050
1051 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
1052                                                   u64 flags)
1053 {
1054         struct list_head *head = &info->space_info;
1055         struct list_head *cur;
1056         struct btrfs_space_info *found;
1057         list_for_each(cur, head) {
1058                 found = list_entry(cur, struct btrfs_space_info, list);
1059                 if (found->flags == flags)
1060                         return found;
1061         }
1062         return NULL;
1063
1064 }
1065
1066 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1067                              u64 total_bytes, u64 bytes_used,
1068                              struct btrfs_space_info **space_info)
1069 {
1070         struct btrfs_space_info *found;
1071
1072         found = __find_space_info(info, flags);
1073         if (found) {
1074                 found->total_bytes += total_bytes;
1075                 found->bytes_used += bytes_used;
1076                 found->full = 0;
1077                 WARN_ON(found->total_bytes < found->bytes_used);
1078                 *space_info = found;
1079                 return 0;
1080         }
1081         found = kmalloc(sizeof(*found), GFP_NOFS);
1082         if (!found)
1083                 return -ENOMEM;
1084
1085         list_add(&found->list, &info->space_info);
1086         found->flags = flags;
1087         found->total_bytes = total_bytes;
1088         found->bytes_used = bytes_used;
1089         found->bytes_pinned = 0;
1090         found->full = 0;
1091         found->force_alloc = 0;
1092         *space_info = found;
1093         return 0;
1094 }
1095
1096 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1097 {
1098         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1099                                    BTRFS_BLOCK_GROUP_RAID1 |
1100                                    BTRFS_BLOCK_GROUP_RAID10 |
1101                                    BTRFS_BLOCK_GROUP_DUP);
1102         if (extra_flags) {
1103                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1104                         fs_info->avail_data_alloc_bits |= extra_flags;
1105                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1106                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1107                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1108                         fs_info->avail_system_alloc_bits |= extra_flags;
1109         }
1110 }
1111
1112 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1113 {
1114         u64 num_devices = root->fs_info->fs_devices->num_devices;
1115
1116         if (num_devices == 1)
1117                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1118         if (num_devices < 4)
1119                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1120
1121         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1122             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1123                       BTRFS_BLOCK_GROUP_RAID10))) {
1124                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1125         }
1126
1127         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1128             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1129                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1130         }
1131
1132         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1133             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1134              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1135              (flags & BTRFS_BLOCK_GROUP_DUP)))
1136                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1137         return flags;
1138 }
1139
1140 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1141                           struct btrfs_root *extent_root, u64 alloc_bytes,
1142                           u64 flags, int force)
1143 {
1144         struct btrfs_space_info *space_info;
1145         u64 thresh;
1146         u64 start;
1147         u64 num_bytes;
1148         int ret;
1149
1150         flags = reduce_alloc_profile(extent_root, flags);
1151
1152         space_info = __find_space_info(extent_root->fs_info, flags);
1153         if (!space_info) {
1154                 ret = update_space_info(extent_root->fs_info, flags,
1155                                         0, 0, &space_info);
1156                 BUG_ON(ret);
1157         }
1158         BUG_ON(!space_info);
1159
1160         if (space_info->force_alloc) {
1161                 force = 1;
1162                 space_info->force_alloc = 0;
1163         }
1164         if (space_info->full)
1165                 return 0;
1166
1167         thresh = div_factor(space_info->total_bytes, 6);
1168         if (!force &&
1169            (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1170             thresh)
1171                 return 0;
1172
1173         ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1174         if (ret == -ENOSPC) {
1175 printk("space info full %Lu\n", flags);
1176                 space_info->full = 1;
1177                 return 0;
1178         }
1179         BUG_ON(ret);
1180
1181         ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1182                      BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1183         BUG_ON(ret);
1184
1185         return 0;
1186 }
1187
1188 static int update_block_group(struct btrfs_trans_handle *trans,
1189                               struct btrfs_root *root,
1190                               u64 bytenr, u64 num_bytes, int alloc,
1191                               int mark_free)
1192 {
1193         struct btrfs_block_group_cache *cache;
1194         struct btrfs_fs_info *info = root->fs_info;
1195         u64 total = num_bytes;
1196         u64 old_val;
1197         u64 byte_in_group;
1198         u64 start;
1199         u64 end;
1200
1201         while(total) {
1202                 cache = btrfs_lookup_block_group(info, bytenr);
1203                 if (!cache) {
1204                         return -1;
1205                 }
1206                 byte_in_group = bytenr - cache->key.objectid;
1207                 WARN_ON(byte_in_group > cache->key.offset);
1208                 start = cache->key.objectid;
1209                 end = start + cache->key.offset - 1;
1210                 set_extent_bits(&info->block_group_cache, start, end,
1211                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
1212
1213                 old_val = btrfs_block_group_used(&cache->item);
1214                 num_bytes = min(total, cache->key.offset - byte_in_group);
1215                 if (alloc) {
1216                         old_val += num_bytes;
1217                         cache->space_info->bytes_used += num_bytes;
1218                 } else {
1219                         old_val -= num_bytes;
1220                         cache->space_info->bytes_used -= num_bytes;
1221                         if (mark_free) {
1222                                 set_extent_dirty(&info->free_space_cache,
1223                                                  bytenr, bytenr + num_bytes - 1,
1224                                                  GFP_NOFS);
1225                         }
1226                 }
1227                 btrfs_set_block_group_used(&cache->item, old_val);
1228                 total -= num_bytes;
1229                 bytenr += num_bytes;
1230         }
1231         return 0;
1232 }
1233
1234 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1235 {
1236         u64 start;
1237         u64 end;
1238         int ret;
1239         ret = find_first_extent_bit(&root->fs_info->block_group_cache,
1240                                     search_start, &start, &end,
1241                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
1242                                     BLOCK_GROUP_SYSTEM);
1243         if (ret)
1244                 return 0;
1245         return start;
1246 }
1247
1248
1249 static int update_pinned_extents(struct btrfs_root *root,
1250                                 u64 bytenr, u64 num, int pin)
1251 {
1252         u64 len;
1253         struct btrfs_block_group_cache *cache;
1254         struct btrfs_fs_info *fs_info = root->fs_info;
1255
1256         if (pin) {
1257                 set_extent_dirty(&fs_info->pinned_extents,
1258                                 bytenr, bytenr + num - 1, GFP_NOFS);
1259         } else {
1260                 clear_extent_dirty(&fs_info->pinned_extents,
1261                                 bytenr, bytenr + num - 1, GFP_NOFS);
1262         }
1263         while (num > 0) {
1264                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1265                 if (!cache) {
1266                         u64 first = first_logical_byte(root, bytenr);
1267                         WARN_ON(first < bytenr);
1268                         len = min(first - bytenr, num);
1269                 } else {
1270                         len = min(num, cache->key.offset -
1271                                   (bytenr - cache->key.objectid));
1272                 }
1273                 if (pin) {
1274                         if (cache) {
1275                                 cache->pinned += len;
1276                                 cache->space_info->bytes_pinned += len;
1277                         }
1278                         fs_info->total_pinned += len;
1279                 } else {
1280                         if (cache) {
1281                                 cache->pinned -= len;
1282                                 cache->space_info->bytes_pinned -= len;
1283                         }
1284                         fs_info->total_pinned -= len;
1285                 }
1286                 bytenr += len;
1287                 num -= len;
1288         }
1289         return 0;
1290 }
1291
1292 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1293 {
1294         u64 last = 0;
1295         u64 start;
1296         u64 end;
1297         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1298         int ret;
1299
1300         while(1) {
1301                 ret = find_first_extent_bit(pinned_extents, last,
1302                                             &start, &end, EXTENT_DIRTY);
1303                 if (ret)
1304                         break;
1305                 set_extent_dirty(copy, start, end, GFP_NOFS);
1306                 last = end + 1;
1307         }
1308         return 0;
1309 }
1310
1311 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1312                                struct btrfs_root *root,
1313                                struct extent_io_tree *unpin)
1314 {
1315         u64 start;
1316         u64 end;
1317         int ret;
1318         struct extent_io_tree *free_space_cache;
1319         free_space_cache = &root->fs_info->free_space_cache;
1320
1321         while(1) {
1322                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1323                                             EXTENT_DIRTY);
1324                 if (ret)
1325                         break;
1326                 update_pinned_extents(root, start, end + 1 - start, 0);
1327                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1328                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1329         }
1330         return 0;
1331 }
1332
1333 static int finish_current_insert(struct btrfs_trans_handle *trans,
1334                                  struct btrfs_root *extent_root)
1335 {
1336         u64 start;
1337         u64 end;
1338         struct btrfs_fs_info *info = extent_root->fs_info;
1339         struct extent_buffer *eb;
1340         struct btrfs_path *path;
1341         struct btrfs_key ins;
1342         struct btrfs_disk_key first;
1343         struct btrfs_extent_item extent_item;
1344         int ret;
1345         int level;
1346         int err = 0;
1347
1348         btrfs_set_stack_extent_refs(&extent_item, 1);
1349         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1350         path = btrfs_alloc_path();
1351
1352         while(1) {
1353                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1354                                             &end, EXTENT_LOCKED);
1355                 if (ret)
1356                         break;
1357
1358                 ins.objectid = start;
1359                 ins.offset = end + 1 - start;
1360                 err = btrfs_insert_item(trans, extent_root, &ins,
1361                                         &extent_item, sizeof(extent_item));
1362                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1363                                   GFP_NOFS);
1364                 eb = read_tree_block(extent_root, ins.objectid, ins.offset,
1365                                      trans->transid);
1366                 level = btrfs_header_level(eb);
1367                 if (level == 0) {
1368                         btrfs_item_key(eb, &first, 0);
1369                 } else {
1370                         btrfs_node_key(eb, &first, 0);
1371                 }
1372                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1373                                           start, extent_root->root_key.objectid,
1374                                           0, level,
1375                                           btrfs_disk_key_objectid(&first));
1376                 BUG_ON(err);
1377                 free_extent_buffer(eb);
1378         }
1379         btrfs_free_path(path);
1380         return 0;
1381 }
1382
1383 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1384                           int pending)
1385 {
1386         int err = 0;
1387         struct extent_buffer *buf;
1388
1389         if (!pending) {
1390                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1391                 if (buf) {
1392                         if (btrfs_buffer_uptodate(buf, 0)) {
1393                                 u64 transid =
1394                                     root->fs_info->running_transaction->transid;
1395                                 u64 header_transid =
1396                                         btrfs_header_generation(buf);
1397                                 if (header_transid == transid &&
1398                                     !btrfs_header_flag(buf,
1399                                                BTRFS_HEADER_FLAG_WRITTEN)) {
1400                                         clean_tree_block(NULL, root, buf);
1401                                         free_extent_buffer(buf);
1402                                         return 1;
1403                                 }
1404                         }
1405                         free_extent_buffer(buf);
1406                 }
1407                 update_pinned_extents(root, bytenr, num_bytes, 1);
1408         } else {
1409                 set_extent_bits(&root->fs_info->pending_del,
1410                                 bytenr, bytenr + num_bytes - 1,
1411                                 EXTENT_LOCKED, GFP_NOFS);
1412         }
1413         BUG_ON(err < 0);
1414         return 0;
1415 }
1416
1417 /*
1418  * remove an extent from the root, returns 0 on success
1419  */
1420 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1421                          *root, u64 bytenr, u64 num_bytes,
1422                          u64 root_objectid, u64 ref_generation,
1423                          u64 owner_objectid, u64 owner_offset, int pin,
1424                          int mark_free)
1425 {
1426         struct btrfs_path *path;
1427         struct btrfs_key key;
1428         struct btrfs_fs_info *info = root->fs_info;
1429         struct btrfs_root *extent_root = info->extent_root;
1430         struct extent_buffer *leaf;
1431         int ret;
1432         int extent_slot = 0;
1433         int found_extent = 0;
1434         int num_to_del = 1;
1435         struct btrfs_extent_item *ei;
1436         u32 refs;
1437
1438         key.objectid = bytenr;
1439         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1440         key.offset = num_bytes;
1441         path = btrfs_alloc_path();
1442         if (!path)
1443                 return -ENOMEM;
1444
1445         path->reada = 1;
1446         ret = lookup_extent_backref(trans, extent_root, path,
1447                                     bytenr, root_objectid,
1448                                     ref_generation,
1449                                     owner_objectid, owner_offset, 1);
1450         if (ret == 0) {
1451                 struct btrfs_key found_key;
1452                 extent_slot = path->slots[0];
1453                 while(extent_slot > 0) {
1454                         extent_slot--;
1455                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1456                                               extent_slot);
1457                         if (found_key.objectid != bytenr)
1458                                 break;
1459                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1460                             found_key.offset == num_bytes) {
1461                                 found_extent = 1;
1462                                 break;
1463                         }
1464                         if (path->slots[0] - extent_slot > 5)
1465                                 break;
1466                 }
1467                 if (!found_extent)
1468                         ret = btrfs_del_item(trans, extent_root, path);
1469         } else {
1470                 btrfs_print_leaf(extent_root, path->nodes[0]);
1471                 WARN_ON(1);
1472                 printk("Unable to find ref byte nr %Lu root %Lu "
1473                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1474                        root_objectid, ref_generation, owner_objectid,
1475                        owner_offset);
1476         }
1477         if (!found_extent) {
1478                 btrfs_release_path(extent_root, path);
1479                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1480                 if (ret < 0)
1481                         return ret;
1482                 BUG_ON(ret);
1483                 extent_slot = path->slots[0];
1484         }
1485
1486         leaf = path->nodes[0];
1487         ei = btrfs_item_ptr(leaf, extent_slot,
1488                             struct btrfs_extent_item);
1489         refs = btrfs_extent_refs(leaf, ei);
1490         BUG_ON(refs == 0);
1491         refs -= 1;
1492         btrfs_set_extent_refs(leaf, ei, refs);
1493
1494         btrfs_mark_buffer_dirty(leaf);
1495
1496         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1497                 /* if the back ref and the extent are next to each other
1498                  * they get deleted below in one shot
1499                  */
1500                 path->slots[0] = extent_slot;
1501                 num_to_del = 2;
1502         } else if (found_extent) {
1503                 /* otherwise delete the extent back ref */
1504                 ret = btrfs_del_item(trans, extent_root, path);
1505                 BUG_ON(ret);
1506                 /* if refs are 0, we need to setup the path for deletion */
1507                 if (refs == 0) {
1508                         btrfs_release_path(extent_root, path);
1509                         ret = btrfs_search_slot(trans, extent_root, &key, path,
1510                                                 -1, 1);
1511                         if (ret < 0)
1512                                 return ret;
1513                         BUG_ON(ret);
1514                 }
1515         }
1516
1517         if (refs == 0) {
1518                 u64 super_used;
1519                 u64 root_used;
1520
1521                 if (pin) {
1522                         ret = pin_down_bytes(root, bytenr, num_bytes, 0);
1523                         if (ret > 0)
1524                                 mark_free = 1;
1525                         BUG_ON(ret < 0);
1526                 }
1527
1528                 /* block accounting for super block */
1529                 super_used = btrfs_super_bytes_used(&info->super_copy);
1530                 btrfs_set_super_bytes_used(&info->super_copy,
1531                                            super_used - num_bytes);
1532
1533                 /* block accounting for root item */
1534                 root_used = btrfs_root_used(&root->root_item);
1535                 btrfs_set_root_used(&root->root_item,
1536                                            root_used - num_bytes);
1537                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1538                                       num_to_del);
1539                 if (ret) {
1540                         return ret;
1541                 }
1542                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1543                                          mark_free);
1544                 BUG_ON(ret);
1545         }
1546         btrfs_free_path(path);
1547         finish_current_insert(trans, extent_root);
1548         return ret;
1549 }
1550
1551 /*
1552  * find all the blocks marked as pending in the radix tree and remove
1553  * them from the extent map
1554  */
1555 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1556                                btrfs_root *extent_root)
1557 {
1558         int ret;
1559         int err = 0;
1560         u64 start;
1561         u64 end;
1562         struct extent_io_tree *pending_del;
1563         struct extent_io_tree *pinned_extents;
1564
1565         pending_del = &extent_root->fs_info->pending_del;
1566         pinned_extents = &extent_root->fs_info->pinned_extents;
1567
1568         while(1) {
1569                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1570                                             EXTENT_LOCKED);
1571                 if (ret)
1572                         break;
1573                 update_pinned_extents(extent_root, start, end + 1 - start, 1);
1574                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1575                                   GFP_NOFS);
1576                 ret = __free_extent(trans, extent_root,
1577                                      start, end + 1 - start,
1578                                      extent_root->root_key.objectid,
1579                                      0, 0, 0, 0, 0);
1580                 if (ret)
1581                         err = ret;
1582         }
1583         return err;
1584 }
1585
1586 /*
1587  * remove an extent from the root, returns 0 on success
1588  */
1589 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1590                       *root, u64 bytenr, u64 num_bytes,
1591                       u64 root_objectid, u64 ref_generation,
1592                       u64 owner_objectid, u64 owner_offset, int pin)
1593 {
1594         struct btrfs_root *extent_root = root->fs_info->extent_root;
1595         int pending_ret;
1596         int ret;
1597
1598         WARN_ON(num_bytes < root->sectorsize);
1599         if (!root->ref_cows)
1600                 ref_generation = 0;
1601
1602         if (root == extent_root) {
1603                 pin_down_bytes(root, bytenr, num_bytes, 1);
1604                 return 0;
1605         }
1606         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1607                             ref_generation, owner_objectid, owner_offset,
1608                             pin, pin == 0);
1609         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1610         return ret ? ret : pending_ret;
1611 }
1612
1613 static u64 stripe_align(struct btrfs_root *root, u64 val)
1614 {
1615         u64 mask = ((u64)root->stripesize - 1);
1616         u64 ret = (val + mask) & ~mask;
1617         return ret;
1618 }
1619
1620 /*
1621  * walks the btree of allocated extents and find a hole of a given size.
1622  * The key ins is changed to record the hole:
1623  * ins->objectid == block start
1624  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1625  * ins->offset == number of blocks
1626  * Any available blocks before search_start are skipped.
1627  */
1628 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
1629                                      struct btrfs_root *orig_root,
1630                                      u64 num_bytes, u64 empty_size,
1631                                      u64 search_start, u64 search_end,
1632                                      u64 hint_byte, struct btrfs_key *ins,
1633                                      u64 exclude_start, u64 exclude_nr,
1634                                      int data)
1635 {
1636         int ret;
1637         u64 orig_search_start;
1638         struct btrfs_root * root = orig_root->fs_info->extent_root;
1639         struct btrfs_fs_info *info = root->fs_info;
1640         u64 total_needed = num_bytes;
1641         u64 *last_ptr = NULL;
1642         struct btrfs_block_group_cache *block_group;
1643         int full_scan = 0;
1644         int wrapped = 0;
1645         int chunk_alloc_done = 0;
1646         int empty_cluster = 2 * 1024 * 1024;
1647         int allowed_chunk_alloc = 0;
1648
1649         WARN_ON(num_bytes < root->sectorsize);
1650         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1651
1652         if (orig_root->ref_cows || empty_size)
1653                 allowed_chunk_alloc = 1;
1654
1655         if (data & BTRFS_BLOCK_GROUP_METADATA) {
1656                 last_ptr = &root->fs_info->last_alloc;
1657                 empty_cluster = 256 * 1024;
1658         }
1659
1660         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
1661                 last_ptr = &root->fs_info->last_data_alloc;
1662         }
1663
1664         if (last_ptr) {
1665                 if (*last_ptr)
1666                         hint_byte = *last_ptr;
1667                 else {
1668                         empty_size += empty_cluster;
1669                 }
1670         }
1671
1672         search_start = max(search_start, first_logical_byte(root, 0));
1673         orig_search_start = search_start;
1674
1675         if (search_end == (u64)-1)
1676                 search_end = btrfs_super_total_bytes(&info->super_copy);
1677
1678         if (hint_byte) {
1679                 block_group = btrfs_lookup_first_block_group(info, hint_byte);
1680                 if (!block_group)
1681                         hint_byte = search_start;
1682                 block_group = btrfs_find_block_group(root, block_group,
1683                                                      hint_byte, data, 1);
1684                 if (last_ptr && *last_ptr == 0 && block_group)
1685                         hint_byte = block_group->key.objectid;
1686         } else {
1687                 block_group = btrfs_find_block_group(root,
1688                                                      trans->block_group,
1689                                                      search_start, data, 1);
1690         }
1691         search_start = max(search_start, hint_byte);
1692
1693         total_needed += empty_size;
1694
1695 check_failed:
1696         if (!block_group) {
1697                 block_group = btrfs_lookup_first_block_group(info,
1698                                                              search_start);
1699                 if (!block_group)
1700                         block_group = btrfs_lookup_first_block_group(info,
1701                                                        orig_search_start);
1702         }
1703         if (full_scan && !chunk_alloc_done) {
1704                 if (allowed_chunk_alloc) {
1705                         do_chunk_alloc(trans, root,
1706                                      num_bytes + 2 * 1024 * 1024, data, 1);
1707                         allowed_chunk_alloc = 0;
1708                 } else if (block_group && block_group_bits(block_group, data)) {
1709                         block_group->space_info->force_alloc = 1;
1710                 }
1711                 chunk_alloc_done = 1;
1712         }
1713         ret = find_search_start(root, &block_group, &search_start,
1714                                 total_needed, data);
1715         if (ret == -ENOSPC && last_ptr && *last_ptr) {
1716                 *last_ptr = 0;
1717                 block_group = btrfs_lookup_first_block_group(info,
1718                                                              orig_search_start);
1719                 search_start = orig_search_start;
1720                 ret = find_search_start(root, &block_group, &search_start,
1721                                         total_needed, data);
1722         }
1723         if (ret == -ENOSPC)
1724                 goto enospc;
1725         if (ret)
1726                 goto error;
1727
1728         if (last_ptr && *last_ptr && search_start != *last_ptr) {
1729                 *last_ptr = 0;
1730                 if (!empty_size) {
1731                         empty_size += empty_cluster;
1732                         total_needed += empty_size;
1733                 }
1734                 block_group = btrfs_lookup_first_block_group(info,
1735                                                        orig_search_start);
1736                 search_start = orig_search_start;
1737                 ret = find_search_start(root, &block_group,
1738                                         &search_start, total_needed, data);
1739                 if (ret == -ENOSPC)
1740                         goto enospc;
1741                 if (ret)
1742                         goto error;
1743         }
1744
1745         search_start = stripe_align(root, search_start);
1746         ins->objectid = search_start;
1747         ins->offset = num_bytes;
1748
1749         if (ins->objectid + num_bytes >= search_end)
1750                 goto enospc;
1751
1752         if (ins->objectid + num_bytes >
1753             block_group->key.objectid + block_group->key.offset) {
1754                 search_start = block_group->key.objectid +
1755                         block_group->key.offset;
1756                 goto new_group;
1757         }
1758
1759         if (test_range_bit(&info->extent_ins, ins->objectid,
1760                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
1761                 search_start = ins->objectid + num_bytes;
1762                 goto new_group;
1763         }
1764
1765         if (test_range_bit(&info->pinned_extents, ins->objectid,
1766                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
1767                 search_start = ins->objectid + num_bytes;
1768                 goto new_group;
1769         }
1770
1771         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
1772             ins->objectid < exclude_start + exclude_nr)) {
1773                 search_start = exclude_start + exclude_nr;
1774                 goto new_group;
1775         }
1776
1777         if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
1778                 block_group = btrfs_lookup_block_group(info, ins->objectid);
1779                 if (block_group)
1780                         trans->block_group = block_group;
1781         }
1782         ins->offset = num_bytes;
1783         if (last_ptr) {
1784                 *last_ptr = ins->objectid + ins->offset;
1785                 if (*last_ptr ==
1786                     btrfs_super_total_bytes(&root->fs_info->super_copy)) {
1787                         *last_ptr = 0;
1788                 }
1789         }
1790         return 0;
1791
1792 new_group:
1793         if (search_start + num_bytes >= search_end) {
1794 enospc:
1795                 search_start = orig_search_start;
1796                 if (full_scan) {
1797                         ret = -ENOSPC;
1798                         goto error;
1799                 }
1800                 if (wrapped) {
1801                         if (!full_scan)
1802                                 total_needed -= empty_size;
1803                         full_scan = 1;
1804                 } else
1805                         wrapped = 1;
1806         }
1807         block_group = btrfs_lookup_first_block_group(info, search_start);
1808         cond_resched();
1809         block_group = btrfs_find_block_group(root, block_group,
1810                                              search_start, data, 0);
1811         goto check_failed;
1812
1813 error:
1814         return ret;
1815 }
1816
1817 /*
1818  * finds a free extent and does all the dirty work required for allocation
1819  * returns the key for the extent through ins, and a tree buffer for
1820  * the first block of the extent through buf.
1821  *
1822  * returns 0 if everything worked, non-zero otherwise.
1823  */
1824 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1825                        struct btrfs_root *root,
1826                        u64 num_bytes, u64 min_alloc_size,
1827                        u64 root_objectid, u64 ref_generation,
1828                        u64 owner, u64 owner_offset,
1829                        u64 empty_size, u64 hint_byte,
1830                        u64 search_end, struct btrfs_key *ins, u64 data)
1831 {
1832         int ret;
1833         int pending_ret;
1834         u64 super_used;
1835         u64 root_used;
1836         u64 search_start = 0;
1837         u64 alloc_profile;
1838         u32 sizes[2];
1839         struct btrfs_fs_info *info = root->fs_info;
1840         struct btrfs_root *extent_root = info->extent_root;
1841         struct btrfs_extent_item *extent_item;
1842         struct btrfs_extent_ref *ref;
1843         struct btrfs_path *path;
1844         struct btrfs_key keys[2];
1845
1846         if (data) {
1847                 alloc_profile = info->avail_data_alloc_bits &
1848                                 info->data_alloc_profile;
1849                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1850         } else if (root == root->fs_info->chunk_root) {
1851                 alloc_profile = info->avail_system_alloc_bits &
1852                                 info->system_alloc_profile;
1853                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1854         } else {
1855                 alloc_profile = info->avail_metadata_alloc_bits &
1856                                 info->metadata_alloc_profile;
1857                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1858         }
1859 again:
1860         data = reduce_alloc_profile(root, data);
1861         /*
1862          * the only place that sets empty_size is btrfs_realloc_node, which
1863          * is not called recursively on allocations
1864          */
1865         if (empty_size || root->ref_cows) {
1866                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
1867                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1868                                      2 * 1024 * 1024,
1869                                      BTRFS_BLOCK_GROUP_METADATA |
1870                                      (info->metadata_alloc_profile &
1871                                       info->avail_metadata_alloc_bits), 0);
1872                         BUG_ON(ret);
1873                 }
1874                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1875                                      num_bytes + 2 * 1024 * 1024, data, 0);
1876                 BUG_ON(ret);
1877         }
1878
1879         WARN_ON(num_bytes < root->sectorsize);
1880         ret = find_free_extent(trans, root, num_bytes, empty_size,
1881                                search_start, search_end, hint_byte, ins,
1882                                trans->alloc_exclude_start,
1883                                trans->alloc_exclude_nr, data);
1884
1885         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
1886                 num_bytes = num_bytes >> 1;
1887                 num_bytes = max(num_bytes, min_alloc_size);
1888                 do_chunk_alloc(trans, root->fs_info->extent_root,
1889                                num_bytes, data, 1);
1890                 goto again;
1891         }
1892         if (ret) {
1893                 printk("allocation failed flags %Lu\n", data);
1894         }
1895         BUG_ON(ret);
1896         if (ret)
1897                 return ret;
1898
1899         /* block accounting for super block */
1900         super_used = btrfs_super_bytes_used(&info->super_copy);
1901         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
1902
1903         /* block accounting for root item */
1904         root_used = btrfs_root_used(&root->root_item);
1905         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
1906
1907         clear_extent_dirty(&root->fs_info->free_space_cache,
1908                            ins->objectid, ins->objectid + ins->offset - 1,
1909                            GFP_NOFS);
1910
1911         if (root == extent_root) {
1912                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
1913                                 ins->objectid + ins->offset - 1,
1914                                 EXTENT_LOCKED, GFP_NOFS);
1915                 goto update_block;
1916         }
1917
1918         WARN_ON(trans->alloc_exclude_nr);
1919         trans->alloc_exclude_start = ins->objectid;
1920         trans->alloc_exclude_nr = ins->offset;
1921
1922         memcpy(&keys[0], ins, sizeof(*ins));
1923         keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
1924                                          owner, owner_offset);
1925         keys[1].objectid = ins->objectid;
1926         keys[1].type = BTRFS_EXTENT_REF_KEY;
1927         sizes[0] = sizeof(*extent_item);
1928         sizes[1] = sizeof(*ref);
1929
1930         path = btrfs_alloc_path();
1931         BUG_ON(!path);
1932
1933         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
1934                                        sizes, 2);
1935
1936         BUG_ON(ret);
1937         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1938                                      struct btrfs_extent_item);
1939         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
1940         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
1941                              struct btrfs_extent_ref);
1942
1943         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
1944         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
1945         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
1946         btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
1947
1948         btrfs_mark_buffer_dirty(path->nodes[0]);
1949
1950         trans->alloc_exclude_start = 0;
1951         trans->alloc_exclude_nr = 0;
1952         btrfs_free_path(path);
1953         finish_current_insert(trans, extent_root);
1954         pending_ret = del_pending_extents(trans, extent_root);
1955
1956         if (ret) {
1957                 return ret;
1958         }
1959         if (pending_ret) {
1960                 return pending_ret;
1961         }
1962
1963 update_block:
1964         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
1965         if (ret) {
1966                 printk("update block group failed for %Lu %Lu\n",
1967                        ins->objectid, ins->offset);
1968                 BUG();
1969         }
1970         return 0;
1971 }
1972
1973 /*
1974  * helper function to allocate a block for a given tree
1975  * returns the tree buffer or NULL.
1976  */
1977 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1978                                              struct btrfs_root *root,
1979                                              u32 blocksize,
1980                                              u64 root_objectid, u64 hint,
1981                                              u64 empty_size)
1982 {
1983         u64 ref_generation;
1984
1985         if (root->ref_cows)
1986                 ref_generation = trans->transid;
1987         else
1988                 ref_generation = 0;
1989
1990
1991         return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
1992                                         ref_generation, 0, 0, hint, empty_size);
1993 }
1994
1995 /*
1996  * helper function to allocate a block for a given tree
1997  * returns the tree buffer or NULL.
1998  */
1999 struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2000                                              struct btrfs_root *root,
2001                                              u32 blocksize,
2002                                              u64 root_objectid,
2003                                              u64 ref_generation,
2004                                              u64 first_objectid,
2005                                              int level,
2006                                              u64 hint,
2007                                              u64 empty_size)
2008 {
2009         struct btrfs_key ins;
2010         int ret;
2011         struct extent_buffer *buf;
2012
2013         ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
2014                                  root_objectid, ref_generation,
2015                                  level, first_objectid, empty_size, hint,
2016                                  (u64)-1, &ins, 0);
2017         if (ret) {
2018                 BUG_ON(ret > 0);
2019                 return ERR_PTR(ret);
2020         }
2021         buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
2022         if (!buf) {
2023                 btrfs_free_extent(trans, root, ins.objectid, blocksize,
2024                                   root->root_key.objectid, ref_generation,
2025                                   0, 0, 0);
2026                 return ERR_PTR(-ENOMEM);
2027         }
2028         btrfs_set_header_generation(buf, trans->transid);
2029         clean_tree_block(trans, root, buf);
2030         btrfs_set_buffer_uptodate(buf);
2031
2032         if (PageDirty(buf->first_page)) {
2033                 printk("page %lu dirty\n", buf->first_page->index);
2034                 WARN_ON(1);
2035         }
2036
2037         set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2038                          buf->start + buf->len - 1, GFP_NOFS);
2039         if (!btrfs_test_opt(root, SSD))
2040                 btrfs_set_buffer_defrag(buf);
2041         trans->blocks_used++;
2042         return buf;
2043 }
2044
2045 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
2046                                   struct btrfs_root *root,
2047                                   struct extent_buffer *leaf)
2048 {
2049         u64 leaf_owner;
2050         u64 leaf_generation;
2051         struct btrfs_key key;
2052         struct btrfs_file_extent_item *fi;
2053         int i;
2054         int nritems;
2055         int ret;
2056
2057         BUG_ON(!btrfs_is_leaf(leaf));
2058         nritems = btrfs_header_nritems(leaf);
2059         leaf_owner = btrfs_header_owner(leaf);
2060         leaf_generation = btrfs_header_generation(leaf);
2061
2062         for (i = 0; i < nritems; i++) {
2063                 u64 disk_bytenr;
2064
2065                 btrfs_item_key_to_cpu(leaf, &key, i);
2066                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2067                         continue;
2068                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2069                 if (btrfs_file_extent_type(leaf, fi) ==
2070                     BTRFS_FILE_EXTENT_INLINE)
2071                         continue;
2072                 /*
2073                  * FIXME make sure to insert a trans record that
2074                  * repeats the snapshot del on crash
2075                  */
2076                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2077                 if (disk_bytenr == 0)
2078                         continue;
2079                 ret = btrfs_free_extent(trans, root, disk_bytenr,
2080                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
2081                                 leaf_owner, leaf_generation,
2082                                 key.objectid, key.offset, 0);
2083                 BUG_ON(ret);
2084         }
2085         return 0;
2086 }
2087
2088 static void noinline reada_walk_down(struct btrfs_root *root,
2089                                      struct extent_buffer *node,
2090                                      int slot)
2091 {
2092         u64 bytenr;
2093         u64 last = 0;
2094         u32 nritems;
2095         u32 refs;
2096         u32 blocksize;
2097         int ret;
2098         int i;
2099         int level;
2100         int skipped = 0;
2101
2102         nritems = btrfs_header_nritems(node);
2103         level = btrfs_header_level(node);
2104         if (level)
2105                 return;
2106
2107         for (i = slot; i < nritems && skipped < 32; i++) {
2108                 bytenr = btrfs_node_blockptr(node, i);
2109                 if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
2110                              (last > bytenr && last - bytenr > 32 * 1024))) {
2111                         skipped++;
2112                         continue;
2113                 }
2114                 blocksize = btrfs_level_size(root, level - 1);
2115                 if (i != slot) {
2116                         ret = lookup_extent_ref(NULL, root, bytenr,
2117                                                 blocksize, &refs);
2118                         BUG_ON(ret);
2119                         if (refs != 1) {
2120                                 skipped++;
2121                                 continue;
2122                         }
2123                 }
2124                 mutex_unlock(&root->fs_info->fs_mutex);
2125                 ret = readahead_tree_block(root, bytenr, blocksize,
2126                                            btrfs_node_ptr_generation(node, i));
2127                 last = bytenr + blocksize;
2128                 cond_resched();
2129                 mutex_lock(&root->fs_info->fs_mutex);
2130                 if (ret)
2131                         break;
2132         }
2133 }
2134
2135 /*
2136  * helper function for drop_snapshot, this walks down the tree dropping ref
2137  * counts as it goes.
2138  */
2139 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2140                                    struct btrfs_root *root,
2141                                    struct btrfs_path *path, int *level)
2142 {
2143         u64 root_owner;
2144         u64 root_gen;
2145         u64 bytenr;
2146         u64 ptr_gen;
2147         struct extent_buffer *next;
2148         struct extent_buffer *cur;
2149         struct extent_buffer *parent;
2150         u32 blocksize;
2151         int ret;
2152         u32 refs;
2153
2154         WARN_ON(*level < 0);
2155         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2156         ret = lookup_extent_ref(trans, root,
2157                                 path->nodes[*level]->start,
2158                                 path->nodes[*level]->len, &refs);
2159         BUG_ON(ret);
2160         if (refs > 1)
2161                 goto out;
2162
2163         /*
2164          * walk down to the last node level and free all the leaves
2165          */
2166         while(*level >= 0) {
2167                 WARN_ON(*level < 0);
2168                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2169                 cur = path->nodes[*level];
2170
2171                 if (btrfs_header_level(cur) != *level)
2172                         WARN_ON(1);
2173
2174                 if (path->slots[*level] >=
2175                     btrfs_header_nritems(cur))
2176                         break;
2177                 if (*level == 0) {
2178                         ret = drop_leaf_ref(trans, root, cur);
2179                         BUG_ON(ret);
2180                         break;
2181                 }
2182                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2183                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2184                 blocksize = btrfs_level_size(root, *level - 1);
2185                 ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
2186                 BUG_ON(ret);
2187                 if (refs != 1) {
2188                         parent = path->nodes[*level];
2189                         root_owner = btrfs_header_owner(parent);
2190                         root_gen = btrfs_header_generation(parent);
2191                         path->slots[*level]++;
2192                         ret = btrfs_free_extent(trans, root, bytenr,
2193                                                 blocksize, root_owner,
2194                                                 root_gen, 0, 0, 1);
2195                         BUG_ON(ret);
2196                         continue;
2197                 }
2198                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2199                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2200                         free_extent_buffer(next);
2201                         reada_walk_down(root, cur, path->slots[*level]);
2202
2203                         mutex_unlock(&root->fs_info->fs_mutex);
2204                         next = read_tree_block(root, bytenr, blocksize,
2205                                                ptr_gen);
2206                         mutex_lock(&root->fs_info->fs_mutex);
2207
2208                         /* we've dropped the lock, double check */
2209                         ret = lookup_extent_ref(trans, root, bytenr,
2210                                                 blocksize, &refs);
2211                         BUG_ON(ret);
2212                         if (refs != 1) {
2213                                 parent = path->nodes[*level];
2214                                 root_owner = btrfs_header_owner(parent);
2215                                 root_gen = btrfs_header_generation(parent);
2216
2217                                 path->slots[*level]++;
2218                                 free_extent_buffer(next);
2219                                 ret = btrfs_free_extent(trans, root, bytenr,
2220                                                         blocksize,
2221                                                         root_owner,
2222                                                         root_gen, 0, 0, 1);
2223                                 BUG_ON(ret);
2224                                 continue;
2225                         }
2226                 }
2227                 WARN_ON(*level <= 0);
2228                 if (path->nodes[*level-1])
2229                         free_extent_buffer(path->nodes[*level-1]);
2230                 path->nodes[*level-1] = next;
2231                 *level = btrfs_header_level(next);
2232                 path->slots[*level] = 0;
2233         }
2234 out:
2235         WARN_ON(*level < 0);
2236         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2237
2238         if (path->nodes[*level] == root->node) {
2239                 root_owner = root->root_key.objectid;
2240                 parent = path->nodes[*level];
2241         } else {
2242                 parent = path->nodes[*level + 1];
2243                 root_owner = btrfs_header_owner(parent);
2244         }
2245
2246         root_gen = btrfs_header_generation(parent);
2247         ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
2248                                 path->nodes[*level]->len,
2249                                 root_owner, root_gen, 0, 0, 1);
2250         free_extent_buffer(path->nodes[*level]);
2251         path->nodes[*level] = NULL;
2252         *level += 1;
2253         BUG_ON(ret);
2254         return 0;
2255 }
2256
2257 /*
2258  * helper for dropping snapshots.  This walks back up the tree in the path
2259  * to find the first node higher up where we haven't yet gone through
2260  * all the slots
2261  */
2262 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
2263                                  struct btrfs_root *root,
2264                                  struct btrfs_path *path, int *level)
2265 {
2266         u64 root_owner;
2267         u64 root_gen;
2268         struct btrfs_root_item *root_item = &root->root_item;
2269         int i;
2270         int slot;
2271         int ret;
2272
2273         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2274                 slot = path->slots[i];
2275                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
2276                         struct extent_buffer *node;
2277                         struct btrfs_disk_key disk_key;
2278                         node = path->nodes[i];
2279                         path->slots[i]++;
2280                         *level = i;
2281                         WARN_ON(*level == 0);
2282                         btrfs_node_key(node, &disk_key, path->slots[i]);
2283                         memcpy(&root_item->drop_progress,
2284                                &disk_key, sizeof(disk_key));
2285                         root_item->drop_level = i;
2286                         return 0;
2287                 } else {
2288                         if (path->nodes[*level] == root->node) {
2289                                 root_owner = root->root_key.objectid;
2290                                 root_gen =
2291                                    btrfs_header_generation(path->nodes[*level]);
2292                         } else {
2293                                 struct extent_buffer *node;
2294                                 node = path->nodes[*level + 1];
2295                                 root_owner = btrfs_header_owner(node);
2296                                 root_gen = btrfs_header_generation(node);
2297                         }
2298                         ret = btrfs_free_extent(trans, root,
2299                                                 path->nodes[*level]->start,
2300                                                 path->nodes[*level]->len,
2301                                                 root_owner, root_gen, 0, 0, 1);
2302                         BUG_ON(ret);
2303                         free_extent_buffer(path->nodes[*level]);
2304                         path->nodes[*level] = NULL;
2305                         *level = i + 1;
2306                 }
2307         }
2308         return 1;
2309 }
2310
2311 /*
2312  * drop the reference count on the tree rooted at 'snap'.  This traverses
2313  * the tree freeing any blocks that have a ref count of zero after being
2314  * decremented.
2315  */
2316 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2317                         *root)
2318 {
2319         int ret = 0;
2320         int wret;
2321         int level;
2322         struct btrfs_path *path;
2323         int i;
2324         int orig_level;
2325         struct btrfs_root_item *root_item = &root->root_item;
2326
2327         path = btrfs_alloc_path();
2328         BUG_ON(!path);
2329
2330         level = btrfs_header_level(root->node);
2331         orig_level = level;
2332         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2333                 path->nodes[level] = root->node;
2334                 extent_buffer_get(root->node);
2335                 path->slots[level] = 0;
2336         } else {
2337                 struct btrfs_key key;
2338                 struct btrfs_disk_key found_key;
2339                 struct extent_buffer *node;
2340
2341                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2342                 level = root_item->drop_level;
2343                 path->lowest_level = level;
2344                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2345                 if (wret < 0) {
2346                         ret = wret;
2347                         goto out;
2348                 }
2349                 node = path->nodes[level];
2350                 btrfs_node_key(node, &found_key, path->slots[level]);
2351                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2352                                sizeof(found_key)));
2353         }
2354         while(1) {
2355                 wret = walk_down_tree(trans, root, path, &level);
2356                 if (wret > 0)
2357                         break;
2358                 if (wret < 0)
2359                         ret = wret;
2360
2361                 wret = walk_up_tree(trans, root, path, &level);
2362                 if (wret > 0)
2363                         break;
2364                 if (wret < 0)
2365                         ret = wret;
2366                 ret = -EAGAIN;
2367                 break;
2368         }
2369         for (i = 0; i <= orig_level; i++) {
2370                 if (path->nodes[i]) {
2371                         free_extent_buffer(path->nodes[i]);
2372                         path->nodes[i] = NULL;
2373                 }
2374         }
2375 out:
2376         btrfs_free_path(path);
2377         return ret;
2378 }
2379
2380 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2381 {
2382         u64 start;
2383         u64 end;
2384         u64 ptr;
2385         int ret;
2386         while(1) {
2387                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2388                                             &start, &end, (unsigned int)-1);
2389                 if (ret)
2390                         break;
2391                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2392                 if (!ret)
2393                         kfree((void *)(unsigned long)ptr);
2394                 clear_extent_bits(&info->block_group_cache, start,
2395                                   end, (unsigned int)-1, GFP_NOFS);
2396         }
2397         while(1) {
2398                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2399                                             &start, &end, EXTENT_DIRTY);
2400                 if (ret)
2401                         break;
2402                 clear_extent_dirty(&info->free_space_cache, start,
2403                                    end, GFP_NOFS);
2404         }
2405         return 0;
2406 }
2407
2408 static unsigned long calc_ra(unsigned long start, unsigned long last,
2409                              unsigned long nr)
2410 {
2411         return min(last, start + nr - 1);
2412 }
2413
2414 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2415                                          u64 len)
2416 {
2417         u64 page_start;
2418         u64 page_end;
2419         unsigned long last_index;
2420         unsigned long i;
2421         struct page *page;
2422         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2423         struct file_ra_state *ra;
2424         unsigned long total_read = 0;
2425         unsigned long ra_pages;
2426         struct btrfs_trans_handle *trans;
2427
2428         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2429
2430         mutex_lock(&inode->i_mutex);
2431         i = start >> PAGE_CACHE_SHIFT;
2432         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2433
2434         ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
2435
2436         file_ra_state_init(ra, inode->i_mapping);
2437
2438         for (; i <= last_index; i++) {
2439                 if (total_read % ra_pages == 0) {
2440                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2441                                        calc_ra(i, last_index, ra_pages));
2442                 }
2443                 total_read++;
2444                 if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
2445                         goto truncate_racing;
2446
2447                 page = grab_cache_page(inode->i_mapping, i);
2448                 if (!page) {
2449                         goto out_unlock;
2450                 }
2451                 if (!PageUptodate(page)) {
2452                         btrfs_readpage(NULL, page);
2453                         lock_page(page);
2454                         if (!PageUptodate(page)) {
2455                                 unlock_page(page);
2456                                 page_cache_release(page);
2457                                 goto out_unlock;
2458                         }
2459                 }
2460 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2461                 ClearPageDirty(page);
2462 #else
2463                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2464 #endif
2465                 wait_on_page_writeback(page);
2466                 set_page_extent_mapped(page);
2467                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2468                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2469
2470                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2471
2472                 set_extent_delalloc(io_tree, page_start,
2473                                     page_end, GFP_NOFS);
2474                 set_page_dirty(page);
2475
2476                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2477                 unlock_page(page);
2478                 page_cache_release(page);
2479         }
2480         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2481                                            total_read);
2482
2483 out_unlock:
2484         kfree(ra);
2485         trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2486         if (trans) {
2487                 btrfs_add_ordered_inode(inode);
2488                 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
2489                 mark_inode_dirty(inode);
2490         }
2491         mutex_unlock(&inode->i_mutex);
2492         return 0;
2493
2494 truncate_racing:
2495         vmtruncate(inode, inode->i_size);
2496         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2497                                            total_read);
2498         goto out_unlock;
2499 }
2500
2501 /*
2502  * The back references tell us which tree holds a ref on a block,
2503  * but it is possible for the tree root field in the reference to
2504  * reflect the original root before a snapshot was made.  In this
2505  * case we should search through all the children of a given root
2506  * to find potential holders of references on a block.
2507  *
2508  * Instead, we do something a little less fancy and just search
2509  * all the roots for a given key/block combination.
2510  */
2511 static int find_root_for_ref(struct btrfs_root *root,
2512                              struct btrfs_path *path,
2513                              struct btrfs_key *key0,
2514                              int level,
2515                              int file_key,
2516                              struct btrfs_root **found_root,
2517                              u64 bytenr)
2518 {
2519         struct btrfs_key root_location;
2520         struct btrfs_root *cur_root = *found_root;
2521         struct btrfs_file_extent_item *file_extent;
2522         u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
2523         u64 found_bytenr;
2524         int ret;
2525         int i;
2526
2527         root_location.offset = (u64)-1;
2528         root_location.type = BTRFS_ROOT_ITEM_KEY;
2529         path->lowest_level = level;
2530         path->reada = 0;
2531         while(1) {
2532                 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
2533                 found_bytenr = 0;
2534                 if (ret == 0 && file_key) {
2535                         struct extent_buffer *leaf = path->nodes[0];
2536                         file_extent = btrfs_item_ptr(leaf, path->slots[0],
2537                                              struct btrfs_file_extent_item);
2538                         if (btrfs_file_extent_type(leaf, file_extent) ==
2539                             BTRFS_FILE_EXTENT_REG) {
2540                                 found_bytenr =
2541                                         btrfs_file_extent_disk_bytenr(leaf,
2542                                                                file_extent);
2543                        }
2544                 } else if (!file_key) {
2545                         if (path->nodes[level])
2546                                 found_bytenr = path->nodes[level]->start;
2547                 }
2548
2549                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2550                         if (!path->nodes[i])
2551                                 break;
2552                         free_extent_buffer(path->nodes[i]);
2553                         path->nodes[i] = NULL;
2554                 }
2555                 btrfs_release_path(cur_root, path);
2556
2557                 if (found_bytenr == bytenr) {
2558                         *found_root = cur_root;
2559                         ret = 0;
2560                         goto out;
2561                 }
2562                 ret = btrfs_search_root(root->fs_info->tree_root,
2563                                         root_search_start, &root_search_start);
2564                 if (ret)
2565                         break;
2566
2567                 root_location.objectid = root_search_start;
2568                 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
2569                                                       &root_location);
2570                 if (!cur_root) {
2571                         ret = 1;
2572                         break;
2573                 }
2574         }
2575 out:
2576         path->lowest_level = 0;
2577         return ret;
2578 }
2579
2580 /*
2581  * note, this releases the path
2582  */
2583 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
2584                                   struct btrfs_path *path,
2585                                   struct btrfs_key *extent_key,
2586                                   u64 *last_file_objectid,
2587                                   u64 *last_file_offset,
2588                                   u64 *last_file_root,
2589                                   u64 last_extent)
2590 {
2591         struct inode *inode;
2592         struct btrfs_root *found_root;
2593         struct btrfs_key root_location;
2594         struct btrfs_key found_key;
2595         struct btrfs_extent_ref *ref;
2596         u64 ref_root;
2597         u64 ref_gen;
2598         u64 ref_objectid;
2599         u64 ref_offset;
2600         int ret;
2601         int level;
2602
2603         ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2604                              struct btrfs_extent_ref);
2605         ref_root = btrfs_ref_root(path->nodes[0], ref);
2606         ref_gen = btrfs_ref_generation(path->nodes[0], ref);
2607         ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
2608         ref_offset = btrfs_ref_offset(path->nodes[0], ref);
2609         btrfs_release_path(extent_root, path);
2610
2611         root_location.objectid = ref_root;
2612         if (ref_gen == 0)
2613                 root_location.offset = 0;
2614         else
2615                 root_location.offset = (u64)-1;
2616         root_location.type = BTRFS_ROOT_ITEM_KEY;
2617
2618         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
2619                                                 &root_location);
2620         BUG_ON(!found_root);
2621
2622         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2623                 found_key.objectid = ref_objectid;
2624                 found_key.type = BTRFS_EXTENT_DATA_KEY;
2625                 found_key.offset = ref_offset;
2626                 level = 0;
2627
2628                 if (last_extent == extent_key->objectid &&
2629                     *last_file_objectid == ref_objectid &&
2630                     *last_file_offset == ref_offset &&
2631                     *last_file_root == ref_root)
2632                         goto out;
2633
2634                 ret = find_root_for_ref(extent_root, path, &found_key,
2635                                         level, 1, &found_root,
2636                                         extent_key->objectid);
2637
2638                 if (ret)
2639                         goto out;
2640
2641                 if (last_extent == extent_key->objectid &&
2642                     *last_file_objectid == ref_objectid &&
2643                     *last_file_offset == ref_offset &&
2644                     *last_file_root == ref_root)
2645                         goto out;
2646
2647                 mutex_unlock(&extent_root->fs_info->fs_mutex);
2648                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
2649                                           ref_objectid, found_root);
2650                 if (inode->i_state & I_NEW) {
2651                         /* the inode and parent dir are two different roots */
2652                         BTRFS_I(inode)->root = found_root;
2653                         BTRFS_I(inode)->location.objectid = ref_objectid;
2654                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
2655                         BTRFS_I(inode)->location.offset = 0;
2656                         btrfs_read_locked_inode(inode);
2657                         unlock_new_inode(inode);
2658
2659                 }
2660                 /* this can happen if the reference is not against
2661                  * the latest version of the tree root
2662                  */
2663                 if (is_bad_inode(inode)) {
2664                         mutex_lock(&extent_root->fs_info->fs_mutex);
2665                         goto out;
2666                 }
2667                 *last_file_objectid = inode->i_ino;
2668                 *last_file_root = found_root->root_key.objectid;
2669                 *last_file_offset = ref_offset;
2670
2671                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
2672                 iput(inode);
2673                 mutex_lock(&extent_root->fs_info->fs_mutex);
2674         } else {
2675                 struct btrfs_trans_handle *trans;
2676                 struct extent_buffer *eb;
2677                 int i;
2678
2679                 eb = read_tree_block(found_root, extent_key->objectid,
2680                                      extent_key->offset, 0);
2681                 level = btrfs_header_level(eb);
2682
2683                 if (level == 0)
2684                         btrfs_item_key_to_cpu(eb, &found_key, 0);
2685                 else
2686                         btrfs_node_key_to_cpu(eb, &found_key, 0);
2687
2688                 free_extent_buffer(eb);
2689
2690                 ret = find_root_for_ref(extent_root, path, &found_key,
2691                                         level, 0, &found_root,
2692                                         extent_key->objectid);
2693
2694                 if (ret)
2695                         goto out;
2696
2697                 trans = btrfs_start_transaction(found_root, 1);
2698
2699                 path->lowest_level = level;
2700                 path->reada = 2;
2701                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
2702                                         0, 1);
2703                 path->lowest_level = 0;
2704                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2705                         if (!path->nodes[i])
2706                                 break;
2707                         free_extent_buffer(path->nodes[i]);
2708                         path->nodes[i] = NULL;
2709                 }
2710                 btrfs_release_path(found_root, path);
2711                 if (found_root == found_root->fs_info->extent_root)
2712                         btrfs_extent_post_op(trans, found_root);
2713                 btrfs_end_transaction(trans, found_root);
2714         }
2715
2716 out:
2717         return 0;
2718 }
2719
2720 static int noinline del_extent_zero(struct btrfs_root *extent_root,
2721                                     struct btrfs_path *path,
2722                                     struct btrfs_key *extent_key)
2723 {
2724         int ret;
2725         struct btrfs_trans_handle *trans;
2726
2727         trans = btrfs_start_transaction(extent_root, 1);
2728         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
2729         if (ret > 0) {
2730                 ret = -EIO;
2731                 goto out;
2732         }
2733         if (ret < 0)
2734                 goto out;
2735         ret = btrfs_del_item(trans, extent_root, path);
2736 out:
2737         btrfs_end_transaction(trans, extent_root);
2738         return ret;
2739 }
2740
2741 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
2742                                         struct btrfs_path *path,
2743                                         struct btrfs_key *extent_key)
2744 {
2745         struct btrfs_key key;
2746         struct btrfs_key found_key;
2747         struct extent_buffer *leaf;
2748         u64 last_file_objectid = 0;
2749         u64 last_file_root = 0;
2750         u64 last_file_offset = (u64)-1;
2751         u64 last_extent = 0;
2752         u32 nritems;
2753         u32 item_size;
2754         int ret = 0;
2755
2756         if (extent_key->objectid == 0) {
2757                 ret = del_extent_zero(extent_root, path, extent_key);
2758                 goto out;
2759         }
2760         key.objectid = extent_key->objectid;
2761         key.type = BTRFS_EXTENT_REF_KEY;
2762         key.offset = 0;
2763
2764         while(1) {
2765                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2766
2767                 if (ret < 0)
2768                         goto out;
2769
2770                 ret = 0;
2771                 leaf = path->nodes[0];
2772                 nritems = btrfs_header_nritems(leaf);
2773                 if (path->slots[0] == nritems) {
2774                         ret = btrfs_next_leaf(extent_root, path);
2775                         if (ret > 0) {
2776                                 ret = 0;
2777                                 goto out;
2778                         }
2779                         if (ret < 0)
2780                                 goto out;
2781                         leaf = path->nodes[0];
2782                 }
2783
2784                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2785                 if (found_key.objectid != extent_key->objectid) {
2786                         break;
2787                 }
2788
2789                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
2790                         break;
2791                 }
2792
2793                 key.offset = found_key.offset + 1;
2794                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2795
2796                 ret = relocate_one_reference(extent_root, path, extent_key,
2797                                              &last_file_objectid,
2798                                              &last_file_offset,
2799                                              &last_file_root, last_extent);
2800                 if (ret)
2801                         goto out;
2802                 last_extent = extent_key->objectid;
2803         }
2804         ret = 0;
2805 out:
2806         btrfs_release_path(extent_root, path);
2807         return ret;
2808 }
2809
2810 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
2811 {
2812         u64 num_devices;
2813         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
2814                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
2815
2816         num_devices = root->fs_info->fs_devices->num_devices;
2817         if (num_devices == 1) {
2818                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2819                 stripped = flags & ~stripped;
2820
2821                 /* turn raid0 into single device chunks */
2822                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
2823                         return stripped;
2824
2825                 /* turn mirroring into duplication */
2826                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2827                              BTRFS_BLOCK_GROUP_RAID10))
2828                         return stripped | BTRFS_BLOCK_GROUP_DUP;
2829                 return flags;
2830         } else {
2831                 /* they already had raid on here, just return */
2832                 if (flags & stripped)
2833                         return flags;
2834
2835                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2836                 stripped = flags & ~stripped;
2837
2838                 /* switch duplicated blocks with raid1 */
2839                 if (flags & BTRFS_BLOCK_GROUP_DUP)
2840                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
2841
2842                 /* turn single device chunks into raid0 */
2843                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
2844         }
2845         return flags;
2846 }
2847
2848 int __alloc_chunk_for_shrink(struct btrfs_root *root,
2849                      struct btrfs_block_group_cache *shrink_block_group,
2850                      int force)
2851 {
2852         struct btrfs_trans_handle *trans;
2853         u64 new_alloc_flags;
2854         u64 calc;
2855
2856         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
2857
2858                 trans = btrfs_start_transaction(root, 1);
2859                 new_alloc_flags = update_block_group_flags(root,
2860                                                    shrink_block_group->flags);
2861                 if (new_alloc_flags != shrink_block_group->flags) {
2862                         calc =
2863                              btrfs_block_group_used(&shrink_block_group->item);
2864                 } else {
2865                         calc = shrink_block_group->key.offset;
2866                 }
2867                 do_chunk_alloc(trans, root->fs_info->extent_root,
2868                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
2869                 btrfs_end_transaction(trans, root);
2870         }
2871         return 0;
2872 }
2873
2874 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
2875 {
2876         struct btrfs_trans_handle *trans;
2877         struct btrfs_root *tree_root = root->fs_info->tree_root;
2878         struct btrfs_path *path;
2879         u64 cur_byte;
2880         u64 total_found;
2881         u64 shrink_last_byte;
2882         struct btrfs_block_group_cache *shrink_block_group;
2883         struct btrfs_fs_info *info = root->fs_info;
2884         struct btrfs_key key;
2885         struct btrfs_key found_key;
2886         struct extent_buffer *leaf;
2887         u32 nritems;
2888         int ret;
2889         int progress;
2890
2891         shrink_block_group = btrfs_lookup_block_group(root->fs_info,
2892                                                       shrink_start);
2893         BUG_ON(!shrink_block_group);
2894
2895         shrink_last_byte = shrink_block_group->key.objectid +
2896                 shrink_block_group->key.offset;
2897
2898         shrink_block_group->space_info->total_bytes -=
2899                 shrink_block_group->key.offset;
2900         path = btrfs_alloc_path();
2901         root = root->fs_info->extent_root;
2902         path->reada = 2;
2903
2904         printk("btrfs relocating block group %llu flags %llu\n",
2905                (unsigned long long)shrink_start,
2906                (unsigned long long)shrink_block_group->flags);
2907
2908         __alloc_chunk_for_shrink(root, shrink_block_group, 1);
2909
2910 again:
2911
2912         shrink_block_group->ro = 1;
2913
2914         total_found = 0;
2915         progress = 0;
2916         key.objectid = shrink_start;
2917         key.offset = 0;
2918         key.type = 0;
2919         cur_byte = key.objectid;
2920
2921         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2922         if (ret < 0)
2923                 goto out;
2924
2925         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
2926         if (ret < 0)
2927                 goto out;
2928
2929         if (ret == 0) {
2930                 leaf = path->nodes[0];
2931                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2932                 if (found_key.objectid + found_key.offset > shrink_start &&
2933                     found_key.objectid < shrink_last_byte) {
2934                         cur_byte = found_key.objectid;
2935                         key.objectid = cur_byte;
2936                 }
2937         }
2938         btrfs_release_path(root, path);
2939
2940         while(1) {
2941                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2942                 if (ret < 0)
2943                         goto out;
2944
2945                 leaf = path->nodes[0];
2946                 nritems = btrfs_header_nritems(leaf);
2947 next:
2948                 if (path->slots[0] >= nritems) {
2949                         ret = btrfs_next_leaf(root, path);
2950                         if (ret < 0)
2951                                 goto out;
2952                         if (ret == 1) {
2953                                 ret = 0;
2954                                 break;
2955                         }
2956                         leaf = path->nodes[0];
2957                         nritems = btrfs_header_nritems(leaf);
2958                 }
2959
2960                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2961
2962                 if (found_key.objectid >= shrink_last_byte)
2963                         break;
2964
2965                 if (progress && need_resched()) {
2966                         memcpy(&key, &found_key, sizeof(key));
2967                         mutex_unlock(&root->fs_info->fs_mutex);
2968                         cond_resched();
2969                         mutex_lock(&root->fs_info->fs_mutex);
2970                         btrfs_release_path(root, path);
2971                         btrfs_search_slot(NULL, root, &key, path, 0, 0);
2972                         progress = 0;
2973                         goto next;
2974                 }
2975                 progress = 1;
2976
2977                 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
2978                     found_key.objectid + found_key.offset <= cur_byte) {
2979                         memcpy(&key, &found_key, sizeof(key));
2980                         key.offset++;
2981                         path->slots[0]++;
2982                         goto next;
2983                 }
2984
2985                 total_found++;
2986                 cur_byte = found_key.objectid + found_key.offset;
2987                 key.objectid = cur_byte;
2988                 btrfs_release_path(root, path);
2989                 ret = relocate_one_extent(root, path, &found_key);
2990                 __alloc_chunk_for_shrink(root, shrink_block_group, 0);
2991         }
2992
2993         btrfs_release_path(root, path);
2994
2995         if (total_found > 0) {
2996                 printk("btrfs relocate found %llu last extent was %llu\n",
2997                        (unsigned long long)total_found,
2998                        (unsigned long long)found_key.objectid);
2999                 trans = btrfs_start_transaction(tree_root, 1);
3000                 btrfs_commit_transaction(trans, tree_root);
3001
3002                 mutex_unlock(&root->fs_info->fs_mutex);
3003                 btrfs_clean_old_snapshots(tree_root);
3004                 mutex_lock(&root->fs_info->fs_mutex);
3005
3006                 trans = btrfs_start_transaction(tree_root, 1);
3007                 btrfs_commit_transaction(trans, tree_root);
3008                 goto again;
3009         }
3010
3011         /*
3012          * we've freed all the extents, now remove the block
3013          * group item from the tree
3014          */
3015         trans = btrfs_start_transaction(root, 1);
3016         memcpy(&key, &shrink_block_group->key, sizeof(key));
3017
3018         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3019         if (ret > 0)
3020                 ret = -EIO;
3021         if (ret < 0)
3022                 goto out;
3023
3024         clear_extent_bits(&info->block_group_cache, key.objectid,
3025                           key.objectid + key.offset - 1,
3026                           (unsigned int)-1, GFP_NOFS);
3027
3028
3029         clear_extent_bits(&info->free_space_cache,
3030                            key.objectid, key.objectid + key.offset - 1,
3031                            (unsigned int)-1, GFP_NOFS);
3032
3033         memset(shrink_block_group, 0, sizeof(*shrink_block_group));
3034         kfree(shrink_block_group);
3035
3036         btrfs_del_item(trans, root, path);
3037         btrfs_commit_transaction(trans, root);
3038
3039         /* the code to unpin extents might set a few bits in the free
3040          * space cache for this range again
3041          */
3042         clear_extent_bits(&info->free_space_cache,
3043                            key.objectid, key.objectid + key.offset - 1,
3044                            (unsigned int)-1, GFP_NOFS);
3045 out:
3046         btrfs_free_path(path);
3047         return ret;
3048 }
3049
3050 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
3051                            struct btrfs_key *key)
3052 {
3053         int ret;
3054         struct btrfs_key found_key;
3055         struct extent_buffer *leaf;
3056         int slot;
3057
3058         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3059         if (ret < 0)
3060                 return ret;
3061         while(1) {
3062                 slot = path->slots[0];
3063                 leaf = path->nodes[0];
3064                 if (slot >= btrfs_header_nritems(leaf)) {
3065                         ret = btrfs_next_leaf(root, path);
3066                         if (ret == 0)
3067                                 continue;
3068                         if (ret < 0)
3069                                 goto error;
3070                         break;
3071                 }
3072                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3073
3074                 if (found_key.objectid >= key->objectid &&
3075                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
3076                         return 0;
3077                 path->slots[0]++;
3078         }
3079         ret = -ENOENT;
3080 error:
3081         return ret;
3082 }
3083
3084 int btrfs_read_block_groups(struct btrfs_root *root)
3085 {
3086         struct btrfs_path *path;
3087         int ret;
3088         int bit;
3089         struct btrfs_block_group_cache *cache;
3090         struct btrfs_fs_info *info = root->fs_info;
3091         struct btrfs_space_info *space_info;
3092         struct extent_io_tree *block_group_cache;
3093         struct btrfs_key key;
3094         struct btrfs_key found_key;
3095         struct extent_buffer *leaf;
3096
3097         block_group_cache = &info->block_group_cache;
3098         root = info->extent_root;
3099         key.objectid = 0;
3100         key.offset = 0;
3101         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3102         path = btrfs_alloc_path();
3103         if (!path)
3104                 return -ENOMEM;
3105
3106         while(1) {
3107                 ret = find_first_block_group(root, path, &key);
3108                 if (ret > 0) {
3109                         ret = 0;
3110                         goto error;
3111                 }
3112                 if (ret != 0)
3113                         goto error;
3114
3115                 leaf = path->nodes[0];
3116                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3117                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3118                 if (!cache) {
3119                         ret = -ENOMEM;
3120                         break;
3121                 }
3122
3123                 read_extent_buffer(leaf, &cache->item,
3124                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
3125                                    sizeof(cache->item));
3126                 memcpy(&cache->key, &found_key, sizeof(found_key));
3127
3128                 key.objectid = found_key.objectid + found_key.offset;
3129                 btrfs_release_path(root, path);
3130                 cache->flags = btrfs_block_group_flags(&cache->item);
3131                 bit = 0;
3132                 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3133                         bit = BLOCK_GROUP_DATA;
3134                 } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3135                         bit = BLOCK_GROUP_SYSTEM;
3136                 } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
3137                         bit = BLOCK_GROUP_METADATA;
3138                 }
3139                 set_avail_alloc_bits(info, cache->flags);
3140
3141                 ret = update_space_info(info, cache->flags, found_key.offset,
3142                                         btrfs_block_group_used(&cache->item),
3143                                         &space_info);
3144                 BUG_ON(ret);
3145                 cache->space_info = space_info;
3146
3147                 /* use EXTENT_LOCKED to prevent merging */
3148                 set_extent_bits(block_group_cache, found_key.objectid,
3149                                 found_key.objectid + found_key.offset - 1,
3150                                 bit | EXTENT_LOCKED, GFP_NOFS);
3151                 set_state_private(block_group_cache, found_key.objectid,
3152                                   (unsigned long)cache);
3153
3154                 if (key.objectid >=
3155                     btrfs_super_total_bytes(&info->super_copy))
3156                         break;
3157         }
3158         ret = 0;
3159 error:
3160         btrfs_free_path(path);
3161         return ret;
3162 }
3163
3164 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3165                            struct btrfs_root *root, u64 bytes_used,
3166                            u64 type, u64 chunk_objectid, u64 chunk_offset,
3167                            u64 size)
3168 {
3169         int ret;
3170         int bit = 0;
3171         struct btrfs_root *extent_root;
3172         struct btrfs_block_group_cache *cache;
3173         struct extent_io_tree *block_group_cache;
3174
3175         extent_root = root->fs_info->extent_root;
3176         block_group_cache = &root->fs_info->block_group_cache;
3177
3178         cache = kzalloc(sizeof(*cache), GFP_NOFS);
3179         BUG_ON(!cache);
3180         cache->key.objectid = chunk_offset;
3181         cache->key.offset = size;
3182         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3183
3184         btrfs_set_block_group_used(&cache->item, bytes_used);
3185         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
3186         cache->flags = type;
3187         btrfs_set_block_group_flags(&cache->item, type);
3188
3189         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
3190                                 &cache->space_info);
3191         BUG_ON(ret);
3192
3193         bit = block_group_state_bits(type);
3194         set_extent_bits(block_group_cache, chunk_offset,
3195                         chunk_offset + size - 1,
3196                         bit | EXTENT_LOCKED, GFP_NOFS);
3197
3198         set_state_private(block_group_cache, chunk_offset,
3199                           (unsigned long)cache);
3200         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
3201                                 sizeof(cache->item));
3202         BUG_ON(ret);
3203
3204         finish_current_insert(trans, extent_root);
3205         ret = del_pending_extents(trans, extent_root);
3206         BUG_ON(ret);
3207         set_avail_alloc_bits(extent_root->fs_info, type);
3208         return 0;
3209 }