Btrfs: Seed device support
[linux-2.6-block.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "locking.h"
25
26 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27                       *root, struct btrfs_path *path, int level);
28 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_key *ins_key,
30                       struct btrfs_path *path, int data_size, int extend);
31 static int push_node_left(struct btrfs_trans_handle *trans,
32                           struct btrfs_root *root, struct extent_buffer *dst,
33                           struct extent_buffer *src, int empty);
34 static int balance_node_right(struct btrfs_trans_handle *trans,
35                               struct btrfs_root *root,
36                               struct extent_buffer *dst_buf,
37                               struct extent_buffer *src_buf);
38 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39                    struct btrfs_path *path, int level, int slot);
40
41 inline void btrfs_init_path(struct btrfs_path *p)
42 {
43         memset(p, 0, sizeof(*p));
44 }
45
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48         struct btrfs_path *path;
49         path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50         if (path) {
51                 btrfs_init_path(path);
52                 path->reada = 1;
53         }
54         return path;
55 }
56
57 /* this also releases the path */
58 void btrfs_free_path(struct btrfs_path *p)
59 {
60         btrfs_release_path(NULL, p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock(p->nodes[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100         spin_lock(&root->node_lock);
101         eb = root->node;
102         extent_buffer_get(eb);
103         spin_unlock(&root->node_lock);
104         return eb;
105 }
106
107 /* loop around taking references on and locking the root node of the
108  * tree until you end up with a lock on the root.  A locked buffer
109  * is returned, with a reference held.
110  */
111 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112 {
113         struct extent_buffer *eb;
114
115         while(1) {
116                 eb = btrfs_root_node(root);
117                 btrfs_tree_lock(eb);
118
119                 spin_lock(&root->node_lock);
120                 if (eb == root->node) {
121                         spin_unlock(&root->node_lock);
122                         break;
123                 }
124                 spin_unlock(&root->node_lock);
125
126                 btrfs_tree_unlock(eb);
127                 free_extent_buffer(eb);
128         }
129         return eb;
130 }
131
132 /* cowonly root (everything not a reference counted cow subvolume), just get
133  * put onto a simple dirty list.  transaction.c walks this to make sure they
134  * get properly updated on disk.
135  */
136 static void add_root_to_dirty_list(struct btrfs_root *root)
137 {
138         if (root->track_dirty && list_empty(&root->dirty_list)) {
139                 list_add(&root->dirty_list,
140                          &root->fs_info->dirty_cowonly_roots);
141         }
142 }
143
144 /*
145  * used by snapshot creation to make a copy of a root for a tree with
146  * a given objectid.  The buffer with the new root node is returned in
147  * cow_ret, and this func returns zero on success or a negative error code.
148  */
149 int btrfs_copy_root(struct btrfs_trans_handle *trans,
150                       struct btrfs_root *root,
151                       struct extent_buffer *buf,
152                       struct extent_buffer **cow_ret, u64 new_root_objectid)
153 {
154         struct extent_buffer *cow;
155         u32 nritems;
156         int ret = 0;
157         int level;
158         struct btrfs_root *new_root;
159
160         new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161         if (!new_root)
162                 return -ENOMEM;
163
164         memcpy(new_root, root, sizeof(*new_root));
165         new_root->root_key.objectid = new_root_objectid;
166
167         WARN_ON(root->ref_cows && trans->transid !=
168                 root->fs_info->running_transaction->transid);
169         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171         level = btrfs_header_level(buf);
172         nritems = btrfs_header_nritems(buf);
173
174         cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175                                      new_root_objectid, trans->transid,
176                                      level, buf->start, 0);
177         if (IS_ERR(cow)) {
178                 kfree(new_root);
179                 return PTR_ERR(cow);
180         }
181
182         copy_extent_buffer(cow, buf, 0, 0, cow->len);
183         btrfs_set_header_bytenr(cow, cow->start);
184         btrfs_set_header_generation(cow, trans->transid);
185         btrfs_set_header_owner(cow, new_root_objectid);
186         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188         write_extent_buffer(cow, root->fs_info->fsid,
189                             (unsigned long)btrfs_header_fsid(cow),
190                             BTRFS_FSID_SIZE);
191
192         WARN_ON(btrfs_header_generation(buf) > trans->transid);
193         ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194         kfree(new_root);
195
196         if (ret)
197                 return ret;
198
199         btrfs_mark_buffer_dirty(cow);
200         *cow_ret = cow;
201         return 0;
202 }
203
204 /*
205  * does the dirty work in cow of a single block.  The parent block
206  * (if supplied) is updated to point to the new cow copy.  The new
207  * buffer is marked dirty and returned locked.  If you modify the block
208  * it needs to be marked dirty again.
209  *
210  * search_start -- an allocation hint for the new block
211  *
212  * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
213  * the allocator should try to find free next to the block it returns.  This is
214  * just a hint and may be ignored by the allocator.
215  *
216  * prealloc_dest -- if you have already reserved a destination for the cow,
217  * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
218  * is used to finish the allocation.
219  */
220 int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
221                              struct btrfs_root *root,
222                              struct extent_buffer *buf,
223                              struct extent_buffer *parent, int parent_slot,
224                              struct extent_buffer **cow_ret,
225                              u64 search_start, u64 empty_size,
226                              u64 prealloc_dest)
227 {
228         u64 parent_start;
229         struct extent_buffer *cow;
230         u32 nritems;
231         int ret = 0;
232         int level;
233         int unlock_orig = 0;
234
235         if (*cow_ret == buf)
236                 unlock_orig = 1;
237
238         WARN_ON(!btrfs_tree_locked(buf));
239
240         if (parent)
241                 parent_start = parent->start;
242         else
243                 parent_start = 0;
244
245         WARN_ON(root->ref_cows && trans->transid !=
246                 root->fs_info->running_transaction->transid);
247         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249         level = btrfs_header_level(buf);
250         nritems = btrfs_header_nritems(buf);
251
252         if (prealloc_dest) {
253                 struct btrfs_key ins;
254
255                 ins.objectid = prealloc_dest;
256                 ins.offset = buf->len;
257                 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259                 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260                                                   root->root_key.objectid,
261                                                   trans->transid, level, &ins);
262                 BUG_ON(ret);
263                 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264                                             buf->len);
265         } else {
266                 cow = btrfs_alloc_free_block(trans, root, buf->len,
267                                              parent_start,
268                                              root->root_key.objectid,
269                                              trans->transid, level,
270                                              search_start, empty_size);
271         }
272         if (IS_ERR(cow))
273                 return PTR_ERR(cow);
274
275         copy_extent_buffer(cow, buf, 0, 0, cow->len);
276         btrfs_set_header_bytenr(cow, cow->start);
277         btrfs_set_header_generation(cow, trans->transid);
278         btrfs_set_header_owner(cow, root->root_key.objectid);
279         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281         write_extent_buffer(cow, root->fs_info->fsid,
282                             (unsigned long)btrfs_header_fsid(cow),
283                             BTRFS_FSID_SIZE);
284
285         WARN_ON(btrfs_header_generation(buf) > trans->transid);
286         if (btrfs_header_generation(buf) != trans->transid) {
287                 u32 nr_extents;
288                 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289                 if (ret)
290                         return ret;
291
292                 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293                 WARN_ON(ret);
294         } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295                 /*
296                  * There are only two places that can drop reference to
297                  * tree blocks owned by living reloc trees, one is here,
298                  * the other place is btrfs_drop_subtree. In both places,
299                  * we check reference count while tree block is locked.
300                  * Furthermore, if reference count is one, it won't get
301                  * increased by someone else.
302                  */
303                 u32 refs;
304                 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305                                               buf->len, &refs);
306                 BUG_ON(ret);
307                 if (refs == 1) {
308                         ret = btrfs_update_ref(trans, root, buf, cow,
309                                                0, nritems);
310                         clean_tree_block(trans, root, buf);
311                 } else {
312                         ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313                 }
314                 BUG_ON(ret);
315         } else {
316                 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317                 if (ret)
318                         return ret;
319                 clean_tree_block(trans, root, buf);
320         }
321
322         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323                 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324                 WARN_ON(ret);
325         }
326
327         if (buf == root->node) {
328                 WARN_ON(parent && parent != buf);
329
330                 spin_lock(&root->node_lock);
331                 root->node = cow;
332                 extent_buffer_get(cow);
333                 spin_unlock(&root->node_lock);
334
335                 if (buf != root->commit_root) {
336                         btrfs_free_extent(trans, root, buf->start,
337                                           buf->len, buf->start,
338                                           root->root_key.objectid,
339                                           btrfs_header_generation(buf),
340                                           level, 1);
341                 }
342                 free_extent_buffer(buf);
343                 add_root_to_dirty_list(root);
344         } else {
345                 btrfs_set_node_blockptr(parent, parent_slot,
346                                         cow->start);
347                 WARN_ON(trans->transid == 0);
348                 btrfs_set_node_ptr_generation(parent, parent_slot,
349                                               trans->transid);
350                 btrfs_mark_buffer_dirty(parent);
351                 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352                 btrfs_free_extent(trans, root, buf->start, buf->len,
353                                   parent_start, btrfs_header_owner(parent),
354                                   btrfs_header_generation(parent), level, 1);
355         }
356         if (unlock_orig)
357                 btrfs_tree_unlock(buf);
358         free_extent_buffer(buf);
359         btrfs_mark_buffer_dirty(cow);
360         *cow_ret = cow;
361         return 0;
362 }
363
364 /*
365  * cows a single block, see __btrfs_cow_block for the real work.
366  * This version of it has extra checks so that a block isn't cow'd more than
367  * once per transaction, as long as it hasn't been written yet
368  */
369 int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
370                     struct btrfs_root *root, struct extent_buffer *buf,
371                     struct extent_buffer *parent, int parent_slot,
372                     struct extent_buffer **cow_ret, u64 prealloc_dest)
373 {
374         u64 search_start;
375         int ret;
376
377         if (trans->transaction != root->fs_info->running_transaction) {
378                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
379                        root->fs_info->running_transaction->transid);
380                 WARN_ON(1);
381         }
382         if (trans->transid != root->fs_info->generation) {
383                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
384                        root->fs_info->generation);
385                 WARN_ON(1);
386         }
387
388         spin_lock(&root->fs_info->hash_lock);
389         if (btrfs_header_generation(buf) == trans->transid &&
390             btrfs_header_owner(buf) == root->root_key.objectid &&
391             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
392                 *cow_ret = buf;
393                 spin_unlock(&root->fs_info->hash_lock);
394                 WARN_ON(prealloc_dest);
395                 return 0;
396         }
397         spin_unlock(&root->fs_info->hash_lock);
398         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
399         ret = __btrfs_cow_block(trans, root, buf, parent,
400                                  parent_slot, cow_ret, search_start, 0,
401                                  prealloc_dest);
402         return ret;
403 }
404
405 /*
406  * helper function for defrag to decide if two blocks pointed to by a
407  * node are actually close by
408  */
409 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
410 {
411         if (blocknr < other && other - (blocknr + blocksize) < 32768)
412                 return 1;
413         if (blocknr > other && blocknr - (other + blocksize) < 32768)
414                 return 1;
415         return 0;
416 }
417
418 /*
419  * compare two keys in a memcmp fashion
420  */
421 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
422 {
423         struct btrfs_key k1;
424
425         btrfs_disk_key_to_cpu(&k1, disk);
426
427         if (k1.objectid > k2->objectid)
428                 return 1;
429         if (k1.objectid < k2->objectid)
430                 return -1;
431         if (k1.type > k2->type)
432                 return 1;
433         if (k1.type < k2->type)
434                 return -1;
435         if (k1.offset > k2->offset)
436                 return 1;
437         if (k1.offset < k2->offset)
438                 return -1;
439         return 0;
440 }
441
442 /*
443  * same as comp_keys only with two btrfs_key's
444  */
445 static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
446 {
447         if (k1->objectid > k2->objectid)
448                 return 1;
449         if (k1->objectid < k2->objectid)
450                 return -1;
451         if (k1->type > k2->type)
452                 return 1;
453         if (k1->type < k2->type)
454                 return -1;
455         if (k1->offset > k2->offset)
456                 return 1;
457         if (k1->offset < k2->offset)
458                 return -1;
459         return 0;
460 }
461
462 /*
463  * this is used by the defrag code to go through all the
464  * leaves pointed to by a node and reallocate them so that
465  * disk order is close to key order
466  */
467 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
468                        struct btrfs_root *root, struct extent_buffer *parent,
469                        int start_slot, int cache_only, u64 *last_ret,
470                        struct btrfs_key *progress)
471 {
472         struct extent_buffer *cur;
473         u64 blocknr;
474         u64 gen;
475         u64 search_start = *last_ret;
476         u64 last_block = 0;
477         u64 other;
478         u32 parent_nritems;
479         int end_slot;
480         int i;
481         int err = 0;
482         int parent_level;
483         int uptodate;
484         u32 blocksize;
485         int progress_passed = 0;
486         struct btrfs_disk_key disk_key;
487
488         parent_level = btrfs_header_level(parent);
489         if (cache_only && parent_level != 1)
490                 return 0;
491
492         if (trans->transaction != root->fs_info->running_transaction) {
493                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
494                        root->fs_info->running_transaction->transid);
495                 WARN_ON(1);
496         }
497         if (trans->transid != root->fs_info->generation) {
498                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
499                        root->fs_info->generation);
500                 WARN_ON(1);
501         }
502
503         parent_nritems = btrfs_header_nritems(parent);
504         blocksize = btrfs_level_size(root, parent_level - 1);
505         end_slot = parent_nritems;
506
507         if (parent_nritems == 1)
508                 return 0;
509
510         for (i = start_slot; i < end_slot; i++) {
511                 int close = 1;
512
513                 if (!parent->map_token) {
514                         map_extent_buffer(parent,
515                                         btrfs_node_key_ptr_offset(i),
516                                         sizeof(struct btrfs_key_ptr),
517                                         &parent->map_token, &parent->kaddr,
518                                         &parent->map_start, &parent->map_len,
519                                         KM_USER1);
520                 }
521                 btrfs_node_key(parent, &disk_key, i);
522                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
523                         continue;
524
525                 progress_passed = 1;
526                 blocknr = btrfs_node_blockptr(parent, i);
527                 gen = btrfs_node_ptr_generation(parent, i);
528                 if (last_block == 0)
529                         last_block = blocknr;
530
531                 if (i > 0) {
532                         other = btrfs_node_blockptr(parent, i - 1);
533                         close = close_blocks(blocknr, other, blocksize);
534                 }
535                 if (!close && i < end_slot - 2) {
536                         other = btrfs_node_blockptr(parent, i + 1);
537                         close = close_blocks(blocknr, other, blocksize);
538                 }
539                 if (close) {
540                         last_block = blocknr;
541                         continue;
542                 }
543                 if (parent->map_token) {
544                         unmap_extent_buffer(parent, parent->map_token,
545                                             KM_USER1);
546                         parent->map_token = NULL;
547                 }
548
549                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
550                 if (cur)
551                         uptodate = btrfs_buffer_uptodate(cur, gen);
552                 else
553                         uptodate = 0;
554                 if (!cur || !uptodate) {
555                         if (cache_only) {
556                                 free_extent_buffer(cur);
557                                 continue;
558                         }
559                         if (!cur) {
560                                 cur = read_tree_block(root, blocknr,
561                                                          blocksize, gen);
562                         } else if (!uptodate) {
563                                 btrfs_read_buffer(cur, gen);
564                         }
565                 }
566                 if (search_start == 0)
567                         search_start = last_block;
568
569                 btrfs_tree_lock(cur);
570                 err = __btrfs_cow_block(trans, root, cur, parent, i,
571                                         &cur, search_start,
572                                         min(16 * blocksize,
573                                             (end_slot - i) * blocksize), 0);
574                 if (err) {
575                         btrfs_tree_unlock(cur);
576                         free_extent_buffer(cur);
577                         break;
578                 }
579                 search_start = cur->start;
580                 last_block = cur->start;
581                 *last_ret = search_start;
582                 btrfs_tree_unlock(cur);
583                 free_extent_buffer(cur);
584         }
585         if (parent->map_token) {
586                 unmap_extent_buffer(parent, parent->map_token,
587                                     KM_USER1);
588                 parent->map_token = NULL;
589         }
590         return err;
591 }
592
593 /*
594  * The leaf data grows from end-to-front in the node.
595  * this returns the address of the start of the last item,
596  * which is the stop of the leaf data stack
597  */
598 static inline unsigned int leaf_data_end(struct btrfs_root *root,
599                                          struct extent_buffer *leaf)
600 {
601         u32 nr = btrfs_header_nritems(leaf);
602         if (nr == 0)
603                 return BTRFS_LEAF_DATA_SIZE(root);
604         return btrfs_item_offset_nr(leaf, nr - 1);
605 }
606
607 /*
608  * extra debugging checks to make sure all the items in a key are
609  * well formed and in the proper order
610  */
611 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
612                       int level)
613 {
614         struct extent_buffer *parent = NULL;
615         struct extent_buffer *node = path->nodes[level];
616         struct btrfs_disk_key parent_key;
617         struct btrfs_disk_key node_key;
618         int parent_slot;
619         int slot;
620         struct btrfs_key cpukey;
621         u32 nritems = btrfs_header_nritems(node);
622
623         if (path->nodes[level + 1])
624                 parent = path->nodes[level + 1];
625
626         slot = path->slots[level];
627         BUG_ON(nritems == 0);
628         if (parent) {
629                 parent_slot = path->slots[level + 1];
630                 btrfs_node_key(parent, &parent_key, parent_slot);
631                 btrfs_node_key(node, &node_key, 0);
632                 BUG_ON(memcmp(&parent_key, &node_key,
633                               sizeof(struct btrfs_disk_key)));
634                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
635                        btrfs_header_bytenr(node));
636         }
637         BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
638         if (slot != 0) {
639                 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
640                 btrfs_node_key(node, &node_key, slot);
641                 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
642         }
643         if (slot < nritems - 1) {
644                 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
645                 btrfs_node_key(node, &node_key, slot);
646                 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
647         }
648         return 0;
649 }
650
651 /*
652  * extra checking to make sure all the items in a leaf are
653  * well formed and in the proper order
654  */
655 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
656                       int level)
657 {
658         struct extent_buffer *leaf = path->nodes[level];
659         struct extent_buffer *parent = NULL;
660         int parent_slot;
661         struct btrfs_key cpukey;
662         struct btrfs_disk_key parent_key;
663         struct btrfs_disk_key leaf_key;
664         int slot = path->slots[0];
665
666         u32 nritems = btrfs_header_nritems(leaf);
667
668         if (path->nodes[level + 1])
669                 parent = path->nodes[level + 1];
670
671         if (nritems == 0)
672                 return 0;
673
674         if (parent) {
675                 parent_slot = path->slots[level + 1];
676                 btrfs_node_key(parent, &parent_key, parent_slot);
677                 btrfs_item_key(leaf, &leaf_key, 0);
678
679                 BUG_ON(memcmp(&parent_key, &leaf_key,
680                        sizeof(struct btrfs_disk_key)));
681                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682                        btrfs_header_bytenr(leaf));
683         }
684 #if 0
685         for (i = 0; nritems > 1 && i < nritems - 2; i++) {
686                 btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
687                 btrfs_item_key(leaf, &leaf_key, i);
688                 if (comp_keys(&leaf_key, &cpukey) >= 0) {
689                         btrfs_print_leaf(root, leaf);
690                         printk("slot %d offset bad key\n", i);
691                         BUG_ON(1);
692                 }
693                 if (btrfs_item_offset_nr(leaf, i) !=
694                         btrfs_item_end_nr(leaf, i + 1)) {
695                         btrfs_print_leaf(root, leaf);
696                         printk("slot %d offset bad\n", i);
697                         BUG_ON(1);
698                 }
699                 if (i == 0) {
700                         if (btrfs_item_offset_nr(leaf, i) +
701                                btrfs_item_size_nr(leaf, i) !=
702                                BTRFS_LEAF_DATA_SIZE(root)) {
703                                 btrfs_print_leaf(root, leaf);
704                                 printk("slot %d first offset bad\n", i);
705                                 BUG_ON(1);
706                         }
707                 }
708         }
709         if (nritems > 0) {
710                 if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
711                                 btrfs_print_leaf(root, leaf);
712                                 printk("slot %d bad size \n", nritems - 1);
713                                 BUG_ON(1);
714                 }
715         }
716 #endif
717         if (slot != 0 && slot < nritems - 1) {
718                 btrfs_item_key(leaf, &leaf_key, slot);
719                 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
720                 if (comp_keys(&leaf_key, &cpukey) <= 0) {
721                         btrfs_print_leaf(root, leaf);
722                         printk("slot %d offset bad key\n", slot);
723                         BUG_ON(1);
724                 }
725                 if (btrfs_item_offset_nr(leaf, slot - 1) !=
726                        btrfs_item_end_nr(leaf, slot)) {
727                         btrfs_print_leaf(root, leaf);
728                         printk("slot %d offset bad\n", slot);
729                         BUG_ON(1);
730                 }
731         }
732         if (slot < nritems - 1) {
733                 btrfs_item_key(leaf, &leaf_key, slot);
734                 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
735                 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
736                 if (btrfs_item_offset_nr(leaf, slot) !=
737                         btrfs_item_end_nr(leaf, slot + 1)) {
738                         btrfs_print_leaf(root, leaf);
739                         printk("slot %d offset bad\n", slot);
740                         BUG_ON(1);
741                 }
742         }
743         BUG_ON(btrfs_item_offset_nr(leaf, 0) +
744                btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
745         return 0;
746 }
747
748 static int noinline check_block(struct btrfs_root *root,
749                                 struct btrfs_path *path, int level)
750 {
751         u64 found_start;
752         return 0;
753         if (btrfs_header_level(path->nodes[level]) != level)
754             printk("warning: bad level %Lu wanted %d found %d\n",
755                    path->nodes[level]->start, level,
756                    btrfs_header_level(path->nodes[level]));
757         found_start = btrfs_header_bytenr(path->nodes[level]);
758         if (found_start != path->nodes[level]->start) {
759             printk("warning: bad bytentr %Lu found %Lu\n",
760                    path->nodes[level]->start, found_start);
761         }
762 #if 0
763         struct extent_buffer *buf = path->nodes[level];
764
765         if (memcmp_extent_buffer(buf, root->fs_info->fsid,
766                                  (unsigned long)btrfs_header_fsid(buf),
767                                  BTRFS_FSID_SIZE)) {
768                 printk("warning bad block %Lu\n", buf->start);
769                 return 1;
770         }
771 #endif
772         if (level == 0)
773                 return check_leaf(root, path, level);
774         return check_node(root, path, level);
775 }
776
777 /*
778  * search for key in the extent_buffer.  The items start at offset p,
779  * and they are item_size apart.  There are 'max' items in p.
780  *
781  * the slot in the array is returned via slot, and it points to
782  * the place where you would insert key if it is not found in
783  * the array.
784  *
785  * slot may point to max if the key is bigger than all of the keys
786  */
787 static noinline int generic_bin_search(struct extent_buffer *eb,
788                                        unsigned long p,
789                                        int item_size, struct btrfs_key *key,
790                                        int max, int *slot)
791 {
792         int low = 0;
793         int high = max;
794         int mid;
795         int ret;
796         struct btrfs_disk_key *tmp = NULL;
797         struct btrfs_disk_key unaligned;
798         unsigned long offset;
799         char *map_token = NULL;
800         char *kaddr = NULL;
801         unsigned long map_start = 0;
802         unsigned long map_len = 0;
803         int err;
804
805         while(low < high) {
806                 mid = (low + high) / 2;
807                 offset = p + mid * item_size;
808
809                 if (!map_token || offset < map_start ||
810                     (offset + sizeof(struct btrfs_disk_key)) >
811                     map_start + map_len) {
812                         if (map_token) {
813                                 unmap_extent_buffer(eb, map_token, KM_USER0);
814                                 map_token = NULL;
815                         }
816                         err = map_extent_buffer(eb, offset,
817                                                 sizeof(struct btrfs_disk_key),
818                                                 &map_token, &kaddr,
819                                                 &map_start, &map_len, KM_USER0);
820
821                         if (!err) {
822                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
823                                                         map_start);
824                         } else {
825                                 read_extent_buffer(eb, &unaligned,
826                                                    offset, sizeof(unaligned));
827                                 tmp = &unaligned;
828                         }
829
830                 } else {
831                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
832                                                         map_start);
833                 }
834                 ret = comp_keys(tmp, key);
835
836                 if (ret < 0)
837                         low = mid + 1;
838                 else if (ret > 0)
839                         high = mid;
840                 else {
841                         *slot = mid;
842                         if (map_token)
843                                 unmap_extent_buffer(eb, map_token, KM_USER0);
844                         return 0;
845                 }
846         }
847         *slot = low;
848         if (map_token)
849                 unmap_extent_buffer(eb, map_token, KM_USER0);
850         return 1;
851 }
852
853 /*
854  * simple bin_search frontend that does the right thing for
855  * leaves vs nodes
856  */
857 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
858                       int level, int *slot)
859 {
860         if (level == 0) {
861                 return generic_bin_search(eb,
862                                           offsetof(struct btrfs_leaf, items),
863                                           sizeof(struct btrfs_item),
864                                           key, btrfs_header_nritems(eb),
865                                           slot);
866         } else {
867                 return generic_bin_search(eb,
868                                           offsetof(struct btrfs_node, ptrs),
869                                           sizeof(struct btrfs_key_ptr),
870                                           key, btrfs_header_nritems(eb),
871                                           slot);
872         }
873         return -1;
874 }
875
876 /* given a node and slot number, this reads the blocks it points to.  The
877  * extent buffer is returned with a reference taken (but unlocked).
878  * NULL is returned on error.
879  */
880 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
881                                    struct extent_buffer *parent, int slot)
882 {
883         int level = btrfs_header_level(parent);
884         if (slot < 0)
885                 return NULL;
886         if (slot >= btrfs_header_nritems(parent))
887                 return NULL;
888
889         BUG_ON(level == 0);
890
891         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
892                        btrfs_level_size(root, level - 1),
893                        btrfs_node_ptr_generation(parent, slot));
894 }
895
896 /*
897  * node level balancing, used to make sure nodes are in proper order for
898  * item deletion.  We balance from the top down, so we have to make sure
899  * that a deletion won't leave an node completely empty later on.
900  */
901 static noinline int balance_level(struct btrfs_trans_handle *trans,
902                          struct btrfs_root *root,
903                          struct btrfs_path *path, int level)
904 {
905         struct extent_buffer *right = NULL;
906         struct extent_buffer *mid;
907         struct extent_buffer *left = NULL;
908         struct extent_buffer *parent = NULL;
909         int ret = 0;
910         int wret;
911         int pslot;
912         int orig_slot = path->slots[level];
913         int err_on_enospc = 0;
914         u64 orig_ptr;
915
916         if (level == 0)
917                 return 0;
918
919         mid = path->nodes[level];
920         WARN_ON(!path->locks[level]);
921         WARN_ON(btrfs_header_generation(mid) != trans->transid);
922
923         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
924
925         if (level < BTRFS_MAX_LEVEL - 1)
926                 parent = path->nodes[level + 1];
927         pslot = path->slots[level + 1];
928
929         /*
930          * deal with the case where there is only one pointer in the root
931          * by promoting the node below to a root
932          */
933         if (!parent) {
934                 struct extent_buffer *child;
935
936                 if (btrfs_header_nritems(mid) != 1)
937                         return 0;
938
939                 /* promote the child to a root */
940                 child = read_node_slot(root, mid, 0);
941                 btrfs_tree_lock(child);
942                 BUG_ON(!child);
943                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
944                 BUG_ON(ret);
945
946                 spin_lock(&root->node_lock);
947                 root->node = child;
948                 spin_unlock(&root->node_lock);
949
950                 ret = btrfs_update_extent_ref(trans, root, child->start,
951                                               mid->start, child->start,
952                                               root->root_key.objectid,
953                                               trans->transid, level - 1);
954                 BUG_ON(ret);
955
956                 add_root_to_dirty_list(root);
957                 btrfs_tree_unlock(child);
958                 path->locks[level] = 0;
959                 path->nodes[level] = NULL;
960                 clean_tree_block(trans, root, mid);
961                 btrfs_tree_unlock(mid);
962                 /* once for the path */
963                 free_extent_buffer(mid);
964                 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
965                                         mid->start, root->root_key.objectid,
966                                         btrfs_header_generation(mid),
967                                         level, 1);
968                 /* once for the root ptr */
969                 free_extent_buffer(mid);
970                 return ret;
971         }
972         if (btrfs_header_nritems(mid) >
973             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
974                 return 0;
975
976         if (btrfs_header_nritems(mid) < 2)
977                 err_on_enospc = 1;
978
979         left = read_node_slot(root, parent, pslot - 1);
980         if (left) {
981                 btrfs_tree_lock(left);
982                 wret = btrfs_cow_block(trans, root, left,
983                                        parent, pslot - 1, &left, 0);
984                 if (wret) {
985                         ret = wret;
986                         goto enospc;
987                 }
988         }
989         right = read_node_slot(root, parent, pslot + 1);
990         if (right) {
991                 btrfs_tree_lock(right);
992                 wret = btrfs_cow_block(trans, root, right,
993                                        parent, pslot + 1, &right, 0);
994                 if (wret) {
995                         ret = wret;
996                         goto enospc;
997                 }
998         }
999
1000         /* first, try to make some room in the middle buffer */
1001         if (left) {
1002                 orig_slot += btrfs_header_nritems(left);
1003                 wret = push_node_left(trans, root, left, mid, 1);
1004                 if (wret < 0)
1005                         ret = wret;
1006                 if (btrfs_header_nritems(mid) < 2)
1007                         err_on_enospc = 1;
1008         }
1009
1010         /*
1011          * then try to empty the right most buffer into the middle
1012          */
1013         if (right) {
1014                 wret = push_node_left(trans, root, mid, right, 1);
1015                 if (wret < 0 && wret != -ENOSPC)
1016                         ret = wret;
1017                 if (btrfs_header_nritems(right) == 0) {
1018                         u64 bytenr = right->start;
1019                         u64 generation = btrfs_header_generation(parent);
1020                         u32 blocksize = right->len;
1021
1022                         clean_tree_block(trans, root, right);
1023                         btrfs_tree_unlock(right);
1024                         free_extent_buffer(right);
1025                         right = NULL;
1026                         wret = del_ptr(trans, root, path, level + 1, pslot +
1027                                        1);
1028                         if (wret)
1029                                 ret = wret;
1030                         wret = btrfs_free_extent(trans, root, bytenr,
1031                                                  blocksize, parent->start,
1032                                                  btrfs_header_owner(parent),
1033                                                  generation, level, 1);
1034                         if (wret)
1035                                 ret = wret;
1036                 } else {
1037                         struct btrfs_disk_key right_key;
1038                         btrfs_node_key(right, &right_key, 0);
1039                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1040                         btrfs_mark_buffer_dirty(parent);
1041                 }
1042         }
1043         if (btrfs_header_nritems(mid) == 1) {
1044                 /*
1045                  * we're not allowed to leave a node with one item in the
1046                  * tree during a delete.  A deletion from lower in the tree
1047                  * could try to delete the only pointer in this node.
1048                  * So, pull some keys from the left.
1049                  * There has to be a left pointer at this point because
1050                  * otherwise we would have pulled some pointers from the
1051                  * right
1052                  */
1053                 BUG_ON(!left);
1054                 wret = balance_node_right(trans, root, mid, left);
1055                 if (wret < 0) {
1056                         ret = wret;
1057                         goto enospc;
1058                 }
1059                 if (wret == 1) {
1060                         wret = push_node_left(trans, root, left, mid, 1);
1061                         if (wret < 0)
1062                                 ret = wret;
1063                 }
1064                 BUG_ON(wret == 1);
1065         }
1066         if (btrfs_header_nritems(mid) == 0) {
1067                 /* we've managed to empty the middle node, drop it */
1068                 u64 root_gen = btrfs_header_generation(parent);
1069                 u64 bytenr = mid->start;
1070                 u32 blocksize = mid->len;
1071
1072                 clean_tree_block(trans, root, mid);
1073                 btrfs_tree_unlock(mid);
1074                 free_extent_buffer(mid);
1075                 mid = NULL;
1076                 wret = del_ptr(trans, root, path, level + 1, pslot);
1077                 if (wret)
1078                         ret = wret;
1079                 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1080                                          parent->start,
1081                                          btrfs_header_owner(parent),
1082                                          root_gen, level, 1);
1083                 if (wret)
1084                         ret = wret;
1085         } else {
1086                 /* update the parent key to reflect our changes */
1087                 struct btrfs_disk_key mid_key;
1088                 btrfs_node_key(mid, &mid_key, 0);
1089                 btrfs_set_node_key(parent, &mid_key, pslot);
1090                 btrfs_mark_buffer_dirty(parent);
1091         }
1092
1093         /* update the path */
1094         if (left) {
1095                 if (btrfs_header_nritems(left) > orig_slot) {
1096                         extent_buffer_get(left);
1097                         /* left was locked after cow */
1098                         path->nodes[level] = left;
1099                         path->slots[level + 1] -= 1;
1100                         path->slots[level] = orig_slot;
1101                         if (mid) {
1102                                 btrfs_tree_unlock(mid);
1103                                 free_extent_buffer(mid);
1104                         }
1105                 } else {
1106                         orig_slot -= btrfs_header_nritems(left);
1107                         path->slots[level] = orig_slot;
1108                 }
1109         }
1110         /* double check we haven't messed things up */
1111         check_block(root, path, level);
1112         if (orig_ptr !=
1113             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1114                 BUG();
1115 enospc:
1116         if (right) {
1117                 btrfs_tree_unlock(right);
1118                 free_extent_buffer(right);
1119         }
1120         if (left) {
1121                 if (path->nodes[level] != left)
1122                         btrfs_tree_unlock(left);
1123                 free_extent_buffer(left);
1124         }
1125         return ret;
1126 }
1127
1128 /* Node balancing for insertion.  Here we only split or push nodes around
1129  * when they are completely full.  This is also done top down, so we
1130  * have to be pessimistic.
1131  */
1132 static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1133                                           struct btrfs_root *root,
1134                                           struct btrfs_path *path, int level)
1135 {
1136         struct extent_buffer *right = NULL;
1137         struct extent_buffer *mid;
1138         struct extent_buffer *left = NULL;
1139         struct extent_buffer *parent = NULL;
1140         int ret = 0;
1141         int wret;
1142         int pslot;
1143         int orig_slot = path->slots[level];
1144         u64 orig_ptr;
1145
1146         if (level == 0)
1147                 return 1;
1148
1149         mid = path->nodes[level];
1150         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1152
1153         if (level < BTRFS_MAX_LEVEL - 1)
1154                 parent = path->nodes[level + 1];
1155         pslot = path->slots[level + 1];
1156
1157         if (!parent)
1158                 return 1;
1159
1160         left = read_node_slot(root, parent, pslot - 1);
1161
1162         /* first, try to make some room in the middle buffer */
1163         if (left) {
1164                 u32 left_nr;
1165
1166                 btrfs_tree_lock(left);
1167                 left_nr = btrfs_header_nritems(left);
1168                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1169                         wret = 1;
1170                 } else {
1171                         ret = btrfs_cow_block(trans, root, left, parent,
1172                                               pslot - 1, &left, 0);
1173                         if (ret)
1174                                 wret = 1;
1175                         else {
1176                                 wret = push_node_left(trans, root,
1177                                                       left, mid, 0);
1178                         }
1179                 }
1180                 if (wret < 0)
1181                         ret = wret;
1182                 if (wret == 0) {
1183                         struct btrfs_disk_key disk_key;
1184                         orig_slot += left_nr;
1185                         btrfs_node_key(mid, &disk_key, 0);
1186                         btrfs_set_node_key(parent, &disk_key, pslot);
1187                         btrfs_mark_buffer_dirty(parent);
1188                         if (btrfs_header_nritems(left) > orig_slot) {
1189                                 path->nodes[level] = left;
1190                                 path->slots[level + 1] -= 1;
1191                                 path->slots[level] = orig_slot;
1192                                 btrfs_tree_unlock(mid);
1193                                 free_extent_buffer(mid);
1194                         } else {
1195                                 orig_slot -=
1196                                         btrfs_header_nritems(left);
1197                                 path->slots[level] = orig_slot;
1198                                 btrfs_tree_unlock(left);
1199                                 free_extent_buffer(left);
1200                         }
1201                         return 0;
1202                 }
1203                 btrfs_tree_unlock(left);
1204                 free_extent_buffer(left);
1205         }
1206         right = read_node_slot(root, parent, pslot + 1);
1207
1208         /*
1209          * then try to empty the right most buffer into the middle
1210          */
1211         if (right) {
1212                 u32 right_nr;
1213                 btrfs_tree_lock(right);
1214                 right_nr = btrfs_header_nritems(right);
1215                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1216                         wret = 1;
1217                 } else {
1218                         ret = btrfs_cow_block(trans, root, right,
1219                                               parent, pslot + 1,
1220                                               &right, 0);
1221                         if (ret)
1222                                 wret = 1;
1223                         else {
1224                                 wret = balance_node_right(trans, root,
1225                                                           right, mid);
1226                         }
1227                 }
1228                 if (wret < 0)
1229                         ret = wret;
1230                 if (wret == 0) {
1231                         struct btrfs_disk_key disk_key;
1232
1233                         btrfs_node_key(right, &disk_key, 0);
1234                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1235                         btrfs_mark_buffer_dirty(parent);
1236
1237                         if (btrfs_header_nritems(mid) <= orig_slot) {
1238                                 path->nodes[level] = right;
1239                                 path->slots[level + 1] += 1;
1240                                 path->slots[level] = orig_slot -
1241                                         btrfs_header_nritems(mid);
1242                                 btrfs_tree_unlock(mid);
1243                                 free_extent_buffer(mid);
1244                         } else {
1245                                 btrfs_tree_unlock(right);
1246                                 free_extent_buffer(right);
1247                         }
1248                         return 0;
1249                 }
1250                 btrfs_tree_unlock(right);
1251                 free_extent_buffer(right);
1252         }
1253         return 1;
1254 }
1255
1256 /*
1257  * readahead one full node of leaves, finding things that are close
1258  * to the block in 'slot', and triggering ra on them.
1259  */
1260 static noinline void reada_for_search(struct btrfs_root *root,
1261                                       struct btrfs_path *path,
1262                                       int level, int slot, u64 objectid)
1263 {
1264         struct extent_buffer *node;
1265         struct btrfs_disk_key disk_key;
1266         u32 nritems;
1267         u64 search;
1268         u64 lowest_read;
1269         u64 highest_read;
1270         u64 nread = 0;
1271         int direction = path->reada;
1272         struct extent_buffer *eb;
1273         u32 nr;
1274         u32 blocksize;
1275         u32 nscan = 0;
1276
1277         if (level != 1)
1278                 return;
1279
1280         if (!path->nodes[level])
1281                 return;
1282
1283         node = path->nodes[level];
1284
1285         search = btrfs_node_blockptr(node, slot);
1286         blocksize = btrfs_level_size(root, level - 1);
1287         eb = btrfs_find_tree_block(root, search, blocksize);
1288         if (eb) {
1289                 free_extent_buffer(eb);
1290                 return;
1291         }
1292
1293         highest_read = search;
1294         lowest_read = search;
1295
1296         nritems = btrfs_header_nritems(node);
1297         nr = slot;
1298         while(1) {
1299                 if (direction < 0) {
1300                         if (nr == 0)
1301                                 break;
1302                         nr--;
1303                 } else if (direction > 0) {
1304                         nr++;
1305                         if (nr >= nritems)
1306                                 break;
1307                 }
1308                 if (path->reada < 0 && objectid) {
1309                         btrfs_node_key(node, &disk_key, nr);
1310                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1311                                 break;
1312                 }
1313                 search = btrfs_node_blockptr(node, nr);
1314                 if ((search >= lowest_read && search <= highest_read) ||
1315                     (search < lowest_read && lowest_read - search <= 16384) ||
1316                     (search > highest_read && search - highest_read <= 16384)) {
1317                         readahead_tree_block(root, search, blocksize,
1318                                      btrfs_node_ptr_generation(node, nr));
1319                         nread += blocksize;
1320                 }
1321                 nscan++;
1322                 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1323                         break;
1324                 if(nread > (256 * 1024) || nscan > 128)
1325                         break;
1326
1327                 if (search < lowest_read)
1328                         lowest_read = search;
1329                 if (search > highest_read)
1330                         highest_read = search;
1331         }
1332 }
1333
1334 /*
1335  * when we walk down the tree, it is usually safe to unlock the higher layers in
1336  * the tree.  The exceptions are when our path goes through slot 0, because operations
1337  * on the tree might require changing key pointers higher up in the tree.
1338  *
1339  * callers might also have set path->keep_locks, which tells this code to
1340  * keep the lock if the path points to the last slot in the block.  This is
1341  * part of walking through the tree, and selecting the next slot in the higher
1342  * block.
1343  *
1344  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
1345  * so if lowest_unlock is 1, level 0 won't be unlocked
1346  */
1347 static noinline void unlock_up(struct btrfs_path *path, int level,
1348                                int lowest_unlock)
1349 {
1350         int i;
1351         int skip_level = level;
1352         int no_skips = 0;
1353         struct extent_buffer *t;
1354
1355         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1356                 if (!path->nodes[i])
1357                         break;
1358                 if (!path->locks[i])
1359                         break;
1360                 if (!no_skips && path->slots[i] == 0) {
1361                         skip_level = i + 1;
1362                         continue;
1363                 }
1364                 if (!no_skips && path->keep_locks) {
1365                         u32 nritems;
1366                         t = path->nodes[i];
1367                         nritems = btrfs_header_nritems(t);
1368                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1369                                 skip_level = i + 1;
1370                                 continue;
1371                         }
1372                 }
1373                 if (skip_level < i && i >= lowest_unlock)
1374                         no_skips = 1;
1375
1376                 t = path->nodes[i];
1377                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1378                         btrfs_tree_unlock(t);
1379                         path->locks[i] = 0;
1380                 }
1381         }
1382 }
1383
1384 /*
1385  * look for key in the tree.  path is filled in with nodes along the way
1386  * if key is found, we return zero and you can find the item in the leaf
1387  * level of the path (level 0)
1388  *
1389  * If the key isn't found, the path points to the slot where it should
1390  * be inserted, and 1 is returned.  If there are other errors during the
1391  * search a negative error number is returned.
1392  *
1393  * if ins_len > 0, nodes and leaves will be split as we walk down the
1394  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1395  * possible)
1396  */
1397 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1398                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1399                       ins_len, int cow)
1400 {
1401         struct extent_buffer *b;
1402         struct extent_buffer *tmp;
1403         int slot;
1404         int ret;
1405         int level;
1406         int should_reada = p->reada;
1407         int lowest_unlock = 1;
1408         int blocksize;
1409         u8 lowest_level = 0;
1410         u64 blocknr;
1411         u64 gen;
1412         struct btrfs_key prealloc_block;
1413
1414         lowest_level = p->lowest_level;
1415         WARN_ON(lowest_level && ins_len > 0);
1416         WARN_ON(p->nodes[0] != NULL);
1417
1418         if (ins_len < 0)
1419                 lowest_unlock = 2;
1420
1421         prealloc_block.objectid = 0;
1422
1423 again:
1424         if (p->skip_locking)
1425                 b = btrfs_root_node(root);
1426         else
1427                 b = btrfs_lock_root_node(root);
1428
1429         while (b) {
1430                 level = btrfs_header_level(b);
1431
1432                 /*
1433                  * setup the path here so we can release it under lock
1434                  * contention with the cow code
1435                  */
1436                 p->nodes[level] = b;
1437                 if (!p->skip_locking)
1438                         p->locks[level] = 1;
1439
1440                 if (cow) {
1441                         int wret;
1442
1443                         /* is a cow on this block not required */
1444                         spin_lock(&root->fs_info->hash_lock);
1445                         if (btrfs_header_generation(b) == trans->transid &&
1446                             btrfs_header_owner(b) == root->root_key.objectid &&
1447                             !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1448                                 spin_unlock(&root->fs_info->hash_lock);
1449                                 goto cow_done;
1450                         }
1451                         spin_unlock(&root->fs_info->hash_lock);
1452
1453                         /* ok, we have to cow, is our old prealloc the right
1454                          * size?
1455                          */
1456                         if (prealloc_block.objectid &&
1457                             prealloc_block.offset != b->len) {
1458                                 btrfs_free_reserved_extent(root,
1459                                            prealloc_block.objectid,
1460                                            prealloc_block.offset);
1461                                 prealloc_block.objectid = 0;
1462                         }
1463
1464                         /*
1465                          * for higher level blocks, try not to allocate blocks
1466                          * with the block and the parent locks held.
1467                          */
1468                         if (level > 1 && !prealloc_block.objectid &&
1469                             btrfs_path_lock_waiting(p, level)) {
1470                                 u32 size = b->len;
1471                                 u64 hint = b->start;
1472
1473                                 btrfs_release_path(root, p);
1474                                 ret = btrfs_reserve_extent(trans, root,
1475                                                            size, size, 0,
1476                                                            hint, (u64)-1,
1477                                                            &prealloc_block, 0);
1478                                 BUG_ON(ret);
1479                                 goto again;
1480                         }
1481
1482                         wret = btrfs_cow_block(trans, root, b,
1483                                                p->nodes[level + 1],
1484                                                p->slots[level + 1],
1485                                                &b, prealloc_block.objectid);
1486                         prealloc_block.objectid = 0;
1487                         if (wret) {
1488                                 free_extent_buffer(b);
1489                                 ret = wret;
1490                                 goto done;
1491                         }
1492                 }
1493 cow_done:
1494                 BUG_ON(!cow && ins_len);
1495                 if (level != btrfs_header_level(b))
1496                         WARN_ON(1);
1497                 level = btrfs_header_level(b);
1498
1499                 p->nodes[level] = b;
1500                 if (!p->skip_locking)
1501                         p->locks[level] = 1;
1502
1503                 ret = check_block(root, p, level);
1504                 if (ret) {
1505                         ret = -1;
1506                         goto done;
1507                 }
1508
1509                 ret = bin_search(b, key, level, &slot);
1510                 if (level != 0) {
1511                         if (ret && slot > 0)
1512                                 slot -= 1;
1513                         p->slots[level] = slot;
1514                         if (ins_len > 0 && btrfs_header_nritems(b) >=
1515                             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1516                                 int sret = split_node(trans, root, p, level);
1517                                 BUG_ON(sret > 0);
1518                                 if (sret) {
1519                                         ret = sret;
1520                                         goto done;
1521                                 }
1522                                 b = p->nodes[level];
1523                                 slot = p->slots[level];
1524                         } else if (ins_len < 0) {
1525                                 int sret = balance_level(trans, root, p,
1526                                                          level);
1527                                 if (sret) {
1528                                         ret = sret;
1529                                         goto done;
1530                                 }
1531                                 b = p->nodes[level];
1532                                 if (!b) {
1533                                         btrfs_release_path(NULL, p);
1534                                         goto again;
1535                                 }
1536                                 slot = p->slots[level];
1537                                 BUG_ON(btrfs_header_nritems(b) == 1);
1538                         }
1539                         unlock_up(p, level, lowest_unlock);
1540
1541                         /* this is only true while dropping a snapshot */
1542                         if (level == lowest_level) {
1543                                 ret = 0;
1544                                 goto done;
1545                         }
1546
1547                         blocknr = btrfs_node_blockptr(b, slot);
1548                         gen = btrfs_node_ptr_generation(b, slot);
1549                         blocksize = btrfs_level_size(root, level - 1);
1550
1551                         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1552                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1553                                 b = tmp;
1554                         } else {
1555                                 /*
1556                                  * reduce lock contention at high levels
1557                                  * of the btree by dropping locks before
1558                                  * we read.
1559                                  */
1560                                 if (level > 1) {
1561                                         btrfs_release_path(NULL, p);
1562                                         if (tmp)
1563                                                 free_extent_buffer(tmp);
1564                                         if (should_reada)
1565                                                 reada_for_search(root, p,
1566                                                                  level, slot,
1567                                                                  key->objectid);
1568
1569                                         tmp = read_tree_block(root, blocknr,
1570                                                          blocksize, gen);
1571                                         if (tmp)
1572                                                 free_extent_buffer(tmp);
1573                                         goto again;
1574                                 } else {
1575                                         if (tmp)
1576                                                 free_extent_buffer(tmp);
1577                                         if (should_reada)
1578                                                 reada_for_search(root, p,
1579                                                                  level, slot,
1580                                                                  key->objectid);
1581                                         b = read_node_slot(root, b, slot);
1582                                 }
1583                         }
1584                         if (!p->skip_locking)
1585                                 btrfs_tree_lock(b);
1586                 } else {
1587                         p->slots[level] = slot;
1588                         if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
1589                             sizeof(struct btrfs_item) + ins_len) {
1590                                 int sret = split_leaf(trans, root, key,
1591                                                       p, ins_len, ret == 0);
1592                                 BUG_ON(sret > 0);
1593                                 if (sret) {
1594                                         ret = sret;
1595                                         goto done;
1596                                 }
1597                         }
1598                         unlock_up(p, level, lowest_unlock);
1599                         goto done;
1600                 }
1601         }
1602         ret = 1;
1603 done:
1604         if (prealloc_block.objectid) {
1605                 btrfs_free_reserved_extent(root,
1606                            prealloc_block.objectid,
1607                            prealloc_block.offset);
1608         }
1609
1610         return ret;
1611 }
1612
1613 int btrfs_merge_path(struct btrfs_trans_handle *trans,
1614                      struct btrfs_root *root,
1615                      struct btrfs_key *node_keys,
1616                      u64 *nodes, int lowest_level)
1617 {
1618         struct extent_buffer *eb;
1619         struct extent_buffer *parent;
1620         struct btrfs_key key;
1621         u64 bytenr;
1622         u64 generation;
1623         u32 blocksize;
1624         int level;
1625         int slot;
1626         int key_match;
1627         int ret;
1628
1629         eb = btrfs_lock_root_node(root);
1630         ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1631         BUG_ON(ret);
1632
1633         parent = eb;
1634         while (1) {
1635                 level = btrfs_header_level(parent);
1636                 if (level == 0 || level <= lowest_level)
1637                         break;
1638
1639                 ret = bin_search(parent, &node_keys[lowest_level], level,
1640                                  &slot);
1641                 if (ret && slot > 0)
1642                         slot--;
1643
1644                 bytenr = btrfs_node_blockptr(parent, slot);
1645                 if (nodes[level - 1] == bytenr)
1646                         break;
1647
1648                 blocksize = btrfs_level_size(root, level - 1);
1649                 generation = btrfs_node_ptr_generation(parent, slot);
1650                 btrfs_node_key_to_cpu(eb, &key, slot);
1651                 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1652
1653                 if (generation == trans->transid) {
1654                         eb = read_tree_block(root, bytenr, blocksize,
1655                                              generation);
1656                         btrfs_tree_lock(eb);
1657                 }
1658
1659                 /*
1660                  * if node keys match and node pointer hasn't been modified
1661                  * in the running transaction, we can merge the path. for
1662                  * blocks owened by reloc trees, the node pointer check is
1663                  * skipped, this is because these blocks are fully controlled
1664                  * by the space balance code, no one else can modify them.
1665                  */
1666                 if (!nodes[level - 1] || !key_match ||
1667                     (generation == trans->transid &&
1668                      btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1669                         if (level == 1 || level == lowest_level + 1) {
1670                                 if (generation == trans->transid) {
1671                                         btrfs_tree_unlock(eb);
1672                                         free_extent_buffer(eb);
1673                                 }
1674                                 break;
1675                         }
1676
1677                         if (generation != trans->transid) {
1678                                 eb = read_tree_block(root, bytenr, blocksize,
1679                                                 generation);
1680                                 btrfs_tree_lock(eb);
1681                         }
1682
1683                         ret = btrfs_cow_block(trans, root, eb, parent, slot,
1684                                               &eb, 0);
1685                         BUG_ON(ret);
1686
1687                         if (root->root_key.objectid ==
1688                             BTRFS_TREE_RELOC_OBJECTID) {
1689                                 if (!nodes[level - 1]) {
1690                                         nodes[level - 1] = eb->start;
1691                                         memcpy(&node_keys[level - 1], &key,
1692                                                sizeof(node_keys[0]));
1693                                 } else {
1694                                         WARN_ON(1);
1695                                 }
1696                         }
1697
1698                         btrfs_tree_unlock(parent);
1699                         free_extent_buffer(parent);
1700                         parent = eb;
1701                         continue;
1702                 }
1703
1704                 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1705                 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1706                 btrfs_mark_buffer_dirty(parent);
1707
1708                 ret = btrfs_inc_extent_ref(trans, root,
1709                                         nodes[level - 1],
1710                                         blocksize, parent->start,
1711                                         btrfs_header_owner(parent),
1712                                         btrfs_header_generation(parent),
1713                                         level - 1);
1714                 BUG_ON(ret);
1715
1716                 /*
1717                  * If the block was created in the running transaction,
1718                  * it's possible this is the last reference to it, so we
1719                  * should drop the subtree.
1720                  */
1721                 if (generation == trans->transid) {
1722                         ret = btrfs_drop_subtree(trans, root, eb, parent);
1723                         BUG_ON(ret);
1724                         btrfs_tree_unlock(eb);
1725                         free_extent_buffer(eb);
1726                 } else {
1727                         ret = btrfs_free_extent(trans, root, bytenr,
1728                                         blocksize, parent->start,
1729                                         btrfs_header_owner(parent),
1730                                         btrfs_header_generation(parent),
1731                                         level - 1, 1);
1732                         BUG_ON(ret);
1733                 }
1734                 break;
1735         }
1736         btrfs_tree_unlock(parent);
1737         free_extent_buffer(parent);
1738         return 0;
1739 }
1740
1741 /*
1742  * adjust the pointers going up the tree, starting at level
1743  * making sure the right key of each node is points to 'key'.
1744  * This is used after shifting pointers to the left, so it stops
1745  * fixing up pointers when a given leaf/node is not in slot 0 of the
1746  * higher levels
1747  *
1748  * If this fails to write a tree block, it returns -1, but continues
1749  * fixing up the blocks in ram so the tree is consistent.
1750  */
1751 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1752                           struct btrfs_root *root, struct btrfs_path *path,
1753                           struct btrfs_disk_key *key, int level)
1754 {
1755         int i;
1756         int ret = 0;
1757         struct extent_buffer *t;
1758
1759         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1760                 int tslot = path->slots[i];
1761                 if (!path->nodes[i])
1762                         break;
1763                 t = path->nodes[i];
1764                 btrfs_set_node_key(t, key, tslot);
1765                 btrfs_mark_buffer_dirty(path->nodes[i]);
1766                 if (tslot != 0)
1767                         break;
1768         }
1769         return ret;
1770 }
1771
1772 /*
1773  * update item key.
1774  *
1775  * This function isn't completely safe. It's the caller's responsibility
1776  * that the new key won't break the order
1777  */
1778 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1779                             struct btrfs_root *root, struct btrfs_path *path,
1780                             struct btrfs_key *new_key)
1781 {
1782         struct btrfs_disk_key disk_key;
1783         struct extent_buffer *eb;
1784         int slot;
1785
1786         eb = path->nodes[0];
1787         slot = path->slots[0];
1788         if (slot > 0) {
1789                 btrfs_item_key(eb, &disk_key, slot - 1);
1790                 if (comp_keys(&disk_key, new_key) >= 0)
1791                         return -1;
1792         }
1793         if (slot < btrfs_header_nritems(eb) - 1) {
1794                 btrfs_item_key(eb, &disk_key, slot + 1);
1795                 if (comp_keys(&disk_key, new_key) <= 0)
1796                         return -1;
1797         }
1798
1799         btrfs_cpu_key_to_disk(&disk_key, new_key);
1800         btrfs_set_item_key(eb, &disk_key, slot);
1801         btrfs_mark_buffer_dirty(eb);
1802         if (slot == 0)
1803                 fixup_low_keys(trans, root, path, &disk_key, 1);
1804         return 0;
1805 }
1806
1807 /*
1808  * try to push data from one node into the next node left in the
1809  * tree.
1810  *
1811  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1812  * error, and > 0 if there was no room in the left hand block.
1813  */
1814 static int push_node_left(struct btrfs_trans_handle *trans,
1815                           struct btrfs_root *root, struct extent_buffer *dst,
1816                           struct extent_buffer *src, int empty)
1817 {
1818         int push_items = 0;
1819         int src_nritems;
1820         int dst_nritems;
1821         int ret = 0;
1822
1823         src_nritems = btrfs_header_nritems(src);
1824         dst_nritems = btrfs_header_nritems(dst);
1825         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1826         WARN_ON(btrfs_header_generation(src) != trans->transid);
1827         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1828
1829         if (!empty && src_nritems <= 8)
1830                 return 1;
1831
1832         if (push_items <= 0) {
1833                 return 1;
1834         }
1835
1836         if (empty) {
1837                 push_items = min(src_nritems, push_items);
1838                 if (push_items < src_nritems) {
1839                         /* leave at least 8 pointers in the node if
1840                          * we aren't going to empty it
1841                          */
1842                         if (src_nritems - push_items < 8) {
1843                                 if (push_items <= 8)
1844                                         return 1;
1845                                 push_items -= 8;
1846                         }
1847                 }
1848         } else
1849                 push_items = min(src_nritems - 8, push_items);
1850
1851         copy_extent_buffer(dst, src,
1852                            btrfs_node_key_ptr_offset(dst_nritems),
1853                            btrfs_node_key_ptr_offset(0),
1854                            push_items * sizeof(struct btrfs_key_ptr));
1855
1856         if (push_items < src_nritems) {
1857                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1858                                       btrfs_node_key_ptr_offset(push_items),
1859                                       (src_nritems - push_items) *
1860                                       sizeof(struct btrfs_key_ptr));
1861         }
1862         btrfs_set_header_nritems(src, src_nritems - push_items);
1863         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1864         btrfs_mark_buffer_dirty(src);
1865         btrfs_mark_buffer_dirty(dst);
1866
1867         ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1868         BUG_ON(ret);
1869
1870         return ret;
1871 }
1872
1873 /*
1874  * try to push data from one node into the next node right in the
1875  * tree.
1876  *
1877  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1878  * error, and > 0 if there was no room in the right hand block.
1879  *
1880  * this will  only push up to 1/2 the contents of the left node over
1881  */
1882 static int balance_node_right(struct btrfs_trans_handle *trans,
1883                               struct btrfs_root *root,
1884                               struct extent_buffer *dst,
1885                               struct extent_buffer *src)
1886 {
1887         int push_items = 0;
1888         int max_push;
1889         int src_nritems;
1890         int dst_nritems;
1891         int ret = 0;
1892
1893         WARN_ON(btrfs_header_generation(src) != trans->transid);
1894         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1895
1896         src_nritems = btrfs_header_nritems(src);
1897         dst_nritems = btrfs_header_nritems(dst);
1898         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1899         if (push_items <= 0) {
1900                 return 1;
1901         }
1902
1903         if (src_nritems < 4) {
1904                 return 1;
1905         }
1906
1907         max_push = src_nritems / 2 + 1;
1908         /* don't try to empty the node */
1909         if (max_push >= src_nritems) {
1910                 return 1;
1911         }
1912
1913         if (max_push < push_items)
1914                 push_items = max_push;
1915
1916         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1917                                       btrfs_node_key_ptr_offset(0),
1918                                       (dst_nritems) *
1919                                       sizeof(struct btrfs_key_ptr));
1920
1921         copy_extent_buffer(dst, src,
1922                            btrfs_node_key_ptr_offset(0),
1923                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1924                            push_items * sizeof(struct btrfs_key_ptr));
1925
1926         btrfs_set_header_nritems(src, src_nritems - push_items);
1927         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1928
1929         btrfs_mark_buffer_dirty(src);
1930         btrfs_mark_buffer_dirty(dst);
1931
1932         ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1933         BUG_ON(ret);
1934
1935         return ret;
1936 }
1937
1938 /*
1939  * helper function to insert a new root level in the tree.
1940  * A new node is allocated, and a single item is inserted to
1941  * point to the existing root
1942  *
1943  * returns zero on success or < 0 on failure.
1944  */
1945 static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1946                            struct btrfs_root *root,
1947                            struct btrfs_path *path, int level)
1948 {
1949         u64 lower_gen;
1950         struct extent_buffer *lower;
1951         struct extent_buffer *c;
1952         struct extent_buffer *old;
1953         struct btrfs_disk_key lower_key;
1954         int ret;
1955
1956         BUG_ON(path->nodes[level]);
1957         BUG_ON(path->nodes[level-1] != root->node);
1958
1959         lower = path->nodes[level-1];
1960         if (level == 1)
1961                 btrfs_item_key(lower, &lower_key, 0);
1962         else
1963                 btrfs_node_key(lower, &lower_key, 0);
1964
1965         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1966                                    root->root_key.objectid, trans->transid,
1967                                    level, root->node->start, 0);
1968         if (IS_ERR(c))
1969                 return PTR_ERR(c);
1970
1971         memset_extent_buffer(c, 0, 0, root->nodesize);
1972         btrfs_set_header_nritems(c, 1);
1973         btrfs_set_header_level(c, level);
1974         btrfs_set_header_bytenr(c, c->start);
1975         btrfs_set_header_generation(c, trans->transid);
1976         btrfs_set_header_owner(c, root->root_key.objectid);
1977
1978         write_extent_buffer(c, root->fs_info->fsid,
1979                             (unsigned long)btrfs_header_fsid(c),
1980                             BTRFS_FSID_SIZE);
1981
1982         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1983                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
1984                             BTRFS_UUID_SIZE);
1985
1986         btrfs_set_node_key(c, &lower_key, 0);
1987         btrfs_set_node_blockptr(c, 0, lower->start);
1988         lower_gen = btrfs_header_generation(lower);
1989         WARN_ON(lower_gen != trans->transid);
1990
1991         btrfs_set_node_ptr_generation(c, 0, lower_gen);
1992
1993         btrfs_mark_buffer_dirty(c);
1994
1995         spin_lock(&root->node_lock);
1996         old = root->node;
1997         root->node = c;
1998         spin_unlock(&root->node_lock);
1999
2000         ret = btrfs_update_extent_ref(trans, root, lower->start,
2001                                       lower->start, c->start,
2002                                       root->root_key.objectid,
2003                                       trans->transid, level - 1);
2004         BUG_ON(ret);
2005
2006         /* the super has an extra ref to root->node */
2007         free_extent_buffer(old);
2008
2009         add_root_to_dirty_list(root);
2010         extent_buffer_get(c);
2011         path->nodes[level] = c;
2012         path->locks[level] = 1;
2013         path->slots[level] = 0;
2014         return 0;
2015 }
2016
2017 /*
2018  * worker function to insert a single pointer in a node.
2019  * the node should have enough room for the pointer already
2020  *
2021  * slot and level indicate where you want the key to go, and
2022  * blocknr is the block the key points to.
2023  *
2024  * returns zero on success and < 0 on any error
2025  */
2026 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2027                       *root, struct btrfs_path *path, struct btrfs_disk_key
2028                       *key, u64 bytenr, int slot, int level)
2029 {
2030         struct extent_buffer *lower;
2031         int nritems;
2032
2033         BUG_ON(!path->nodes[level]);
2034         lower = path->nodes[level];
2035         nritems = btrfs_header_nritems(lower);
2036         if (slot > nritems)
2037                 BUG();
2038         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2039                 BUG();
2040         if (slot != nritems) {
2041                 memmove_extent_buffer(lower,
2042                               btrfs_node_key_ptr_offset(slot + 1),
2043                               btrfs_node_key_ptr_offset(slot),
2044                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2045         }
2046         btrfs_set_node_key(lower, key, slot);
2047         btrfs_set_node_blockptr(lower, slot, bytenr);
2048         WARN_ON(trans->transid == 0);
2049         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2050         btrfs_set_header_nritems(lower, nritems + 1);
2051         btrfs_mark_buffer_dirty(lower);
2052         return 0;
2053 }
2054
2055 /*
2056  * split the node at the specified level in path in two.
2057  * The path is corrected to point to the appropriate node after the split
2058  *
2059  * Before splitting this tries to make some room in the node by pushing
2060  * left and right, if either one works, it returns right away.
2061  *
2062  * returns 0 on success and < 0 on failure
2063  */
2064 static noinline int split_node(struct btrfs_trans_handle *trans,
2065                                struct btrfs_root *root,
2066                                struct btrfs_path *path, int level)
2067 {
2068         struct extent_buffer *c;
2069         struct extent_buffer *split;
2070         struct btrfs_disk_key disk_key;
2071         int mid;
2072         int ret;
2073         int wret;
2074         u32 c_nritems;
2075
2076         c = path->nodes[level];
2077         WARN_ON(btrfs_header_generation(c) != trans->transid);
2078         if (c == root->node) {
2079                 /* trying to split the root, lets make a new one */
2080                 ret = insert_new_root(trans, root, path, level + 1);
2081                 if (ret)
2082                         return ret;
2083         } else {
2084                 ret = push_nodes_for_insert(trans, root, path, level);
2085                 c = path->nodes[level];
2086                 if (!ret && btrfs_header_nritems(c) <
2087                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2088                         return 0;
2089                 if (ret < 0)
2090                         return ret;
2091         }
2092
2093         c_nritems = btrfs_header_nritems(c);
2094
2095         split = btrfs_alloc_free_block(trans, root, root->nodesize,
2096                                         path->nodes[level + 1]->start,
2097                                         root->root_key.objectid,
2098                                         trans->transid, level, c->start, 0);
2099         if (IS_ERR(split))
2100                 return PTR_ERR(split);
2101
2102         btrfs_set_header_flags(split, btrfs_header_flags(c));
2103         btrfs_set_header_level(split, btrfs_header_level(c));
2104         btrfs_set_header_bytenr(split, split->start);
2105         btrfs_set_header_generation(split, trans->transid);
2106         btrfs_set_header_owner(split, root->root_key.objectid);
2107         btrfs_set_header_flags(split, 0);
2108         write_extent_buffer(split, root->fs_info->fsid,
2109                             (unsigned long)btrfs_header_fsid(split),
2110                             BTRFS_FSID_SIZE);
2111         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2112                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2113                             BTRFS_UUID_SIZE);
2114
2115         mid = (c_nritems + 1) / 2;
2116
2117         copy_extent_buffer(split, c,
2118                            btrfs_node_key_ptr_offset(0),
2119                            btrfs_node_key_ptr_offset(mid),
2120                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2121         btrfs_set_header_nritems(split, c_nritems - mid);
2122         btrfs_set_header_nritems(c, mid);
2123         ret = 0;
2124
2125         btrfs_mark_buffer_dirty(c);
2126         btrfs_mark_buffer_dirty(split);
2127
2128         btrfs_node_key(split, &disk_key, 0);
2129         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2130                           path->slots[level + 1] + 1,
2131                           level + 1);
2132         if (wret)
2133                 ret = wret;
2134
2135         ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2136         BUG_ON(ret);
2137
2138         if (path->slots[level] >= mid) {
2139                 path->slots[level] -= mid;
2140                 btrfs_tree_unlock(c);
2141                 free_extent_buffer(c);
2142                 path->nodes[level] = split;
2143                 path->slots[level + 1] += 1;
2144         } else {
2145                 btrfs_tree_unlock(split);
2146                 free_extent_buffer(split);
2147         }
2148         return ret;
2149 }
2150
2151 /*
2152  * how many bytes are required to store the items in a leaf.  start
2153  * and nr indicate which items in the leaf to check.  This totals up the
2154  * space used both by the item structs and the item data
2155  */
2156 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2157 {
2158         int data_len;
2159         int nritems = btrfs_header_nritems(l);
2160         int end = min(nritems, start + nr) - 1;
2161
2162         if (!nr)
2163                 return 0;
2164         data_len = btrfs_item_end_nr(l, start);
2165         data_len = data_len - btrfs_item_offset_nr(l, end);
2166         data_len += sizeof(struct btrfs_item) * nr;
2167         WARN_ON(data_len < 0);
2168         return data_len;
2169 }
2170
2171 /*
2172  * The space between the end of the leaf items and
2173  * the start of the leaf data.  IOW, how much room
2174  * the leaf has left for both items and data
2175  */
2176 int noinline btrfs_leaf_free_space(struct btrfs_root *root,
2177                                    struct extent_buffer *leaf)
2178 {
2179         int nritems = btrfs_header_nritems(leaf);
2180         int ret;
2181         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2182         if (ret < 0) {
2183                 printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
2184                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2185                        leaf_space_used(leaf, 0, nritems), nritems);
2186         }
2187         return ret;
2188 }
2189
2190 /*
2191  * push some data in the path leaf to the right, trying to free up at
2192  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2193  *
2194  * returns 1 if the push failed because the other node didn't have enough
2195  * room, 0 if everything worked out and < 0 if there were major errors.
2196  */
2197 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2198                            *root, struct btrfs_path *path, int data_size,
2199                            int empty)
2200 {
2201         struct extent_buffer *left = path->nodes[0];
2202         struct extent_buffer *right;
2203         struct extent_buffer *upper;
2204         struct btrfs_disk_key disk_key;
2205         int slot;
2206         u32 i;
2207         int free_space;
2208         int push_space = 0;
2209         int push_items = 0;
2210         struct btrfs_item *item;
2211         u32 left_nritems;
2212         u32 nr;
2213         u32 right_nritems;
2214         u32 data_end;
2215         u32 this_item_size;
2216         int ret;
2217
2218         slot = path->slots[1];
2219         if (!path->nodes[1]) {
2220                 return 1;
2221         }
2222         upper = path->nodes[1];
2223         if (slot >= btrfs_header_nritems(upper) - 1)
2224                 return 1;
2225
2226         WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2227
2228         right = read_node_slot(root, upper, slot + 1);
2229         btrfs_tree_lock(right);
2230         free_space = btrfs_leaf_free_space(root, right);
2231         if (free_space < data_size + sizeof(struct btrfs_item))
2232                 goto out_unlock;
2233
2234         /* cow and double check */
2235         ret = btrfs_cow_block(trans, root, right, upper,
2236                               slot + 1, &right, 0);
2237         if (ret)
2238                 goto out_unlock;
2239
2240         free_space = btrfs_leaf_free_space(root, right);
2241         if (free_space < data_size + sizeof(struct btrfs_item))
2242                 goto out_unlock;
2243
2244         left_nritems = btrfs_header_nritems(left);
2245         if (left_nritems == 0)
2246                 goto out_unlock;
2247
2248         if (empty)
2249                 nr = 0;
2250         else
2251                 nr = 1;
2252
2253         if (path->slots[0] >= left_nritems)
2254                 push_space += data_size + sizeof(*item);
2255
2256         i = left_nritems - 1;
2257         while (i >= nr) {
2258                 item = btrfs_item_nr(left, i);
2259
2260                 if (!empty && push_items > 0) {
2261                         if (path->slots[0] > i)
2262                                 break;
2263                         if (path->slots[0] == i) {
2264                                 int space = btrfs_leaf_free_space(root, left);
2265                                 if (space + push_space * 2 > free_space)
2266                                         break;
2267                         }
2268                 }
2269
2270                 if (path->slots[0] == i)
2271                         push_space += data_size + sizeof(*item);
2272
2273                 if (!left->map_token) {
2274                         map_extent_buffer(left, (unsigned long)item,
2275                                         sizeof(struct btrfs_item),
2276                                         &left->map_token, &left->kaddr,
2277                                         &left->map_start, &left->map_len,
2278                                         KM_USER1);
2279                 }
2280
2281                 this_item_size = btrfs_item_size(left, item);
2282                 if (this_item_size + sizeof(*item) + push_space > free_space)
2283                         break;
2284
2285                 push_items++;
2286                 push_space += this_item_size + sizeof(*item);
2287                 if (i == 0)
2288                         break;
2289                 i--;
2290         }
2291         if (left->map_token) {
2292                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2293                 left->map_token = NULL;
2294         }
2295
2296         if (push_items == 0)
2297                 goto out_unlock;
2298
2299         if (!empty && push_items == left_nritems)
2300                 WARN_ON(1);
2301
2302         /* push left to right */
2303         right_nritems = btrfs_header_nritems(right);
2304
2305         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2306         push_space -= leaf_data_end(root, left);
2307
2308         /* make room in the right data area */
2309         data_end = leaf_data_end(root, right);
2310         memmove_extent_buffer(right,
2311                               btrfs_leaf_data(right) + data_end - push_space,
2312                               btrfs_leaf_data(right) + data_end,
2313                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2314
2315         /* copy from the left data area */
2316         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2317                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2318                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2319                      push_space);
2320
2321         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2322                               btrfs_item_nr_offset(0),
2323                               right_nritems * sizeof(struct btrfs_item));
2324
2325         /* copy the items from left to right */
2326         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2327                    btrfs_item_nr_offset(left_nritems - push_items),
2328                    push_items * sizeof(struct btrfs_item));
2329
2330         /* update the item pointers */
2331         right_nritems += push_items;
2332         btrfs_set_header_nritems(right, right_nritems);
2333         push_space = BTRFS_LEAF_DATA_SIZE(root);
2334         for (i = 0; i < right_nritems; i++) {
2335                 item = btrfs_item_nr(right, i);
2336                 if (!right->map_token) {
2337                         map_extent_buffer(right, (unsigned long)item,
2338                                         sizeof(struct btrfs_item),
2339                                         &right->map_token, &right->kaddr,
2340                                         &right->map_start, &right->map_len,
2341                                         KM_USER1);
2342                 }
2343                 push_space -= btrfs_item_size(right, item);
2344                 btrfs_set_item_offset(right, item, push_space);
2345         }
2346
2347         if (right->map_token) {
2348                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2349                 right->map_token = NULL;
2350         }
2351         left_nritems -= push_items;
2352         btrfs_set_header_nritems(left, left_nritems);
2353
2354         if (left_nritems)
2355                 btrfs_mark_buffer_dirty(left);
2356         btrfs_mark_buffer_dirty(right);
2357
2358         ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2359         BUG_ON(ret);
2360
2361         btrfs_item_key(right, &disk_key, 0);
2362         btrfs_set_node_key(upper, &disk_key, slot + 1);
2363         btrfs_mark_buffer_dirty(upper);
2364
2365         /* then fixup the leaf pointer in the path */
2366         if (path->slots[0] >= left_nritems) {
2367                 path->slots[0] -= left_nritems;
2368                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2369                         clean_tree_block(trans, root, path->nodes[0]);
2370                 btrfs_tree_unlock(path->nodes[0]);
2371                 free_extent_buffer(path->nodes[0]);
2372                 path->nodes[0] = right;
2373                 path->slots[1] += 1;
2374         } else {
2375                 btrfs_tree_unlock(right);
2376                 free_extent_buffer(right);
2377         }
2378         return 0;
2379
2380 out_unlock:
2381         btrfs_tree_unlock(right);
2382         free_extent_buffer(right);
2383         return 1;
2384 }
2385
2386 /*
2387  * push some data in the path leaf to the left, trying to free up at
2388  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2389  */
2390 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2391                           *root, struct btrfs_path *path, int data_size,
2392                           int empty)
2393 {
2394         struct btrfs_disk_key disk_key;
2395         struct extent_buffer *right = path->nodes[0];
2396         struct extent_buffer *left;
2397         int slot;
2398         int i;
2399         int free_space;
2400         int push_space = 0;
2401         int push_items = 0;
2402         struct btrfs_item *item;
2403         u32 old_left_nritems;
2404         u32 right_nritems;
2405         u32 nr;
2406         int ret = 0;
2407         int wret;
2408         u32 this_item_size;
2409         u32 old_left_item_size;
2410
2411         slot = path->slots[1];
2412         if (slot == 0)
2413                 return 1;
2414         if (!path->nodes[1])
2415                 return 1;
2416
2417         right_nritems = btrfs_header_nritems(right);
2418         if (right_nritems == 0) {
2419                 return 1;
2420         }
2421
2422         WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2423
2424         left = read_node_slot(root, path->nodes[1], slot - 1);
2425         btrfs_tree_lock(left);
2426         free_space = btrfs_leaf_free_space(root, left);
2427         if (free_space < data_size + sizeof(struct btrfs_item)) {
2428                 ret = 1;
2429                 goto out;
2430         }
2431
2432         /* cow and double check */
2433         ret = btrfs_cow_block(trans, root, left,
2434                               path->nodes[1], slot - 1, &left, 0);
2435         if (ret) {
2436                 /* we hit -ENOSPC, but it isn't fatal here */
2437                 ret = 1;
2438                 goto out;
2439         }
2440
2441         free_space = btrfs_leaf_free_space(root, left);
2442         if (free_space < data_size + sizeof(struct btrfs_item)) {
2443                 ret = 1;
2444                 goto out;
2445         }
2446
2447         if (empty)
2448                 nr = right_nritems;
2449         else
2450                 nr = right_nritems - 1;
2451
2452         for (i = 0; i < nr; i++) {
2453                 item = btrfs_item_nr(right, i);
2454                 if (!right->map_token) {
2455                         map_extent_buffer(right, (unsigned long)item,
2456                                         sizeof(struct btrfs_item),
2457                                         &right->map_token, &right->kaddr,
2458                                         &right->map_start, &right->map_len,
2459                                         KM_USER1);
2460                 }
2461
2462                 if (!empty && push_items > 0) {
2463                         if (path->slots[0] < i)
2464                                 break;
2465                         if (path->slots[0] == i) {
2466                                 int space = btrfs_leaf_free_space(root, right);
2467                                 if (space + push_space * 2 > free_space)
2468                                         break;
2469                         }
2470                 }
2471
2472                 if (path->slots[0] == i)
2473                         push_space += data_size + sizeof(*item);
2474
2475                 this_item_size = btrfs_item_size(right, item);
2476                 if (this_item_size + sizeof(*item) + push_space > free_space)
2477                         break;
2478
2479                 push_items++;
2480                 push_space += this_item_size + sizeof(*item);
2481         }
2482
2483         if (right->map_token) {
2484                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2485                 right->map_token = NULL;
2486         }
2487
2488         if (push_items == 0) {
2489                 ret = 1;
2490                 goto out;
2491         }
2492         if (!empty && push_items == btrfs_header_nritems(right))
2493                 WARN_ON(1);
2494
2495         /* push data from right to left */
2496         copy_extent_buffer(left, right,
2497                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2498                            btrfs_item_nr_offset(0),
2499                            push_items * sizeof(struct btrfs_item));
2500
2501         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2502                      btrfs_item_offset_nr(right, push_items -1);
2503
2504         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2505                      leaf_data_end(root, left) - push_space,
2506                      btrfs_leaf_data(right) +
2507                      btrfs_item_offset_nr(right, push_items - 1),
2508                      push_space);
2509         old_left_nritems = btrfs_header_nritems(left);
2510         BUG_ON(old_left_nritems < 0);
2511
2512         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2513         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2514                 u32 ioff;
2515
2516                 item = btrfs_item_nr(left, i);
2517                 if (!left->map_token) {
2518                         map_extent_buffer(left, (unsigned long)item,
2519                                         sizeof(struct btrfs_item),
2520                                         &left->map_token, &left->kaddr,
2521                                         &left->map_start, &left->map_len,
2522                                         KM_USER1);
2523                 }
2524
2525                 ioff = btrfs_item_offset(left, item);
2526                 btrfs_set_item_offset(left, item,
2527                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2528         }
2529         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2530         if (left->map_token) {
2531                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2532                 left->map_token = NULL;
2533         }
2534
2535         /* fixup right node */
2536         if (push_items > right_nritems) {
2537                 printk("push items %d nr %u\n", push_items, right_nritems);
2538                 WARN_ON(1);
2539         }
2540
2541         if (push_items < right_nritems) {
2542                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2543                                                   leaf_data_end(root, right);
2544                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2545                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2546                                       btrfs_leaf_data(right) +
2547                                       leaf_data_end(root, right), push_space);
2548
2549                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2550                               btrfs_item_nr_offset(push_items),
2551                              (btrfs_header_nritems(right) - push_items) *
2552                              sizeof(struct btrfs_item));
2553         }
2554         right_nritems -= push_items;
2555         btrfs_set_header_nritems(right, right_nritems);
2556         push_space = BTRFS_LEAF_DATA_SIZE(root);
2557         for (i = 0; i < right_nritems; i++) {
2558                 item = btrfs_item_nr(right, i);
2559
2560                 if (!right->map_token) {
2561                         map_extent_buffer(right, (unsigned long)item,
2562                                         sizeof(struct btrfs_item),
2563                                         &right->map_token, &right->kaddr,
2564                                         &right->map_start, &right->map_len,
2565                                         KM_USER1);
2566                 }
2567
2568                 push_space = push_space - btrfs_item_size(right, item);
2569                 btrfs_set_item_offset(right, item, push_space);
2570         }
2571         if (right->map_token) {
2572                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2573                 right->map_token = NULL;
2574         }
2575
2576         btrfs_mark_buffer_dirty(left);
2577         if (right_nritems)
2578                 btrfs_mark_buffer_dirty(right);
2579
2580         ret = btrfs_update_ref(trans, root, right, left,
2581                                old_left_nritems, push_items);
2582         BUG_ON(ret);
2583
2584         btrfs_item_key(right, &disk_key, 0);
2585         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2586         if (wret)
2587                 ret = wret;
2588
2589         /* then fixup the leaf pointer in the path */
2590         if (path->slots[0] < push_items) {
2591                 path->slots[0] += old_left_nritems;
2592                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2593                         clean_tree_block(trans, root, path->nodes[0]);
2594                 btrfs_tree_unlock(path->nodes[0]);
2595                 free_extent_buffer(path->nodes[0]);
2596                 path->nodes[0] = left;
2597                 path->slots[1] -= 1;
2598         } else {
2599                 btrfs_tree_unlock(left);
2600                 free_extent_buffer(left);
2601                 path->slots[0] -= push_items;
2602         }
2603         BUG_ON(path->slots[0] < 0);
2604         return ret;
2605 out:
2606         btrfs_tree_unlock(left);
2607         free_extent_buffer(left);
2608         return ret;
2609 }
2610
2611 /*
2612  * split the path's leaf in two, making sure there is at least data_size
2613  * available for the resulting leaf level of the path.
2614  *
2615  * returns 0 if all went well and < 0 on failure.
2616  */
2617 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2618                                struct btrfs_root *root,
2619                                struct btrfs_key *ins_key,
2620                                struct btrfs_path *path, int data_size,
2621                                int extend)
2622 {
2623         struct extent_buffer *l;
2624         u32 nritems;
2625         int mid;
2626         int slot;
2627         struct extent_buffer *right;
2628         int space_needed = data_size + sizeof(struct btrfs_item);
2629         int data_copy_size;
2630         int rt_data_off;
2631         int i;
2632         int ret = 0;
2633         int wret;
2634         int double_split;
2635         int num_doubles = 0;
2636         struct btrfs_disk_key disk_key;
2637
2638         if (extend)
2639                 space_needed = data_size;
2640
2641         /* first try to make some room by pushing left and right */
2642         if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2643                 wret = push_leaf_right(trans, root, path, data_size, 0);
2644                 if (wret < 0) {
2645                         return wret;
2646                 }
2647                 if (wret) {
2648                         wret = push_leaf_left(trans, root, path, data_size, 0);
2649                         if (wret < 0)
2650                                 return wret;
2651                 }
2652                 l = path->nodes[0];
2653
2654                 /* did the pushes work? */
2655                 if (btrfs_leaf_free_space(root, l) >= space_needed)
2656                         return 0;
2657         }
2658
2659         if (!path->nodes[1]) {
2660                 ret = insert_new_root(trans, root, path, 1);
2661                 if (ret)
2662                         return ret;
2663         }
2664 again:
2665         double_split = 0;
2666         l = path->nodes[0];
2667         slot = path->slots[0];
2668         nritems = btrfs_header_nritems(l);
2669         mid = (nritems + 1)/ 2;
2670
2671         right = btrfs_alloc_free_block(trans, root, root->leafsize,
2672                                         path->nodes[1]->start,
2673                                         root->root_key.objectid,
2674                                         trans->transid, 0, l->start, 0);
2675         if (IS_ERR(right)) {
2676                 BUG_ON(1);
2677                 return PTR_ERR(right);
2678         }
2679
2680         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2681         btrfs_set_header_bytenr(right, right->start);
2682         btrfs_set_header_generation(right, trans->transid);
2683         btrfs_set_header_owner(right, root->root_key.objectid);
2684         btrfs_set_header_level(right, 0);
2685         write_extent_buffer(right, root->fs_info->fsid,
2686                             (unsigned long)btrfs_header_fsid(right),
2687                             BTRFS_FSID_SIZE);
2688
2689         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2690                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2691                             BTRFS_UUID_SIZE);
2692         if (mid <= slot) {
2693                 if (nritems == 1 ||
2694                     leaf_space_used(l, mid, nritems - mid) + space_needed >
2695                         BTRFS_LEAF_DATA_SIZE(root)) {
2696                         if (slot >= nritems) {
2697                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2698                                 btrfs_set_header_nritems(right, 0);
2699                                 wret = insert_ptr(trans, root, path,
2700                                                   &disk_key, right->start,
2701                                                   path->slots[1] + 1, 1);
2702                                 if (wret)
2703                                         ret = wret;
2704
2705                                 btrfs_tree_unlock(path->nodes[0]);
2706                                 free_extent_buffer(path->nodes[0]);
2707                                 path->nodes[0] = right;
2708                                 path->slots[0] = 0;
2709                                 path->slots[1] += 1;
2710                                 btrfs_mark_buffer_dirty(right);
2711                                 return ret;
2712                         }
2713                         mid = slot;
2714                         if (mid != nritems &&
2715                             leaf_space_used(l, mid, nritems - mid) +
2716                             space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2717                                 double_split = 1;
2718                         }
2719                 }
2720         } else {
2721                 if (leaf_space_used(l, 0, mid + 1) + space_needed >
2722                         BTRFS_LEAF_DATA_SIZE(root)) {
2723                         if (!extend && slot == 0) {
2724                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2725                                 btrfs_set_header_nritems(right, 0);
2726                                 wret = insert_ptr(trans, root, path,
2727                                                   &disk_key,
2728                                                   right->start,
2729                                                   path->slots[1], 1);
2730                                 if (wret)
2731                                         ret = wret;
2732                                 btrfs_tree_unlock(path->nodes[0]);
2733                                 free_extent_buffer(path->nodes[0]);
2734                                 path->nodes[0] = right;
2735                                 path->slots[0] = 0;
2736                                 if (path->slots[1] == 0) {
2737                                         wret = fixup_low_keys(trans, root,
2738                                                    path, &disk_key, 1);
2739                                         if (wret)
2740                                                 ret = wret;
2741                                 }
2742                                 btrfs_mark_buffer_dirty(right);
2743                                 return ret;
2744                         } else if (extend && slot == 0) {
2745                                 mid = 1;
2746                         } else {
2747                                 mid = slot;
2748                                 if (mid != nritems &&
2749                                     leaf_space_used(l, mid, nritems - mid) +
2750                                     space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2751                                         double_split = 1;
2752                                 }
2753                         }
2754                 }
2755         }
2756         nritems = nritems - mid;
2757         btrfs_set_header_nritems(right, nritems);
2758         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2759
2760         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2761                            btrfs_item_nr_offset(mid),
2762                            nritems * sizeof(struct btrfs_item));
2763
2764         copy_extent_buffer(right, l,
2765                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2766                      data_copy_size, btrfs_leaf_data(l) +
2767                      leaf_data_end(root, l), data_copy_size);
2768
2769         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2770                       btrfs_item_end_nr(l, mid);
2771
2772         for (i = 0; i < nritems; i++) {
2773                 struct btrfs_item *item = btrfs_item_nr(right, i);
2774                 u32 ioff;
2775
2776                 if (!right->map_token) {
2777                         map_extent_buffer(right, (unsigned long)item,
2778                                         sizeof(struct btrfs_item),
2779                                         &right->map_token, &right->kaddr,
2780                                         &right->map_start, &right->map_len,
2781                                         KM_USER1);
2782                 }
2783
2784                 ioff = btrfs_item_offset(right, item);
2785                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2786         }
2787
2788         if (right->map_token) {
2789                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2790                 right->map_token = NULL;
2791         }
2792
2793         btrfs_set_header_nritems(l, mid);
2794         ret = 0;
2795         btrfs_item_key(right, &disk_key, 0);
2796         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2797                           path->slots[1] + 1, 1);
2798         if (wret)
2799                 ret = wret;
2800
2801         btrfs_mark_buffer_dirty(right);
2802         btrfs_mark_buffer_dirty(l);
2803         BUG_ON(path->slots[0] != slot);
2804
2805         ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2806         BUG_ON(ret);
2807
2808         if (mid <= slot) {
2809                 btrfs_tree_unlock(path->nodes[0]);
2810                 free_extent_buffer(path->nodes[0]);
2811                 path->nodes[0] = right;
2812                 path->slots[0] -= mid;
2813                 path->slots[1] += 1;
2814         } else {
2815                 btrfs_tree_unlock(right);
2816                 free_extent_buffer(right);
2817         }
2818
2819         BUG_ON(path->slots[0] < 0);
2820
2821         if (double_split) {
2822                 BUG_ON(num_doubles != 0);
2823                 num_doubles++;
2824                 goto again;
2825         }
2826         return ret;
2827 }
2828
2829 /*
2830  * make the item pointed to by the path smaller.  new_size indicates
2831  * how small to make it, and from_end tells us if we just chop bytes
2832  * off the end of the item or if we shift the item to chop bytes off
2833  * the front.
2834  */
2835 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2836                         struct btrfs_root *root,
2837                         struct btrfs_path *path,
2838                         u32 new_size, int from_end)
2839 {
2840         int ret = 0;
2841         int slot;
2842         int slot_orig;
2843         struct extent_buffer *leaf;
2844         struct btrfs_item *item;
2845         u32 nritems;
2846         unsigned int data_end;
2847         unsigned int old_data_start;
2848         unsigned int old_size;
2849         unsigned int size_diff;
2850         int i;
2851
2852         slot_orig = path->slots[0];
2853         leaf = path->nodes[0];
2854         slot = path->slots[0];
2855
2856         old_size = btrfs_item_size_nr(leaf, slot);
2857         if (old_size == new_size)
2858                 return 0;
2859
2860         nritems = btrfs_header_nritems(leaf);
2861         data_end = leaf_data_end(root, leaf);
2862
2863         old_data_start = btrfs_item_offset_nr(leaf, slot);
2864
2865         size_diff = old_size - new_size;
2866
2867         BUG_ON(slot < 0);
2868         BUG_ON(slot >= nritems);
2869
2870         /*
2871          * item0..itemN ... dataN.offset..dataN.size .. data0.size
2872          */
2873         /* first correct the data pointers */
2874         for (i = slot; i < nritems; i++) {
2875                 u32 ioff;
2876                 item = btrfs_item_nr(leaf, i);
2877
2878                 if (!leaf->map_token) {
2879                         map_extent_buffer(leaf, (unsigned long)item,
2880                                         sizeof(struct btrfs_item),
2881                                         &leaf->map_token, &leaf->kaddr,
2882                                         &leaf->map_start, &leaf->map_len,
2883                                         KM_USER1);
2884                 }
2885
2886                 ioff = btrfs_item_offset(leaf, item);
2887                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2888         }
2889
2890         if (leaf->map_token) {
2891                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2892                 leaf->map_token = NULL;
2893         }
2894
2895         /* shift the data */
2896         if (from_end) {
2897                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2898                               data_end + size_diff, btrfs_leaf_data(leaf) +
2899                               data_end, old_data_start + new_size - data_end);
2900         } else {
2901                 struct btrfs_disk_key disk_key;
2902                 u64 offset;
2903
2904                 btrfs_item_key(leaf, &disk_key, slot);
2905
2906                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2907                         unsigned long ptr;
2908                         struct btrfs_file_extent_item *fi;
2909
2910                         fi = btrfs_item_ptr(leaf, slot,
2911                                             struct btrfs_file_extent_item);
2912                         fi = (struct btrfs_file_extent_item *)(
2913                              (unsigned long)fi - size_diff);
2914
2915                         if (btrfs_file_extent_type(leaf, fi) ==
2916                             BTRFS_FILE_EXTENT_INLINE) {
2917                                 ptr = btrfs_item_ptr_offset(leaf, slot);
2918                                 memmove_extent_buffer(leaf, ptr,
2919                                         (unsigned long)fi,
2920                                         offsetof(struct btrfs_file_extent_item,
2921                                                  disk_bytenr));
2922                         }
2923                 }
2924
2925                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2926                               data_end + size_diff, btrfs_leaf_data(leaf) +
2927                               data_end, old_data_start - data_end);
2928
2929                 offset = btrfs_disk_key_offset(&disk_key);
2930                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2931                 btrfs_set_item_key(leaf, &disk_key, slot);
2932                 if (slot == 0)
2933                         fixup_low_keys(trans, root, path, &disk_key, 1);
2934         }
2935
2936         item = btrfs_item_nr(leaf, slot);
2937         btrfs_set_item_size(leaf, item, new_size);
2938         btrfs_mark_buffer_dirty(leaf);
2939
2940         ret = 0;
2941         if (btrfs_leaf_free_space(root, leaf) < 0) {
2942                 btrfs_print_leaf(root, leaf);
2943                 BUG();
2944         }
2945         return ret;
2946 }
2947
2948 /*
2949  * make the item pointed to by the path bigger, data_size is the new size.
2950  */
2951 int btrfs_extend_item(struct btrfs_trans_handle *trans,
2952                       struct btrfs_root *root, struct btrfs_path *path,
2953                       u32 data_size)
2954 {
2955         int ret = 0;
2956         int slot;
2957         int slot_orig;
2958         struct extent_buffer *leaf;
2959         struct btrfs_item *item;
2960         u32 nritems;
2961         unsigned int data_end;
2962         unsigned int old_data;
2963         unsigned int old_size;
2964         int i;
2965
2966         slot_orig = path->slots[0];
2967         leaf = path->nodes[0];
2968
2969         nritems = btrfs_header_nritems(leaf);
2970         data_end = leaf_data_end(root, leaf);
2971
2972         if (btrfs_leaf_free_space(root, leaf) < data_size) {
2973                 btrfs_print_leaf(root, leaf);
2974                 BUG();
2975         }
2976         slot = path->slots[0];
2977         old_data = btrfs_item_end_nr(leaf, slot);
2978
2979         BUG_ON(slot < 0);
2980         if (slot >= nritems) {
2981                 btrfs_print_leaf(root, leaf);
2982                 printk("slot %d too large, nritems %d\n", slot, nritems);
2983                 BUG_ON(1);
2984         }
2985
2986         /*
2987          * item0..itemN ... dataN.offset..dataN.size .. data0.size
2988          */
2989         /* first correct the data pointers */
2990         for (i = slot; i < nritems; i++) {
2991                 u32 ioff;
2992                 item = btrfs_item_nr(leaf, i);
2993
2994                 if (!leaf->map_token) {
2995                         map_extent_buffer(leaf, (unsigned long)item,
2996                                         sizeof(struct btrfs_item),
2997                                         &leaf->map_token, &leaf->kaddr,
2998                                         &leaf->map_start, &leaf->map_len,
2999                                         KM_USER1);
3000                 }
3001                 ioff = btrfs_item_offset(leaf, item);
3002                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3003         }
3004
3005         if (leaf->map_token) {
3006                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3007                 leaf->map_token = NULL;
3008         }
3009
3010         /* shift the data */
3011         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3012                       data_end - data_size, btrfs_leaf_data(leaf) +
3013                       data_end, old_data - data_end);
3014
3015         data_end = old_data;
3016         old_size = btrfs_item_size_nr(leaf, slot);
3017         item = btrfs_item_nr(leaf, slot);
3018         btrfs_set_item_size(leaf, item, old_size + data_size);
3019         btrfs_mark_buffer_dirty(leaf);
3020
3021         ret = 0;
3022         if (btrfs_leaf_free_space(root, leaf) < 0) {
3023                 btrfs_print_leaf(root, leaf);
3024                 BUG();
3025         }
3026         return ret;
3027 }
3028
3029 /*
3030  * Given a key and some data, insert items into the tree.
3031  * This does all the path init required, making room in the tree if needed.
3032  * Returns the number of keys that were inserted.
3033  */
3034 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3035                             struct btrfs_root *root,
3036                             struct btrfs_path *path,
3037                             struct btrfs_key *cpu_key, u32 *data_size,
3038                             int nr)
3039 {
3040         struct extent_buffer *leaf;
3041         struct btrfs_item *item;
3042         int ret = 0;
3043         int slot;
3044         int slot_orig;
3045         int i;
3046         u32 nritems;
3047         u32 total_data = 0;
3048         u32 total_size = 0;
3049         unsigned int data_end;
3050         struct btrfs_disk_key disk_key;
3051         struct btrfs_key found_key;
3052
3053         found_key.objectid = 0;
3054         nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
3055
3056         for (i = 0; i < nr; i++)
3057                 total_data += data_size[i];
3058
3059         total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
3060         total_size = total_data + (nr * sizeof(struct btrfs_item));
3061         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3062         if (ret == 0)
3063                 return -EEXIST;
3064         if (ret < 0)
3065                 goto out;
3066
3067         slot_orig = path->slots[0];
3068         leaf = path->nodes[0];
3069
3070         nritems = btrfs_header_nritems(leaf);
3071         data_end = leaf_data_end(root, leaf);
3072
3073         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3074                 for (i = nr; i >= 0; i--) {
3075                         total_data -= data_size[i];
3076                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3077                         if (total_size < btrfs_leaf_free_space(root, leaf))
3078                                 break;
3079                 }
3080                 nr = i;
3081         }
3082
3083         slot = path->slots[0];
3084         BUG_ON(slot < 0);
3085
3086         if (slot != nritems) {
3087                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3088
3089                 item = btrfs_item_nr(leaf, slot);
3090                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3091
3092                 /* figure out how many keys we can insert in here */
3093                 total_data = data_size[0];
3094                 for (i = 1; i < nr; i++) {
3095                         if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3096                                 break;
3097                         total_data += data_size[i];
3098                 }
3099                 nr = i;
3100
3101                 if (old_data < data_end) {
3102                         btrfs_print_leaf(root, leaf);
3103                         printk("slot %d old_data %d data_end %d\n",
3104                                slot, old_data, data_end);
3105                         BUG_ON(1);
3106                 }
3107                 /*
3108                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3109                  */
3110                 /* first correct the data pointers */
3111                 WARN_ON(leaf->map_token);
3112                 for (i = slot; i < nritems; i++) {
3113                         u32 ioff;
3114
3115                         item = btrfs_item_nr(leaf, i);
3116                         if (!leaf->map_token) {
3117                                 map_extent_buffer(leaf, (unsigned long)item,
3118                                         sizeof(struct btrfs_item),
3119                                         &leaf->map_token, &leaf->kaddr,
3120                                         &leaf->map_start, &leaf->map_len,
3121                                         KM_USER1);
3122                         }
3123
3124                         ioff = btrfs_item_offset(leaf, item);
3125                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3126                 }
3127                 if (leaf->map_token) {
3128                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3129                         leaf->map_token = NULL;
3130                 }
3131
3132                 /* shift the items */
3133                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3134                               btrfs_item_nr_offset(slot),
3135                               (nritems - slot) * sizeof(struct btrfs_item));
3136
3137                 /* shift the data */
3138                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3139                               data_end - total_data, btrfs_leaf_data(leaf) +
3140                               data_end, old_data - data_end);
3141                 data_end = old_data;
3142         } else {
3143                 /*
3144                  * this sucks but it has to be done, if we are inserting at
3145                  * the end of the leaf only insert 1 of the items, since we
3146                  * have no way of knowing whats on the next leaf and we'd have
3147                  * to drop our current locks to figure it out
3148                  */
3149                 nr = 1;
3150         }
3151
3152         /* setup the item for the new data */
3153         for (i = 0; i < nr; i++) {
3154                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3155                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3156                 item = btrfs_item_nr(leaf, slot + i);
3157                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3158                 data_end -= data_size[i];
3159                 btrfs_set_item_size(leaf, item, data_size[i]);
3160         }
3161         btrfs_set_header_nritems(leaf, nritems + nr);
3162         btrfs_mark_buffer_dirty(leaf);
3163
3164         ret = 0;
3165         if (slot == 0) {
3166                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3167                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3168         }
3169
3170         if (btrfs_leaf_free_space(root, leaf) < 0) {
3171                 btrfs_print_leaf(root, leaf);
3172                 BUG();
3173         }
3174 out:
3175         if (!ret)
3176                 ret = nr;
3177         return ret;
3178 }
3179
3180 /*
3181  * Given a key and some data, insert items into the tree.
3182  * This does all the path init required, making room in the tree if needed.
3183  */
3184 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3185                             struct btrfs_root *root,
3186                             struct btrfs_path *path,
3187                             struct btrfs_key *cpu_key, u32 *data_size,
3188                             int nr)
3189 {
3190         struct extent_buffer *leaf;
3191         struct btrfs_item *item;
3192         int ret = 0;
3193         int slot;
3194         int slot_orig;
3195         int i;
3196         u32 nritems;
3197         u32 total_size = 0;
3198         u32 total_data = 0;
3199         unsigned int data_end;
3200         struct btrfs_disk_key disk_key;
3201
3202         for (i = 0; i < nr; i++) {
3203                 total_data += data_size[i];
3204         }
3205
3206         total_size = total_data + (nr * sizeof(struct btrfs_item));
3207         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3208         if (ret == 0)
3209                 return -EEXIST;
3210         if (ret < 0)
3211                 goto out;
3212
3213         slot_orig = path->slots[0];
3214         leaf = path->nodes[0];
3215
3216         nritems = btrfs_header_nritems(leaf);
3217         data_end = leaf_data_end(root, leaf);
3218
3219         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3220                 btrfs_print_leaf(root, leaf);
3221                 printk("not enough freespace need %u have %d\n",
3222                        total_size, btrfs_leaf_free_space(root, leaf));
3223                 BUG();
3224         }
3225
3226         slot = path->slots[0];
3227         BUG_ON(slot < 0);
3228
3229         if (slot != nritems) {
3230                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3231
3232                 if (old_data < data_end) {
3233                         btrfs_print_leaf(root, leaf);
3234                         printk("slot %d old_data %d data_end %d\n",
3235                                slot, old_data, data_end);
3236                         BUG_ON(1);
3237                 }
3238                 /*
3239                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3240                  */
3241                 /* first correct the data pointers */
3242                 WARN_ON(leaf->map_token);
3243                 for (i = slot; i < nritems; i++) {
3244                         u32 ioff;
3245
3246                         item = btrfs_item_nr(leaf, i);
3247                         if (!leaf->map_token) {
3248                                 map_extent_buffer(leaf, (unsigned long)item,
3249                                         sizeof(struct btrfs_item),
3250                                         &leaf->map_token, &leaf->kaddr,
3251                                         &leaf->map_start, &leaf->map_len,
3252                                         KM_USER1);
3253                         }
3254
3255                         ioff = btrfs_item_offset(leaf, item);
3256                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3257                 }
3258                 if (leaf->map_token) {
3259                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3260                         leaf->map_token = NULL;
3261                 }
3262
3263                 /* shift the items */
3264                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3265                               btrfs_item_nr_offset(slot),
3266                               (nritems - slot) * sizeof(struct btrfs_item));
3267
3268                 /* shift the data */
3269                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3270                               data_end - total_data, btrfs_leaf_data(leaf) +
3271                               data_end, old_data - data_end);
3272                 data_end = old_data;
3273         }
3274
3275         /* setup the item for the new data */
3276         for (i = 0; i < nr; i++) {
3277                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3278                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3279                 item = btrfs_item_nr(leaf, slot + i);
3280                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3281                 data_end -= data_size[i];
3282                 btrfs_set_item_size(leaf, item, data_size[i]);
3283         }
3284         btrfs_set_header_nritems(leaf, nritems + nr);
3285         btrfs_mark_buffer_dirty(leaf);
3286
3287         ret = 0;
3288         if (slot == 0) {
3289                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3290                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3291         }
3292
3293         if (btrfs_leaf_free_space(root, leaf) < 0) {
3294                 btrfs_print_leaf(root, leaf);
3295                 BUG();
3296         }
3297 out:
3298         return ret;
3299 }
3300
3301 /*
3302  * Given a key and some data, insert an item into the tree.
3303  * This does all the path init required, making room in the tree if needed.
3304  */
3305 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3306                       *root, struct btrfs_key *cpu_key, void *data, u32
3307                       data_size)
3308 {
3309         int ret = 0;
3310         struct btrfs_path *path;
3311         struct extent_buffer *leaf;
3312         unsigned long ptr;
3313
3314         path = btrfs_alloc_path();
3315         BUG_ON(!path);
3316         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3317         if (!ret) {
3318                 leaf = path->nodes[0];
3319                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3320                 write_extent_buffer(leaf, data, ptr, data_size);
3321                 btrfs_mark_buffer_dirty(leaf);
3322         }
3323         btrfs_free_path(path);
3324         return ret;
3325 }
3326
3327 /*
3328  * delete the pointer from a given node.
3329  *
3330  * the tree should have been previously balanced so the deletion does not
3331  * empty a node.
3332  */
3333 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3334                    struct btrfs_path *path, int level, int slot)
3335 {
3336         struct extent_buffer *parent = path->nodes[level];
3337         u32 nritems;
3338         int ret = 0;
3339         int wret;
3340
3341         nritems = btrfs_header_nritems(parent);
3342         if (slot != nritems -1) {
3343                 memmove_extent_buffer(parent,
3344                               btrfs_node_key_ptr_offset(slot),
3345                               btrfs_node_key_ptr_offset(slot + 1),
3346                               sizeof(struct btrfs_key_ptr) *
3347                               (nritems - slot - 1));
3348         }
3349         nritems--;
3350         btrfs_set_header_nritems(parent, nritems);
3351         if (nritems == 0 && parent == root->node) {
3352                 BUG_ON(btrfs_header_level(root->node) != 1);
3353                 /* just turn the root into a leaf and break */
3354                 btrfs_set_header_level(root->node, 0);
3355         } else if (slot == 0) {
3356                 struct btrfs_disk_key disk_key;
3357
3358                 btrfs_node_key(parent, &disk_key, 0);
3359                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3360                 if (wret)
3361                         ret = wret;
3362         }
3363         btrfs_mark_buffer_dirty(parent);
3364         return ret;
3365 }
3366
3367 /*
3368  * a helper function to delete the leaf pointed to by path->slots[1] and
3369  * path->nodes[1].  bytenr is the node block pointer, but since the callers
3370  * already know it, it is faster to have them pass it down than to
3371  * read it out of the node again.
3372  *
3373  * This deletes the pointer in path->nodes[1] and frees the leaf
3374  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3375  *
3376  * The path must have already been setup for deleting the leaf, including
3377  * all the proper balancing.  path->nodes[1] must be locked.
3378  */
3379 noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3380                             struct btrfs_root *root,
3381                             struct btrfs_path *path, u64 bytenr)
3382 {
3383         int ret;
3384         u64 root_gen = btrfs_header_generation(path->nodes[1]);
3385
3386         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3387         if (ret)
3388                 return ret;
3389
3390         ret = btrfs_free_extent(trans, root, bytenr,
3391                                 btrfs_level_size(root, 0),
3392                                 path->nodes[1]->start,
3393                                 btrfs_header_owner(path->nodes[1]),
3394                                 root_gen, 0, 1);
3395         return ret;
3396 }
3397 /*
3398  * delete the item at the leaf level in path.  If that empties
3399  * the leaf, remove it from the tree
3400  */
3401 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3402                     struct btrfs_path *path, int slot, int nr)
3403 {
3404         struct extent_buffer *leaf;
3405         struct btrfs_item *item;
3406         int last_off;
3407         int dsize = 0;
3408         int ret = 0;
3409         int wret;
3410         int i;
3411         u32 nritems;
3412
3413         leaf = path->nodes[0];
3414         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3415
3416         for (i = 0; i < nr; i++)
3417                 dsize += btrfs_item_size_nr(leaf, slot + i);
3418
3419         nritems = btrfs_header_nritems(leaf);
3420
3421         if (slot + nr != nritems) {
3422                 int data_end = leaf_data_end(root, leaf);
3423
3424                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3425                               data_end + dsize,
3426                               btrfs_leaf_data(leaf) + data_end,
3427                               last_off - data_end);
3428
3429                 for (i = slot + nr; i < nritems; i++) {
3430                         u32 ioff;
3431
3432                         item = btrfs_item_nr(leaf, i);
3433                         if (!leaf->map_token) {
3434                                 map_extent_buffer(leaf, (unsigned long)item,
3435                                         sizeof(struct btrfs_item),
3436                                         &leaf->map_token, &leaf->kaddr,
3437                                         &leaf->map_start, &leaf->map_len,
3438                                         KM_USER1);
3439                         }
3440                         ioff = btrfs_item_offset(leaf, item);
3441                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3442                 }
3443
3444                 if (leaf->map_token) {
3445                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3446                         leaf->map_token = NULL;
3447                 }
3448
3449                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3450                               btrfs_item_nr_offset(slot + nr),
3451                               sizeof(struct btrfs_item) *
3452                               (nritems - slot - nr));
3453         }
3454         btrfs_set_header_nritems(leaf, nritems - nr);
3455         nritems -= nr;
3456
3457         /* delete the leaf if we've emptied it */
3458         if (nritems == 0) {
3459                 if (leaf == root->node) {
3460                         btrfs_set_header_level(leaf, 0);
3461                 } else {
3462                         ret = btrfs_del_leaf(trans, root, path, leaf->start);
3463                         BUG_ON(ret);
3464                 }
3465         } else {
3466                 int used = leaf_space_used(leaf, 0, nritems);
3467                 if (slot == 0) {
3468                         struct btrfs_disk_key disk_key;
3469
3470                         btrfs_item_key(leaf, &disk_key, 0);
3471                         wret = fixup_low_keys(trans, root, path,
3472                                               &disk_key, 1);
3473                         if (wret)
3474                                 ret = wret;
3475                 }
3476
3477                 /* delete the leaf if it is mostly empty */
3478                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3479                         /* push_leaf_left fixes the path.
3480                          * make sure the path still points to our leaf
3481                          * for possible call to del_ptr below
3482                          */
3483                         slot = path->slots[1];
3484                         extent_buffer_get(leaf);
3485
3486                         wret = push_leaf_left(trans, root, path, 1, 1);
3487                         if (wret < 0 && wret != -ENOSPC)
3488                                 ret = wret;
3489
3490                         if (path->nodes[0] == leaf &&
3491                             btrfs_header_nritems(leaf)) {
3492                                 wret = push_leaf_right(trans, root, path, 1, 1);
3493                                 if (wret < 0 && wret != -ENOSPC)
3494                                         ret = wret;
3495                         }
3496
3497                         if (btrfs_header_nritems(leaf) == 0) {
3498                                 path->slots[1] = slot;
3499                                 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3500                                 BUG_ON(ret);
3501                                 free_extent_buffer(leaf);
3502                         } else {
3503                                 /* if we're still in the path, make sure
3504                                  * we're dirty.  Otherwise, one of the
3505                                  * push_leaf functions must have already
3506                                  * dirtied this buffer
3507                                  */
3508                                 if (path->nodes[0] == leaf)
3509                                         btrfs_mark_buffer_dirty(leaf);
3510                                 free_extent_buffer(leaf);
3511                         }
3512                 } else {
3513                         btrfs_mark_buffer_dirty(leaf);
3514                 }
3515         }
3516         return ret;
3517 }
3518
3519 /*
3520  * search the tree again to find a leaf with lesser keys
3521  * returns 0 if it found something or 1 if there are no lesser leaves.
3522  * returns < 0 on io errors.
3523  *
3524  * This may release the path, and so you may lose any locks held at the
3525  * time you call it.
3526  */
3527 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3528 {
3529         struct btrfs_key key;
3530         struct btrfs_disk_key found_key;
3531         int ret;
3532
3533         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3534
3535         if (key.offset > 0)
3536                 key.offset--;
3537         else if (key.type > 0)
3538                 key.type--;
3539         else if (key.objectid > 0)
3540                 key.objectid--;
3541         else
3542                 return 1;
3543
3544         btrfs_release_path(root, path);
3545         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3546         if (ret < 0)
3547                 return ret;
3548         btrfs_item_key(path->nodes[0], &found_key, 0);
3549         ret = comp_keys(&found_key, &key);
3550         if (ret < 0)
3551                 return 0;
3552         return 1;
3553 }
3554
3555 /*
3556  * A helper function to walk down the tree starting at min_key, and looking
3557  * for nodes or leaves that are either in cache or have a minimum
3558  * transaction id.  This is used by the btree defrag code, and tree logging
3559  *
3560  * This does not cow, but it does stuff the starting key it finds back
3561  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3562  * key and get a writable path.
3563  *
3564  * This does lock as it descends, and path->keep_locks should be set
3565  * to 1 by the caller.
3566  *
3567  * This honors path->lowest_level to prevent descent past a given level
3568  * of the tree.
3569  *
3570  * min_trans indicates the oldest transaction that you are interested
3571  * in walking through.  Any nodes or leaves older than min_trans are
3572  * skipped over (without reading them).
3573  *
3574  * returns zero if something useful was found, < 0 on error and 1 if there
3575  * was nothing in the tree that matched the search criteria.
3576  */
3577 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3578                          struct btrfs_key *max_key,
3579                          struct btrfs_path *path, int cache_only,
3580                          u64 min_trans)
3581 {
3582         struct extent_buffer *cur;
3583         struct btrfs_key found_key;
3584         int slot;
3585         int sret;
3586         u32 nritems;
3587         int level;
3588         int ret = 1;
3589
3590 again:
3591         cur = btrfs_lock_root_node(root);
3592         level = btrfs_header_level(cur);
3593         WARN_ON(path->nodes[level]);
3594         path->nodes[level] = cur;
3595         path->locks[level] = 1;
3596
3597         if (btrfs_header_generation(cur) < min_trans) {
3598                 ret = 1;
3599                 goto out;
3600         }
3601         while(1) {
3602                 nritems = btrfs_header_nritems(cur);
3603                 level = btrfs_header_level(cur);
3604                 sret = bin_search(cur, min_key, level, &slot);
3605
3606                 /* at the lowest level, we're done, setup the path and exit */
3607                 if (level == path->lowest_level) {
3608                         if (slot >= nritems)
3609                                 goto find_next_key;
3610                         ret = 0;
3611                         path->slots[level] = slot;
3612                         btrfs_item_key_to_cpu(cur, &found_key, slot);
3613                         goto out;
3614                 }
3615                 if (sret && slot > 0)
3616                         slot--;
3617                 /*
3618                  * check this node pointer against the cache_only and
3619                  * min_trans parameters.  If it isn't in cache or is too
3620                  * old, skip to the next one.
3621                  */
3622                 while(slot < nritems) {
3623                         u64 blockptr;
3624                         u64 gen;
3625                         struct extent_buffer *tmp;
3626                         struct btrfs_disk_key disk_key;
3627
3628                         blockptr = btrfs_node_blockptr(cur, slot);
3629                         gen = btrfs_node_ptr_generation(cur, slot);
3630                         if (gen < min_trans) {
3631                                 slot++;
3632                                 continue;
3633                         }
3634                         if (!cache_only)
3635                                 break;
3636
3637                         if (max_key) {
3638                                 btrfs_node_key(cur, &disk_key, slot);
3639                                 if (comp_keys(&disk_key, max_key) >= 0) {
3640                                         ret = 1;
3641                                         goto out;
3642                                 }
3643                         }
3644
3645                         tmp = btrfs_find_tree_block(root, blockptr,
3646                                             btrfs_level_size(root, level - 1));
3647
3648                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3649                                 free_extent_buffer(tmp);
3650                                 break;
3651                         }
3652                         if (tmp)
3653                                 free_extent_buffer(tmp);
3654                         slot++;
3655                 }
3656 find_next_key:
3657                 /*
3658                  * we didn't find a candidate key in this node, walk forward
3659                  * and find another one
3660                  */
3661                 if (slot >= nritems) {
3662                         path->slots[level] = slot;
3663                         sret = btrfs_find_next_key(root, path, min_key, level,
3664                                                   cache_only, min_trans);
3665                         if (sret == 0) {
3666                                 btrfs_release_path(root, path);
3667                                 goto again;
3668                         } else {
3669                                 goto out;
3670                         }
3671                 }
3672                 /* save our key for returning back */
3673                 btrfs_node_key_to_cpu(cur, &found_key, slot);
3674                 path->slots[level] = slot;
3675                 if (level == path->lowest_level) {
3676                         ret = 0;
3677                         unlock_up(path, level, 1);
3678                         goto out;
3679                 }
3680                 cur = read_node_slot(root, cur, slot);
3681
3682                 btrfs_tree_lock(cur);
3683                 path->locks[level - 1] = 1;
3684                 path->nodes[level - 1] = cur;
3685                 unlock_up(path, level, 1);
3686         }
3687 out:
3688         if (ret == 0)
3689                 memcpy(min_key, &found_key, sizeof(found_key));
3690         return ret;
3691 }
3692
3693 /*
3694  * this is similar to btrfs_next_leaf, but does not try to preserve
3695  * and fixup the path.  It looks for and returns the next key in the
3696  * tree based on the current path and the cache_only and min_trans
3697  * parameters.
3698  *
3699  * 0 is returned if another key is found, < 0 if there are any errors
3700  * and 1 is returned if there are no higher keys in the tree
3701  *
3702  * path->keep_locks should be set to 1 on the search made before
3703  * calling this function.
3704  */
3705 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3706                         struct btrfs_key *key, int lowest_level,
3707                         int cache_only, u64 min_trans)
3708 {
3709         int level = lowest_level;
3710         int slot;
3711         struct extent_buffer *c;
3712
3713         while(level < BTRFS_MAX_LEVEL) {
3714                 if (!path->nodes[level])
3715                         return 1;
3716
3717                 slot = path->slots[level] + 1;
3718                 c = path->nodes[level];
3719 next:
3720                 if (slot >= btrfs_header_nritems(c)) {
3721                         level++;
3722                         if (level == BTRFS_MAX_LEVEL) {
3723                                 return 1;
3724                         }
3725                         continue;
3726                 }
3727                 if (level == 0)
3728                         btrfs_item_key_to_cpu(c, key, slot);
3729                 else {
3730                         u64 blockptr = btrfs_node_blockptr(c, slot);
3731                         u64 gen = btrfs_node_ptr_generation(c, slot);
3732
3733                         if (cache_only) {
3734                                 struct extent_buffer *cur;
3735                                 cur = btrfs_find_tree_block(root, blockptr,
3736                                             btrfs_level_size(root, level - 1));
3737                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3738                                         slot++;
3739                                         if (cur)
3740                                                 free_extent_buffer(cur);
3741                                         goto next;
3742                                 }
3743                                 free_extent_buffer(cur);
3744                         }
3745                         if (gen < min_trans) {
3746                                 slot++;
3747                                 goto next;
3748                         }
3749                         btrfs_node_key_to_cpu(c, key, slot);
3750                 }
3751                 return 0;
3752         }
3753         return 1;
3754 }
3755
3756 /*
3757  * search the tree again to find a leaf with greater keys
3758  * returns 0 if it found something or 1 if there are no greater leaves.
3759  * returns < 0 on io errors.
3760  */
3761 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3762 {
3763         int slot;
3764         int level = 1;
3765         struct extent_buffer *c;
3766         struct extent_buffer *next = NULL;
3767         struct btrfs_key key;
3768         u32 nritems;
3769         int ret;
3770
3771         nritems = btrfs_header_nritems(path->nodes[0]);
3772         if (nritems == 0) {
3773                 return 1;
3774         }
3775
3776         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3777
3778         btrfs_release_path(root, path);
3779         path->keep_locks = 1;
3780         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3781         path->keep_locks = 0;
3782
3783         if (ret < 0)
3784                 return ret;
3785
3786         nritems = btrfs_header_nritems(path->nodes[0]);
3787         /*
3788          * by releasing the path above we dropped all our locks.  A balance
3789          * could have added more items next to the key that used to be
3790          * at the very end of the block.  So, check again here and
3791          * advance the path if there are now more items available.
3792          */
3793         if (nritems > 0 && path->slots[0] < nritems - 1) {
3794                 path->slots[0]++;
3795                 goto done;
3796         }
3797
3798         while(level < BTRFS_MAX_LEVEL) {
3799                 if (!path->nodes[level])
3800                         return 1;
3801
3802                 slot = path->slots[level] + 1;
3803                 c = path->nodes[level];
3804                 if (slot >= btrfs_header_nritems(c)) {
3805                         level++;
3806                         if (level == BTRFS_MAX_LEVEL) {
3807                                 return 1;
3808                         }
3809                         continue;
3810                 }
3811
3812                 if (next) {
3813                         btrfs_tree_unlock(next);
3814                         free_extent_buffer(next);
3815                 }
3816
3817                 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3818                     path->reada)
3819                         reada_for_search(root, path, level, slot, 0);
3820
3821                 next = read_node_slot(root, c, slot);
3822                 if (!path->skip_locking) {
3823                         WARN_ON(!btrfs_tree_locked(c));
3824                         btrfs_tree_lock(next);
3825                 }
3826                 break;
3827         }
3828         path->slots[level] = slot;
3829         while(1) {
3830                 level--;
3831                 c = path->nodes[level];
3832                 if (path->locks[level])
3833                         btrfs_tree_unlock(c);
3834                 free_extent_buffer(c);
3835                 path->nodes[level] = next;
3836                 path->slots[level] = 0;
3837                 if (!path->skip_locking)
3838                         path->locks[level] = 1;
3839                 if (!level)
3840                         break;
3841                 if (level == 1 && path->locks[1] && path->reada)
3842                         reada_for_search(root, path, level, slot, 0);
3843                 next = read_node_slot(root, next, 0);
3844                 if (!path->skip_locking) {
3845                         WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3846                         btrfs_tree_lock(next);
3847                 }
3848         }
3849 done:
3850         unlock_up(path, 0, 1);
3851         return 0;
3852 }
3853
3854 /*
3855  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3856  * searching until it gets past min_objectid or finds an item of 'type'
3857  *
3858  * returns 0 if something is found, 1 if nothing was found and < 0 on error
3859  */
3860 int btrfs_previous_item(struct btrfs_root *root,
3861                         struct btrfs_path *path, u64 min_objectid,
3862                         int type)
3863 {
3864         struct btrfs_key found_key;
3865         struct extent_buffer *leaf;
3866         u32 nritems;
3867         int ret;
3868
3869         while(1) {
3870                 if (path->slots[0] == 0) {
3871                         ret = btrfs_prev_leaf(root, path);
3872                         if (ret != 0)
3873                                 return ret;
3874                 } else {
3875                         path->slots[0]--;
3876                 }
3877                 leaf = path->nodes[0];
3878                 nritems = btrfs_header_nritems(leaf);
3879                 if (nritems == 0)
3880                         return 1;
3881                 if (path->slots[0] == nritems)
3882                         path->slots[0]--;
3883
3884                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3885                 if (found_key.type == type)
3886                         return 0;
3887                 if (found_key.objectid < min_objectid)
3888                         break;
3889                 if (found_key.objectid == min_objectid &&
3890                     found_key.type < type)
3891                         break;
3892         }
3893         return 1;
3894 }