Merge tag 'rpmsg-v6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc...
[linux-block.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include <linux/error-injection.h>
11 #include "messages.h"
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "print-tree.h"
16 #include "locking.h"
17 #include "volumes.h"
18 #include "qgroup.h"
19 #include "tree-mod-log.h"
20 #include "tree-checker.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 #include "relocation.h"
25 #include "file-item.h"
26
27 static struct kmem_cache *btrfs_path_cachep;
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32                       const struct btrfs_key *ins_key, struct btrfs_path *path,
33                       int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35                           struct extent_buffer *dst,
36                           struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42
43 static const struct btrfs_csums {
44         u16             size;
45         const char      name[10];
46         const char      driver[12];
47 } btrfs_csums[] = {
48         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
49         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
50         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
51         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
52                                      .driver = "blake2b-256" },
53 };
54
55 /*
56  * The leaf data grows from end-to-front in the node.  this returns the address
57  * of the start of the last item, which is the stop of the leaf data stack.
58  */
59 static unsigned int leaf_data_end(const struct extent_buffer *leaf)
60 {
61         u32 nr = btrfs_header_nritems(leaf);
62
63         if (nr == 0)
64                 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
65         return btrfs_item_offset(leaf, nr - 1);
66 }
67
68 /*
69  * Move data in a @leaf (using memmove, safe for overlapping ranges).
70  *
71  * @leaf:       leaf that we're doing a memmove on
72  * @dst_offset: item data offset we're moving to
73  * @src_offset: item data offset were' moving from
74  * @len:        length of the data we're moving
75  *
76  * Wrapper around memmove_extent_buffer() that takes into account the header on
77  * the leaf.  The btrfs_item offset's start directly after the header, so we
78  * have to adjust any offsets to account for the header in the leaf.  This
79  * handles that math to simplify the callers.
80  */
81 static inline void memmove_leaf_data(const struct extent_buffer *leaf,
82                                      unsigned long dst_offset,
83                                      unsigned long src_offset,
84                                      unsigned long len)
85 {
86         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
87                               btrfs_item_nr_offset(leaf, 0) + src_offset, len);
88 }
89
90 /*
91  * Copy item data from @src into @dst at the given @offset.
92  *
93  * @dst:        destination leaf that we're copying into
94  * @src:        source leaf that we're copying from
95  * @dst_offset: item data offset we're copying to
96  * @src_offset: item data offset were' copying from
97  * @len:        length of the data we're copying
98  *
99  * Wrapper around copy_extent_buffer() that takes into account the header on
100  * the leaf.  The btrfs_item offset's start directly after the header, so we
101  * have to adjust any offsets to account for the header in the leaf.  This
102  * handles that math to simplify the callers.
103  */
104 static inline void copy_leaf_data(const struct extent_buffer *dst,
105                                   const struct extent_buffer *src,
106                                   unsigned long dst_offset,
107                                   unsigned long src_offset, unsigned long len)
108 {
109         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
110                            btrfs_item_nr_offset(src, 0) + src_offset, len);
111 }
112
113 /*
114  * Move items in a @leaf (using memmove).
115  *
116  * @dst:        destination leaf for the items
117  * @dst_item:   the item nr we're copying into
118  * @src_item:   the item nr we're copying from
119  * @nr_items:   the number of items to copy
120  *
121  * Wrapper around memmove_extent_buffer() that does the math to get the
122  * appropriate offsets into the leaf from the item numbers.
123  */
124 static inline void memmove_leaf_items(const struct extent_buffer *leaf,
125                                       int dst_item, int src_item, int nr_items)
126 {
127         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
128                               btrfs_item_nr_offset(leaf, src_item),
129                               nr_items * sizeof(struct btrfs_item));
130 }
131
132 /*
133  * Copy items from @src into @dst at the given @offset.
134  *
135  * @dst:        destination leaf for the items
136  * @src:        source leaf for the items
137  * @dst_item:   the item nr we're copying into
138  * @src_item:   the item nr we're copying from
139  * @nr_items:   the number of items to copy
140  *
141  * Wrapper around copy_extent_buffer() that does the math to get the
142  * appropriate offsets into the leaf from the item numbers.
143  */
144 static inline void copy_leaf_items(const struct extent_buffer *dst,
145                                    const struct extent_buffer *src,
146                                    int dst_item, int src_item, int nr_items)
147 {
148         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
149                               btrfs_item_nr_offset(src, src_item),
150                               nr_items * sizeof(struct btrfs_item));
151 }
152
153 int btrfs_super_csum_size(const struct btrfs_super_block *s)
154 {
155         u16 t = btrfs_super_csum_type(s);
156         /*
157          * csum type is validated at mount time
158          */
159         return btrfs_csums[t].size;
160 }
161
162 const char *btrfs_super_csum_name(u16 csum_type)
163 {
164         /* csum type is validated at mount time */
165         return btrfs_csums[csum_type].name;
166 }
167
168 /*
169  * Return driver name if defined, otherwise the name that's also a valid driver
170  * name
171  */
172 const char *btrfs_super_csum_driver(u16 csum_type)
173 {
174         /* csum type is validated at mount time */
175         return btrfs_csums[csum_type].driver[0] ?
176                 btrfs_csums[csum_type].driver :
177                 btrfs_csums[csum_type].name;
178 }
179
180 size_t __attribute_const__ btrfs_get_num_csums(void)
181 {
182         return ARRAY_SIZE(btrfs_csums);
183 }
184
185 struct btrfs_path *btrfs_alloc_path(void)
186 {
187         might_sleep();
188
189         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
190 }
191
192 /* this also releases the path */
193 void btrfs_free_path(struct btrfs_path *p)
194 {
195         if (!p)
196                 return;
197         btrfs_release_path(p);
198         kmem_cache_free(btrfs_path_cachep, p);
199 }
200
201 /*
202  * path release drops references on the extent buffers in the path
203  * and it drops any locks held by this path
204  *
205  * It is safe to call this on paths that no locks or extent buffers held.
206  */
207 noinline void btrfs_release_path(struct btrfs_path *p)
208 {
209         int i;
210
211         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
212                 p->slots[i] = 0;
213                 if (!p->nodes[i])
214                         continue;
215                 if (p->locks[i]) {
216                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
217                         p->locks[i] = 0;
218                 }
219                 free_extent_buffer(p->nodes[i]);
220                 p->nodes[i] = NULL;
221         }
222 }
223
224 /*
225  * We want the transaction abort to print stack trace only for errors where the
226  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
227  * caused by external factors.
228  */
229 bool __cold abort_should_print_stack(int errno)
230 {
231         switch (errno) {
232         case -EIO:
233         case -EROFS:
234         case -ENOMEM:
235                 return false;
236         }
237         return true;
238 }
239
240 /*
241  * safely gets a reference on the root node of a tree.  A lock
242  * is not taken, so a concurrent writer may put a different node
243  * at the root of the tree.  See btrfs_lock_root_node for the
244  * looping required.
245  *
246  * The extent buffer returned by this has a reference taken, so
247  * it won't disappear.  It may stop being the root of the tree
248  * at any time because there are no locks held.
249  */
250 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
251 {
252         struct extent_buffer *eb;
253
254         while (1) {
255                 rcu_read_lock();
256                 eb = rcu_dereference(root->node);
257
258                 /*
259                  * RCU really hurts here, we could free up the root node because
260                  * it was COWed but we may not get the new root node yet so do
261                  * the inc_not_zero dance and if it doesn't work then
262                  * synchronize_rcu and try again.
263                  */
264                 if (atomic_inc_not_zero(&eb->refs)) {
265                         rcu_read_unlock();
266                         break;
267                 }
268                 rcu_read_unlock();
269                 synchronize_rcu();
270         }
271         return eb;
272 }
273
274 /*
275  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
276  * just get put onto a simple dirty list.  Transaction walks this list to make
277  * sure they get properly updated on disk.
278  */
279 static void add_root_to_dirty_list(struct btrfs_root *root)
280 {
281         struct btrfs_fs_info *fs_info = root->fs_info;
282
283         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
284             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
285                 return;
286
287         spin_lock(&fs_info->trans_lock);
288         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
289                 /* Want the extent tree to be the last on the list */
290                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
291                         list_move_tail(&root->dirty_list,
292                                        &fs_info->dirty_cowonly_roots);
293                 else
294                         list_move(&root->dirty_list,
295                                   &fs_info->dirty_cowonly_roots);
296         }
297         spin_unlock(&fs_info->trans_lock);
298 }
299
300 /*
301  * used by snapshot creation to make a copy of a root for a tree with
302  * a given objectid.  The buffer with the new root node is returned in
303  * cow_ret, and this func returns zero on success or a negative error code.
304  */
305 int btrfs_copy_root(struct btrfs_trans_handle *trans,
306                       struct btrfs_root *root,
307                       struct extent_buffer *buf,
308                       struct extent_buffer **cow_ret, u64 new_root_objectid)
309 {
310         struct btrfs_fs_info *fs_info = root->fs_info;
311         struct extent_buffer *cow;
312         int ret = 0;
313         int level;
314         struct btrfs_disk_key disk_key;
315
316         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
317                 trans->transid != fs_info->running_transaction->transid);
318         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
319                 trans->transid != root->last_trans);
320
321         level = btrfs_header_level(buf);
322         if (level == 0)
323                 btrfs_item_key(buf, &disk_key, 0);
324         else
325                 btrfs_node_key(buf, &disk_key, 0);
326
327         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
328                                      &disk_key, level, buf->start, 0,
329                                      BTRFS_NESTING_NEW_ROOT);
330         if (IS_ERR(cow))
331                 return PTR_ERR(cow);
332
333         copy_extent_buffer_full(cow, buf);
334         btrfs_set_header_bytenr(cow, cow->start);
335         btrfs_set_header_generation(cow, trans->transid);
336         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
337         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
338                                      BTRFS_HEADER_FLAG_RELOC);
339         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
340                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
341         else
342                 btrfs_set_header_owner(cow, new_root_objectid);
343
344         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
345
346         WARN_ON(btrfs_header_generation(buf) > trans->transid);
347         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
348                 ret = btrfs_inc_ref(trans, root, cow, 1);
349         else
350                 ret = btrfs_inc_ref(trans, root, cow, 0);
351         if (ret) {
352                 btrfs_tree_unlock(cow);
353                 free_extent_buffer(cow);
354                 btrfs_abort_transaction(trans, ret);
355                 return ret;
356         }
357
358         btrfs_mark_buffer_dirty(cow);
359         *cow_ret = cow;
360         return 0;
361 }
362
363 /*
364  * check if the tree block can be shared by multiple trees
365  */
366 int btrfs_block_can_be_shared(struct btrfs_root *root,
367                               struct extent_buffer *buf)
368 {
369         /*
370          * Tree blocks not in shareable trees and tree roots are never shared.
371          * If a block was allocated after the last snapshot and the block was
372          * not allocated by tree relocation, we know the block is not shared.
373          */
374         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
375             buf != root->node && buf != root->commit_root &&
376             (btrfs_header_generation(buf) <=
377              btrfs_root_last_snapshot(&root->root_item) ||
378              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
379                 return 1;
380
381         return 0;
382 }
383
384 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
385                                        struct btrfs_root *root,
386                                        struct extent_buffer *buf,
387                                        struct extent_buffer *cow,
388                                        int *last_ref)
389 {
390         struct btrfs_fs_info *fs_info = root->fs_info;
391         u64 refs;
392         u64 owner;
393         u64 flags;
394         u64 new_flags = 0;
395         int ret;
396
397         /*
398          * Backrefs update rules:
399          *
400          * Always use full backrefs for extent pointers in tree block
401          * allocated by tree relocation.
402          *
403          * If a shared tree block is no longer referenced by its owner
404          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
405          * use full backrefs for extent pointers in tree block.
406          *
407          * If a tree block is been relocating
408          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
409          * use full backrefs for extent pointers in tree block.
410          * The reason for this is some operations (such as drop tree)
411          * are only allowed for blocks use full backrefs.
412          */
413
414         if (btrfs_block_can_be_shared(root, buf)) {
415                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
416                                                btrfs_header_level(buf), 1,
417                                                &refs, &flags);
418                 if (ret)
419                         return ret;
420                 if (refs == 0) {
421                         ret = -EROFS;
422                         btrfs_handle_fs_error(fs_info, ret, NULL);
423                         return ret;
424                 }
425         } else {
426                 refs = 1;
427                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
428                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
429                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
430                 else
431                         flags = 0;
432         }
433
434         owner = btrfs_header_owner(buf);
435         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
436                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
437
438         if (refs > 1) {
439                 if ((owner == root->root_key.objectid ||
440                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
441                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
442                         ret = btrfs_inc_ref(trans, root, buf, 1);
443                         if (ret)
444                                 return ret;
445
446                         if (root->root_key.objectid ==
447                             BTRFS_TREE_RELOC_OBJECTID) {
448                                 ret = btrfs_dec_ref(trans, root, buf, 0);
449                                 if (ret)
450                                         return ret;
451                                 ret = btrfs_inc_ref(trans, root, cow, 1);
452                                 if (ret)
453                                         return ret;
454                         }
455                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
456                 } else {
457
458                         if (root->root_key.objectid ==
459                             BTRFS_TREE_RELOC_OBJECTID)
460                                 ret = btrfs_inc_ref(trans, root, cow, 1);
461                         else
462                                 ret = btrfs_inc_ref(trans, root, cow, 0);
463                         if (ret)
464                                 return ret;
465                 }
466                 if (new_flags != 0) {
467                         int level = btrfs_header_level(buf);
468
469                         ret = btrfs_set_disk_extent_flags(trans, buf,
470                                                           new_flags, level);
471                         if (ret)
472                                 return ret;
473                 }
474         } else {
475                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
476                         if (root->root_key.objectid ==
477                             BTRFS_TREE_RELOC_OBJECTID)
478                                 ret = btrfs_inc_ref(trans, root, cow, 1);
479                         else
480                                 ret = btrfs_inc_ref(trans, root, cow, 0);
481                         if (ret)
482                                 return ret;
483                         ret = btrfs_dec_ref(trans, root, buf, 1);
484                         if (ret)
485                                 return ret;
486                 }
487                 btrfs_clear_buffer_dirty(trans, buf);
488                 *last_ref = 1;
489         }
490         return 0;
491 }
492
493 /*
494  * does the dirty work in cow of a single block.  The parent block (if
495  * supplied) is updated to point to the new cow copy.  The new buffer is marked
496  * dirty and returned locked.  If you modify the block it needs to be marked
497  * dirty again.
498  *
499  * search_start -- an allocation hint for the new block
500  *
501  * empty_size -- a hint that you plan on doing more cow.  This is the size in
502  * bytes the allocator should try to find free next to the block it returns.
503  * This is just a hint and may be ignored by the allocator.
504  */
505 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
506                              struct btrfs_root *root,
507                              struct extent_buffer *buf,
508                              struct extent_buffer *parent, int parent_slot,
509                              struct extent_buffer **cow_ret,
510                              u64 search_start, u64 empty_size,
511                              enum btrfs_lock_nesting nest)
512 {
513         struct btrfs_fs_info *fs_info = root->fs_info;
514         struct btrfs_disk_key disk_key;
515         struct extent_buffer *cow;
516         int level, ret;
517         int last_ref = 0;
518         int unlock_orig = 0;
519         u64 parent_start = 0;
520
521         if (*cow_ret == buf)
522                 unlock_orig = 1;
523
524         btrfs_assert_tree_write_locked(buf);
525
526         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
527                 trans->transid != fs_info->running_transaction->transid);
528         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
529                 trans->transid != root->last_trans);
530
531         level = btrfs_header_level(buf);
532
533         if (level == 0)
534                 btrfs_item_key(buf, &disk_key, 0);
535         else
536                 btrfs_node_key(buf, &disk_key, 0);
537
538         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
539                 parent_start = parent->start;
540
541         cow = btrfs_alloc_tree_block(trans, root, parent_start,
542                                      root->root_key.objectid, &disk_key, level,
543                                      search_start, empty_size, nest);
544         if (IS_ERR(cow))
545                 return PTR_ERR(cow);
546
547         /* cow is set to blocking by btrfs_init_new_buffer */
548
549         copy_extent_buffer_full(cow, buf);
550         btrfs_set_header_bytenr(cow, cow->start);
551         btrfs_set_header_generation(cow, trans->transid);
552         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
553         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
554                                      BTRFS_HEADER_FLAG_RELOC);
555         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
556                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
557         else
558                 btrfs_set_header_owner(cow, root->root_key.objectid);
559
560         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
561
562         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
563         if (ret) {
564                 btrfs_tree_unlock(cow);
565                 free_extent_buffer(cow);
566                 btrfs_abort_transaction(trans, ret);
567                 return ret;
568         }
569
570         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
571                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
572                 if (ret) {
573                         btrfs_tree_unlock(cow);
574                         free_extent_buffer(cow);
575                         btrfs_abort_transaction(trans, ret);
576                         return ret;
577                 }
578         }
579
580         if (buf == root->node) {
581                 WARN_ON(parent && parent != buf);
582                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
583                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
584                         parent_start = buf->start;
585
586                 atomic_inc(&cow->refs);
587                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
588                 BUG_ON(ret < 0);
589                 rcu_assign_pointer(root->node, cow);
590
591                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
592                                       parent_start, last_ref);
593                 free_extent_buffer(buf);
594                 add_root_to_dirty_list(root);
595         } else {
596                 WARN_ON(trans->transid != btrfs_header_generation(parent));
597                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
598                                               BTRFS_MOD_LOG_KEY_REPLACE);
599                 btrfs_set_node_blockptr(parent, parent_slot,
600                                         cow->start);
601                 btrfs_set_node_ptr_generation(parent, parent_slot,
602                                               trans->transid);
603                 btrfs_mark_buffer_dirty(parent);
604                 if (last_ref) {
605                         ret = btrfs_tree_mod_log_free_eb(buf);
606                         if (ret) {
607                                 btrfs_tree_unlock(cow);
608                                 free_extent_buffer(cow);
609                                 btrfs_abort_transaction(trans, ret);
610                                 return ret;
611                         }
612                 }
613                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
614                                       parent_start, last_ref);
615         }
616         if (unlock_orig)
617                 btrfs_tree_unlock(buf);
618         free_extent_buffer_stale(buf);
619         btrfs_mark_buffer_dirty(cow);
620         *cow_ret = cow;
621         return 0;
622 }
623
624 static inline int should_cow_block(struct btrfs_trans_handle *trans,
625                                    struct btrfs_root *root,
626                                    struct extent_buffer *buf)
627 {
628         if (btrfs_is_testing(root->fs_info))
629                 return 0;
630
631         /* Ensure we can see the FORCE_COW bit */
632         smp_mb__before_atomic();
633
634         /*
635          * We do not need to cow a block if
636          * 1) this block is not created or changed in this transaction;
637          * 2) this block does not belong to TREE_RELOC tree;
638          * 3) the root is not forced COW.
639          *
640          * What is forced COW:
641          *    when we create snapshot during committing the transaction,
642          *    after we've finished copying src root, we must COW the shared
643          *    block to ensure the metadata consistency.
644          */
645         if (btrfs_header_generation(buf) == trans->transid &&
646             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
647             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
648               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
649             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
650                 return 0;
651         return 1;
652 }
653
654 /*
655  * cows a single block, see __btrfs_cow_block for the real work.
656  * This version of it has extra checks so that a block isn't COWed more than
657  * once per transaction, as long as it hasn't been written yet
658  */
659 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
660                     struct btrfs_root *root, struct extent_buffer *buf,
661                     struct extent_buffer *parent, int parent_slot,
662                     struct extent_buffer **cow_ret,
663                     enum btrfs_lock_nesting nest)
664 {
665         struct btrfs_fs_info *fs_info = root->fs_info;
666         u64 search_start;
667         int ret;
668
669         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
670                 btrfs_err(fs_info,
671                         "COW'ing blocks on a fs root that's being dropped");
672
673         if (trans->transaction != fs_info->running_transaction)
674                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
675                        trans->transid,
676                        fs_info->running_transaction->transid);
677
678         if (trans->transid != fs_info->generation)
679                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
680                        trans->transid, fs_info->generation);
681
682         if (!should_cow_block(trans, root, buf)) {
683                 *cow_ret = buf;
684                 return 0;
685         }
686
687         search_start = buf->start & ~((u64)SZ_1G - 1);
688
689         /*
690          * Before CoWing this block for later modification, check if it's
691          * the subtree root and do the delayed subtree trace if needed.
692          *
693          * Also We don't care about the error, as it's handled internally.
694          */
695         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
696         ret = __btrfs_cow_block(trans, root, buf, parent,
697                                  parent_slot, cow_ret, search_start, 0, nest);
698
699         trace_btrfs_cow_block(root, buf, *cow_ret);
700
701         return ret;
702 }
703 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
704
705 /*
706  * helper function for defrag to decide if two blocks pointed to by a
707  * node are actually close by
708  */
709 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
710 {
711         if (blocknr < other && other - (blocknr + blocksize) < 32768)
712                 return 1;
713         if (blocknr > other && blocknr - (other + blocksize) < 32768)
714                 return 1;
715         return 0;
716 }
717
718 #ifdef __LITTLE_ENDIAN
719
720 /*
721  * Compare two keys, on little-endian the disk order is same as CPU order and
722  * we can avoid the conversion.
723  */
724 static int comp_keys(const struct btrfs_disk_key *disk_key,
725                      const struct btrfs_key *k2)
726 {
727         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
728
729         return btrfs_comp_cpu_keys(k1, k2);
730 }
731
732 #else
733
734 /*
735  * compare two keys in a memcmp fashion
736  */
737 static int comp_keys(const struct btrfs_disk_key *disk,
738                      const struct btrfs_key *k2)
739 {
740         struct btrfs_key k1;
741
742         btrfs_disk_key_to_cpu(&k1, disk);
743
744         return btrfs_comp_cpu_keys(&k1, k2);
745 }
746 #endif
747
748 /*
749  * same as comp_keys only with two btrfs_key's
750  */
751 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
752 {
753         if (k1->objectid > k2->objectid)
754                 return 1;
755         if (k1->objectid < k2->objectid)
756                 return -1;
757         if (k1->type > k2->type)
758                 return 1;
759         if (k1->type < k2->type)
760                 return -1;
761         if (k1->offset > k2->offset)
762                 return 1;
763         if (k1->offset < k2->offset)
764                 return -1;
765         return 0;
766 }
767
768 /*
769  * this is used by the defrag code to go through all the
770  * leaves pointed to by a node and reallocate them so that
771  * disk order is close to key order
772  */
773 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
774                        struct btrfs_root *root, struct extent_buffer *parent,
775                        int start_slot, u64 *last_ret,
776                        struct btrfs_key *progress)
777 {
778         struct btrfs_fs_info *fs_info = root->fs_info;
779         struct extent_buffer *cur;
780         u64 blocknr;
781         u64 search_start = *last_ret;
782         u64 last_block = 0;
783         u64 other;
784         u32 parent_nritems;
785         int end_slot;
786         int i;
787         int err = 0;
788         u32 blocksize;
789         int progress_passed = 0;
790         struct btrfs_disk_key disk_key;
791
792         WARN_ON(trans->transaction != fs_info->running_transaction);
793         WARN_ON(trans->transid != fs_info->generation);
794
795         parent_nritems = btrfs_header_nritems(parent);
796         blocksize = fs_info->nodesize;
797         end_slot = parent_nritems - 1;
798
799         if (parent_nritems <= 1)
800                 return 0;
801
802         for (i = start_slot; i <= end_slot; i++) {
803                 int close = 1;
804
805                 btrfs_node_key(parent, &disk_key, i);
806                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
807                         continue;
808
809                 progress_passed = 1;
810                 blocknr = btrfs_node_blockptr(parent, i);
811                 if (last_block == 0)
812                         last_block = blocknr;
813
814                 if (i > 0) {
815                         other = btrfs_node_blockptr(parent, i - 1);
816                         close = close_blocks(blocknr, other, blocksize);
817                 }
818                 if (!close && i < end_slot) {
819                         other = btrfs_node_blockptr(parent, i + 1);
820                         close = close_blocks(blocknr, other, blocksize);
821                 }
822                 if (close) {
823                         last_block = blocknr;
824                         continue;
825                 }
826
827                 cur = btrfs_read_node_slot(parent, i);
828                 if (IS_ERR(cur))
829                         return PTR_ERR(cur);
830                 if (search_start == 0)
831                         search_start = last_block;
832
833                 btrfs_tree_lock(cur);
834                 err = __btrfs_cow_block(trans, root, cur, parent, i,
835                                         &cur, search_start,
836                                         min(16 * blocksize,
837                                             (end_slot - i) * blocksize),
838                                         BTRFS_NESTING_COW);
839                 if (err) {
840                         btrfs_tree_unlock(cur);
841                         free_extent_buffer(cur);
842                         break;
843                 }
844                 search_start = cur->start;
845                 last_block = cur->start;
846                 *last_ret = search_start;
847                 btrfs_tree_unlock(cur);
848                 free_extent_buffer(cur);
849         }
850         return err;
851 }
852
853 /*
854  * Search for a key in the given extent_buffer.
855  *
856  * The lower boundary for the search is specified by the slot number @first_slot.
857  * Use a value of 0 to search over the whole extent buffer.
858  *
859  * The slot in the extent buffer is returned via @slot. If the key exists in the
860  * extent buffer, then @slot will point to the slot where the key is, otherwise
861  * it points to the slot where you would insert the key.
862  *
863  * Slot may point to the total number of items (i.e. one position beyond the last
864  * key) if the key is bigger than the last key in the extent buffer.
865  */
866 int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
867                              const struct btrfs_key *key, int *slot)
868 {
869         unsigned long p;
870         int item_size;
871         /*
872          * Use unsigned types for the low and high slots, so that we get a more
873          * efficient division in the search loop below.
874          */
875         u32 low = first_slot;
876         u32 high = btrfs_header_nritems(eb);
877         int ret;
878         const int key_size = sizeof(struct btrfs_disk_key);
879
880         if (unlikely(low > high)) {
881                 btrfs_err(eb->fs_info,
882                  "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
883                           __func__, low, high, eb->start,
884                           btrfs_header_owner(eb), btrfs_header_level(eb));
885                 return -EINVAL;
886         }
887
888         if (btrfs_header_level(eb) == 0) {
889                 p = offsetof(struct btrfs_leaf, items);
890                 item_size = sizeof(struct btrfs_item);
891         } else {
892                 p = offsetof(struct btrfs_node, ptrs);
893                 item_size = sizeof(struct btrfs_key_ptr);
894         }
895
896         while (low < high) {
897                 unsigned long oip;
898                 unsigned long offset;
899                 struct btrfs_disk_key *tmp;
900                 struct btrfs_disk_key unaligned;
901                 int mid;
902
903                 mid = (low + high) / 2;
904                 offset = p + mid * item_size;
905                 oip = offset_in_page(offset);
906
907                 if (oip + key_size <= PAGE_SIZE) {
908                         const unsigned long idx = get_eb_page_index(offset);
909                         char *kaddr = page_address(eb->pages[idx]);
910
911                         oip = get_eb_offset_in_page(eb, offset);
912                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
913                 } else {
914                         read_extent_buffer(eb, &unaligned, offset, key_size);
915                         tmp = &unaligned;
916                 }
917
918                 ret = comp_keys(tmp, key);
919
920                 if (ret < 0)
921                         low = mid + 1;
922                 else if (ret > 0)
923                         high = mid;
924                 else {
925                         *slot = mid;
926                         return 0;
927                 }
928         }
929         *slot = low;
930         return 1;
931 }
932
933 static void root_add_used(struct btrfs_root *root, u32 size)
934 {
935         spin_lock(&root->accounting_lock);
936         btrfs_set_root_used(&root->root_item,
937                             btrfs_root_used(&root->root_item) + size);
938         spin_unlock(&root->accounting_lock);
939 }
940
941 static void root_sub_used(struct btrfs_root *root, u32 size)
942 {
943         spin_lock(&root->accounting_lock);
944         btrfs_set_root_used(&root->root_item,
945                             btrfs_root_used(&root->root_item) - size);
946         spin_unlock(&root->accounting_lock);
947 }
948
949 /* given a node and slot number, this reads the blocks it points to.  The
950  * extent buffer is returned with a reference taken (but unlocked).
951  */
952 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
953                                            int slot)
954 {
955         int level = btrfs_header_level(parent);
956         struct btrfs_tree_parent_check check = { 0 };
957         struct extent_buffer *eb;
958
959         if (slot < 0 || slot >= btrfs_header_nritems(parent))
960                 return ERR_PTR(-ENOENT);
961
962         BUG_ON(level == 0);
963
964         check.level = level - 1;
965         check.transid = btrfs_node_ptr_generation(parent, slot);
966         check.owner_root = btrfs_header_owner(parent);
967         check.has_first_key = true;
968         btrfs_node_key_to_cpu(parent, &check.first_key, slot);
969
970         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
971                              &check);
972         if (IS_ERR(eb))
973                 return eb;
974         if (!extent_buffer_uptodate(eb)) {
975                 free_extent_buffer(eb);
976                 return ERR_PTR(-EIO);
977         }
978
979         return eb;
980 }
981
982 /*
983  * node level balancing, used to make sure nodes are in proper order for
984  * item deletion.  We balance from the top down, so we have to make sure
985  * that a deletion won't leave an node completely empty later on.
986  */
987 static noinline int balance_level(struct btrfs_trans_handle *trans,
988                          struct btrfs_root *root,
989                          struct btrfs_path *path, int level)
990 {
991         struct btrfs_fs_info *fs_info = root->fs_info;
992         struct extent_buffer *right = NULL;
993         struct extent_buffer *mid;
994         struct extent_buffer *left = NULL;
995         struct extent_buffer *parent = NULL;
996         int ret = 0;
997         int wret;
998         int pslot;
999         int orig_slot = path->slots[level];
1000         u64 orig_ptr;
1001
1002         ASSERT(level > 0);
1003
1004         mid = path->nodes[level];
1005
1006         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1007         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1008
1009         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1010
1011         if (level < BTRFS_MAX_LEVEL - 1) {
1012                 parent = path->nodes[level + 1];
1013                 pslot = path->slots[level + 1];
1014         }
1015
1016         /*
1017          * deal with the case where there is only one pointer in the root
1018          * by promoting the node below to a root
1019          */
1020         if (!parent) {
1021                 struct extent_buffer *child;
1022
1023                 if (btrfs_header_nritems(mid) != 1)
1024                         return 0;
1025
1026                 /* promote the child to a root */
1027                 child = btrfs_read_node_slot(mid, 0);
1028                 if (IS_ERR(child)) {
1029                         ret = PTR_ERR(child);
1030                         btrfs_handle_fs_error(fs_info, ret, NULL);
1031                         goto enospc;
1032                 }
1033
1034                 btrfs_tree_lock(child);
1035                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1036                                       BTRFS_NESTING_COW);
1037                 if (ret) {
1038                         btrfs_tree_unlock(child);
1039                         free_extent_buffer(child);
1040                         goto enospc;
1041                 }
1042
1043                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
1044                 BUG_ON(ret < 0);
1045                 rcu_assign_pointer(root->node, child);
1046
1047                 add_root_to_dirty_list(root);
1048                 btrfs_tree_unlock(child);
1049
1050                 path->locks[level] = 0;
1051                 path->nodes[level] = NULL;
1052                 btrfs_clear_buffer_dirty(trans, mid);
1053                 btrfs_tree_unlock(mid);
1054                 /* once for the path */
1055                 free_extent_buffer(mid);
1056
1057                 root_sub_used(root, mid->len);
1058                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1059                 /* once for the root ptr */
1060                 free_extent_buffer_stale(mid);
1061                 return 0;
1062         }
1063         if (btrfs_header_nritems(mid) >
1064             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1065                 return 0;
1066
1067         left = btrfs_read_node_slot(parent, pslot - 1);
1068         if (IS_ERR(left))
1069                 left = NULL;
1070
1071         if (left) {
1072                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1073                 wret = btrfs_cow_block(trans, root, left,
1074                                        parent, pslot - 1, &left,
1075                                        BTRFS_NESTING_LEFT_COW);
1076                 if (wret) {
1077                         ret = wret;
1078                         goto enospc;
1079                 }
1080         }
1081
1082         right = btrfs_read_node_slot(parent, pslot + 1);
1083         if (IS_ERR(right))
1084                 right = NULL;
1085
1086         if (right) {
1087                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1088                 wret = btrfs_cow_block(trans, root, right,
1089                                        parent, pslot + 1, &right,
1090                                        BTRFS_NESTING_RIGHT_COW);
1091                 if (wret) {
1092                         ret = wret;
1093                         goto enospc;
1094                 }
1095         }
1096
1097         /* first, try to make some room in the middle buffer */
1098         if (left) {
1099                 orig_slot += btrfs_header_nritems(left);
1100                 wret = push_node_left(trans, left, mid, 1);
1101                 if (wret < 0)
1102                         ret = wret;
1103         }
1104
1105         /*
1106          * then try to empty the right most buffer into the middle
1107          */
1108         if (right) {
1109                 wret = push_node_left(trans, mid, right, 1);
1110                 if (wret < 0 && wret != -ENOSPC)
1111                         ret = wret;
1112                 if (btrfs_header_nritems(right) == 0) {
1113                         btrfs_clear_buffer_dirty(trans, right);
1114                         btrfs_tree_unlock(right);
1115                         del_ptr(root, path, level + 1, pslot + 1);
1116                         root_sub_used(root, right->len);
1117                         btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1118                                               0, 1);
1119                         free_extent_buffer_stale(right);
1120                         right = NULL;
1121                 } else {
1122                         struct btrfs_disk_key right_key;
1123                         btrfs_node_key(right, &right_key, 0);
1124                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1125                                         BTRFS_MOD_LOG_KEY_REPLACE);
1126                         BUG_ON(ret < 0);
1127                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1128                         btrfs_mark_buffer_dirty(parent);
1129                 }
1130         }
1131         if (btrfs_header_nritems(mid) == 1) {
1132                 /*
1133                  * we're not allowed to leave a node with one item in the
1134                  * tree during a delete.  A deletion from lower in the tree
1135                  * could try to delete the only pointer in this node.
1136                  * So, pull some keys from the left.
1137                  * There has to be a left pointer at this point because
1138                  * otherwise we would have pulled some pointers from the
1139                  * right
1140                  */
1141                 if (!left) {
1142                         ret = -EROFS;
1143                         btrfs_handle_fs_error(fs_info, ret, NULL);
1144                         goto enospc;
1145                 }
1146                 wret = balance_node_right(trans, mid, left);
1147                 if (wret < 0) {
1148                         ret = wret;
1149                         goto enospc;
1150                 }
1151                 if (wret == 1) {
1152                         wret = push_node_left(trans, left, mid, 1);
1153                         if (wret < 0)
1154                                 ret = wret;
1155                 }
1156                 BUG_ON(wret == 1);
1157         }
1158         if (btrfs_header_nritems(mid) == 0) {
1159                 btrfs_clear_buffer_dirty(trans, mid);
1160                 btrfs_tree_unlock(mid);
1161                 del_ptr(root, path, level + 1, pslot);
1162                 root_sub_used(root, mid->len);
1163                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1164                 free_extent_buffer_stale(mid);
1165                 mid = NULL;
1166         } else {
1167                 /* update the parent key to reflect our changes */
1168                 struct btrfs_disk_key mid_key;
1169                 btrfs_node_key(mid, &mid_key, 0);
1170                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1171                                                     BTRFS_MOD_LOG_KEY_REPLACE);
1172                 BUG_ON(ret < 0);
1173                 btrfs_set_node_key(parent, &mid_key, pslot);
1174                 btrfs_mark_buffer_dirty(parent);
1175         }
1176
1177         /* update the path */
1178         if (left) {
1179                 if (btrfs_header_nritems(left) > orig_slot) {
1180                         atomic_inc(&left->refs);
1181                         /* left was locked after cow */
1182                         path->nodes[level] = left;
1183                         path->slots[level + 1] -= 1;
1184                         path->slots[level] = orig_slot;
1185                         if (mid) {
1186                                 btrfs_tree_unlock(mid);
1187                                 free_extent_buffer(mid);
1188                         }
1189                 } else {
1190                         orig_slot -= btrfs_header_nritems(left);
1191                         path->slots[level] = orig_slot;
1192                 }
1193         }
1194         /* double check we haven't messed things up */
1195         if (orig_ptr !=
1196             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1197                 BUG();
1198 enospc:
1199         if (right) {
1200                 btrfs_tree_unlock(right);
1201                 free_extent_buffer(right);
1202         }
1203         if (left) {
1204                 if (path->nodes[level] != left)
1205                         btrfs_tree_unlock(left);
1206                 free_extent_buffer(left);
1207         }
1208         return ret;
1209 }
1210
1211 /* Node balancing for insertion.  Here we only split or push nodes around
1212  * when they are completely full.  This is also done top down, so we
1213  * have to be pessimistic.
1214  */
1215 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1216                                           struct btrfs_root *root,
1217                                           struct btrfs_path *path, int level)
1218 {
1219         struct btrfs_fs_info *fs_info = root->fs_info;
1220         struct extent_buffer *right = NULL;
1221         struct extent_buffer *mid;
1222         struct extent_buffer *left = NULL;
1223         struct extent_buffer *parent = NULL;
1224         int ret = 0;
1225         int wret;
1226         int pslot;
1227         int orig_slot = path->slots[level];
1228
1229         if (level == 0)
1230                 return 1;
1231
1232         mid = path->nodes[level];
1233         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1234
1235         if (level < BTRFS_MAX_LEVEL - 1) {
1236                 parent = path->nodes[level + 1];
1237                 pslot = path->slots[level + 1];
1238         }
1239
1240         if (!parent)
1241                 return 1;
1242
1243         left = btrfs_read_node_slot(parent, pslot - 1);
1244         if (IS_ERR(left))
1245                 left = NULL;
1246
1247         /* first, try to make some room in the middle buffer */
1248         if (left) {
1249                 u32 left_nr;
1250
1251                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1252
1253                 left_nr = btrfs_header_nritems(left);
1254                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1255                         wret = 1;
1256                 } else {
1257                         ret = btrfs_cow_block(trans, root, left, parent,
1258                                               pslot - 1, &left,
1259                                               BTRFS_NESTING_LEFT_COW);
1260                         if (ret)
1261                                 wret = 1;
1262                         else {
1263                                 wret = push_node_left(trans, left, mid, 0);
1264                         }
1265                 }
1266                 if (wret < 0)
1267                         ret = wret;
1268                 if (wret == 0) {
1269                         struct btrfs_disk_key disk_key;
1270                         orig_slot += left_nr;
1271                         btrfs_node_key(mid, &disk_key, 0);
1272                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1273                                         BTRFS_MOD_LOG_KEY_REPLACE);
1274                         BUG_ON(ret < 0);
1275                         btrfs_set_node_key(parent, &disk_key, pslot);
1276                         btrfs_mark_buffer_dirty(parent);
1277                         if (btrfs_header_nritems(left) > orig_slot) {
1278                                 path->nodes[level] = left;
1279                                 path->slots[level + 1] -= 1;
1280                                 path->slots[level] = orig_slot;
1281                                 btrfs_tree_unlock(mid);
1282                                 free_extent_buffer(mid);
1283                         } else {
1284                                 orig_slot -=
1285                                         btrfs_header_nritems(left);
1286                                 path->slots[level] = orig_slot;
1287                                 btrfs_tree_unlock(left);
1288                                 free_extent_buffer(left);
1289                         }
1290                         return 0;
1291                 }
1292                 btrfs_tree_unlock(left);
1293                 free_extent_buffer(left);
1294         }
1295         right = btrfs_read_node_slot(parent, pslot + 1);
1296         if (IS_ERR(right))
1297                 right = NULL;
1298
1299         /*
1300          * then try to empty the right most buffer into the middle
1301          */
1302         if (right) {
1303                 u32 right_nr;
1304
1305                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1306
1307                 right_nr = btrfs_header_nritems(right);
1308                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1309                         wret = 1;
1310                 } else {
1311                         ret = btrfs_cow_block(trans, root, right,
1312                                               parent, pslot + 1,
1313                                               &right, BTRFS_NESTING_RIGHT_COW);
1314                         if (ret)
1315                                 wret = 1;
1316                         else {
1317                                 wret = balance_node_right(trans, right, mid);
1318                         }
1319                 }
1320                 if (wret < 0)
1321                         ret = wret;
1322                 if (wret == 0) {
1323                         struct btrfs_disk_key disk_key;
1324
1325                         btrfs_node_key(right, &disk_key, 0);
1326                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1327                                         BTRFS_MOD_LOG_KEY_REPLACE);
1328                         BUG_ON(ret < 0);
1329                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1330                         btrfs_mark_buffer_dirty(parent);
1331
1332                         if (btrfs_header_nritems(mid) <= orig_slot) {
1333                                 path->nodes[level] = right;
1334                                 path->slots[level + 1] += 1;
1335                                 path->slots[level] = orig_slot -
1336                                         btrfs_header_nritems(mid);
1337                                 btrfs_tree_unlock(mid);
1338                                 free_extent_buffer(mid);
1339                         } else {
1340                                 btrfs_tree_unlock(right);
1341                                 free_extent_buffer(right);
1342                         }
1343                         return 0;
1344                 }
1345                 btrfs_tree_unlock(right);
1346                 free_extent_buffer(right);
1347         }
1348         return 1;
1349 }
1350
1351 /*
1352  * readahead one full node of leaves, finding things that are close
1353  * to the block in 'slot', and triggering ra on them.
1354  */
1355 static void reada_for_search(struct btrfs_fs_info *fs_info,
1356                              struct btrfs_path *path,
1357                              int level, int slot, u64 objectid)
1358 {
1359         struct extent_buffer *node;
1360         struct btrfs_disk_key disk_key;
1361         u32 nritems;
1362         u64 search;
1363         u64 target;
1364         u64 nread = 0;
1365         u64 nread_max;
1366         u32 nr;
1367         u32 blocksize;
1368         u32 nscan = 0;
1369
1370         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1371                 return;
1372
1373         if (!path->nodes[level])
1374                 return;
1375
1376         node = path->nodes[level];
1377
1378         /*
1379          * Since the time between visiting leaves is much shorter than the time
1380          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1381          * much IO at once (possibly random).
1382          */
1383         if (path->reada == READA_FORWARD_ALWAYS) {
1384                 if (level > 1)
1385                         nread_max = node->fs_info->nodesize;
1386                 else
1387                         nread_max = SZ_128K;
1388         } else {
1389                 nread_max = SZ_64K;
1390         }
1391
1392         search = btrfs_node_blockptr(node, slot);
1393         blocksize = fs_info->nodesize;
1394         if (path->reada != READA_FORWARD_ALWAYS) {
1395                 struct extent_buffer *eb;
1396
1397                 eb = find_extent_buffer(fs_info, search);
1398                 if (eb) {
1399                         free_extent_buffer(eb);
1400                         return;
1401                 }
1402         }
1403
1404         target = search;
1405
1406         nritems = btrfs_header_nritems(node);
1407         nr = slot;
1408
1409         while (1) {
1410                 if (path->reada == READA_BACK) {
1411                         if (nr == 0)
1412                                 break;
1413                         nr--;
1414                 } else if (path->reada == READA_FORWARD ||
1415                            path->reada == READA_FORWARD_ALWAYS) {
1416                         nr++;
1417                         if (nr >= nritems)
1418                                 break;
1419                 }
1420                 if (path->reada == READA_BACK && objectid) {
1421                         btrfs_node_key(node, &disk_key, nr);
1422                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1423                                 break;
1424                 }
1425                 search = btrfs_node_blockptr(node, nr);
1426                 if (path->reada == READA_FORWARD_ALWAYS ||
1427                     (search <= target && target - search <= 65536) ||
1428                     (search > target && search - target <= 65536)) {
1429                         btrfs_readahead_node_child(node, nr);
1430                         nread += blocksize;
1431                 }
1432                 nscan++;
1433                 if (nread > nread_max || nscan > 32)
1434                         break;
1435         }
1436 }
1437
1438 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1439 {
1440         struct extent_buffer *parent;
1441         int slot;
1442         int nritems;
1443
1444         parent = path->nodes[level + 1];
1445         if (!parent)
1446                 return;
1447
1448         nritems = btrfs_header_nritems(parent);
1449         slot = path->slots[level + 1];
1450
1451         if (slot > 0)
1452                 btrfs_readahead_node_child(parent, slot - 1);
1453         if (slot + 1 < nritems)
1454                 btrfs_readahead_node_child(parent, slot + 1);
1455 }
1456
1457
1458 /*
1459  * when we walk down the tree, it is usually safe to unlock the higher layers
1460  * in the tree.  The exceptions are when our path goes through slot 0, because
1461  * operations on the tree might require changing key pointers higher up in the
1462  * tree.
1463  *
1464  * callers might also have set path->keep_locks, which tells this code to keep
1465  * the lock if the path points to the last slot in the block.  This is part of
1466  * walking through the tree, and selecting the next slot in the higher block.
1467  *
1468  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1469  * if lowest_unlock is 1, level 0 won't be unlocked
1470  */
1471 static noinline void unlock_up(struct btrfs_path *path, int level,
1472                                int lowest_unlock, int min_write_lock_level,
1473                                int *write_lock_level)
1474 {
1475         int i;
1476         int skip_level = level;
1477         bool check_skip = true;
1478
1479         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1480                 if (!path->nodes[i])
1481                         break;
1482                 if (!path->locks[i])
1483                         break;
1484
1485                 if (check_skip) {
1486                         if (path->slots[i] == 0) {
1487                                 skip_level = i + 1;
1488                                 continue;
1489                         }
1490
1491                         if (path->keep_locks) {
1492                                 u32 nritems;
1493
1494                                 nritems = btrfs_header_nritems(path->nodes[i]);
1495                                 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1496                                         skip_level = i + 1;
1497                                         continue;
1498                                 }
1499                         }
1500                 }
1501
1502                 if (i >= lowest_unlock && i > skip_level) {
1503                         check_skip = false;
1504                         btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1505                         path->locks[i] = 0;
1506                         if (write_lock_level &&
1507                             i > min_write_lock_level &&
1508                             i <= *write_lock_level) {
1509                                 *write_lock_level = i - 1;
1510                         }
1511                 }
1512         }
1513 }
1514
1515 /*
1516  * Helper function for btrfs_search_slot() and other functions that do a search
1517  * on a btree. The goal is to find a tree block in the cache (the radix tree at
1518  * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1519  * its pages from disk.
1520  *
1521  * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1522  * whole btree search, starting again from the current root node.
1523  */
1524 static int
1525 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1526                       struct extent_buffer **eb_ret, int level, int slot,
1527                       const struct btrfs_key *key)
1528 {
1529         struct btrfs_fs_info *fs_info = root->fs_info;
1530         struct btrfs_tree_parent_check check = { 0 };
1531         u64 blocknr;
1532         u64 gen;
1533         struct extent_buffer *tmp;
1534         int ret;
1535         int parent_level;
1536         bool unlock_up;
1537
1538         unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1539         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1540         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1541         parent_level = btrfs_header_level(*eb_ret);
1542         btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1543         check.has_first_key = true;
1544         check.level = parent_level - 1;
1545         check.transid = gen;
1546         check.owner_root = root->root_key.objectid;
1547
1548         /*
1549          * If we need to read an extent buffer from disk and we are holding locks
1550          * on upper level nodes, we unlock all the upper nodes before reading the
1551          * extent buffer, and then return -EAGAIN to the caller as it needs to
1552          * restart the search. We don't release the lock on the current level
1553          * because we need to walk this node to figure out which blocks to read.
1554          */
1555         tmp = find_extent_buffer(fs_info, blocknr);
1556         if (tmp) {
1557                 if (p->reada == READA_FORWARD_ALWAYS)
1558                         reada_for_search(fs_info, p, level, slot, key->objectid);
1559
1560                 /* first we do an atomic uptodate check */
1561                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1562                         /*
1563                          * Do extra check for first_key, eb can be stale due to
1564                          * being cached, read from scrub, or have multiple
1565                          * parents (shared tree blocks).
1566                          */
1567                         if (btrfs_verify_level_key(tmp,
1568                                         parent_level - 1, &check.first_key, gen)) {
1569                                 free_extent_buffer(tmp);
1570                                 return -EUCLEAN;
1571                         }
1572                         *eb_ret = tmp;
1573                         return 0;
1574                 }
1575
1576                 if (p->nowait) {
1577                         free_extent_buffer(tmp);
1578                         return -EAGAIN;
1579                 }
1580
1581                 if (unlock_up)
1582                         btrfs_unlock_up_safe(p, level + 1);
1583
1584                 /* now we're allowed to do a blocking uptodate check */
1585                 ret = btrfs_read_extent_buffer(tmp, &check);
1586                 if (ret) {
1587                         free_extent_buffer(tmp);
1588                         btrfs_release_path(p);
1589                         return -EIO;
1590                 }
1591                 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1592                         free_extent_buffer(tmp);
1593                         btrfs_release_path(p);
1594                         return -EUCLEAN;
1595                 }
1596
1597                 if (unlock_up)
1598                         ret = -EAGAIN;
1599
1600                 goto out;
1601         } else if (p->nowait) {
1602                 return -EAGAIN;
1603         }
1604
1605         if (unlock_up) {
1606                 btrfs_unlock_up_safe(p, level + 1);
1607                 ret = -EAGAIN;
1608         } else {
1609                 ret = 0;
1610         }
1611
1612         if (p->reada != READA_NONE)
1613                 reada_for_search(fs_info, p, level, slot, key->objectid);
1614
1615         tmp = read_tree_block(fs_info, blocknr, &check);
1616         if (IS_ERR(tmp)) {
1617                 btrfs_release_path(p);
1618                 return PTR_ERR(tmp);
1619         }
1620         /*
1621          * If the read above didn't mark this buffer up to date,
1622          * it will never end up being up to date.  Set ret to EIO now
1623          * and give up so that our caller doesn't loop forever
1624          * on our EAGAINs.
1625          */
1626         if (!extent_buffer_uptodate(tmp))
1627                 ret = -EIO;
1628
1629 out:
1630         if (ret == 0) {
1631                 *eb_ret = tmp;
1632         } else {
1633                 free_extent_buffer(tmp);
1634                 btrfs_release_path(p);
1635         }
1636
1637         return ret;
1638 }
1639
1640 /*
1641  * helper function for btrfs_search_slot.  This does all of the checks
1642  * for node-level blocks and does any balancing required based on
1643  * the ins_len.
1644  *
1645  * If no extra work was required, zero is returned.  If we had to
1646  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1647  * start over
1648  */
1649 static int
1650 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1651                        struct btrfs_root *root, struct btrfs_path *p,
1652                        struct extent_buffer *b, int level, int ins_len,
1653                        int *write_lock_level)
1654 {
1655         struct btrfs_fs_info *fs_info = root->fs_info;
1656         int ret = 0;
1657
1658         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1659             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1660
1661                 if (*write_lock_level < level + 1) {
1662                         *write_lock_level = level + 1;
1663                         btrfs_release_path(p);
1664                         return -EAGAIN;
1665                 }
1666
1667                 reada_for_balance(p, level);
1668                 ret = split_node(trans, root, p, level);
1669
1670                 b = p->nodes[level];
1671         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1672                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1673
1674                 if (*write_lock_level < level + 1) {
1675                         *write_lock_level = level + 1;
1676                         btrfs_release_path(p);
1677                         return -EAGAIN;
1678                 }
1679
1680                 reada_for_balance(p, level);
1681                 ret = balance_level(trans, root, p, level);
1682                 if (ret)
1683                         return ret;
1684
1685                 b = p->nodes[level];
1686                 if (!b) {
1687                         btrfs_release_path(p);
1688                         return -EAGAIN;
1689                 }
1690                 BUG_ON(btrfs_header_nritems(b) == 1);
1691         }
1692         return ret;
1693 }
1694
1695 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1696                 u64 iobjectid, u64 ioff, u8 key_type,
1697                 struct btrfs_key *found_key)
1698 {
1699         int ret;
1700         struct btrfs_key key;
1701         struct extent_buffer *eb;
1702
1703         ASSERT(path);
1704         ASSERT(found_key);
1705
1706         key.type = key_type;
1707         key.objectid = iobjectid;
1708         key.offset = ioff;
1709
1710         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1711         if (ret < 0)
1712                 return ret;
1713
1714         eb = path->nodes[0];
1715         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1716                 ret = btrfs_next_leaf(fs_root, path);
1717                 if (ret)
1718                         return ret;
1719                 eb = path->nodes[0];
1720         }
1721
1722         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1723         if (found_key->type != key.type ||
1724                         found_key->objectid != key.objectid)
1725                 return 1;
1726
1727         return 0;
1728 }
1729
1730 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1731                                                         struct btrfs_path *p,
1732                                                         int write_lock_level)
1733 {
1734         struct extent_buffer *b;
1735         int root_lock = 0;
1736         int level = 0;
1737
1738         if (p->search_commit_root) {
1739                 b = root->commit_root;
1740                 atomic_inc(&b->refs);
1741                 level = btrfs_header_level(b);
1742                 /*
1743                  * Ensure that all callers have set skip_locking when
1744                  * p->search_commit_root = 1.
1745                  */
1746                 ASSERT(p->skip_locking == 1);
1747
1748                 goto out;
1749         }
1750
1751         if (p->skip_locking) {
1752                 b = btrfs_root_node(root);
1753                 level = btrfs_header_level(b);
1754                 goto out;
1755         }
1756
1757         /* We try very hard to do read locks on the root */
1758         root_lock = BTRFS_READ_LOCK;
1759
1760         /*
1761          * If the level is set to maximum, we can skip trying to get the read
1762          * lock.
1763          */
1764         if (write_lock_level < BTRFS_MAX_LEVEL) {
1765                 /*
1766                  * We don't know the level of the root node until we actually
1767                  * have it read locked
1768                  */
1769                 if (p->nowait) {
1770                         b = btrfs_try_read_lock_root_node(root);
1771                         if (IS_ERR(b))
1772                                 return b;
1773                 } else {
1774                         b = btrfs_read_lock_root_node(root);
1775                 }
1776                 level = btrfs_header_level(b);
1777                 if (level > write_lock_level)
1778                         goto out;
1779
1780                 /* Whoops, must trade for write lock */
1781                 btrfs_tree_read_unlock(b);
1782                 free_extent_buffer(b);
1783         }
1784
1785         b = btrfs_lock_root_node(root);
1786         root_lock = BTRFS_WRITE_LOCK;
1787
1788         /* The level might have changed, check again */
1789         level = btrfs_header_level(b);
1790
1791 out:
1792         /*
1793          * The root may have failed to write out at some point, and thus is no
1794          * longer valid, return an error in this case.
1795          */
1796         if (!extent_buffer_uptodate(b)) {
1797                 if (root_lock)
1798                         btrfs_tree_unlock_rw(b, root_lock);
1799                 free_extent_buffer(b);
1800                 return ERR_PTR(-EIO);
1801         }
1802
1803         p->nodes[level] = b;
1804         if (!p->skip_locking)
1805                 p->locks[level] = root_lock;
1806         /*
1807          * Callers are responsible for dropping b's references.
1808          */
1809         return b;
1810 }
1811
1812 /*
1813  * Replace the extent buffer at the lowest level of the path with a cloned
1814  * version. The purpose is to be able to use it safely, after releasing the
1815  * commit root semaphore, even if relocation is happening in parallel, the
1816  * transaction used for relocation is committed and the extent buffer is
1817  * reallocated in the next transaction.
1818  *
1819  * This is used in a context where the caller does not prevent transaction
1820  * commits from happening, either by holding a transaction handle or holding
1821  * some lock, while it's doing searches through a commit root.
1822  * At the moment it's only used for send operations.
1823  */
1824 static int finish_need_commit_sem_search(struct btrfs_path *path)
1825 {
1826         const int i = path->lowest_level;
1827         const int slot = path->slots[i];
1828         struct extent_buffer *lowest = path->nodes[i];
1829         struct extent_buffer *clone;
1830
1831         ASSERT(path->need_commit_sem);
1832
1833         if (!lowest)
1834                 return 0;
1835
1836         lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1837
1838         clone = btrfs_clone_extent_buffer(lowest);
1839         if (!clone)
1840                 return -ENOMEM;
1841
1842         btrfs_release_path(path);
1843         path->nodes[i] = clone;
1844         path->slots[i] = slot;
1845
1846         return 0;
1847 }
1848
1849 static inline int search_for_key_slot(struct extent_buffer *eb,
1850                                       int search_low_slot,
1851                                       const struct btrfs_key *key,
1852                                       int prev_cmp,
1853                                       int *slot)
1854 {
1855         /*
1856          * If a previous call to btrfs_bin_search() on a parent node returned an
1857          * exact match (prev_cmp == 0), we can safely assume the target key will
1858          * always be at slot 0 on lower levels, since each key pointer
1859          * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1860          * subtree it points to. Thus we can skip searching lower levels.
1861          */
1862         if (prev_cmp == 0) {
1863                 *slot = 0;
1864                 return 0;
1865         }
1866
1867         return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
1868 }
1869
1870 static int search_leaf(struct btrfs_trans_handle *trans,
1871                        struct btrfs_root *root,
1872                        const struct btrfs_key *key,
1873                        struct btrfs_path *path,
1874                        int ins_len,
1875                        int prev_cmp)
1876 {
1877         struct extent_buffer *leaf = path->nodes[0];
1878         int leaf_free_space = -1;
1879         int search_low_slot = 0;
1880         int ret;
1881         bool do_bin_search = true;
1882
1883         /*
1884          * If we are doing an insertion, the leaf has enough free space and the
1885          * destination slot for the key is not slot 0, then we can unlock our
1886          * write lock on the parent, and any other upper nodes, before doing the
1887          * binary search on the leaf (with search_for_key_slot()), allowing other
1888          * tasks to lock the parent and any other upper nodes.
1889          */
1890         if (ins_len > 0) {
1891                 /*
1892                  * Cache the leaf free space, since we will need it later and it
1893                  * will not change until then.
1894                  */
1895                 leaf_free_space = btrfs_leaf_free_space(leaf);
1896
1897                 /*
1898                  * !path->locks[1] means we have a single node tree, the leaf is
1899                  * the root of the tree.
1900                  */
1901                 if (path->locks[1] && leaf_free_space >= ins_len) {
1902                         struct btrfs_disk_key first_key;
1903
1904                         ASSERT(btrfs_header_nritems(leaf) > 0);
1905                         btrfs_item_key(leaf, &first_key, 0);
1906
1907                         /*
1908                          * Doing the extra comparison with the first key is cheap,
1909                          * taking into account that the first key is very likely
1910                          * already in a cache line because it immediately follows
1911                          * the extent buffer's header and we have recently accessed
1912                          * the header's level field.
1913                          */
1914                         ret = comp_keys(&first_key, key);
1915                         if (ret < 0) {
1916                                 /*
1917                                  * The first key is smaller than the key we want
1918                                  * to insert, so we are safe to unlock all upper
1919                                  * nodes and we have to do the binary search.
1920                                  *
1921                                  * We do use btrfs_unlock_up_safe() and not
1922                                  * unlock_up() because the later does not unlock
1923                                  * nodes with a slot of 0 - we can safely unlock
1924                                  * any node even if its slot is 0 since in this
1925                                  * case the key does not end up at slot 0 of the
1926                                  * leaf and there's no need to split the leaf.
1927                                  */
1928                                 btrfs_unlock_up_safe(path, 1);
1929                                 search_low_slot = 1;
1930                         } else {
1931                                 /*
1932                                  * The first key is >= then the key we want to
1933                                  * insert, so we can skip the binary search as
1934                                  * the target key will be at slot 0.
1935                                  *
1936                                  * We can not unlock upper nodes when the key is
1937                                  * less than the first key, because we will need
1938                                  * to update the key at slot 0 of the parent node
1939                                  * and possibly of other upper nodes too.
1940                                  * If the key matches the first key, then we can
1941                                  * unlock all the upper nodes, using
1942                                  * btrfs_unlock_up_safe() instead of unlock_up()
1943                                  * as stated above.
1944                                  */
1945                                 if (ret == 0)
1946                                         btrfs_unlock_up_safe(path, 1);
1947                                 /*
1948                                  * ret is already 0 or 1, matching the result of
1949                                  * a btrfs_bin_search() call, so there is no need
1950                                  * to adjust it.
1951                                  */
1952                                 do_bin_search = false;
1953                                 path->slots[0] = 0;
1954                         }
1955                 }
1956         }
1957
1958         if (do_bin_search) {
1959                 ret = search_for_key_slot(leaf, search_low_slot, key,
1960                                           prev_cmp, &path->slots[0]);
1961                 if (ret < 0)
1962                         return ret;
1963         }
1964
1965         if (ins_len > 0) {
1966                 /*
1967                  * Item key already exists. In this case, if we are allowed to
1968                  * insert the item (for example, in dir_item case, item key
1969                  * collision is allowed), it will be merged with the original
1970                  * item. Only the item size grows, no new btrfs item will be
1971                  * added. If search_for_extension is not set, ins_len already
1972                  * accounts the size btrfs_item, deduct it here so leaf space
1973                  * check will be correct.
1974                  */
1975                 if (ret == 0 && !path->search_for_extension) {
1976                         ASSERT(ins_len >= sizeof(struct btrfs_item));
1977                         ins_len -= sizeof(struct btrfs_item);
1978                 }
1979
1980                 ASSERT(leaf_free_space >= 0);
1981
1982                 if (leaf_free_space < ins_len) {
1983                         int err;
1984
1985                         err = split_leaf(trans, root, key, path, ins_len,
1986                                          (ret == 0));
1987                         ASSERT(err <= 0);
1988                         if (WARN_ON(err > 0))
1989                                 err = -EUCLEAN;
1990                         if (err)
1991                                 ret = err;
1992                 }
1993         }
1994
1995         return ret;
1996 }
1997
1998 /*
1999  * btrfs_search_slot - look for a key in a tree and perform necessary
2000  * modifications to preserve tree invariants.
2001  *
2002  * @trans:      Handle of transaction, used when modifying the tree
2003  * @p:          Holds all btree nodes along the search path
2004  * @root:       The root node of the tree
2005  * @key:        The key we are looking for
2006  * @ins_len:    Indicates purpose of search:
2007  *              >0  for inserts it's size of item inserted (*)
2008  *              <0  for deletions
2009  *               0  for plain searches, not modifying the tree
2010  *
2011  *              (*) If size of item inserted doesn't include
2012  *              sizeof(struct btrfs_item), then p->search_for_extension must
2013  *              be set.
2014  * @cow:        boolean should CoW operations be performed. Must always be 1
2015  *              when modifying the tree.
2016  *
2017  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2018  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2019  *
2020  * If @key is found, 0 is returned and you can find the item in the leaf level
2021  * of the path (level 0)
2022  *
2023  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2024  * points to the slot where it should be inserted
2025  *
2026  * If an error is encountered while searching the tree a negative error number
2027  * is returned
2028  */
2029 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2030                       const struct btrfs_key *key, struct btrfs_path *p,
2031                       int ins_len, int cow)
2032 {
2033         struct btrfs_fs_info *fs_info = root->fs_info;
2034         struct extent_buffer *b;
2035         int slot;
2036         int ret;
2037         int err;
2038         int level;
2039         int lowest_unlock = 1;
2040         /* everything at write_lock_level or lower must be write locked */
2041         int write_lock_level = 0;
2042         u8 lowest_level = 0;
2043         int min_write_lock_level;
2044         int prev_cmp;
2045
2046         might_sleep();
2047
2048         lowest_level = p->lowest_level;
2049         WARN_ON(lowest_level && ins_len > 0);
2050         WARN_ON(p->nodes[0] != NULL);
2051         BUG_ON(!cow && ins_len);
2052
2053         /*
2054          * For now only allow nowait for read only operations.  There's no
2055          * strict reason why we can't, we just only need it for reads so it's
2056          * only implemented for reads.
2057          */
2058         ASSERT(!p->nowait || !cow);
2059
2060         if (ins_len < 0) {
2061                 lowest_unlock = 2;
2062
2063                 /* when we are removing items, we might have to go up to level
2064                  * two as we update tree pointers  Make sure we keep write
2065                  * for those levels as well
2066                  */
2067                 write_lock_level = 2;
2068         } else if (ins_len > 0) {
2069                 /*
2070                  * for inserting items, make sure we have a write lock on
2071                  * level 1 so we can update keys
2072                  */
2073                 write_lock_level = 1;
2074         }
2075
2076         if (!cow)
2077                 write_lock_level = -1;
2078
2079         if (cow && (p->keep_locks || p->lowest_level))
2080                 write_lock_level = BTRFS_MAX_LEVEL;
2081
2082         min_write_lock_level = write_lock_level;
2083
2084         if (p->need_commit_sem) {
2085                 ASSERT(p->search_commit_root);
2086                 if (p->nowait) {
2087                         if (!down_read_trylock(&fs_info->commit_root_sem))
2088                                 return -EAGAIN;
2089                 } else {
2090                         down_read(&fs_info->commit_root_sem);
2091                 }
2092         }
2093
2094 again:
2095         prev_cmp = -1;
2096         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2097         if (IS_ERR(b)) {
2098                 ret = PTR_ERR(b);
2099                 goto done;
2100         }
2101
2102         while (b) {
2103                 int dec = 0;
2104
2105                 level = btrfs_header_level(b);
2106
2107                 if (cow) {
2108                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2109
2110                         /*
2111                          * if we don't really need to cow this block
2112                          * then we don't want to set the path blocking,
2113                          * so we test it here
2114                          */
2115                         if (!should_cow_block(trans, root, b))
2116                                 goto cow_done;
2117
2118                         /*
2119                          * must have write locks on this node and the
2120                          * parent
2121                          */
2122                         if (level > write_lock_level ||
2123                             (level + 1 > write_lock_level &&
2124                             level + 1 < BTRFS_MAX_LEVEL &&
2125                             p->nodes[level + 1])) {
2126                                 write_lock_level = level + 1;
2127                                 btrfs_release_path(p);
2128                                 goto again;
2129                         }
2130
2131                         if (last_level)
2132                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2133                                                       &b,
2134                                                       BTRFS_NESTING_COW);
2135                         else
2136                                 err = btrfs_cow_block(trans, root, b,
2137                                                       p->nodes[level + 1],
2138                                                       p->slots[level + 1], &b,
2139                                                       BTRFS_NESTING_COW);
2140                         if (err) {
2141                                 ret = err;
2142                                 goto done;
2143                         }
2144                 }
2145 cow_done:
2146                 p->nodes[level] = b;
2147
2148                 /*
2149                  * we have a lock on b and as long as we aren't changing
2150                  * the tree, there is no way to for the items in b to change.
2151                  * It is safe to drop the lock on our parent before we
2152                  * go through the expensive btree search on b.
2153                  *
2154                  * If we're inserting or deleting (ins_len != 0), then we might
2155                  * be changing slot zero, which may require changing the parent.
2156                  * So, we can't drop the lock until after we know which slot
2157                  * we're operating on.
2158                  */
2159                 if (!ins_len && !p->keep_locks) {
2160                         int u = level + 1;
2161
2162                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2163                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2164                                 p->locks[u] = 0;
2165                         }
2166                 }
2167
2168                 if (level == 0) {
2169                         if (ins_len > 0)
2170                                 ASSERT(write_lock_level >= 1);
2171
2172                         ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2173                         if (!p->search_for_split)
2174                                 unlock_up(p, level, lowest_unlock,
2175                                           min_write_lock_level, NULL);
2176                         goto done;
2177                 }
2178
2179                 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2180                 if (ret < 0)
2181                         goto done;
2182                 prev_cmp = ret;
2183
2184                 if (ret && slot > 0) {
2185                         dec = 1;
2186                         slot--;
2187                 }
2188                 p->slots[level] = slot;
2189                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2190                                              &write_lock_level);
2191                 if (err == -EAGAIN)
2192                         goto again;
2193                 if (err) {
2194                         ret = err;
2195                         goto done;
2196                 }
2197                 b = p->nodes[level];
2198                 slot = p->slots[level];
2199
2200                 /*
2201                  * Slot 0 is special, if we change the key we have to update
2202                  * the parent pointer which means we must have a write lock on
2203                  * the parent
2204                  */
2205                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2206                         write_lock_level = level + 1;
2207                         btrfs_release_path(p);
2208                         goto again;
2209                 }
2210
2211                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2212                           &write_lock_level);
2213
2214                 if (level == lowest_level) {
2215                         if (dec)
2216                                 p->slots[level]++;
2217                         goto done;
2218                 }
2219
2220                 err = read_block_for_search(root, p, &b, level, slot, key);
2221                 if (err == -EAGAIN)
2222                         goto again;
2223                 if (err) {
2224                         ret = err;
2225                         goto done;
2226                 }
2227
2228                 if (!p->skip_locking) {
2229                         level = btrfs_header_level(b);
2230
2231                         btrfs_maybe_reset_lockdep_class(root, b);
2232
2233                         if (level <= write_lock_level) {
2234                                 btrfs_tree_lock(b);
2235                                 p->locks[level] = BTRFS_WRITE_LOCK;
2236                         } else {
2237                                 if (p->nowait) {
2238                                         if (!btrfs_try_tree_read_lock(b)) {
2239                                                 free_extent_buffer(b);
2240                                                 ret = -EAGAIN;
2241                                                 goto done;
2242                                         }
2243                                 } else {
2244                                         btrfs_tree_read_lock(b);
2245                                 }
2246                                 p->locks[level] = BTRFS_READ_LOCK;
2247                         }
2248                         p->nodes[level] = b;
2249                 }
2250         }
2251         ret = 1;
2252 done:
2253         if (ret < 0 && !p->skip_release_on_error)
2254                 btrfs_release_path(p);
2255
2256         if (p->need_commit_sem) {
2257                 int ret2;
2258
2259                 ret2 = finish_need_commit_sem_search(p);
2260                 up_read(&fs_info->commit_root_sem);
2261                 if (ret2)
2262                         ret = ret2;
2263         }
2264
2265         return ret;
2266 }
2267 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2268
2269 /*
2270  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2271  * current state of the tree together with the operations recorded in the tree
2272  * modification log to search for the key in a previous version of this tree, as
2273  * denoted by the time_seq parameter.
2274  *
2275  * Naturally, there is no support for insert, delete or cow operations.
2276  *
2277  * The resulting path and return value will be set up as if we called
2278  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2279  */
2280 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2281                           struct btrfs_path *p, u64 time_seq)
2282 {
2283         struct btrfs_fs_info *fs_info = root->fs_info;
2284         struct extent_buffer *b;
2285         int slot;
2286         int ret;
2287         int err;
2288         int level;
2289         int lowest_unlock = 1;
2290         u8 lowest_level = 0;
2291
2292         lowest_level = p->lowest_level;
2293         WARN_ON(p->nodes[0] != NULL);
2294         ASSERT(!p->nowait);
2295
2296         if (p->search_commit_root) {
2297                 BUG_ON(time_seq);
2298                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2299         }
2300
2301 again:
2302         b = btrfs_get_old_root(root, time_seq);
2303         if (!b) {
2304                 ret = -EIO;
2305                 goto done;
2306         }
2307         level = btrfs_header_level(b);
2308         p->locks[level] = BTRFS_READ_LOCK;
2309
2310         while (b) {
2311                 int dec = 0;
2312
2313                 level = btrfs_header_level(b);
2314                 p->nodes[level] = b;
2315
2316                 /*
2317                  * we have a lock on b and as long as we aren't changing
2318                  * the tree, there is no way to for the items in b to change.
2319                  * It is safe to drop the lock on our parent before we
2320                  * go through the expensive btree search on b.
2321                  */
2322                 btrfs_unlock_up_safe(p, level + 1);
2323
2324                 ret = btrfs_bin_search(b, key, &slot);
2325                 if (ret < 0)
2326                         goto done;
2327
2328                 if (level == 0) {
2329                         p->slots[level] = slot;
2330                         unlock_up(p, level, lowest_unlock, 0, NULL);
2331                         goto done;
2332                 }
2333
2334                 if (ret && slot > 0) {
2335                         dec = 1;
2336                         slot--;
2337                 }
2338                 p->slots[level] = slot;
2339                 unlock_up(p, level, lowest_unlock, 0, NULL);
2340
2341                 if (level == lowest_level) {
2342                         if (dec)
2343                                 p->slots[level]++;
2344                         goto done;
2345                 }
2346
2347                 err = read_block_for_search(root, p, &b, level, slot, key);
2348                 if (err == -EAGAIN)
2349                         goto again;
2350                 if (err) {
2351                         ret = err;
2352                         goto done;
2353                 }
2354
2355                 level = btrfs_header_level(b);
2356                 btrfs_tree_read_lock(b);
2357                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2358                 if (!b) {
2359                         ret = -ENOMEM;
2360                         goto done;
2361                 }
2362                 p->locks[level] = BTRFS_READ_LOCK;
2363                 p->nodes[level] = b;
2364         }
2365         ret = 1;
2366 done:
2367         if (ret < 0)
2368                 btrfs_release_path(p);
2369
2370         return ret;
2371 }
2372
2373 /*
2374  * helper to use instead of search slot if no exact match is needed but
2375  * instead the next or previous item should be returned.
2376  * When find_higher is true, the next higher item is returned, the next lower
2377  * otherwise.
2378  * When return_any and find_higher are both true, and no higher item is found,
2379  * return the next lower instead.
2380  * When return_any is true and find_higher is false, and no lower item is found,
2381  * return the next higher instead.
2382  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2383  * < 0 on error
2384  */
2385 int btrfs_search_slot_for_read(struct btrfs_root *root,
2386                                const struct btrfs_key *key,
2387                                struct btrfs_path *p, int find_higher,
2388                                int return_any)
2389 {
2390         int ret;
2391         struct extent_buffer *leaf;
2392
2393 again:
2394         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2395         if (ret <= 0)
2396                 return ret;
2397         /*
2398          * a return value of 1 means the path is at the position where the
2399          * item should be inserted. Normally this is the next bigger item,
2400          * but in case the previous item is the last in a leaf, path points
2401          * to the first free slot in the previous leaf, i.e. at an invalid
2402          * item.
2403          */
2404         leaf = p->nodes[0];
2405
2406         if (find_higher) {
2407                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2408                         ret = btrfs_next_leaf(root, p);
2409                         if (ret <= 0)
2410                                 return ret;
2411                         if (!return_any)
2412                                 return 1;
2413                         /*
2414                          * no higher item found, return the next
2415                          * lower instead
2416                          */
2417                         return_any = 0;
2418                         find_higher = 0;
2419                         btrfs_release_path(p);
2420                         goto again;
2421                 }
2422         } else {
2423                 if (p->slots[0] == 0) {
2424                         ret = btrfs_prev_leaf(root, p);
2425                         if (ret < 0)
2426                                 return ret;
2427                         if (!ret) {
2428                                 leaf = p->nodes[0];
2429                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2430                                         p->slots[0]--;
2431                                 return 0;
2432                         }
2433                         if (!return_any)
2434                                 return 1;
2435                         /*
2436                          * no lower item found, return the next
2437                          * higher instead
2438                          */
2439                         return_any = 0;
2440                         find_higher = 1;
2441                         btrfs_release_path(p);
2442                         goto again;
2443                 } else {
2444                         --p->slots[0];
2445                 }
2446         }
2447         return 0;
2448 }
2449
2450 /*
2451  * Execute search and call btrfs_previous_item to traverse backwards if the item
2452  * was not found.
2453  *
2454  * Return 0 if found, 1 if not found and < 0 if error.
2455  */
2456 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2457                            struct btrfs_path *path)
2458 {
2459         int ret;
2460
2461         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2462         if (ret > 0)
2463                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2464
2465         if (ret == 0)
2466                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2467
2468         return ret;
2469 }
2470
2471 /*
2472  * Search for a valid slot for the given path.
2473  *
2474  * @root:       The root node of the tree.
2475  * @key:        Will contain a valid item if found.
2476  * @path:       The starting point to validate the slot.
2477  *
2478  * Return: 0  if the item is valid
2479  *         1  if not found
2480  *         <0 if error.
2481  */
2482 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2483                               struct btrfs_path *path)
2484 {
2485         while (1) {
2486                 int ret;
2487                 const int slot = path->slots[0];
2488                 const struct extent_buffer *leaf = path->nodes[0];
2489
2490                 /* This is where we start walking the path. */
2491                 if (slot >= btrfs_header_nritems(leaf)) {
2492                         /*
2493                          * If we've reached the last slot in this leaf we need
2494                          * to go to the next leaf and reset the path.
2495                          */
2496                         ret = btrfs_next_leaf(root, path);
2497                         if (ret)
2498                                 return ret;
2499                         continue;
2500                 }
2501                 /* Store the found, valid item in @key. */
2502                 btrfs_item_key_to_cpu(leaf, key, slot);
2503                 break;
2504         }
2505         return 0;
2506 }
2507
2508 /*
2509  * adjust the pointers going up the tree, starting at level
2510  * making sure the right key of each node is points to 'key'.
2511  * This is used after shifting pointers to the left, so it stops
2512  * fixing up pointers when a given leaf/node is not in slot 0 of the
2513  * higher levels
2514  *
2515  */
2516 static void fixup_low_keys(struct btrfs_path *path,
2517                            struct btrfs_disk_key *key, int level)
2518 {
2519         int i;
2520         struct extent_buffer *t;
2521         int ret;
2522
2523         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2524                 int tslot = path->slots[i];
2525
2526                 if (!path->nodes[i])
2527                         break;
2528                 t = path->nodes[i];
2529                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2530                                                     BTRFS_MOD_LOG_KEY_REPLACE);
2531                 BUG_ON(ret < 0);
2532                 btrfs_set_node_key(t, key, tslot);
2533                 btrfs_mark_buffer_dirty(path->nodes[i]);
2534                 if (tslot != 0)
2535                         break;
2536         }
2537 }
2538
2539 /*
2540  * update item key.
2541  *
2542  * This function isn't completely safe. It's the caller's responsibility
2543  * that the new key won't break the order
2544  */
2545 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2546                              struct btrfs_path *path,
2547                              const struct btrfs_key *new_key)
2548 {
2549         struct btrfs_disk_key disk_key;
2550         struct extent_buffer *eb;
2551         int slot;
2552
2553         eb = path->nodes[0];
2554         slot = path->slots[0];
2555         if (slot > 0) {
2556                 btrfs_item_key(eb, &disk_key, slot - 1);
2557                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2558                         btrfs_crit(fs_info,
2559                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2560                                    slot, btrfs_disk_key_objectid(&disk_key),
2561                                    btrfs_disk_key_type(&disk_key),
2562                                    btrfs_disk_key_offset(&disk_key),
2563                                    new_key->objectid, new_key->type,
2564                                    new_key->offset);
2565                         btrfs_print_leaf(eb);
2566                         BUG();
2567                 }
2568         }
2569         if (slot < btrfs_header_nritems(eb) - 1) {
2570                 btrfs_item_key(eb, &disk_key, slot + 1);
2571                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2572                         btrfs_crit(fs_info,
2573                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2574                                    slot, btrfs_disk_key_objectid(&disk_key),
2575                                    btrfs_disk_key_type(&disk_key),
2576                                    btrfs_disk_key_offset(&disk_key),
2577                                    new_key->objectid, new_key->type,
2578                                    new_key->offset);
2579                         btrfs_print_leaf(eb);
2580                         BUG();
2581                 }
2582         }
2583
2584         btrfs_cpu_key_to_disk(&disk_key, new_key);
2585         btrfs_set_item_key(eb, &disk_key, slot);
2586         btrfs_mark_buffer_dirty(eb);
2587         if (slot == 0)
2588                 fixup_low_keys(path, &disk_key, 1);
2589 }
2590
2591 /*
2592  * Check key order of two sibling extent buffers.
2593  *
2594  * Return true if something is wrong.
2595  * Return false if everything is fine.
2596  *
2597  * Tree-checker only works inside one tree block, thus the following
2598  * corruption can not be detected by tree-checker:
2599  *
2600  * Leaf @left                   | Leaf @right
2601  * --------------------------------------------------------------
2602  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2603  *
2604  * Key f6 in leaf @left itself is valid, but not valid when the next
2605  * key in leaf @right is 7.
2606  * This can only be checked at tree block merge time.
2607  * And since tree checker has ensured all key order in each tree block
2608  * is correct, we only need to bother the last key of @left and the first
2609  * key of @right.
2610  */
2611 static bool check_sibling_keys(struct extent_buffer *left,
2612                                struct extent_buffer *right)
2613 {
2614         struct btrfs_key left_last;
2615         struct btrfs_key right_first;
2616         int level = btrfs_header_level(left);
2617         int nr_left = btrfs_header_nritems(left);
2618         int nr_right = btrfs_header_nritems(right);
2619
2620         /* No key to check in one of the tree blocks */
2621         if (!nr_left || !nr_right)
2622                 return false;
2623
2624         if (level) {
2625                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2626                 btrfs_node_key_to_cpu(right, &right_first, 0);
2627         } else {
2628                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2629                 btrfs_item_key_to_cpu(right, &right_first, 0);
2630         }
2631
2632         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2633                 btrfs_crit(left->fs_info,
2634 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2635                            left_last.objectid, left_last.type,
2636                            left_last.offset, right_first.objectid,
2637                            right_first.type, right_first.offset);
2638                 return true;
2639         }
2640         return false;
2641 }
2642
2643 /*
2644  * try to push data from one node into the next node left in the
2645  * tree.
2646  *
2647  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2648  * error, and > 0 if there was no room in the left hand block.
2649  */
2650 static int push_node_left(struct btrfs_trans_handle *trans,
2651                           struct extent_buffer *dst,
2652                           struct extent_buffer *src, int empty)
2653 {
2654         struct btrfs_fs_info *fs_info = trans->fs_info;
2655         int push_items = 0;
2656         int src_nritems;
2657         int dst_nritems;
2658         int ret = 0;
2659
2660         src_nritems = btrfs_header_nritems(src);
2661         dst_nritems = btrfs_header_nritems(dst);
2662         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2663         WARN_ON(btrfs_header_generation(src) != trans->transid);
2664         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2665
2666         if (!empty && src_nritems <= 8)
2667                 return 1;
2668
2669         if (push_items <= 0)
2670                 return 1;
2671
2672         if (empty) {
2673                 push_items = min(src_nritems, push_items);
2674                 if (push_items < src_nritems) {
2675                         /* leave at least 8 pointers in the node if
2676                          * we aren't going to empty it
2677                          */
2678                         if (src_nritems - push_items < 8) {
2679                                 if (push_items <= 8)
2680                                         return 1;
2681                                 push_items -= 8;
2682                         }
2683                 }
2684         } else
2685                 push_items = min(src_nritems - 8, push_items);
2686
2687         /* dst is the left eb, src is the middle eb */
2688         if (check_sibling_keys(dst, src)) {
2689                 ret = -EUCLEAN;
2690                 btrfs_abort_transaction(trans, ret);
2691                 return ret;
2692         }
2693         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2694         if (ret) {
2695                 btrfs_abort_transaction(trans, ret);
2696                 return ret;
2697         }
2698         copy_extent_buffer(dst, src,
2699                            btrfs_node_key_ptr_offset(dst, dst_nritems),
2700                            btrfs_node_key_ptr_offset(src, 0),
2701                            push_items * sizeof(struct btrfs_key_ptr));
2702
2703         if (push_items < src_nritems) {
2704                 /*
2705                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2706                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2707                  */
2708                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2709                                       btrfs_node_key_ptr_offset(src, push_items),
2710                                       (src_nritems - push_items) *
2711                                       sizeof(struct btrfs_key_ptr));
2712         }
2713         btrfs_set_header_nritems(src, src_nritems - push_items);
2714         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2715         btrfs_mark_buffer_dirty(src);
2716         btrfs_mark_buffer_dirty(dst);
2717
2718         return ret;
2719 }
2720
2721 /*
2722  * try to push data from one node into the next node right in the
2723  * tree.
2724  *
2725  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2726  * error, and > 0 if there was no room in the right hand block.
2727  *
2728  * this will  only push up to 1/2 the contents of the left node over
2729  */
2730 static int balance_node_right(struct btrfs_trans_handle *trans,
2731                               struct extent_buffer *dst,
2732                               struct extent_buffer *src)
2733 {
2734         struct btrfs_fs_info *fs_info = trans->fs_info;
2735         int push_items = 0;
2736         int max_push;
2737         int src_nritems;
2738         int dst_nritems;
2739         int ret = 0;
2740
2741         WARN_ON(btrfs_header_generation(src) != trans->transid);
2742         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2743
2744         src_nritems = btrfs_header_nritems(src);
2745         dst_nritems = btrfs_header_nritems(dst);
2746         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2747         if (push_items <= 0)
2748                 return 1;
2749
2750         if (src_nritems < 4)
2751                 return 1;
2752
2753         max_push = src_nritems / 2 + 1;
2754         /* don't try to empty the node */
2755         if (max_push >= src_nritems)
2756                 return 1;
2757
2758         if (max_push < push_items)
2759                 push_items = max_push;
2760
2761         /* dst is the right eb, src is the middle eb */
2762         if (check_sibling_keys(src, dst)) {
2763                 ret = -EUCLEAN;
2764                 btrfs_abort_transaction(trans, ret);
2765                 return ret;
2766         }
2767         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2768         BUG_ON(ret < 0);
2769         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2770                                       btrfs_node_key_ptr_offset(dst, 0),
2771                                       (dst_nritems) *
2772                                       sizeof(struct btrfs_key_ptr));
2773
2774         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2775                                          push_items);
2776         if (ret) {
2777                 btrfs_abort_transaction(trans, ret);
2778                 return ret;
2779         }
2780         copy_extent_buffer(dst, src,
2781                            btrfs_node_key_ptr_offset(dst, 0),
2782                            btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2783                            push_items * sizeof(struct btrfs_key_ptr));
2784
2785         btrfs_set_header_nritems(src, src_nritems - push_items);
2786         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2787
2788         btrfs_mark_buffer_dirty(src);
2789         btrfs_mark_buffer_dirty(dst);
2790
2791         return ret;
2792 }
2793
2794 /*
2795  * helper function to insert a new root level in the tree.
2796  * A new node is allocated, and a single item is inserted to
2797  * point to the existing root
2798  *
2799  * returns zero on success or < 0 on failure.
2800  */
2801 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2802                            struct btrfs_root *root,
2803                            struct btrfs_path *path, int level)
2804 {
2805         struct btrfs_fs_info *fs_info = root->fs_info;
2806         u64 lower_gen;
2807         struct extent_buffer *lower;
2808         struct extent_buffer *c;
2809         struct extent_buffer *old;
2810         struct btrfs_disk_key lower_key;
2811         int ret;
2812
2813         BUG_ON(path->nodes[level]);
2814         BUG_ON(path->nodes[level-1] != root->node);
2815
2816         lower = path->nodes[level-1];
2817         if (level == 1)
2818                 btrfs_item_key(lower, &lower_key, 0);
2819         else
2820                 btrfs_node_key(lower, &lower_key, 0);
2821
2822         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2823                                    &lower_key, level, root->node->start, 0,
2824                                    BTRFS_NESTING_NEW_ROOT);
2825         if (IS_ERR(c))
2826                 return PTR_ERR(c);
2827
2828         root_add_used(root, fs_info->nodesize);
2829
2830         btrfs_set_header_nritems(c, 1);
2831         btrfs_set_node_key(c, &lower_key, 0);
2832         btrfs_set_node_blockptr(c, 0, lower->start);
2833         lower_gen = btrfs_header_generation(lower);
2834         WARN_ON(lower_gen != trans->transid);
2835
2836         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2837
2838         btrfs_mark_buffer_dirty(c);
2839
2840         old = root->node;
2841         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2842         BUG_ON(ret < 0);
2843         rcu_assign_pointer(root->node, c);
2844
2845         /* the super has an extra ref to root->node */
2846         free_extent_buffer(old);
2847
2848         add_root_to_dirty_list(root);
2849         atomic_inc(&c->refs);
2850         path->nodes[level] = c;
2851         path->locks[level] = BTRFS_WRITE_LOCK;
2852         path->slots[level] = 0;
2853         return 0;
2854 }
2855
2856 /*
2857  * worker function to insert a single pointer in a node.
2858  * the node should have enough room for the pointer already
2859  *
2860  * slot and level indicate where you want the key to go, and
2861  * blocknr is the block the key points to.
2862  */
2863 static void insert_ptr(struct btrfs_trans_handle *trans,
2864                        struct btrfs_path *path,
2865                        struct btrfs_disk_key *key, u64 bytenr,
2866                        int slot, int level)
2867 {
2868         struct extent_buffer *lower;
2869         int nritems;
2870         int ret;
2871
2872         BUG_ON(!path->nodes[level]);
2873         btrfs_assert_tree_write_locked(path->nodes[level]);
2874         lower = path->nodes[level];
2875         nritems = btrfs_header_nritems(lower);
2876         BUG_ON(slot > nritems);
2877         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2878         if (slot != nritems) {
2879                 if (level) {
2880                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2881                                         slot, nritems - slot);
2882                         BUG_ON(ret < 0);
2883                 }
2884                 memmove_extent_buffer(lower,
2885                               btrfs_node_key_ptr_offset(lower, slot + 1),
2886                               btrfs_node_key_ptr_offset(lower, slot),
2887                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2888         }
2889         if (level) {
2890                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2891                                                     BTRFS_MOD_LOG_KEY_ADD);
2892                 BUG_ON(ret < 0);
2893         }
2894         btrfs_set_node_key(lower, key, slot);
2895         btrfs_set_node_blockptr(lower, slot, bytenr);
2896         WARN_ON(trans->transid == 0);
2897         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2898         btrfs_set_header_nritems(lower, nritems + 1);
2899         btrfs_mark_buffer_dirty(lower);
2900 }
2901
2902 /*
2903  * split the node at the specified level in path in two.
2904  * The path is corrected to point to the appropriate node after the split
2905  *
2906  * Before splitting this tries to make some room in the node by pushing
2907  * left and right, if either one works, it returns right away.
2908  *
2909  * returns 0 on success and < 0 on failure
2910  */
2911 static noinline int split_node(struct btrfs_trans_handle *trans,
2912                                struct btrfs_root *root,
2913                                struct btrfs_path *path, int level)
2914 {
2915         struct btrfs_fs_info *fs_info = root->fs_info;
2916         struct extent_buffer *c;
2917         struct extent_buffer *split;
2918         struct btrfs_disk_key disk_key;
2919         int mid;
2920         int ret;
2921         u32 c_nritems;
2922
2923         c = path->nodes[level];
2924         WARN_ON(btrfs_header_generation(c) != trans->transid);
2925         if (c == root->node) {
2926                 /*
2927                  * trying to split the root, lets make a new one
2928                  *
2929                  * tree mod log: We don't log_removal old root in
2930                  * insert_new_root, because that root buffer will be kept as a
2931                  * normal node. We are going to log removal of half of the
2932                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2933                  * holding a tree lock on the buffer, which is why we cannot
2934                  * race with other tree_mod_log users.
2935                  */
2936                 ret = insert_new_root(trans, root, path, level + 1);
2937                 if (ret)
2938                         return ret;
2939         } else {
2940                 ret = push_nodes_for_insert(trans, root, path, level);
2941                 c = path->nodes[level];
2942                 if (!ret && btrfs_header_nritems(c) <
2943                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2944                         return 0;
2945                 if (ret < 0)
2946                         return ret;
2947         }
2948
2949         c_nritems = btrfs_header_nritems(c);
2950         mid = (c_nritems + 1) / 2;
2951         btrfs_node_key(c, &disk_key, mid);
2952
2953         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2954                                        &disk_key, level, c->start, 0,
2955                                        BTRFS_NESTING_SPLIT);
2956         if (IS_ERR(split))
2957                 return PTR_ERR(split);
2958
2959         root_add_used(root, fs_info->nodesize);
2960         ASSERT(btrfs_header_level(c) == level);
2961
2962         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2963         if (ret) {
2964                 btrfs_abort_transaction(trans, ret);
2965                 return ret;
2966         }
2967         copy_extent_buffer(split, c,
2968                            btrfs_node_key_ptr_offset(split, 0),
2969                            btrfs_node_key_ptr_offset(c, mid),
2970                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2971         btrfs_set_header_nritems(split, c_nritems - mid);
2972         btrfs_set_header_nritems(c, mid);
2973
2974         btrfs_mark_buffer_dirty(c);
2975         btrfs_mark_buffer_dirty(split);
2976
2977         insert_ptr(trans, path, &disk_key, split->start,
2978                    path->slots[level + 1] + 1, level + 1);
2979
2980         if (path->slots[level] >= mid) {
2981                 path->slots[level] -= mid;
2982                 btrfs_tree_unlock(c);
2983                 free_extent_buffer(c);
2984                 path->nodes[level] = split;
2985                 path->slots[level + 1] += 1;
2986         } else {
2987                 btrfs_tree_unlock(split);
2988                 free_extent_buffer(split);
2989         }
2990         return 0;
2991 }
2992
2993 /*
2994  * how many bytes are required to store the items in a leaf.  start
2995  * and nr indicate which items in the leaf to check.  This totals up the
2996  * space used both by the item structs and the item data
2997  */
2998 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2999 {
3000         int data_len;
3001         int nritems = btrfs_header_nritems(l);
3002         int end = min(nritems, start + nr) - 1;
3003
3004         if (!nr)
3005                 return 0;
3006         data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3007         data_len = data_len - btrfs_item_offset(l, end);
3008         data_len += sizeof(struct btrfs_item) * nr;
3009         WARN_ON(data_len < 0);
3010         return data_len;
3011 }
3012
3013 /*
3014  * The space between the end of the leaf items and
3015  * the start of the leaf data.  IOW, how much room
3016  * the leaf has left for both items and data
3017  */
3018 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3019 {
3020         struct btrfs_fs_info *fs_info = leaf->fs_info;
3021         int nritems = btrfs_header_nritems(leaf);
3022         int ret;
3023
3024         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3025         if (ret < 0) {
3026                 btrfs_crit(fs_info,
3027                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3028                            ret,
3029                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3030                            leaf_space_used(leaf, 0, nritems), nritems);
3031         }
3032         return ret;
3033 }
3034
3035 /*
3036  * min slot controls the lowest index we're willing to push to the
3037  * right.  We'll push up to and including min_slot, but no lower
3038  */
3039 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3040                                       struct btrfs_path *path,
3041                                       int data_size, int empty,
3042                                       struct extent_buffer *right,
3043                                       int free_space, u32 left_nritems,
3044                                       u32 min_slot)
3045 {
3046         struct btrfs_fs_info *fs_info = right->fs_info;
3047         struct extent_buffer *left = path->nodes[0];
3048         struct extent_buffer *upper = path->nodes[1];
3049         struct btrfs_map_token token;
3050         struct btrfs_disk_key disk_key;
3051         int slot;
3052         u32 i;
3053         int push_space = 0;
3054         int push_items = 0;
3055         u32 nr;
3056         u32 right_nritems;
3057         u32 data_end;
3058         u32 this_item_size;
3059
3060         if (empty)
3061                 nr = 0;
3062         else
3063                 nr = max_t(u32, 1, min_slot);
3064
3065         if (path->slots[0] >= left_nritems)
3066                 push_space += data_size;
3067
3068         slot = path->slots[1];
3069         i = left_nritems - 1;
3070         while (i >= nr) {
3071                 if (!empty && push_items > 0) {
3072                         if (path->slots[0] > i)
3073                                 break;
3074                         if (path->slots[0] == i) {
3075                                 int space = btrfs_leaf_free_space(left);
3076
3077                                 if (space + push_space * 2 > free_space)
3078                                         break;
3079                         }
3080                 }
3081
3082                 if (path->slots[0] == i)
3083                         push_space += data_size;
3084
3085                 this_item_size = btrfs_item_size(left, i);
3086                 if (this_item_size + sizeof(struct btrfs_item) +
3087                     push_space > free_space)
3088                         break;
3089
3090                 push_items++;
3091                 push_space += this_item_size + sizeof(struct btrfs_item);
3092                 if (i == 0)
3093                         break;
3094                 i--;
3095         }
3096
3097         if (push_items == 0)
3098                 goto out_unlock;
3099
3100         WARN_ON(!empty && push_items == left_nritems);
3101
3102         /* push left to right */
3103         right_nritems = btrfs_header_nritems(right);
3104
3105         push_space = btrfs_item_data_end(left, left_nritems - push_items);
3106         push_space -= leaf_data_end(left);
3107
3108         /* make room in the right data area */
3109         data_end = leaf_data_end(right);
3110         memmove_leaf_data(right, data_end - push_space, data_end,
3111                           BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3112
3113         /* copy from the left data area */
3114         copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3115                        leaf_data_end(left), push_space);
3116
3117         memmove_leaf_items(right, push_items, 0, right_nritems);
3118
3119         /* copy the items from left to right */
3120         copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3121
3122         /* update the item pointers */
3123         btrfs_init_map_token(&token, right);
3124         right_nritems += push_items;
3125         btrfs_set_header_nritems(right, right_nritems);
3126         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3127         for (i = 0; i < right_nritems; i++) {
3128                 push_space -= btrfs_token_item_size(&token, i);
3129                 btrfs_set_token_item_offset(&token, i, push_space);
3130         }
3131
3132         left_nritems -= push_items;
3133         btrfs_set_header_nritems(left, left_nritems);
3134
3135         if (left_nritems)
3136                 btrfs_mark_buffer_dirty(left);
3137         else
3138                 btrfs_clear_buffer_dirty(trans, left);
3139
3140         btrfs_mark_buffer_dirty(right);
3141
3142         btrfs_item_key(right, &disk_key, 0);
3143         btrfs_set_node_key(upper, &disk_key, slot + 1);
3144         btrfs_mark_buffer_dirty(upper);
3145
3146         /* then fixup the leaf pointer in the path */
3147         if (path->slots[0] >= left_nritems) {
3148                 path->slots[0] -= left_nritems;
3149                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3150                         btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3151                 btrfs_tree_unlock(path->nodes[0]);
3152                 free_extent_buffer(path->nodes[0]);
3153                 path->nodes[0] = right;
3154                 path->slots[1] += 1;
3155         } else {
3156                 btrfs_tree_unlock(right);
3157                 free_extent_buffer(right);
3158         }
3159         return 0;
3160
3161 out_unlock:
3162         btrfs_tree_unlock(right);
3163         free_extent_buffer(right);
3164         return 1;
3165 }
3166
3167 /*
3168  * push some data in the path leaf to the right, trying to free up at
3169  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3170  *
3171  * returns 1 if the push failed because the other node didn't have enough
3172  * room, 0 if everything worked out and < 0 if there were major errors.
3173  *
3174  * this will push starting from min_slot to the end of the leaf.  It won't
3175  * push any slot lower than min_slot
3176  */
3177 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3178                            *root, struct btrfs_path *path,
3179                            int min_data_size, int data_size,
3180                            int empty, u32 min_slot)
3181 {
3182         struct extent_buffer *left = path->nodes[0];
3183         struct extent_buffer *right;
3184         struct extent_buffer *upper;
3185         int slot;
3186         int free_space;
3187         u32 left_nritems;
3188         int ret;
3189
3190         if (!path->nodes[1])
3191                 return 1;
3192
3193         slot = path->slots[1];
3194         upper = path->nodes[1];
3195         if (slot >= btrfs_header_nritems(upper) - 1)
3196                 return 1;
3197
3198         btrfs_assert_tree_write_locked(path->nodes[1]);
3199
3200         right = btrfs_read_node_slot(upper, slot + 1);
3201         /*
3202          * slot + 1 is not valid or we fail to read the right node,
3203          * no big deal, just return.
3204          */
3205         if (IS_ERR(right))
3206                 return 1;
3207
3208         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3209
3210         free_space = btrfs_leaf_free_space(right);
3211         if (free_space < data_size)
3212                 goto out_unlock;
3213
3214         ret = btrfs_cow_block(trans, root, right, upper,
3215                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3216         if (ret)
3217                 goto out_unlock;
3218
3219         left_nritems = btrfs_header_nritems(left);
3220         if (left_nritems == 0)
3221                 goto out_unlock;
3222
3223         if (check_sibling_keys(left, right)) {
3224                 ret = -EUCLEAN;
3225                 btrfs_tree_unlock(right);
3226                 free_extent_buffer(right);
3227                 return ret;
3228         }
3229         if (path->slots[0] == left_nritems && !empty) {
3230                 /* Key greater than all keys in the leaf, right neighbor has
3231                  * enough room for it and we're not emptying our leaf to delete
3232                  * it, therefore use right neighbor to insert the new item and
3233                  * no need to touch/dirty our left leaf. */
3234                 btrfs_tree_unlock(left);
3235                 free_extent_buffer(left);
3236                 path->nodes[0] = right;
3237                 path->slots[0] = 0;
3238                 path->slots[1]++;
3239                 return 0;
3240         }
3241
3242         return __push_leaf_right(trans, path, min_data_size, empty, right,
3243                                  free_space, left_nritems, min_slot);
3244 out_unlock:
3245         btrfs_tree_unlock(right);
3246         free_extent_buffer(right);
3247         return 1;
3248 }
3249
3250 /*
3251  * push some data in the path leaf to the left, trying to free up at
3252  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3253  *
3254  * max_slot can put a limit on how far into the leaf we'll push items.  The
3255  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3256  * items
3257  */
3258 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3259                                      struct btrfs_path *path, int data_size,
3260                                      int empty, struct extent_buffer *left,
3261                                      int free_space, u32 right_nritems,
3262                                      u32 max_slot)
3263 {
3264         struct btrfs_fs_info *fs_info = left->fs_info;
3265         struct btrfs_disk_key disk_key;
3266         struct extent_buffer *right = path->nodes[0];
3267         int i;
3268         int push_space = 0;
3269         int push_items = 0;
3270         u32 old_left_nritems;
3271         u32 nr;
3272         int ret = 0;
3273         u32 this_item_size;
3274         u32 old_left_item_size;
3275         struct btrfs_map_token token;
3276
3277         if (empty)
3278                 nr = min(right_nritems, max_slot);
3279         else
3280                 nr = min(right_nritems - 1, max_slot);
3281
3282         for (i = 0; i < nr; i++) {
3283                 if (!empty && push_items > 0) {
3284                         if (path->slots[0] < i)
3285                                 break;
3286                         if (path->slots[0] == i) {
3287                                 int space = btrfs_leaf_free_space(right);
3288
3289                                 if (space + push_space * 2 > free_space)
3290                                         break;
3291                         }
3292                 }
3293
3294                 if (path->slots[0] == i)
3295                         push_space += data_size;
3296
3297                 this_item_size = btrfs_item_size(right, i);
3298                 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3299                     free_space)
3300                         break;
3301
3302                 push_items++;
3303                 push_space += this_item_size + sizeof(struct btrfs_item);
3304         }
3305
3306         if (push_items == 0) {
3307                 ret = 1;
3308                 goto out;
3309         }
3310         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3311
3312         /* push data from right to left */
3313         copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3314
3315         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3316                      btrfs_item_offset(right, push_items - 1);
3317
3318         copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3319                        btrfs_item_offset(right, push_items - 1), push_space);
3320         old_left_nritems = btrfs_header_nritems(left);
3321         BUG_ON(old_left_nritems <= 0);
3322
3323         btrfs_init_map_token(&token, left);
3324         old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3325         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3326                 u32 ioff;
3327
3328                 ioff = btrfs_token_item_offset(&token, i);
3329                 btrfs_set_token_item_offset(&token, i,
3330                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3331         }
3332         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3333
3334         /* fixup right node */
3335         if (push_items > right_nritems)
3336                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3337                        right_nritems);
3338
3339         if (push_items < right_nritems) {
3340                 push_space = btrfs_item_offset(right, push_items - 1) -
3341                                                   leaf_data_end(right);
3342                 memmove_leaf_data(right,
3343                                   BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3344                                   leaf_data_end(right), push_space);
3345
3346                 memmove_leaf_items(right, 0, push_items,
3347                                    btrfs_header_nritems(right) - push_items);
3348         }
3349
3350         btrfs_init_map_token(&token, right);
3351         right_nritems -= push_items;
3352         btrfs_set_header_nritems(right, right_nritems);
3353         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3354         for (i = 0; i < right_nritems; i++) {
3355                 push_space = push_space - btrfs_token_item_size(&token, i);
3356                 btrfs_set_token_item_offset(&token, i, push_space);
3357         }
3358
3359         btrfs_mark_buffer_dirty(left);
3360         if (right_nritems)
3361                 btrfs_mark_buffer_dirty(right);
3362         else
3363                 btrfs_clear_buffer_dirty(trans, right);
3364
3365         btrfs_item_key(right, &disk_key, 0);
3366         fixup_low_keys(path, &disk_key, 1);
3367
3368         /* then fixup the leaf pointer in the path */
3369         if (path->slots[0] < push_items) {
3370                 path->slots[0] += old_left_nritems;
3371                 btrfs_tree_unlock(path->nodes[0]);
3372                 free_extent_buffer(path->nodes[0]);
3373                 path->nodes[0] = left;
3374                 path->slots[1] -= 1;
3375         } else {
3376                 btrfs_tree_unlock(left);
3377                 free_extent_buffer(left);
3378                 path->slots[0] -= push_items;
3379         }
3380         BUG_ON(path->slots[0] < 0);
3381         return ret;
3382 out:
3383         btrfs_tree_unlock(left);
3384         free_extent_buffer(left);
3385         return ret;
3386 }
3387
3388 /*
3389  * push some data in the path leaf to the left, trying to free up at
3390  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3391  *
3392  * max_slot can put a limit on how far into the leaf we'll push items.  The
3393  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3394  * items
3395  */
3396 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3397                           *root, struct btrfs_path *path, int min_data_size,
3398                           int data_size, int empty, u32 max_slot)
3399 {
3400         struct extent_buffer *right = path->nodes[0];
3401         struct extent_buffer *left;
3402         int slot;
3403         int free_space;
3404         u32 right_nritems;
3405         int ret = 0;
3406
3407         slot = path->slots[1];
3408         if (slot == 0)
3409                 return 1;
3410         if (!path->nodes[1])
3411                 return 1;
3412
3413         right_nritems = btrfs_header_nritems(right);
3414         if (right_nritems == 0)
3415                 return 1;
3416
3417         btrfs_assert_tree_write_locked(path->nodes[1]);
3418
3419         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3420         /*
3421          * slot - 1 is not valid or we fail to read the left node,
3422          * no big deal, just return.
3423          */
3424         if (IS_ERR(left))
3425                 return 1;
3426
3427         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3428
3429         free_space = btrfs_leaf_free_space(left);
3430         if (free_space < data_size) {
3431                 ret = 1;
3432                 goto out;
3433         }
3434
3435         ret = btrfs_cow_block(trans, root, left,
3436                               path->nodes[1], slot - 1, &left,
3437                               BTRFS_NESTING_LEFT_COW);
3438         if (ret) {
3439                 /* we hit -ENOSPC, but it isn't fatal here */
3440                 if (ret == -ENOSPC)
3441                         ret = 1;
3442                 goto out;
3443         }
3444
3445         if (check_sibling_keys(left, right)) {
3446                 ret = -EUCLEAN;
3447                 goto out;
3448         }
3449         return __push_leaf_left(trans, path, min_data_size, empty, left,
3450                                 free_space, right_nritems, max_slot);
3451 out:
3452         btrfs_tree_unlock(left);
3453         free_extent_buffer(left);
3454         return ret;
3455 }
3456
3457 /*
3458  * split the path's leaf in two, making sure there is at least data_size
3459  * available for the resulting leaf level of the path.
3460  */
3461 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3462                                     struct btrfs_path *path,
3463                                     struct extent_buffer *l,
3464                                     struct extent_buffer *right,
3465                                     int slot, int mid, int nritems)
3466 {
3467         struct btrfs_fs_info *fs_info = trans->fs_info;
3468         int data_copy_size;
3469         int rt_data_off;
3470         int i;
3471         struct btrfs_disk_key disk_key;
3472         struct btrfs_map_token token;
3473
3474         nritems = nritems - mid;
3475         btrfs_set_header_nritems(right, nritems);
3476         data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3477
3478         copy_leaf_items(right, l, 0, mid, nritems);
3479
3480         copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3481                        leaf_data_end(l), data_copy_size);
3482
3483         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3484
3485         btrfs_init_map_token(&token, right);
3486         for (i = 0; i < nritems; i++) {
3487                 u32 ioff;
3488
3489                 ioff = btrfs_token_item_offset(&token, i);
3490                 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3491         }
3492
3493         btrfs_set_header_nritems(l, mid);
3494         btrfs_item_key(right, &disk_key, 0);
3495         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3496
3497         btrfs_mark_buffer_dirty(right);
3498         btrfs_mark_buffer_dirty(l);
3499         BUG_ON(path->slots[0] != slot);
3500
3501         if (mid <= slot) {
3502                 btrfs_tree_unlock(path->nodes[0]);
3503                 free_extent_buffer(path->nodes[0]);
3504                 path->nodes[0] = right;
3505                 path->slots[0] -= mid;
3506                 path->slots[1] += 1;
3507         } else {
3508                 btrfs_tree_unlock(right);
3509                 free_extent_buffer(right);
3510         }
3511
3512         BUG_ON(path->slots[0] < 0);
3513 }
3514
3515 /*
3516  * double splits happen when we need to insert a big item in the middle
3517  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3518  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3519  *          A                 B                 C
3520  *
3521  * We avoid this by trying to push the items on either side of our target
3522  * into the adjacent leaves.  If all goes well we can avoid the double split
3523  * completely.
3524  */
3525 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3526                                           struct btrfs_root *root,
3527                                           struct btrfs_path *path,
3528                                           int data_size)
3529 {
3530         int ret;
3531         int progress = 0;
3532         int slot;
3533         u32 nritems;
3534         int space_needed = data_size;
3535
3536         slot = path->slots[0];
3537         if (slot < btrfs_header_nritems(path->nodes[0]))
3538                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3539
3540         /*
3541          * try to push all the items after our slot into the
3542          * right leaf
3543          */
3544         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3545         if (ret < 0)
3546                 return ret;
3547
3548         if (ret == 0)
3549                 progress++;
3550
3551         nritems = btrfs_header_nritems(path->nodes[0]);
3552         /*
3553          * our goal is to get our slot at the start or end of a leaf.  If
3554          * we've done so we're done
3555          */
3556         if (path->slots[0] == 0 || path->slots[0] == nritems)
3557                 return 0;
3558
3559         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3560                 return 0;
3561
3562         /* try to push all the items before our slot into the next leaf */
3563         slot = path->slots[0];
3564         space_needed = data_size;
3565         if (slot > 0)
3566                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3567         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3568         if (ret < 0)
3569                 return ret;
3570
3571         if (ret == 0)
3572                 progress++;
3573
3574         if (progress)
3575                 return 0;
3576         return 1;
3577 }
3578
3579 /*
3580  * split the path's leaf in two, making sure there is at least data_size
3581  * available for the resulting leaf level of the path.
3582  *
3583  * returns 0 if all went well and < 0 on failure.
3584  */
3585 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3586                                struct btrfs_root *root,
3587                                const struct btrfs_key *ins_key,
3588                                struct btrfs_path *path, int data_size,
3589                                int extend)
3590 {
3591         struct btrfs_disk_key disk_key;
3592         struct extent_buffer *l;
3593         u32 nritems;
3594         int mid;
3595         int slot;
3596         struct extent_buffer *right;
3597         struct btrfs_fs_info *fs_info = root->fs_info;
3598         int ret = 0;
3599         int wret;
3600         int split;
3601         int num_doubles = 0;
3602         int tried_avoid_double = 0;
3603
3604         l = path->nodes[0];
3605         slot = path->slots[0];
3606         if (extend && data_size + btrfs_item_size(l, slot) +
3607             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3608                 return -EOVERFLOW;
3609
3610         /* first try to make some room by pushing left and right */
3611         if (data_size && path->nodes[1]) {
3612                 int space_needed = data_size;
3613
3614                 if (slot < btrfs_header_nritems(l))
3615                         space_needed -= btrfs_leaf_free_space(l);
3616
3617                 wret = push_leaf_right(trans, root, path, space_needed,
3618                                        space_needed, 0, 0);
3619                 if (wret < 0)
3620                         return wret;
3621                 if (wret) {
3622                         space_needed = data_size;
3623                         if (slot > 0)
3624                                 space_needed -= btrfs_leaf_free_space(l);
3625                         wret = push_leaf_left(trans, root, path, space_needed,
3626                                               space_needed, 0, (u32)-1);
3627                         if (wret < 0)
3628                                 return wret;
3629                 }
3630                 l = path->nodes[0];
3631
3632                 /* did the pushes work? */
3633                 if (btrfs_leaf_free_space(l) >= data_size)
3634                         return 0;
3635         }
3636
3637         if (!path->nodes[1]) {
3638                 ret = insert_new_root(trans, root, path, 1);
3639                 if (ret)
3640                         return ret;
3641         }
3642 again:
3643         split = 1;
3644         l = path->nodes[0];
3645         slot = path->slots[0];
3646         nritems = btrfs_header_nritems(l);
3647         mid = (nritems + 1) / 2;
3648
3649         if (mid <= slot) {
3650                 if (nritems == 1 ||
3651                     leaf_space_used(l, mid, nritems - mid) + data_size >
3652                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3653                         if (slot >= nritems) {
3654                                 split = 0;
3655                         } else {
3656                                 mid = slot;
3657                                 if (mid != nritems &&
3658                                     leaf_space_used(l, mid, nritems - mid) +
3659                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3660                                         if (data_size && !tried_avoid_double)
3661                                                 goto push_for_double;
3662                                         split = 2;
3663                                 }
3664                         }
3665                 }
3666         } else {
3667                 if (leaf_space_used(l, 0, mid) + data_size >
3668                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3669                         if (!extend && data_size && slot == 0) {
3670                                 split = 0;
3671                         } else if ((extend || !data_size) && slot == 0) {
3672                                 mid = 1;
3673                         } else {
3674                                 mid = slot;
3675                                 if (mid != nritems &&
3676                                     leaf_space_used(l, mid, nritems - mid) +
3677                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3678                                         if (data_size && !tried_avoid_double)
3679                                                 goto push_for_double;
3680                                         split = 2;
3681                                 }
3682                         }
3683                 }
3684         }
3685
3686         if (split == 0)
3687                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3688         else
3689                 btrfs_item_key(l, &disk_key, mid);
3690
3691         /*
3692          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3693          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3694          * subclasses, which is 8 at the time of this patch, and we've maxed it
3695          * out.  In the future we could add a
3696          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3697          * use BTRFS_NESTING_NEW_ROOT.
3698          */
3699         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3700                                        &disk_key, 0, l->start, 0,
3701                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3702                                        BTRFS_NESTING_SPLIT);
3703         if (IS_ERR(right))
3704                 return PTR_ERR(right);
3705
3706         root_add_used(root, fs_info->nodesize);
3707
3708         if (split == 0) {
3709                 if (mid <= slot) {
3710                         btrfs_set_header_nritems(right, 0);
3711                         insert_ptr(trans, path, &disk_key,
3712                                    right->start, path->slots[1] + 1, 1);
3713                         btrfs_tree_unlock(path->nodes[0]);
3714                         free_extent_buffer(path->nodes[0]);
3715                         path->nodes[0] = right;
3716                         path->slots[0] = 0;
3717                         path->slots[1] += 1;
3718                 } else {
3719                         btrfs_set_header_nritems(right, 0);
3720                         insert_ptr(trans, path, &disk_key,
3721                                    right->start, path->slots[1], 1);
3722                         btrfs_tree_unlock(path->nodes[0]);
3723                         free_extent_buffer(path->nodes[0]);
3724                         path->nodes[0] = right;
3725                         path->slots[0] = 0;
3726                         if (path->slots[1] == 0)
3727                                 fixup_low_keys(path, &disk_key, 1);
3728                 }
3729                 /*
3730                  * We create a new leaf 'right' for the required ins_len and
3731                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3732                  * the content of ins_len to 'right'.
3733                  */
3734                 return ret;
3735         }
3736
3737         copy_for_split(trans, path, l, right, slot, mid, nritems);
3738
3739         if (split == 2) {
3740                 BUG_ON(num_doubles != 0);
3741                 num_doubles++;
3742                 goto again;
3743         }
3744
3745         return 0;
3746
3747 push_for_double:
3748         push_for_double_split(trans, root, path, data_size);
3749         tried_avoid_double = 1;
3750         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3751                 return 0;
3752         goto again;
3753 }
3754
3755 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3756                                          struct btrfs_root *root,
3757                                          struct btrfs_path *path, int ins_len)
3758 {
3759         struct btrfs_key key;
3760         struct extent_buffer *leaf;
3761         struct btrfs_file_extent_item *fi;
3762         u64 extent_len = 0;
3763         u32 item_size;
3764         int ret;
3765
3766         leaf = path->nodes[0];
3767         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3768
3769         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3770                key.type != BTRFS_EXTENT_CSUM_KEY);
3771
3772         if (btrfs_leaf_free_space(leaf) >= ins_len)
3773                 return 0;
3774
3775         item_size = btrfs_item_size(leaf, path->slots[0]);
3776         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3777                 fi = btrfs_item_ptr(leaf, path->slots[0],
3778                                     struct btrfs_file_extent_item);
3779                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3780         }
3781         btrfs_release_path(path);
3782
3783         path->keep_locks = 1;
3784         path->search_for_split = 1;
3785         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3786         path->search_for_split = 0;
3787         if (ret > 0)
3788                 ret = -EAGAIN;
3789         if (ret < 0)
3790                 goto err;
3791
3792         ret = -EAGAIN;
3793         leaf = path->nodes[0];
3794         /* if our item isn't there, return now */
3795         if (item_size != btrfs_item_size(leaf, path->slots[0]))
3796                 goto err;
3797
3798         /* the leaf has  changed, it now has room.  return now */
3799         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3800                 goto err;
3801
3802         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3803                 fi = btrfs_item_ptr(leaf, path->slots[0],
3804                                     struct btrfs_file_extent_item);
3805                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3806                         goto err;
3807         }
3808
3809         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3810         if (ret)
3811                 goto err;
3812
3813         path->keep_locks = 0;
3814         btrfs_unlock_up_safe(path, 1);
3815         return 0;
3816 err:
3817         path->keep_locks = 0;
3818         return ret;
3819 }
3820
3821 static noinline int split_item(struct btrfs_path *path,
3822                                const struct btrfs_key *new_key,
3823                                unsigned long split_offset)
3824 {
3825         struct extent_buffer *leaf;
3826         int orig_slot, slot;
3827         char *buf;
3828         u32 nritems;
3829         u32 item_size;
3830         u32 orig_offset;
3831         struct btrfs_disk_key disk_key;
3832
3833         leaf = path->nodes[0];
3834         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3835
3836         orig_slot = path->slots[0];
3837         orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3838         item_size = btrfs_item_size(leaf, path->slots[0]);
3839
3840         buf = kmalloc(item_size, GFP_NOFS);
3841         if (!buf)
3842                 return -ENOMEM;
3843
3844         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3845                             path->slots[0]), item_size);
3846
3847         slot = path->slots[0] + 1;
3848         nritems = btrfs_header_nritems(leaf);
3849         if (slot != nritems) {
3850                 /* shift the items */
3851                 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3852         }
3853
3854         btrfs_cpu_key_to_disk(&disk_key, new_key);
3855         btrfs_set_item_key(leaf, &disk_key, slot);
3856
3857         btrfs_set_item_offset(leaf, slot, orig_offset);
3858         btrfs_set_item_size(leaf, slot, item_size - split_offset);
3859
3860         btrfs_set_item_offset(leaf, orig_slot,
3861                                  orig_offset + item_size - split_offset);
3862         btrfs_set_item_size(leaf, orig_slot, split_offset);
3863
3864         btrfs_set_header_nritems(leaf, nritems + 1);
3865
3866         /* write the data for the start of the original item */
3867         write_extent_buffer(leaf, buf,
3868                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3869                             split_offset);
3870
3871         /* write the data for the new item */
3872         write_extent_buffer(leaf, buf + split_offset,
3873                             btrfs_item_ptr_offset(leaf, slot),
3874                             item_size - split_offset);
3875         btrfs_mark_buffer_dirty(leaf);
3876
3877         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3878         kfree(buf);
3879         return 0;
3880 }
3881
3882 /*
3883  * This function splits a single item into two items,
3884  * giving 'new_key' to the new item and splitting the
3885  * old one at split_offset (from the start of the item).
3886  *
3887  * The path may be released by this operation.  After
3888  * the split, the path is pointing to the old item.  The
3889  * new item is going to be in the same node as the old one.
3890  *
3891  * Note, the item being split must be smaller enough to live alone on
3892  * a tree block with room for one extra struct btrfs_item
3893  *
3894  * This allows us to split the item in place, keeping a lock on the
3895  * leaf the entire time.
3896  */
3897 int btrfs_split_item(struct btrfs_trans_handle *trans,
3898                      struct btrfs_root *root,
3899                      struct btrfs_path *path,
3900                      const struct btrfs_key *new_key,
3901                      unsigned long split_offset)
3902 {
3903         int ret;
3904         ret = setup_leaf_for_split(trans, root, path,
3905                                    sizeof(struct btrfs_item));
3906         if (ret)
3907                 return ret;
3908
3909         ret = split_item(path, new_key, split_offset);
3910         return ret;
3911 }
3912
3913 /*
3914  * make the item pointed to by the path smaller.  new_size indicates
3915  * how small to make it, and from_end tells us if we just chop bytes
3916  * off the end of the item or if we shift the item to chop bytes off
3917  * the front.
3918  */
3919 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3920 {
3921         int slot;
3922         struct extent_buffer *leaf;
3923         u32 nritems;
3924         unsigned int data_end;
3925         unsigned int old_data_start;
3926         unsigned int old_size;
3927         unsigned int size_diff;
3928         int i;
3929         struct btrfs_map_token token;
3930
3931         leaf = path->nodes[0];
3932         slot = path->slots[0];
3933
3934         old_size = btrfs_item_size(leaf, slot);
3935         if (old_size == new_size)
3936                 return;
3937
3938         nritems = btrfs_header_nritems(leaf);
3939         data_end = leaf_data_end(leaf);
3940
3941         old_data_start = btrfs_item_offset(leaf, slot);
3942
3943         size_diff = old_size - new_size;
3944
3945         BUG_ON(slot < 0);
3946         BUG_ON(slot >= nritems);
3947
3948         /*
3949          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3950          */
3951         /* first correct the data pointers */
3952         btrfs_init_map_token(&token, leaf);
3953         for (i = slot; i < nritems; i++) {
3954                 u32 ioff;
3955
3956                 ioff = btrfs_token_item_offset(&token, i);
3957                 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
3958         }
3959
3960         /* shift the data */
3961         if (from_end) {
3962                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
3963                                   old_data_start + new_size - data_end);
3964         } else {
3965                 struct btrfs_disk_key disk_key;
3966                 u64 offset;
3967
3968                 btrfs_item_key(leaf, &disk_key, slot);
3969
3970                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3971                         unsigned long ptr;
3972                         struct btrfs_file_extent_item *fi;
3973
3974                         fi = btrfs_item_ptr(leaf, slot,
3975                                             struct btrfs_file_extent_item);
3976                         fi = (struct btrfs_file_extent_item *)(
3977                              (unsigned long)fi - size_diff);
3978
3979                         if (btrfs_file_extent_type(leaf, fi) ==
3980                             BTRFS_FILE_EXTENT_INLINE) {
3981                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3982                                 memmove_extent_buffer(leaf, ptr,
3983                                       (unsigned long)fi,
3984                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
3985                         }
3986                 }
3987
3988                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
3989                                   old_data_start - data_end);
3990
3991                 offset = btrfs_disk_key_offset(&disk_key);
3992                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3993                 btrfs_set_item_key(leaf, &disk_key, slot);
3994                 if (slot == 0)
3995                         fixup_low_keys(path, &disk_key, 1);
3996         }
3997
3998         btrfs_set_item_size(leaf, slot, new_size);
3999         btrfs_mark_buffer_dirty(leaf);
4000
4001         if (btrfs_leaf_free_space(leaf) < 0) {
4002                 btrfs_print_leaf(leaf);
4003                 BUG();
4004         }
4005 }
4006
4007 /*
4008  * make the item pointed to by the path bigger, data_size is the added size.
4009  */
4010 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4011 {
4012         int slot;
4013         struct extent_buffer *leaf;
4014         u32 nritems;
4015         unsigned int data_end;
4016         unsigned int old_data;
4017         unsigned int old_size;
4018         int i;
4019         struct btrfs_map_token token;
4020
4021         leaf = path->nodes[0];
4022
4023         nritems = btrfs_header_nritems(leaf);
4024         data_end = leaf_data_end(leaf);
4025
4026         if (btrfs_leaf_free_space(leaf) < data_size) {
4027                 btrfs_print_leaf(leaf);
4028                 BUG();
4029         }
4030         slot = path->slots[0];
4031         old_data = btrfs_item_data_end(leaf, slot);
4032
4033         BUG_ON(slot < 0);
4034         if (slot >= nritems) {
4035                 btrfs_print_leaf(leaf);
4036                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4037                            slot, nritems);
4038                 BUG();
4039         }
4040
4041         /*
4042          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4043          */
4044         /* first correct the data pointers */
4045         btrfs_init_map_token(&token, leaf);
4046         for (i = slot; i < nritems; i++) {
4047                 u32 ioff;
4048
4049                 ioff = btrfs_token_item_offset(&token, i);
4050                 btrfs_set_token_item_offset(&token, i, ioff - data_size);
4051         }
4052
4053         /* shift the data */
4054         memmove_leaf_data(leaf, data_end - data_size, data_end,
4055                           old_data - data_end);
4056
4057         data_end = old_data;
4058         old_size = btrfs_item_size(leaf, slot);
4059         btrfs_set_item_size(leaf, slot, old_size + data_size);
4060         btrfs_mark_buffer_dirty(leaf);
4061
4062         if (btrfs_leaf_free_space(leaf) < 0) {
4063                 btrfs_print_leaf(leaf);
4064                 BUG();
4065         }
4066 }
4067
4068 /*
4069  * Make space in the node before inserting one or more items.
4070  *
4071  * @root:       root we are inserting items to
4072  * @path:       points to the leaf/slot where we are going to insert new items
4073  * @batch:      information about the batch of items to insert
4074  *
4075  * Main purpose is to save stack depth by doing the bulk of the work in a
4076  * function that doesn't call btrfs_search_slot
4077  */
4078 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4079                                    const struct btrfs_item_batch *batch)
4080 {
4081         struct btrfs_fs_info *fs_info = root->fs_info;
4082         int i;
4083         u32 nritems;
4084         unsigned int data_end;
4085         struct btrfs_disk_key disk_key;
4086         struct extent_buffer *leaf;
4087         int slot;
4088         struct btrfs_map_token token;
4089         u32 total_size;
4090
4091         /*
4092          * Before anything else, update keys in the parent and other ancestors
4093          * if needed, then release the write locks on them, so that other tasks
4094          * can use them while we modify the leaf.
4095          */
4096         if (path->slots[0] == 0) {
4097                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4098                 fixup_low_keys(path, &disk_key, 1);
4099         }
4100         btrfs_unlock_up_safe(path, 1);
4101
4102         leaf = path->nodes[0];
4103         slot = path->slots[0];
4104
4105         nritems = btrfs_header_nritems(leaf);
4106         data_end = leaf_data_end(leaf);
4107         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4108
4109         if (btrfs_leaf_free_space(leaf) < total_size) {
4110                 btrfs_print_leaf(leaf);
4111                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4112                            total_size, btrfs_leaf_free_space(leaf));
4113                 BUG();
4114         }
4115
4116         btrfs_init_map_token(&token, leaf);
4117         if (slot != nritems) {
4118                 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4119
4120                 if (old_data < data_end) {
4121                         btrfs_print_leaf(leaf);
4122                         btrfs_crit(fs_info,
4123                 "item at slot %d with data offset %u beyond data end of leaf %u",
4124                                    slot, old_data, data_end);
4125                         BUG();
4126                 }
4127                 /*
4128                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4129                  */
4130                 /* first correct the data pointers */
4131                 for (i = slot; i < nritems; i++) {
4132                         u32 ioff;
4133
4134                         ioff = btrfs_token_item_offset(&token, i);
4135                         btrfs_set_token_item_offset(&token, i,
4136                                                        ioff - batch->total_data_size);
4137                 }
4138                 /* shift the items */
4139                 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4140
4141                 /* shift the data */
4142                 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4143                                   data_end, old_data - data_end);
4144                 data_end = old_data;
4145         }
4146
4147         /* setup the item for the new data */
4148         for (i = 0; i < batch->nr; i++) {
4149                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4150                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4151                 data_end -= batch->data_sizes[i];
4152                 btrfs_set_token_item_offset(&token, slot + i, data_end);
4153                 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4154         }
4155
4156         btrfs_set_header_nritems(leaf, nritems + batch->nr);
4157         btrfs_mark_buffer_dirty(leaf);
4158
4159         if (btrfs_leaf_free_space(leaf) < 0) {
4160                 btrfs_print_leaf(leaf);
4161                 BUG();
4162         }
4163 }
4164
4165 /*
4166  * Insert a new item into a leaf.
4167  *
4168  * @root:      The root of the btree.
4169  * @path:      A path pointing to the target leaf and slot.
4170  * @key:       The key of the new item.
4171  * @data_size: The size of the data associated with the new key.
4172  */
4173 void btrfs_setup_item_for_insert(struct btrfs_root *root,
4174                                  struct btrfs_path *path,
4175                                  const struct btrfs_key *key,
4176                                  u32 data_size)
4177 {
4178         struct btrfs_item_batch batch;
4179
4180         batch.keys = key;
4181         batch.data_sizes = &data_size;
4182         batch.total_data_size = data_size;
4183         batch.nr = 1;
4184
4185         setup_items_for_insert(root, path, &batch);
4186 }
4187
4188 /*
4189  * Given a key and some data, insert items into the tree.
4190  * This does all the path init required, making room in the tree if needed.
4191  */
4192 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4193                             struct btrfs_root *root,
4194                             struct btrfs_path *path,
4195                             const struct btrfs_item_batch *batch)
4196 {
4197         int ret = 0;
4198         int slot;
4199         u32 total_size;
4200
4201         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4202         ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4203         if (ret == 0)
4204                 return -EEXIST;
4205         if (ret < 0)
4206                 return ret;
4207
4208         slot = path->slots[0];
4209         BUG_ON(slot < 0);
4210
4211         setup_items_for_insert(root, path, batch);
4212         return 0;
4213 }
4214
4215 /*
4216  * Given a key and some data, insert an item into the tree.
4217  * This does all the path init required, making room in the tree if needed.
4218  */
4219 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4220                       const struct btrfs_key *cpu_key, void *data,
4221                       u32 data_size)
4222 {
4223         int ret = 0;
4224         struct btrfs_path *path;
4225         struct extent_buffer *leaf;
4226         unsigned long ptr;
4227
4228         path = btrfs_alloc_path();
4229         if (!path)
4230                 return -ENOMEM;
4231         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4232         if (!ret) {
4233                 leaf = path->nodes[0];
4234                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4235                 write_extent_buffer(leaf, data, ptr, data_size);
4236                 btrfs_mark_buffer_dirty(leaf);
4237         }
4238         btrfs_free_path(path);
4239         return ret;
4240 }
4241
4242 /*
4243  * This function duplicates an item, giving 'new_key' to the new item.
4244  * It guarantees both items live in the same tree leaf and the new item is
4245  * contiguous with the original item.
4246  *
4247  * This allows us to split a file extent in place, keeping a lock on the leaf
4248  * the entire time.
4249  */
4250 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4251                          struct btrfs_root *root,
4252                          struct btrfs_path *path,
4253                          const struct btrfs_key *new_key)
4254 {
4255         struct extent_buffer *leaf;
4256         int ret;
4257         u32 item_size;
4258
4259         leaf = path->nodes[0];
4260         item_size = btrfs_item_size(leaf, path->slots[0]);
4261         ret = setup_leaf_for_split(trans, root, path,
4262                                    item_size + sizeof(struct btrfs_item));
4263         if (ret)
4264                 return ret;
4265
4266         path->slots[0]++;
4267         btrfs_setup_item_for_insert(root, path, new_key, item_size);
4268         leaf = path->nodes[0];
4269         memcpy_extent_buffer(leaf,
4270                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4271                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4272                              item_size);
4273         return 0;
4274 }
4275
4276 /*
4277  * delete the pointer from a given node.
4278  *
4279  * the tree should have been previously balanced so the deletion does not
4280  * empty a node.
4281  */
4282 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4283                     int level, int slot)
4284 {
4285         struct extent_buffer *parent = path->nodes[level];
4286         u32 nritems;
4287         int ret;
4288
4289         nritems = btrfs_header_nritems(parent);
4290         if (slot != nritems - 1) {
4291                 if (level) {
4292                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
4293                                         slot + 1, nritems - slot - 1);
4294                         BUG_ON(ret < 0);
4295                 }
4296                 memmove_extent_buffer(parent,
4297                               btrfs_node_key_ptr_offset(parent, slot),
4298                               btrfs_node_key_ptr_offset(parent, slot + 1),
4299                               sizeof(struct btrfs_key_ptr) *
4300                               (nritems - slot - 1));
4301         } else if (level) {
4302                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4303                                                     BTRFS_MOD_LOG_KEY_REMOVE);
4304                 BUG_ON(ret < 0);
4305         }
4306
4307         nritems--;
4308         btrfs_set_header_nritems(parent, nritems);
4309         if (nritems == 0 && parent == root->node) {
4310                 BUG_ON(btrfs_header_level(root->node) != 1);
4311                 /* just turn the root into a leaf and break */
4312                 btrfs_set_header_level(root->node, 0);
4313         } else if (slot == 0) {
4314                 struct btrfs_disk_key disk_key;
4315
4316                 btrfs_node_key(parent, &disk_key, 0);
4317                 fixup_low_keys(path, &disk_key, level + 1);
4318         }
4319         btrfs_mark_buffer_dirty(parent);
4320 }
4321
4322 /*
4323  * a helper function to delete the leaf pointed to by path->slots[1] and
4324  * path->nodes[1].
4325  *
4326  * This deletes the pointer in path->nodes[1] and frees the leaf
4327  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4328  *
4329  * The path must have already been setup for deleting the leaf, including
4330  * all the proper balancing.  path->nodes[1] must be locked.
4331  */
4332 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4333                                     struct btrfs_root *root,
4334                                     struct btrfs_path *path,
4335                                     struct extent_buffer *leaf)
4336 {
4337         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4338         del_ptr(root, path, 1, path->slots[1]);
4339
4340         /*
4341          * btrfs_free_extent is expensive, we want to make sure we
4342          * aren't holding any locks when we call it
4343          */
4344         btrfs_unlock_up_safe(path, 0);
4345
4346         root_sub_used(root, leaf->len);
4347
4348         atomic_inc(&leaf->refs);
4349         btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4350         free_extent_buffer_stale(leaf);
4351 }
4352 /*
4353  * delete the item at the leaf level in path.  If that empties
4354  * the leaf, remove it from the tree
4355  */
4356 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4357                     struct btrfs_path *path, int slot, int nr)
4358 {
4359         struct btrfs_fs_info *fs_info = root->fs_info;
4360         struct extent_buffer *leaf;
4361         int ret = 0;
4362         int wret;
4363         u32 nritems;
4364
4365         leaf = path->nodes[0];
4366         nritems = btrfs_header_nritems(leaf);
4367
4368         if (slot + nr != nritems) {
4369                 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4370                 const int data_end = leaf_data_end(leaf);
4371                 struct btrfs_map_token token;
4372                 u32 dsize = 0;
4373                 int i;
4374
4375                 for (i = 0; i < nr; i++)
4376                         dsize += btrfs_item_size(leaf, slot + i);
4377
4378                 memmove_leaf_data(leaf, data_end + dsize, data_end,
4379                                   last_off - data_end);
4380
4381                 btrfs_init_map_token(&token, leaf);
4382                 for (i = slot + nr; i < nritems; i++) {
4383                         u32 ioff;
4384
4385                         ioff = btrfs_token_item_offset(&token, i);
4386                         btrfs_set_token_item_offset(&token, i, ioff + dsize);
4387                 }
4388
4389                 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4390         }
4391         btrfs_set_header_nritems(leaf, nritems - nr);
4392         nritems -= nr;
4393
4394         /* delete the leaf if we've emptied it */
4395         if (nritems == 0) {
4396                 if (leaf == root->node) {
4397                         btrfs_set_header_level(leaf, 0);
4398                 } else {
4399                         btrfs_clear_buffer_dirty(trans, leaf);
4400                         btrfs_del_leaf(trans, root, path, leaf);
4401                 }
4402         } else {
4403                 int used = leaf_space_used(leaf, 0, nritems);
4404                 if (slot == 0) {
4405                         struct btrfs_disk_key disk_key;
4406
4407                         btrfs_item_key(leaf, &disk_key, 0);
4408                         fixup_low_keys(path, &disk_key, 1);
4409                 }
4410
4411                 /*
4412                  * Try to delete the leaf if it is mostly empty. We do this by
4413                  * trying to move all its items into its left and right neighbours.
4414                  * If we can't move all the items, then we don't delete it - it's
4415                  * not ideal, but future insertions might fill the leaf with more
4416                  * items, or items from other leaves might be moved later into our
4417                  * leaf due to deletions on those leaves.
4418                  */
4419                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4420                         u32 min_push_space;
4421
4422                         /* push_leaf_left fixes the path.
4423                          * make sure the path still points to our leaf
4424                          * for possible call to del_ptr below
4425                          */
4426                         slot = path->slots[1];
4427                         atomic_inc(&leaf->refs);
4428                         /*
4429                          * We want to be able to at least push one item to the
4430                          * left neighbour leaf, and that's the first item.
4431                          */
4432                         min_push_space = sizeof(struct btrfs_item) +
4433                                 btrfs_item_size(leaf, 0);
4434                         wret = push_leaf_left(trans, root, path, 0,
4435                                               min_push_space, 1, (u32)-1);
4436                         if (wret < 0 && wret != -ENOSPC)
4437                                 ret = wret;
4438
4439                         if (path->nodes[0] == leaf &&
4440                             btrfs_header_nritems(leaf)) {
4441                                 /*
4442                                  * If we were not able to push all items from our
4443                                  * leaf to its left neighbour, then attempt to
4444                                  * either push all the remaining items to the
4445                                  * right neighbour or none. There's no advantage
4446                                  * in pushing only some items, instead of all, as
4447                                  * it's pointless to end up with a leaf having
4448                                  * too few items while the neighbours can be full
4449                                  * or nearly full.
4450                                  */
4451                                 nritems = btrfs_header_nritems(leaf);
4452                                 min_push_space = leaf_space_used(leaf, 0, nritems);
4453                                 wret = push_leaf_right(trans, root, path, 0,
4454                                                        min_push_space, 1, 0);
4455                                 if (wret < 0 && wret != -ENOSPC)
4456                                         ret = wret;
4457                         }
4458
4459                         if (btrfs_header_nritems(leaf) == 0) {
4460                                 path->slots[1] = slot;
4461                                 btrfs_del_leaf(trans, root, path, leaf);
4462                                 free_extent_buffer(leaf);
4463                                 ret = 0;
4464                         } else {
4465                                 /* if we're still in the path, make sure
4466                                  * we're dirty.  Otherwise, one of the
4467                                  * push_leaf functions must have already
4468                                  * dirtied this buffer
4469                                  */
4470                                 if (path->nodes[0] == leaf)
4471                                         btrfs_mark_buffer_dirty(leaf);
4472                                 free_extent_buffer(leaf);
4473                         }
4474                 } else {
4475                         btrfs_mark_buffer_dirty(leaf);
4476                 }
4477         }
4478         return ret;
4479 }
4480
4481 /*
4482  * search the tree again to find a leaf with lesser keys
4483  * returns 0 if it found something or 1 if there are no lesser leaves.
4484  * returns < 0 on io errors.
4485  *
4486  * This may release the path, and so you may lose any locks held at the
4487  * time you call it.
4488  */
4489 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4490 {
4491         struct btrfs_key key;
4492         struct btrfs_disk_key found_key;
4493         int ret;
4494
4495         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4496
4497         if (key.offset > 0) {
4498                 key.offset--;
4499         } else if (key.type > 0) {
4500                 key.type--;
4501                 key.offset = (u64)-1;
4502         } else if (key.objectid > 0) {
4503                 key.objectid--;
4504                 key.type = (u8)-1;
4505                 key.offset = (u64)-1;
4506         } else {
4507                 return 1;
4508         }
4509
4510         btrfs_release_path(path);
4511         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4512         if (ret < 0)
4513                 return ret;
4514         btrfs_item_key(path->nodes[0], &found_key, 0);
4515         ret = comp_keys(&found_key, &key);
4516         /*
4517          * We might have had an item with the previous key in the tree right
4518          * before we released our path. And after we released our path, that
4519          * item might have been pushed to the first slot (0) of the leaf we
4520          * were holding due to a tree balance. Alternatively, an item with the
4521          * previous key can exist as the only element of a leaf (big fat item).
4522          * Therefore account for these 2 cases, so that our callers (like
4523          * btrfs_previous_item) don't miss an existing item with a key matching
4524          * the previous key we computed above.
4525          */
4526         if (ret <= 0)
4527                 return 0;
4528         return 1;
4529 }
4530
4531 /*
4532  * A helper function to walk down the tree starting at min_key, and looking
4533  * for nodes or leaves that are have a minimum transaction id.
4534  * This is used by the btree defrag code, and tree logging
4535  *
4536  * This does not cow, but it does stuff the starting key it finds back
4537  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4538  * key and get a writable path.
4539  *
4540  * This honors path->lowest_level to prevent descent past a given level
4541  * of the tree.
4542  *
4543  * min_trans indicates the oldest transaction that you are interested
4544  * in walking through.  Any nodes or leaves older than min_trans are
4545  * skipped over (without reading them).
4546  *
4547  * returns zero if something useful was found, < 0 on error and 1 if there
4548  * was nothing in the tree that matched the search criteria.
4549  */
4550 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4551                          struct btrfs_path *path,
4552                          u64 min_trans)
4553 {
4554         struct extent_buffer *cur;
4555         struct btrfs_key found_key;
4556         int slot;
4557         int sret;
4558         u32 nritems;
4559         int level;
4560         int ret = 1;
4561         int keep_locks = path->keep_locks;
4562
4563         ASSERT(!path->nowait);
4564         path->keep_locks = 1;
4565 again:
4566         cur = btrfs_read_lock_root_node(root);
4567         level = btrfs_header_level(cur);
4568         WARN_ON(path->nodes[level]);
4569         path->nodes[level] = cur;
4570         path->locks[level] = BTRFS_READ_LOCK;
4571
4572         if (btrfs_header_generation(cur) < min_trans) {
4573                 ret = 1;
4574                 goto out;
4575         }
4576         while (1) {
4577                 nritems = btrfs_header_nritems(cur);
4578                 level = btrfs_header_level(cur);
4579                 sret = btrfs_bin_search(cur, min_key, &slot);
4580                 if (sret < 0) {
4581                         ret = sret;
4582                         goto out;
4583                 }
4584
4585                 /* at the lowest level, we're done, setup the path and exit */
4586                 if (level == path->lowest_level) {
4587                         if (slot >= nritems)
4588                                 goto find_next_key;
4589                         ret = 0;
4590                         path->slots[level] = slot;
4591                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4592                         goto out;
4593                 }
4594                 if (sret && slot > 0)
4595                         slot--;
4596                 /*
4597                  * check this node pointer against the min_trans parameters.
4598                  * If it is too old, skip to the next one.
4599                  */
4600                 while (slot < nritems) {
4601                         u64 gen;
4602
4603                         gen = btrfs_node_ptr_generation(cur, slot);
4604                         if (gen < min_trans) {
4605                                 slot++;
4606                                 continue;
4607                         }
4608                         break;
4609                 }
4610 find_next_key:
4611                 /*
4612                  * we didn't find a candidate key in this node, walk forward
4613                  * and find another one
4614                  */
4615                 if (slot >= nritems) {
4616                         path->slots[level] = slot;
4617                         sret = btrfs_find_next_key(root, path, min_key, level,
4618                                                   min_trans);
4619                         if (sret == 0) {
4620                                 btrfs_release_path(path);
4621                                 goto again;
4622                         } else {
4623                                 goto out;
4624                         }
4625                 }
4626                 /* save our key for returning back */
4627                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4628                 path->slots[level] = slot;
4629                 if (level == path->lowest_level) {
4630                         ret = 0;
4631                         goto out;
4632                 }
4633                 cur = btrfs_read_node_slot(cur, slot);
4634                 if (IS_ERR(cur)) {
4635                         ret = PTR_ERR(cur);
4636                         goto out;
4637                 }
4638
4639                 btrfs_tree_read_lock(cur);
4640
4641                 path->locks[level - 1] = BTRFS_READ_LOCK;
4642                 path->nodes[level - 1] = cur;
4643                 unlock_up(path, level, 1, 0, NULL);
4644         }
4645 out:
4646         path->keep_locks = keep_locks;
4647         if (ret == 0) {
4648                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4649                 memcpy(min_key, &found_key, sizeof(found_key));
4650         }
4651         return ret;
4652 }
4653
4654 /*
4655  * this is similar to btrfs_next_leaf, but does not try to preserve
4656  * and fixup the path.  It looks for and returns the next key in the
4657  * tree based on the current path and the min_trans parameters.
4658  *
4659  * 0 is returned if another key is found, < 0 if there are any errors
4660  * and 1 is returned if there are no higher keys in the tree
4661  *
4662  * path->keep_locks should be set to 1 on the search made before
4663  * calling this function.
4664  */
4665 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4666                         struct btrfs_key *key, int level, u64 min_trans)
4667 {
4668         int slot;
4669         struct extent_buffer *c;
4670
4671         WARN_ON(!path->keep_locks && !path->skip_locking);
4672         while (level < BTRFS_MAX_LEVEL) {
4673                 if (!path->nodes[level])
4674                         return 1;
4675
4676                 slot = path->slots[level] + 1;
4677                 c = path->nodes[level];
4678 next:
4679                 if (slot >= btrfs_header_nritems(c)) {
4680                         int ret;
4681                         int orig_lowest;
4682                         struct btrfs_key cur_key;
4683                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4684                             !path->nodes[level + 1])
4685                                 return 1;
4686
4687                         if (path->locks[level + 1] || path->skip_locking) {
4688                                 level++;
4689                                 continue;
4690                         }
4691
4692                         slot = btrfs_header_nritems(c) - 1;
4693                         if (level == 0)
4694                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4695                         else
4696                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4697
4698                         orig_lowest = path->lowest_level;
4699                         btrfs_release_path(path);
4700                         path->lowest_level = level;
4701                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4702                                                 0, 0);
4703                         path->lowest_level = orig_lowest;
4704                         if (ret < 0)
4705                                 return ret;
4706
4707                         c = path->nodes[level];
4708                         slot = path->slots[level];
4709                         if (ret == 0)
4710                                 slot++;
4711                         goto next;
4712                 }
4713
4714                 if (level == 0)
4715                         btrfs_item_key_to_cpu(c, key, slot);
4716                 else {
4717                         u64 gen = btrfs_node_ptr_generation(c, slot);
4718
4719                         if (gen < min_trans) {
4720                                 slot++;
4721                                 goto next;
4722                         }
4723                         btrfs_node_key_to_cpu(c, key, slot);
4724                 }
4725                 return 0;
4726         }
4727         return 1;
4728 }
4729
4730 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4731                         u64 time_seq)
4732 {
4733         int slot;
4734         int level;
4735         struct extent_buffer *c;
4736         struct extent_buffer *next;
4737         struct btrfs_fs_info *fs_info = root->fs_info;
4738         struct btrfs_key key;
4739         bool need_commit_sem = false;
4740         u32 nritems;
4741         int ret;
4742         int i;
4743
4744         /*
4745          * The nowait semantics are used only for write paths, where we don't
4746          * use the tree mod log and sequence numbers.
4747          */
4748         if (time_seq)
4749                 ASSERT(!path->nowait);
4750
4751         nritems = btrfs_header_nritems(path->nodes[0]);
4752         if (nritems == 0)
4753                 return 1;
4754
4755         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4756 again:
4757         level = 1;
4758         next = NULL;
4759         btrfs_release_path(path);
4760
4761         path->keep_locks = 1;
4762
4763         if (time_seq) {
4764                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4765         } else {
4766                 if (path->need_commit_sem) {
4767                         path->need_commit_sem = 0;
4768                         need_commit_sem = true;
4769                         if (path->nowait) {
4770                                 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4771                                         ret = -EAGAIN;
4772                                         goto done;
4773                                 }
4774                         } else {
4775                                 down_read(&fs_info->commit_root_sem);
4776                         }
4777                 }
4778                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4779         }
4780         path->keep_locks = 0;
4781
4782         if (ret < 0)
4783                 goto done;
4784
4785         nritems = btrfs_header_nritems(path->nodes[0]);
4786         /*
4787          * by releasing the path above we dropped all our locks.  A balance
4788          * could have added more items next to the key that used to be
4789          * at the very end of the block.  So, check again here and
4790          * advance the path if there are now more items available.
4791          */
4792         if (nritems > 0 && path->slots[0] < nritems - 1) {
4793                 if (ret == 0)
4794                         path->slots[0]++;
4795                 ret = 0;
4796                 goto done;
4797         }
4798         /*
4799          * So the above check misses one case:
4800          * - after releasing the path above, someone has removed the item that
4801          *   used to be at the very end of the block, and balance between leafs
4802          *   gets another one with bigger key.offset to replace it.
4803          *
4804          * This one should be returned as well, or we can get leaf corruption
4805          * later(esp. in __btrfs_drop_extents()).
4806          *
4807          * And a bit more explanation about this check,
4808          * with ret > 0, the key isn't found, the path points to the slot
4809          * where it should be inserted, so the path->slots[0] item must be the
4810          * bigger one.
4811          */
4812         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4813                 ret = 0;
4814                 goto done;
4815         }
4816
4817         while (level < BTRFS_MAX_LEVEL) {
4818                 if (!path->nodes[level]) {
4819                         ret = 1;
4820                         goto done;
4821                 }
4822
4823                 slot = path->slots[level] + 1;
4824                 c = path->nodes[level];
4825                 if (slot >= btrfs_header_nritems(c)) {
4826                         level++;
4827                         if (level == BTRFS_MAX_LEVEL) {
4828                                 ret = 1;
4829                                 goto done;
4830                         }
4831                         continue;
4832                 }
4833
4834
4835                 /*
4836                  * Our current level is where we're going to start from, and to
4837                  * make sure lockdep doesn't complain we need to drop our locks
4838                  * and nodes from 0 to our current level.
4839                  */
4840                 for (i = 0; i < level; i++) {
4841                         if (path->locks[level]) {
4842                                 btrfs_tree_read_unlock(path->nodes[i]);
4843                                 path->locks[i] = 0;
4844                         }
4845                         free_extent_buffer(path->nodes[i]);
4846                         path->nodes[i] = NULL;
4847                 }
4848
4849                 next = c;
4850                 ret = read_block_for_search(root, path, &next, level,
4851                                             slot, &key);
4852                 if (ret == -EAGAIN && !path->nowait)
4853                         goto again;
4854
4855                 if (ret < 0) {
4856                         btrfs_release_path(path);
4857                         goto done;
4858                 }
4859
4860                 if (!path->skip_locking) {
4861                         ret = btrfs_try_tree_read_lock(next);
4862                         if (!ret && path->nowait) {
4863                                 ret = -EAGAIN;
4864                                 goto done;
4865                         }
4866                         if (!ret && time_seq) {
4867                                 /*
4868                                  * If we don't get the lock, we may be racing
4869                                  * with push_leaf_left, holding that lock while
4870                                  * itself waiting for the leaf we've currently
4871                                  * locked. To solve this situation, we give up
4872                                  * on our lock and cycle.
4873                                  */
4874                                 free_extent_buffer(next);
4875                                 btrfs_release_path(path);
4876                                 cond_resched();
4877                                 goto again;
4878                         }
4879                         if (!ret)
4880                                 btrfs_tree_read_lock(next);
4881                 }
4882                 break;
4883         }
4884         path->slots[level] = slot;
4885         while (1) {
4886                 level--;
4887                 path->nodes[level] = next;
4888                 path->slots[level] = 0;
4889                 if (!path->skip_locking)
4890                         path->locks[level] = BTRFS_READ_LOCK;
4891                 if (!level)
4892                         break;
4893
4894                 ret = read_block_for_search(root, path, &next, level,
4895                                             0, &key);
4896                 if (ret == -EAGAIN && !path->nowait)
4897                         goto again;
4898
4899                 if (ret < 0) {
4900                         btrfs_release_path(path);
4901                         goto done;
4902                 }
4903
4904                 if (!path->skip_locking) {
4905                         if (path->nowait) {
4906                                 if (!btrfs_try_tree_read_lock(next)) {
4907                                         ret = -EAGAIN;
4908                                         goto done;
4909                                 }
4910                         } else {
4911                                 btrfs_tree_read_lock(next);
4912                         }
4913                 }
4914         }
4915         ret = 0;
4916 done:
4917         unlock_up(path, 0, 1, 0, NULL);
4918         if (need_commit_sem) {
4919                 int ret2;
4920
4921                 path->need_commit_sem = 1;
4922                 ret2 = finish_need_commit_sem_search(path);
4923                 up_read(&fs_info->commit_root_sem);
4924                 if (ret2)
4925                         ret = ret2;
4926         }
4927
4928         return ret;
4929 }
4930
4931 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4932 {
4933         path->slots[0]++;
4934         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4935                 return btrfs_next_old_leaf(root, path, time_seq);
4936         return 0;
4937 }
4938
4939 /*
4940  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4941  * searching until it gets past min_objectid or finds an item of 'type'
4942  *
4943  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4944  */
4945 int btrfs_previous_item(struct btrfs_root *root,
4946                         struct btrfs_path *path, u64 min_objectid,
4947                         int type)
4948 {
4949         struct btrfs_key found_key;
4950         struct extent_buffer *leaf;
4951         u32 nritems;
4952         int ret;
4953
4954         while (1) {
4955                 if (path->slots[0] == 0) {
4956                         ret = btrfs_prev_leaf(root, path);
4957                         if (ret != 0)
4958                                 return ret;
4959                 } else {
4960                         path->slots[0]--;
4961                 }
4962                 leaf = path->nodes[0];
4963                 nritems = btrfs_header_nritems(leaf);
4964                 if (nritems == 0)
4965                         return 1;
4966                 if (path->slots[0] == nritems)
4967                         path->slots[0]--;
4968
4969                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4970                 if (found_key.objectid < min_objectid)
4971                         break;
4972                 if (found_key.type == type)
4973                         return 0;
4974                 if (found_key.objectid == min_objectid &&
4975                     found_key.type < type)
4976                         break;
4977         }
4978         return 1;
4979 }
4980
4981 /*
4982  * search in extent tree to find a previous Metadata/Data extent item with
4983  * min objecitd.
4984  *
4985  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4986  */
4987 int btrfs_previous_extent_item(struct btrfs_root *root,
4988                         struct btrfs_path *path, u64 min_objectid)
4989 {
4990         struct btrfs_key found_key;
4991         struct extent_buffer *leaf;
4992         u32 nritems;
4993         int ret;
4994
4995         while (1) {
4996                 if (path->slots[0] == 0) {
4997                         ret = btrfs_prev_leaf(root, path);
4998                         if (ret != 0)
4999                                 return ret;
5000                 } else {
5001                         path->slots[0]--;
5002                 }
5003                 leaf = path->nodes[0];
5004                 nritems = btrfs_header_nritems(leaf);
5005                 if (nritems == 0)
5006                         return 1;
5007                 if (path->slots[0] == nritems)
5008                         path->slots[0]--;
5009
5010                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5011                 if (found_key.objectid < min_objectid)
5012                         break;
5013                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5014                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5015                         return 0;
5016                 if (found_key.objectid == min_objectid &&
5017                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5018                         break;
5019         }
5020         return 1;
5021 }
5022
5023 int __init btrfs_ctree_init(void)
5024 {
5025         btrfs_path_cachep = kmem_cache_create("btrfs_path",
5026                         sizeof(struct btrfs_path), 0,
5027                         SLAB_MEM_SPREAD, NULL);
5028         if (!btrfs_path_cachep)
5029                 return -ENOMEM;
5030         return 0;
5031 }
5032
5033 void __cold btrfs_ctree_exit(void)
5034 {
5035         kmem_cache_destroy(btrfs_path_cachep);
5036 }