btrfs: add a btrfs_csum_type_size helper
[linux-block.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include <linux/error-injection.h>
11 #include "messages.h"
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "print-tree.h"
16 #include "locking.h"
17 #include "volumes.h"
18 #include "qgroup.h"
19 #include "tree-mod-log.h"
20 #include "tree-checker.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 #include "relocation.h"
25 #include "file-item.h"
26
27 static struct kmem_cache *btrfs_path_cachep;
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32                       const struct btrfs_key *ins_key, struct btrfs_path *path,
33                       int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35                           struct extent_buffer *dst,
36                           struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42
43 static const struct btrfs_csums {
44         u16             size;
45         const char      name[10];
46         const char      driver[12];
47 } btrfs_csums[] = {
48         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
49         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
50         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
51         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
52                                      .driver = "blake2b-256" },
53 };
54
55 /*
56  * The leaf data grows from end-to-front in the node.  this returns the address
57  * of the start of the last item, which is the stop of the leaf data stack.
58  */
59 static unsigned int leaf_data_end(const struct extent_buffer *leaf)
60 {
61         u32 nr = btrfs_header_nritems(leaf);
62
63         if (nr == 0)
64                 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
65         return btrfs_item_offset(leaf, nr - 1);
66 }
67
68 /*
69  * Move data in a @leaf (using memmove, safe for overlapping ranges).
70  *
71  * @leaf:       leaf that we're doing a memmove on
72  * @dst_offset: item data offset we're moving to
73  * @src_offset: item data offset were' moving from
74  * @len:        length of the data we're moving
75  *
76  * Wrapper around memmove_extent_buffer() that takes into account the header on
77  * the leaf.  The btrfs_item offset's start directly after the header, so we
78  * have to adjust any offsets to account for the header in the leaf.  This
79  * handles that math to simplify the callers.
80  */
81 static inline void memmove_leaf_data(const struct extent_buffer *leaf,
82                                      unsigned long dst_offset,
83                                      unsigned long src_offset,
84                                      unsigned long len)
85 {
86         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
87                               btrfs_item_nr_offset(leaf, 0) + src_offset, len);
88 }
89
90 /*
91  * Copy item data from @src into @dst at the given @offset.
92  *
93  * @dst:        destination leaf that we're copying into
94  * @src:        source leaf that we're copying from
95  * @dst_offset: item data offset we're copying to
96  * @src_offset: item data offset were' copying from
97  * @len:        length of the data we're copying
98  *
99  * Wrapper around copy_extent_buffer() that takes into account the header on
100  * the leaf.  The btrfs_item offset's start directly after the header, so we
101  * have to adjust any offsets to account for the header in the leaf.  This
102  * handles that math to simplify the callers.
103  */
104 static inline void copy_leaf_data(const struct extent_buffer *dst,
105                                   const struct extent_buffer *src,
106                                   unsigned long dst_offset,
107                                   unsigned long src_offset, unsigned long len)
108 {
109         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
110                            btrfs_item_nr_offset(src, 0) + src_offset, len);
111 }
112
113 /*
114  * Move items in a @leaf (using memmove).
115  *
116  * @dst:        destination leaf for the items
117  * @dst_item:   the item nr we're copying into
118  * @src_item:   the item nr we're copying from
119  * @nr_items:   the number of items to copy
120  *
121  * Wrapper around memmove_extent_buffer() that does the math to get the
122  * appropriate offsets into the leaf from the item numbers.
123  */
124 static inline void memmove_leaf_items(const struct extent_buffer *leaf,
125                                       int dst_item, int src_item, int nr_items)
126 {
127         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
128                               btrfs_item_nr_offset(leaf, src_item),
129                               nr_items * sizeof(struct btrfs_item));
130 }
131
132 /*
133  * Copy items from @src into @dst at the given @offset.
134  *
135  * @dst:        destination leaf for the items
136  * @src:        source leaf for the items
137  * @dst_item:   the item nr we're copying into
138  * @src_item:   the item nr we're copying from
139  * @nr_items:   the number of items to copy
140  *
141  * Wrapper around copy_extent_buffer() that does the math to get the
142  * appropriate offsets into the leaf from the item numbers.
143  */
144 static inline void copy_leaf_items(const struct extent_buffer *dst,
145                                    const struct extent_buffer *src,
146                                    int dst_item, int src_item, int nr_items)
147 {
148         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
149                               btrfs_item_nr_offset(src, src_item),
150                               nr_items * sizeof(struct btrfs_item));
151 }
152
153 /* This exists for btrfs-progs usages. */
154 u16 btrfs_csum_type_size(u16 type)
155 {
156         return btrfs_csums[type].size;
157 }
158
159 int btrfs_super_csum_size(const struct btrfs_super_block *s)
160 {
161         u16 t = btrfs_super_csum_type(s);
162         /*
163          * csum type is validated at mount time
164          */
165         return btrfs_csum_type_size(t);
166 }
167
168 const char *btrfs_super_csum_name(u16 csum_type)
169 {
170         /* csum type is validated at mount time */
171         return btrfs_csums[csum_type].name;
172 }
173
174 /*
175  * Return driver name if defined, otherwise the name that's also a valid driver
176  * name
177  */
178 const char *btrfs_super_csum_driver(u16 csum_type)
179 {
180         /* csum type is validated at mount time */
181         return btrfs_csums[csum_type].driver[0] ?
182                 btrfs_csums[csum_type].driver :
183                 btrfs_csums[csum_type].name;
184 }
185
186 size_t __attribute_const__ btrfs_get_num_csums(void)
187 {
188         return ARRAY_SIZE(btrfs_csums);
189 }
190
191 struct btrfs_path *btrfs_alloc_path(void)
192 {
193         might_sleep();
194
195         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
196 }
197
198 /* this also releases the path */
199 void btrfs_free_path(struct btrfs_path *p)
200 {
201         if (!p)
202                 return;
203         btrfs_release_path(p);
204         kmem_cache_free(btrfs_path_cachep, p);
205 }
206
207 /*
208  * path release drops references on the extent buffers in the path
209  * and it drops any locks held by this path
210  *
211  * It is safe to call this on paths that no locks or extent buffers held.
212  */
213 noinline void btrfs_release_path(struct btrfs_path *p)
214 {
215         int i;
216
217         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
218                 p->slots[i] = 0;
219                 if (!p->nodes[i])
220                         continue;
221                 if (p->locks[i]) {
222                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
223                         p->locks[i] = 0;
224                 }
225                 free_extent_buffer(p->nodes[i]);
226                 p->nodes[i] = NULL;
227         }
228 }
229
230 /*
231  * We want the transaction abort to print stack trace only for errors where the
232  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
233  * caused by external factors.
234  */
235 bool __cold abort_should_print_stack(int errno)
236 {
237         switch (errno) {
238         case -EIO:
239         case -EROFS:
240         case -ENOMEM:
241                 return false;
242         }
243         return true;
244 }
245
246 /*
247  * safely gets a reference on the root node of a tree.  A lock
248  * is not taken, so a concurrent writer may put a different node
249  * at the root of the tree.  See btrfs_lock_root_node for the
250  * looping required.
251  *
252  * The extent buffer returned by this has a reference taken, so
253  * it won't disappear.  It may stop being the root of the tree
254  * at any time because there are no locks held.
255  */
256 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
257 {
258         struct extent_buffer *eb;
259
260         while (1) {
261                 rcu_read_lock();
262                 eb = rcu_dereference(root->node);
263
264                 /*
265                  * RCU really hurts here, we could free up the root node because
266                  * it was COWed but we may not get the new root node yet so do
267                  * the inc_not_zero dance and if it doesn't work then
268                  * synchronize_rcu and try again.
269                  */
270                 if (atomic_inc_not_zero(&eb->refs)) {
271                         rcu_read_unlock();
272                         break;
273                 }
274                 rcu_read_unlock();
275                 synchronize_rcu();
276         }
277         return eb;
278 }
279
280 /*
281  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
282  * just get put onto a simple dirty list.  Transaction walks this list to make
283  * sure they get properly updated on disk.
284  */
285 static void add_root_to_dirty_list(struct btrfs_root *root)
286 {
287         struct btrfs_fs_info *fs_info = root->fs_info;
288
289         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
290             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
291                 return;
292
293         spin_lock(&fs_info->trans_lock);
294         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
295                 /* Want the extent tree to be the last on the list */
296                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
297                         list_move_tail(&root->dirty_list,
298                                        &fs_info->dirty_cowonly_roots);
299                 else
300                         list_move(&root->dirty_list,
301                                   &fs_info->dirty_cowonly_roots);
302         }
303         spin_unlock(&fs_info->trans_lock);
304 }
305
306 /*
307  * used by snapshot creation to make a copy of a root for a tree with
308  * a given objectid.  The buffer with the new root node is returned in
309  * cow_ret, and this func returns zero on success or a negative error code.
310  */
311 int btrfs_copy_root(struct btrfs_trans_handle *trans,
312                       struct btrfs_root *root,
313                       struct extent_buffer *buf,
314                       struct extent_buffer **cow_ret, u64 new_root_objectid)
315 {
316         struct btrfs_fs_info *fs_info = root->fs_info;
317         struct extent_buffer *cow;
318         int ret = 0;
319         int level;
320         struct btrfs_disk_key disk_key;
321
322         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
323                 trans->transid != fs_info->running_transaction->transid);
324         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
325                 trans->transid != root->last_trans);
326
327         level = btrfs_header_level(buf);
328         if (level == 0)
329                 btrfs_item_key(buf, &disk_key, 0);
330         else
331                 btrfs_node_key(buf, &disk_key, 0);
332
333         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
334                                      &disk_key, level, buf->start, 0,
335                                      BTRFS_NESTING_NEW_ROOT);
336         if (IS_ERR(cow))
337                 return PTR_ERR(cow);
338
339         copy_extent_buffer_full(cow, buf);
340         btrfs_set_header_bytenr(cow, cow->start);
341         btrfs_set_header_generation(cow, trans->transid);
342         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
343         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
344                                      BTRFS_HEADER_FLAG_RELOC);
345         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
346                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
347         else
348                 btrfs_set_header_owner(cow, new_root_objectid);
349
350         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
351
352         WARN_ON(btrfs_header_generation(buf) > trans->transid);
353         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
354                 ret = btrfs_inc_ref(trans, root, cow, 1);
355         else
356                 ret = btrfs_inc_ref(trans, root, cow, 0);
357         if (ret) {
358                 btrfs_tree_unlock(cow);
359                 free_extent_buffer(cow);
360                 btrfs_abort_transaction(trans, ret);
361                 return ret;
362         }
363
364         btrfs_mark_buffer_dirty(cow);
365         *cow_ret = cow;
366         return 0;
367 }
368
369 /*
370  * check if the tree block can be shared by multiple trees
371  */
372 int btrfs_block_can_be_shared(struct btrfs_root *root,
373                               struct extent_buffer *buf)
374 {
375         /*
376          * Tree blocks not in shareable trees and tree roots are never shared.
377          * If a block was allocated after the last snapshot and the block was
378          * not allocated by tree relocation, we know the block is not shared.
379          */
380         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
381             buf != root->node && buf != root->commit_root &&
382             (btrfs_header_generation(buf) <=
383              btrfs_root_last_snapshot(&root->root_item) ||
384              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
385                 return 1;
386
387         return 0;
388 }
389
390 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
391                                        struct btrfs_root *root,
392                                        struct extent_buffer *buf,
393                                        struct extent_buffer *cow,
394                                        int *last_ref)
395 {
396         struct btrfs_fs_info *fs_info = root->fs_info;
397         u64 refs;
398         u64 owner;
399         u64 flags;
400         u64 new_flags = 0;
401         int ret;
402
403         /*
404          * Backrefs update rules:
405          *
406          * Always use full backrefs for extent pointers in tree block
407          * allocated by tree relocation.
408          *
409          * If a shared tree block is no longer referenced by its owner
410          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
411          * use full backrefs for extent pointers in tree block.
412          *
413          * If a tree block is been relocating
414          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
415          * use full backrefs for extent pointers in tree block.
416          * The reason for this is some operations (such as drop tree)
417          * are only allowed for blocks use full backrefs.
418          */
419
420         if (btrfs_block_can_be_shared(root, buf)) {
421                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
422                                                btrfs_header_level(buf), 1,
423                                                &refs, &flags);
424                 if (ret)
425                         return ret;
426                 if (refs == 0) {
427                         ret = -EROFS;
428                         btrfs_handle_fs_error(fs_info, ret, NULL);
429                         return ret;
430                 }
431         } else {
432                 refs = 1;
433                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
434                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
435                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
436                 else
437                         flags = 0;
438         }
439
440         owner = btrfs_header_owner(buf);
441         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
442                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
443
444         if (refs > 1) {
445                 if ((owner == root->root_key.objectid ||
446                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
447                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
448                         ret = btrfs_inc_ref(trans, root, buf, 1);
449                         if (ret)
450                                 return ret;
451
452                         if (root->root_key.objectid ==
453                             BTRFS_TREE_RELOC_OBJECTID) {
454                                 ret = btrfs_dec_ref(trans, root, buf, 0);
455                                 if (ret)
456                                         return ret;
457                                 ret = btrfs_inc_ref(trans, root, cow, 1);
458                                 if (ret)
459                                         return ret;
460                         }
461                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
462                 } else {
463
464                         if (root->root_key.objectid ==
465                             BTRFS_TREE_RELOC_OBJECTID)
466                                 ret = btrfs_inc_ref(trans, root, cow, 1);
467                         else
468                                 ret = btrfs_inc_ref(trans, root, cow, 0);
469                         if (ret)
470                                 return ret;
471                 }
472                 if (new_flags != 0) {
473                         ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
474                         if (ret)
475                                 return ret;
476                 }
477         } else {
478                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
479                         if (root->root_key.objectid ==
480                             BTRFS_TREE_RELOC_OBJECTID)
481                                 ret = btrfs_inc_ref(trans, root, cow, 1);
482                         else
483                                 ret = btrfs_inc_ref(trans, root, cow, 0);
484                         if (ret)
485                                 return ret;
486                         ret = btrfs_dec_ref(trans, root, buf, 1);
487                         if (ret)
488                                 return ret;
489                 }
490                 btrfs_clear_buffer_dirty(trans, buf);
491                 *last_ref = 1;
492         }
493         return 0;
494 }
495
496 /*
497  * does the dirty work in cow of a single block.  The parent block (if
498  * supplied) is updated to point to the new cow copy.  The new buffer is marked
499  * dirty and returned locked.  If you modify the block it needs to be marked
500  * dirty again.
501  *
502  * search_start -- an allocation hint for the new block
503  *
504  * empty_size -- a hint that you plan on doing more cow.  This is the size in
505  * bytes the allocator should try to find free next to the block it returns.
506  * This is just a hint and may be ignored by the allocator.
507  */
508 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
509                              struct btrfs_root *root,
510                              struct extent_buffer *buf,
511                              struct extent_buffer *parent, int parent_slot,
512                              struct extent_buffer **cow_ret,
513                              u64 search_start, u64 empty_size,
514                              enum btrfs_lock_nesting nest)
515 {
516         struct btrfs_fs_info *fs_info = root->fs_info;
517         struct btrfs_disk_key disk_key;
518         struct extent_buffer *cow;
519         int level, ret;
520         int last_ref = 0;
521         int unlock_orig = 0;
522         u64 parent_start = 0;
523
524         if (*cow_ret == buf)
525                 unlock_orig = 1;
526
527         btrfs_assert_tree_write_locked(buf);
528
529         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
530                 trans->transid != fs_info->running_transaction->transid);
531         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
532                 trans->transid != root->last_trans);
533
534         level = btrfs_header_level(buf);
535
536         if (level == 0)
537                 btrfs_item_key(buf, &disk_key, 0);
538         else
539                 btrfs_node_key(buf, &disk_key, 0);
540
541         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
542                 parent_start = parent->start;
543
544         cow = btrfs_alloc_tree_block(trans, root, parent_start,
545                                      root->root_key.objectid, &disk_key, level,
546                                      search_start, empty_size, nest);
547         if (IS_ERR(cow))
548                 return PTR_ERR(cow);
549
550         /* cow is set to blocking by btrfs_init_new_buffer */
551
552         copy_extent_buffer_full(cow, buf);
553         btrfs_set_header_bytenr(cow, cow->start);
554         btrfs_set_header_generation(cow, trans->transid);
555         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
556         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
557                                      BTRFS_HEADER_FLAG_RELOC);
558         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
559                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
560         else
561                 btrfs_set_header_owner(cow, root->root_key.objectid);
562
563         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
564
565         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
566         if (ret) {
567                 btrfs_tree_unlock(cow);
568                 free_extent_buffer(cow);
569                 btrfs_abort_transaction(trans, ret);
570                 return ret;
571         }
572
573         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
574                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
575                 if (ret) {
576                         btrfs_tree_unlock(cow);
577                         free_extent_buffer(cow);
578                         btrfs_abort_transaction(trans, ret);
579                         return ret;
580                 }
581         }
582
583         if (buf == root->node) {
584                 WARN_ON(parent && parent != buf);
585                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
586                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
587                         parent_start = buf->start;
588
589                 atomic_inc(&cow->refs);
590                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
591                 BUG_ON(ret < 0);
592                 rcu_assign_pointer(root->node, cow);
593
594                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
595                                       parent_start, last_ref);
596                 free_extent_buffer(buf);
597                 add_root_to_dirty_list(root);
598         } else {
599                 WARN_ON(trans->transid != btrfs_header_generation(parent));
600                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
601                                               BTRFS_MOD_LOG_KEY_REPLACE);
602                 btrfs_set_node_blockptr(parent, parent_slot,
603                                         cow->start);
604                 btrfs_set_node_ptr_generation(parent, parent_slot,
605                                               trans->transid);
606                 btrfs_mark_buffer_dirty(parent);
607                 if (last_ref) {
608                         ret = btrfs_tree_mod_log_free_eb(buf);
609                         if (ret) {
610                                 btrfs_tree_unlock(cow);
611                                 free_extent_buffer(cow);
612                                 btrfs_abort_transaction(trans, ret);
613                                 return ret;
614                         }
615                 }
616                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
617                                       parent_start, last_ref);
618         }
619         if (unlock_orig)
620                 btrfs_tree_unlock(buf);
621         free_extent_buffer_stale(buf);
622         btrfs_mark_buffer_dirty(cow);
623         *cow_ret = cow;
624         return 0;
625 }
626
627 static inline int should_cow_block(struct btrfs_trans_handle *trans,
628                                    struct btrfs_root *root,
629                                    struct extent_buffer *buf)
630 {
631         if (btrfs_is_testing(root->fs_info))
632                 return 0;
633
634         /* Ensure we can see the FORCE_COW bit */
635         smp_mb__before_atomic();
636
637         /*
638          * We do not need to cow a block if
639          * 1) this block is not created or changed in this transaction;
640          * 2) this block does not belong to TREE_RELOC tree;
641          * 3) the root is not forced COW.
642          *
643          * What is forced COW:
644          *    when we create snapshot during committing the transaction,
645          *    after we've finished copying src root, we must COW the shared
646          *    block to ensure the metadata consistency.
647          */
648         if (btrfs_header_generation(buf) == trans->transid &&
649             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
650             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
651               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
652             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
653                 return 0;
654         return 1;
655 }
656
657 /*
658  * cows a single block, see __btrfs_cow_block for the real work.
659  * This version of it has extra checks so that a block isn't COWed more than
660  * once per transaction, as long as it hasn't been written yet
661  */
662 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
663                     struct btrfs_root *root, struct extent_buffer *buf,
664                     struct extent_buffer *parent, int parent_slot,
665                     struct extent_buffer **cow_ret,
666                     enum btrfs_lock_nesting nest)
667 {
668         struct btrfs_fs_info *fs_info = root->fs_info;
669         u64 search_start;
670         int ret;
671
672         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
673                 btrfs_err(fs_info,
674                         "COW'ing blocks on a fs root that's being dropped");
675
676         if (trans->transaction != fs_info->running_transaction)
677                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
678                        trans->transid,
679                        fs_info->running_transaction->transid);
680
681         if (trans->transid != fs_info->generation)
682                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
683                        trans->transid, fs_info->generation);
684
685         if (!should_cow_block(trans, root, buf)) {
686                 *cow_ret = buf;
687                 return 0;
688         }
689
690         search_start = buf->start & ~((u64)SZ_1G - 1);
691
692         /*
693          * Before CoWing this block for later modification, check if it's
694          * the subtree root and do the delayed subtree trace if needed.
695          *
696          * Also We don't care about the error, as it's handled internally.
697          */
698         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
699         ret = __btrfs_cow_block(trans, root, buf, parent,
700                                  parent_slot, cow_ret, search_start, 0, nest);
701
702         trace_btrfs_cow_block(root, buf, *cow_ret);
703
704         return ret;
705 }
706 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
707
708 /*
709  * helper function for defrag to decide if two blocks pointed to by a
710  * node are actually close by
711  */
712 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
713 {
714         if (blocknr < other && other - (blocknr + blocksize) < 32768)
715                 return 1;
716         if (blocknr > other && blocknr - (other + blocksize) < 32768)
717                 return 1;
718         return 0;
719 }
720
721 #ifdef __LITTLE_ENDIAN
722
723 /*
724  * Compare two keys, on little-endian the disk order is same as CPU order and
725  * we can avoid the conversion.
726  */
727 static int comp_keys(const struct btrfs_disk_key *disk_key,
728                      const struct btrfs_key *k2)
729 {
730         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
731
732         return btrfs_comp_cpu_keys(k1, k2);
733 }
734
735 #else
736
737 /*
738  * compare two keys in a memcmp fashion
739  */
740 static int comp_keys(const struct btrfs_disk_key *disk,
741                      const struct btrfs_key *k2)
742 {
743         struct btrfs_key k1;
744
745         btrfs_disk_key_to_cpu(&k1, disk);
746
747         return btrfs_comp_cpu_keys(&k1, k2);
748 }
749 #endif
750
751 /*
752  * same as comp_keys only with two btrfs_key's
753  */
754 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
755 {
756         if (k1->objectid > k2->objectid)
757                 return 1;
758         if (k1->objectid < k2->objectid)
759                 return -1;
760         if (k1->type > k2->type)
761                 return 1;
762         if (k1->type < k2->type)
763                 return -1;
764         if (k1->offset > k2->offset)
765                 return 1;
766         if (k1->offset < k2->offset)
767                 return -1;
768         return 0;
769 }
770
771 /*
772  * this is used by the defrag code to go through all the
773  * leaves pointed to by a node and reallocate them so that
774  * disk order is close to key order
775  */
776 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
777                        struct btrfs_root *root, struct extent_buffer *parent,
778                        int start_slot, u64 *last_ret,
779                        struct btrfs_key *progress)
780 {
781         struct btrfs_fs_info *fs_info = root->fs_info;
782         struct extent_buffer *cur;
783         u64 blocknr;
784         u64 search_start = *last_ret;
785         u64 last_block = 0;
786         u64 other;
787         u32 parent_nritems;
788         int end_slot;
789         int i;
790         int err = 0;
791         u32 blocksize;
792         int progress_passed = 0;
793         struct btrfs_disk_key disk_key;
794
795         WARN_ON(trans->transaction != fs_info->running_transaction);
796         WARN_ON(trans->transid != fs_info->generation);
797
798         parent_nritems = btrfs_header_nritems(parent);
799         blocksize = fs_info->nodesize;
800         end_slot = parent_nritems - 1;
801
802         if (parent_nritems <= 1)
803                 return 0;
804
805         for (i = start_slot; i <= end_slot; i++) {
806                 int close = 1;
807
808                 btrfs_node_key(parent, &disk_key, i);
809                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
810                         continue;
811
812                 progress_passed = 1;
813                 blocknr = btrfs_node_blockptr(parent, i);
814                 if (last_block == 0)
815                         last_block = blocknr;
816
817                 if (i > 0) {
818                         other = btrfs_node_blockptr(parent, i - 1);
819                         close = close_blocks(blocknr, other, blocksize);
820                 }
821                 if (!close && i < end_slot) {
822                         other = btrfs_node_blockptr(parent, i + 1);
823                         close = close_blocks(blocknr, other, blocksize);
824                 }
825                 if (close) {
826                         last_block = blocknr;
827                         continue;
828                 }
829
830                 cur = btrfs_read_node_slot(parent, i);
831                 if (IS_ERR(cur))
832                         return PTR_ERR(cur);
833                 if (search_start == 0)
834                         search_start = last_block;
835
836                 btrfs_tree_lock(cur);
837                 err = __btrfs_cow_block(trans, root, cur, parent, i,
838                                         &cur, search_start,
839                                         min(16 * blocksize,
840                                             (end_slot - i) * blocksize),
841                                         BTRFS_NESTING_COW);
842                 if (err) {
843                         btrfs_tree_unlock(cur);
844                         free_extent_buffer(cur);
845                         break;
846                 }
847                 search_start = cur->start;
848                 last_block = cur->start;
849                 *last_ret = search_start;
850                 btrfs_tree_unlock(cur);
851                 free_extent_buffer(cur);
852         }
853         return err;
854 }
855
856 /*
857  * Search for a key in the given extent_buffer.
858  *
859  * The lower boundary for the search is specified by the slot number @first_slot.
860  * Use a value of 0 to search over the whole extent buffer. Works for both
861  * leaves and nodes.
862  *
863  * The slot in the extent buffer is returned via @slot. If the key exists in the
864  * extent buffer, then @slot will point to the slot where the key is, otherwise
865  * it points to the slot where you would insert the key.
866  *
867  * Slot may point to the total number of items (i.e. one position beyond the last
868  * key) if the key is bigger than the last key in the extent buffer.
869  */
870 int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
871                      const struct btrfs_key *key, int *slot)
872 {
873         unsigned long p;
874         int item_size;
875         /*
876          * Use unsigned types for the low and high slots, so that we get a more
877          * efficient division in the search loop below.
878          */
879         u32 low = first_slot;
880         u32 high = btrfs_header_nritems(eb);
881         int ret;
882         const int key_size = sizeof(struct btrfs_disk_key);
883
884         if (unlikely(low > high)) {
885                 btrfs_err(eb->fs_info,
886                  "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
887                           __func__, low, high, eb->start,
888                           btrfs_header_owner(eb), btrfs_header_level(eb));
889                 return -EINVAL;
890         }
891
892         if (btrfs_header_level(eb) == 0) {
893                 p = offsetof(struct btrfs_leaf, items);
894                 item_size = sizeof(struct btrfs_item);
895         } else {
896                 p = offsetof(struct btrfs_node, ptrs);
897                 item_size = sizeof(struct btrfs_key_ptr);
898         }
899
900         while (low < high) {
901                 unsigned long oip;
902                 unsigned long offset;
903                 struct btrfs_disk_key *tmp;
904                 struct btrfs_disk_key unaligned;
905                 int mid;
906
907                 mid = (low + high) / 2;
908                 offset = p + mid * item_size;
909                 oip = offset_in_page(offset);
910
911                 if (oip + key_size <= PAGE_SIZE) {
912                         const unsigned long idx = get_eb_page_index(offset);
913                         char *kaddr = page_address(eb->pages[idx]);
914
915                         oip = get_eb_offset_in_page(eb, offset);
916                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
917                 } else {
918                         read_extent_buffer(eb, &unaligned, offset, key_size);
919                         tmp = &unaligned;
920                 }
921
922                 ret = comp_keys(tmp, key);
923
924                 if (ret < 0)
925                         low = mid + 1;
926                 else if (ret > 0)
927                         high = mid;
928                 else {
929                         *slot = mid;
930                         return 0;
931                 }
932         }
933         *slot = low;
934         return 1;
935 }
936
937 static void root_add_used(struct btrfs_root *root, u32 size)
938 {
939         spin_lock(&root->accounting_lock);
940         btrfs_set_root_used(&root->root_item,
941                             btrfs_root_used(&root->root_item) + size);
942         spin_unlock(&root->accounting_lock);
943 }
944
945 static void root_sub_used(struct btrfs_root *root, u32 size)
946 {
947         spin_lock(&root->accounting_lock);
948         btrfs_set_root_used(&root->root_item,
949                             btrfs_root_used(&root->root_item) - size);
950         spin_unlock(&root->accounting_lock);
951 }
952
953 /* given a node and slot number, this reads the blocks it points to.  The
954  * extent buffer is returned with a reference taken (but unlocked).
955  */
956 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
957                                            int slot)
958 {
959         int level = btrfs_header_level(parent);
960         struct btrfs_tree_parent_check check = { 0 };
961         struct extent_buffer *eb;
962
963         if (slot < 0 || slot >= btrfs_header_nritems(parent))
964                 return ERR_PTR(-ENOENT);
965
966         ASSERT(level);
967
968         check.level = level - 1;
969         check.transid = btrfs_node_ptr_generation(parent, slot);
970         check.owner_root = btrfs_header_owner(parent);
971         check.has_first_key = true;
972         btrfs_node_key_to_cpu(parent, &check.first_key, slot);
973
974         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
975                              &check);
976         if (IS_ERR(eb))
977                 return eb;
978         if (!extent_buffer_uptodate(eb)) {
979                 free_extent_buffer(eb);
980                 return ERR_PTR(-EIO);
981         }
982
983         return eb;
984 }
985
986 /*
987  * node level balancing, used to make sure nodes are in proper order for
988  * item deletion.  We balance from the top down, so we have to make sure
989  * that a deletion won't leave an node completely empty later on.
990  */
991 static noinline int balance_level(struct btrfs_trans_handle *trans,
992                          struct btrfs_root *root,
993                          struct btrfs_path *path, int level)
994 {
995         struct btrfs_fs_info *fs_info = root->fs_info;
996         struct extent_buffer *right = NULL;
997         struct extent_buffer *mid;
998         struct extent_buffer *left = NULL;
999         struct extent_buffer *parent = NULL;
1000         int ret = 0;
1001         int wret;
1002         int pslot;
1003         int orig_slot = path->slots[level];
1004         u64 orig_ptr;
1005
1006         ASSERT(level > 0);
1007
1008         mid = path->nodes[level];
1009
1010         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1011         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1012
1013         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1014
1015         if (level < BTRFS_MAX_LEVEL - 1) {
1016                 parent = path->nodes[level + 1];
1017                 pslot = path->slots[level + 1];
1018         }
1019
1020         /*
1021          * deal with the case where there is only one pointer in the root
1022          * by promoting the node below to a root
1023          */
1024         if (!parent) {
1025                 struct extent_buffer *child;
1026
1027                 if (btrfs_header_nritems(mid) != 1)
1028                         return 0;
1029
1030                 /* promote the child to a root */
1031                 child = btrfs_read_node_slot(mid, 0);
1032                 if (IS_ERR(child)) {
1033                         ret = PTR_ERR(child);
1034                         btrfs_handle_fs_error(fs_info, ret, NULL);
1035                         goto enospc;
1036                 }
1037
1038                 btrfs_tree_lock(child);
1039                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1040                                       BTRFS_NESTING_COW);
1041                 if (ret) {
1042                         btrfs_tree_unlock(child);
1043                         free_extent_buffer(child);
1044                         goto enospc;
1045                 }
1046
1047                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
1048                 BUG_ON(ret < 0);
1049                 rcu_assign_pointer(root->node, child);
1050
1051                 add_root_to_dirty_list(root);
1052                 btrfs_tree_unlock(child);
1053
1054                 path->locks[level] = 0;
1055                 path->nodes[level] = NULL;
1056                 btrfs_clear_buffer_dirty(trans, mid);
1057                 btrfs_tree_unlock(mid);
1058                 /* once for the path */
1059                 free_extent_buffer(mid);
1060
1061                 root_sub_used(root, mid->len);
1062                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1063                 /* once for the root ptr */
1064                 free_extent_buffer_stale(mid);
1065                 return 0;
1066         }
1067         if (btrfs_header_nritems(mid) >
1068             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1069                 return 0;
1070
1071         if (pslot) {
1072                 left = btrfs_read_node_slot(parent, pslot - 1);
1073                 if (IS_ERR(left)) {
1074                         ret = PTR_ERR(left);
1075                         left = NULL;
1076                         goto enospc;
1077                 }
1078
1079                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1080                 wret = btrfs_cow_block(trans, root, left,
1081                                        parent, pslot - 1, &left,
1082                                        BTRFS_NESTING_LEFT_COW);
1083                 if (wret) {
1084                         ret = wret;
1085                         goto enospc;
1086                 }
1087         }
1088
1089         if (pslot + 1 < btrfs_header_nritems(parent)) {
1090                 right = btrfs_read_node_slot(parent, pslot + 1);
1091                 if (IS_ERR(right)) {
1092                         ret = PTR_ERR(right);
1093                         right = NULL;
1094                         goto enospc;
1095                 }
1096
1097                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1098                 wret = btrfs_cow_block(trans, root, right,
1099                                        parent, pslot + 1, &right,
1100                                        BTRFS_NESTING_RIGHT_COW);
1101                 if (wret) {
1102                         ret = wret;
1103                         goto enospc;
1104                 }
1105         }
1106
1107         /* first, try to make some room in the middle buffer */
1108         if (left) {
1109                 orig_slot += btrfs_header_nritems(left);
1110                 wret = push_node_left(trans, left, mid, 1);
1111                 if (wret < 0)
1112                         ret = wret;
1113         }
1114
1115         /*
1116          * then try to empty the right most buffer into the middle
1117          */
1118         if (right) {
1119                 wret = push_node_left(trans, mid, right, 1);
1120                 if (wret < 0 && wret != -ENOSPC)
1121                         ret = wret;
1122                 if (btrfs_header_nritems(right) == 0) {
1123                         btrfs_clear_buffer_dirty(trans, right);
1124                         btrfs_tree_unlock(right);
1125                         del_ptr(root, path, level + 1, pslot + 1);
1126                         root_sub_used(root, right->len);
1127                         btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1128                                               0, 1);
1129                         free_extent_buffer_stale(right);
1130                         right = NULL;
1131                 } else {
1132                         struct btrfs_disk_key right_key;
1133                         btrfs_node_key(right, &right_key, 0);
1134                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1135                                         BTRFS_MOD_LOG_KEY_REPLACE);
1136                         BUG_ON(ret < 0);
1137                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1138                         btrfs_mark_buffer_dirty(parent);
1139                 }
1140         }
1141         if (btrfs_header_nritems(mid) == 1) {
1142                 /*
1143                  * we're not allowed to leave a node with one item in the
1144                  * tree during a delete.  A deletion from lower in the tree
1145                  * could try to delete the only pointer in this node.
1146                  * So, pull some keys from the left.
1147                  * There has to be a left pointer at this point because
1148                  * otherwise we would have pulled some pointers from the
1149                  * right
1150                  */
1151                 if (!left) {
1152                         ret = -EROFS;
1153                         btrfs_handle_fs_error(fs_info, ret, NULL);
1154                         goto enospc;
1155                 }
1156                 wret = balance_node_right(trans, mid, left);
1157                 if (wret < 0) {
1158                         ret = wret;
1159                         goto enospc;
1160                 }
1161                 if (wret == 1) {
1162                         wret = push_node_left(trans, left, mid, 1);
1163                         if (wret < 0)
1164                                 ret = wret;
1165                 }
1166                 BUG_ON(wret == 1);
1167         }
1168         if (btrfs_header_nritems(mid) == 0) {
1169                 btrfs_clear_buffer_dirty(trans, mid);
1170                 btrfs_tree_unlock(mid);
1171                 del_ptr(root, path, level + 1, pslot);
1172                 root_sub_used(root, mid->len);
1173                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1174                 free_extent_buffer_stale(mid);
1175                 mid = NULL;
1176         } else {
1177                 /* update the parent key to reflect our changes */
1178                 struct btrfs_disk_key mid_key;
1179                 btrfs_node_key(mid, &mid_key, 0);
1180                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1181                                                     BTRFS_MOD_LOG_KEY_REPLACE);
1182                 BUG_ON(ret < 0);
1183                 btrfs_set_node_key(parent, &mid_key, pslot);
1184                 btrfs_mark_buffer_dirty(parent);
1185         }
1186
1187         /* update the path */
1188         if (left) {
1189                 if (btrfs_header_nritems(left) > orig_slot) {
1190                         atomic_inc(&left->refs);
1191                         /* left was locked after cow */
1192                         path->nodes[level] = left;
1193                         path->slots[level + 1] -= 1;
1194                         path->slots[level] = orig_slot;
1195                         if (mid) {
1196                                 btrfs_tree_unlock(mid);
1197                                 free_extent_buffer(mid);
1198                         }
1199                 } else {
1200                         orig_slot -= btrfs_header_nritems(left);
1201                         path->slots[level] = orig_slot;
1202                 }
1203         }
1204         /* double check we haven't messed things up */
1205         if (orig_ptr !=
1206             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1207                 BUG();
1208 enospc:
1209         if (right) {
1210                 btrfs_tree_unlock(right);
1211                 free_extent_buffer(right);
1212         }
1213         if (left) {
1214                 if (path->nodes[level] != left)
1215                         btrfs_tree_unlock(left);
1216                 free_extent_buffer(left);
1217         }
1218         return ret;
1219 }
1220
1221 /* Node balancing for insertion.  Here we only split or push nodes around
1222  * when they are completely full.  This is also done top down, so we
1223  * have to be pessimistic.
1224  */
1225 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1226                                           struct btrfs_root *root,
1227                                           struct btrfs_path *path, int level)
1228 {
1229         struct btrfs_fs_info *fs_info = root->fs_info;
1230         struct extent_buffer *right = NULL;
1231         struct extent_buffer *mid;
1232         struct extent_buffer *left = NULL;
1233         struct extent_buffer *parent = NULL;
1234         int ret = 0;
1235         int wret;
1236         int pslot;
1237         int orig_slot = path->slots[level];
1238
1239         if (level == 0)
1240                 return 1;
1241
1242         mid = path->nodes[level];
1243         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1244
1245         if (level < BTRFS_MAX_LEVEL - 1) {
1246                 parent = path->nodes[level + 1];
1247                 pslot = path->slots[level + 1];
1248         }
1249
1250         if (!parent)
1251                 return 1;
1252
1253         /* first, try to make some room in the middle buffer */
1254         if (pslot) {
1255                 u32 left_nr;
1256
1257                 left = btrfs_read_node_slot(parent, pslot - 1);
1258                 if (IS_ERR(left))
1259                         return PTR_ERR(left);
1260
1261                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1262
1263                 left_nr = btrfs_header_nritems(left);
1264                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1265                         wret = 1;
1266                 } else {
1267                         ret = btrfs_cow_block(trans, root, left, parent,
1268                                               pslot - 1, &left,
1269                                               BTRFS_NESTING_LEFT_COW);
1270                         if (ret)
1271                                 wret = 1;
1272                         else {
1273                                 wret = push_node_left(trans, left, mid, 0);
1274                         }
1275                 }
1276                 if (wret < 0)
1277                         ret = wret;
1278                 if (wret == 0) {
1279                         struct btrfs_disk_key disk_key;
1280                         orig_slot += left_nr;
1281                         btrfs_node_key(mid, &disk_key, 0);
1282                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1283                                         BTRFS_MOD_LOG_KEY_REPLACE);
1284                         BUG_ON(ret < 0);
1285                         btrfs_set_node_key(parent, &disk_key, pslot);
1286                         btrfs_mark_buffer_dirty(parent);
1287                         if (btrfs_header_nritems(left) > orig_slot) {
1288                                 path->nodes[level] = left;
1289                                 path->slots[level + 1] -= 1;
1290                                 path->slots[level] = orig_slot;
1291                                 btrfs_tree_unlock(mid);
1292                                 free_extent_buffer(mid);
1293                         } else {
1294                                 orig_slot -=
1295                                         btrfs_header_nritems(left);
1296                                 path->slots[level] = orig_slot;
1297                                 btrfs_tree_unlock(left);
1298                                 free_extent_buffer(left);
1299                         }
1300                         return 0;
1301                 }
1302                 btrfs_tree_unlock(left);
1303                 free_extent_buffer(left);
1304         }
1305
1306         /*
1307          * then try to empty the right most buffer into the middle
1308          */
1309         if (pslot + 1 < btrfs_header_nritems(parent)) {
1310                 u32 right_nr;
1311
1312                 right = btrfs_read_node_slot(parent, pslot + 1);
1313                 if (IS_ERR(right))
1314                         return PTR_ERR(right);
1315
1316                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1317
1318                 right_nr = btrfs_header_nritems(right);
1319                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1320                         wret = 1;
1321                 } else {
1322                         ret = btrfs_cow_block(trans, root, right,
1323                                               parent, pslot + 1,
1324                                               &right, BTRFS_NESTING_RIGHT_COW);
1325                         if (ret)
1326                                 wret = 1;
1327                         else {
1328                                 wret = balance_node_right(trans, right, mid);
1329                         }
1330                 }
1331                 if (wret < 0)
1332                         ret = wret;
1333                 if (wret == 0) {
1334                         struct btrfs_disk_key disk_key;
1335
1336                         btrfs_node_key(right, &disk_key, 0);
1337                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1338                                         BTRFS_MOD_LOG_KEY_REPLACE);
1339                         BUG_ON(ret < 0);
1340                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1341                         btrfs_mark_buffer_dirty(parent);
1342
1343                         if (btrfs_header_nritems(mid) <= orig_slot) {
1344                                 path->nodes[level] = right;
1345                                 path->slots[level + 1] += 1;
1346                                 path->slots[level] = orig_slot -
1347                                         btrfs_header_nritems(mid);
1348                                 btrfs_tree_unlock(mid);
1349                                 free_extent_buffer(mid);
1350                         } else {
1351                                 btrfs_tree_unlock(right);
1352                                 free_extent_buffer(right);
1353                         }
1354                         return 0;
1355                 }
1356                 btrfs_tree_unlock(right);
1357                 free_extent_buffer(right);
1358         }
1359         return 1;
1360 }
1361
1362 /*
1363  * readahead one full node of leaves, finding things that are close
1364  * to the block in 'slot', and triggering ra on them.
1365  */
1366 static void reada_for_search(struct btrfs_fs_info *fs_info,
1367                              struct btrfs_path *path,
1368                              int level, int slot, u64 objectid)
1369 {
1370         struct extent_buffer *node;
1371         struct btrfs_disk_key disk_key;
1372         u32 nritems;
1373         u64 search;
1374         u64 target;
1375         u64 nread = 0;
1376         u64 nread_max;
1377         u32 nr;
1378         u32 blocksize;
1379         u32 nscan = 0;
1380
1381         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1382                 return;
1383
1384         if (!path->nodes[level])
1385                 return;
1386
1387         node = path->nodes[level];
1388
1389         /*
1390          * Since the time between visiting leaves is much shorter than the time
1391          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1392          * much IO at once (possibly random).
1393          */
1394         if (path->reada == READA_FORWARD_ALWAYS) {
1395                 if (level > 1)
1396                         nread_max = node->fs_info->nodesize;
1397                 else
1398                         nread_max = SZ_128K;
1399         } else {
1400                 nread_max = SZ_64K;
1401         }
1402
1403         search = btrfs_node_blockptr(node, slot);
1404         blocksize = fs_info->nodesize;
1405         if (path->reada != READA_FORWARD_ALWAYS) {
1406                 struct extent_buffer *eb;
1407
1408                 eb = find_extent_buffer(fs_info, search);
1409                 if (eb) {
1410                         free_extent_buffer(eb);
1411                         return;
1412                 }
1413         }
1414
1415         target = search;
1416
1417         nritems = btrfs_header_nritems(node);
1418         nr = slot;
1419
1420         while (1) {
1421                 if (path->reada == READA_BACK) {
1422                         if (nr == 0)
1423                                 break;
1424                         nr--;
1425                 } else if (path->reada == READA_FORWARD ||
1426                            path->reada == READA_FORWARD_ALWAYS) {
1427                         nr++;
1428                         if (nr >= nritems)
1429                                 break;
1430                 }
1431                 if (path->reada == READA_BACK && objectid) {
1432                         btrfs_node_key(node, &disk_key, nr);
1433                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1434                                 break;
1435                 }
1436                 search = btrfs_node_blockptr(node, nr);
1437                 if (path->reada == READA_FORWARD_ALWAYS ||
1438                     (search <= target && target - search <= 65536) ||
1439                     (search > target && search - target <= 65536)) {
1440                         btrfs_readahead_node_child(node, nr);
1441                         nread += blocksize;
1442                 }
1443                 nscan++;
1444                 if (nread > nread_max || nscan > 32)
1445                         break;
1446         }
1447 }
1448
1449 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1450 {
1451         struct extent_buffer *parent;
1452         int slot;
1453         int nritems;
1454
1455         parent = path->nodes[level + 1];
1456         if (!parent)
1457                 return;
1458
1459         nritems = btrfs_header_nritems(parent);
1460         slot = path->slots[level + 1];
1461
1462         if (slot > 0)
1463                 btrfs_readahead_node_child(parent, slot - 1);
1464         if (slot + 1 < nritems)
1465                 btrfs_readahead_node_child(parent, slot + 1);
1466 }
1467
1468
1469 /*
1470  * when we walk down the tree, it is usually safe to unlock the higher layers
1471  * in the tree.  The exceptions are when our path goes through slot 0, because
1472  * operations on the tree might require changing key pointers higher up in the
1473  * tree.
1474  *
1475  * callers might also have set path->keep_locks, which tells this code to keep
1476  * the lock if the path points to the last slot in the block.  This is part of
1477  * walking through the tree, and selecting the next slot in the higher block.
1478  *
1479  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1480  * if lowest_unlock is 1, level 0 won't be unlocked
1481  */
1482 static noinline void unlock_up(struct btrfs_path *path, int level,
1483                                int lowest_unlock, int min_write_lock_level,
1484                                int *write_lock_level)
1485 {
1486         int i;
1487         int skip_level = level;
1488         bool check_skip = true;
1489
1490         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1491                 if (!path->nodes[i])
1492                         break;
1493                 if (!path->locks[i])
1494                         break;
1495
1496                 if (check_skip) {
1497                         if (path->slots[i] == 0) {
1498                                 skip_level = i + 1;
1499                                 continue;
1500                         }
1501
1502                         if (path->keep_locks) {
1503                                 u32 nritems;
1504
1505                                 nritems = btrfs_header_nritems(path->nodes[i]);
1506                                 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1507                                         skip_level = i + 1;
1508                                         continue;
1509                                 }
1510                         }
1511                 }
1512
1513                 if (i >= lowest_unlock && i > skip_level) {
1514                         check_skip = false;
1515                         btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1516                         path->locks[i] = 0;
1517                         if (write_lock_level &&
1518                             i > min_write_lock_level &&
1519                             i <= *write_lock_level) {
1520                                 *write_lock_level = i - 1;
1521                         }
1522                 }
1523         }
1524 }
1525
1526 /*
1527  * Helper function for btrfs_search_slot() and other functions that do a search
1528  * on a btree. The goal is to find a tree block in the cache (the radix tree at
1529  * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1530  * its pages from disk.
1531  *
1532  * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1533  * whole btree search, starting again from the current root node.
1534  */
1535 static int
1536 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1537                       struct extent_buffer **eb_ret, int level, int slot,
1538                       const struct btrfs_key *key)
1539 {
1540         struct btrfs_fs_info *fs_info = root->fs_info;
1541         struct btrfs_tree_parent_check check = { 0 };
1542         u64 blocknr;
1543         u64 gen;
1544         struct extent_buffer *tmp;
1545         int ret;
1546         int parent_level;
1547         bool unlock_up;
1548
1549         unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1550         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1551         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1552         parent_level = btrfs_header_level(*eb_ret);
1553         btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1554         check.has_first_key = true;
1555         check.level = parent_level - 1;
1556         check.transid = gen;
1557         check.owner_root = root->root_key.objectid;
1558
1559         /*
1560          * If we need to read an extent buffer from disk and we are holding locks
1561          * on upper level nodes, we unlock all the upper nodes before reading the
1562          * extent buffer, and then return -EAGAIN to the caller as it needs to
1563          * restart the search. We don't release the lock on the current level
1564          * because we need to walk this node to figure out which blocks to read.
1565          */
1566         tmp = find_extent_buffer(fs_info, blocknr);
1567         if (tmp) {
1568                 if (p->reada == READA_FORWARD_ALWAYS)
1569                         reada_for_search(fs_info, p, level, slot, key->objectid);
1570
1571                 /* first we do an atomic uptodate check */
1572                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1573                         /*
1574                          * Do extra check for first_key, eb can be stale due to
1575                          * being cached, read from scrub, or have multiple
1576                          * parents (shared tree blocks).
1577                          */
1578                         if (btrfs_verify_level_key(tmp,
1579                                         parent_level - 1, &check.first_key, gen)) {
1580                                 free_extent_buffer(tmp);
1581                                 return -EUCLEAN;
1582                         }
1583                         *eb_ret = tmp;
1584                         return 0;
1585                 }
1586
1587                 if (p->nowait) {
1588                         free_extent_buffer(tmp);
1589                         return -EAGAIN;
1590                 }
1591
1592                 if (unlock_up)
1593                         btrfs_unlock_up_safe(p, level + 1);
1594
1595                 /* now we're allowed to do a blocking uptodate check */
1596                 ret = btrfs_read_extent_buffer(tmp, &check);
1597                 if (ret) {
1598                         free_extent_buffer(tmp);
1599                         btrfs_release_path(p);
1600                         return -EIO;
1601                 }
1602                 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1603                         free_extent_buffer(tmp);
1604                         btrfs_release_path(p);
1605                         return -EUCLEAN;
1606                 }
1607
1608                 if (unlock_up)
1609                         ret = -EAGAIN;
1610
1611                 goto out;
1612         } else if (p->nowait) {
1613                 return -EAGAIN;
1614         }
1615
1616         if (unlock_up) {
1617                 btrfs_unlock_up_safe(p, level + 1);
1618                 ret = -EAGAIN;
1619         } else {
1620                 ret = 0;
1621         }
1622
1623         if (p->reada != READA_NONE)
1624                 reada_for_search(fs_info, p, level, slot, key->objectid);
1625
1626         tmp = read_tree_block(fs_info, blocknr, &check);
1627         if (IS_ERR(tmp)) {
1628                 btrfs_release_path(p);
1629                 return PTR_ERR(tmp);
1630         }
1631         /*
1632          * If the read above didn't mark this buffer up to date,
1633          * it will never end up being up to date.  Set ret to EIO now
1634          * and give up so that our caller doesn't loop forever
1635          * on our EAGAINs.
1636          */
1637         if (!extent_buffer_uptodate(tmp))
1638                 ret = -EIO;
1639
1640 out:
1641         if (ret == 0) {
1642                 *eb_ret = tmp;
1643         } else {
1644                 free_extent_buffer(tmp);
1645                 btrfs_release_path(p);
1646         }
1647
1648         return ret;
1649 }
1650
1651 /*
1652  * helper function for btrfs_search_slot.  This does all of the checks
1653  * for node-level blocks and does any balancing required based on
1654  * the ins_len.
1655  *
1656  * If no extra work was required, zero is returned.  If we had to
1657  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1658  * start over
1659  */
1660 static int
1661 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1662                        struct btrfs_root *root, struct btrfs_path *p,
1663                        struct extent_buffer *b, int level, int ins_len,
1664                        int *write_lock_level)
1665 {
1666         struct btrfs_fs_info *fs_info = root->fs_info;
1667         int ret = 0;
1668
1669         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1670             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1671
1672                 if (*write_lock_level < level + 1) {
1673                         *write_lock_level = level + 1;
1674                         btrfs_release_path(p);
1675                         return -EAGAIN;
1676                 }
1677
1678                 reada_for_balance(p, level);
1679                 ret = split_node(trans, root, p, level);
1680
1681                 b = p->nodes[level];
1682         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1683                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1684
1685                 if (*write_lock_level < level + 1) {
1686                         *write_lock_level = level + 1;
1687                         btrfs_release_path(p);
1688                         return -EAGAIN;
1689                 }
1690
1691                 reada_for_balance(p, level);
1692                 ret = balance_level(trans, root, p, level);
1693                 if (ret)
1694                         return ret;
1695
1696                 b = p->nodes[level];
1697                 if (!b) {
1698                         btrfs_release_path(p);
1699                         return -EAGAIN;
1700                 }
1701                 BUG_ON(btrfs_header_nritems(b) == 1);
1702         }
1703         return ret;
1704 }
1705
1706 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1707                 u64 iobjectid, u64 ioff, u8 key_type,
1708                 struct btrfs_key *found_key)
1709 {
1710         int ret;
1711         struct btrfs_key key;
1712         struct extent_buffer *eb;
1713
1714         ASSERT(path);
1715         ASSERT(found_key);
1716
1717         key.type = key_type;
1718         key.objectid = iobjectid;
1719         key.offset = ioff;
1720
1721         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1722         if (ret < 0)
1723                 return ret;
1724
1725         eb = path->nodes[0];
1726         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1727                 ret = btrfs_next_leaf(fs_root, path);
1728                 if (ret)
1729                         return ret;
1730                 eb = path->nodes[0];
1731         }
1732
1733         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1734         if (found_key->type != key.type ||
1735                         found_key->objectid != key.objectid)
1736                 return 1;
1737
1738         return 0;
1739 }
1740
1741 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1742                                                         struct btrfs_path *p,
1743                                                         int write_lock_level)
1744 {
1745         struct extent_buffer *b;
1746         int root_lock = 0;
1747         int level = 0;
1748
1749         if (p->search_commit_root) {
1750                 b = root->commit_root;
1751                 atomic_inc(&b->refs);
1752                 level = btrfs_header_level(b);
1753                 /*
1754                  * Ensure that all callers have set skip_locking when
1755                  * p->search_commit_root = 1.
1756                  */
1757                 ASSERT(p->skip_locking == 1);
1758
1759                 goto out;
1760         }
1761
1762         if (p->skip_locking) {
1763                 b = btrfs_root_node(root);
1764                 level = btrfs_header_level(b);
1765                 goto out;
1766         }
1767
1768         /* We try very hard to do read locks on the root */
1769         root_lock = BTRFS_READ_LOCK;
1770
1771         /*
1772          * If the level is set to maximum, we can skip trying to get the read
1773          * lock.
1774          */
1775         if (write_lock_level < BTRFS_MAX_LEVEL) {
1776                 /*
1777                  * We don't know the level of the root node until we actually
1778                  * have it read locked
1779                  */
1780                 if (p->nowait) {
1781                         b = btrfs_try_read_lock_root_node(root);
1782                         if (IS_ERR(b))
1783                                 return b;
1784                 } else {
1785                         b = btrfs_read_lock_root_node(root);
1786                 }
1787                 level = btrfs_header_level(b);
1788                 if (level > write_lock_level)
1789                         goto out;
1790
1791                 /* Whoops, must trade for write lock */
1792                 btrfs_tree_read_unlock(b);
1793                 free_extent_buffer(b);
1794         }
1795
1796         b = btrfs_lock_root_node(root);
1797         root_lock = BTRFS_WRITE_LOCK;
1798
1799         /* The level might have changed, check again */
1800         level = btrfs_header_level(b);
1801
1802 out:
1803         /*
1804          * The root may have failed to write out at some point, and thus is no
1805          * longer valid, return an error in this case.
1806          */
1807         if (!extent_buffer_uptodate(b)) {
1808                 if (root_lock)
1809                         btrfs_tree_unlock_rw(b, root_lock);
1810                 free_extent_buffer(b);
1811                 return ERR_PTR(-EIO);
1812         }
1813
1814         p->nodes[level] = b;
1815         if (!p->skip_locking)
1816                 p->locks[level] = root_lock;
1817         /*
1818          * Callers are responsible for dropping b's references.
1819          */
1820         return b;
1821 }
1822
1823 /*
1824  * Replace the extent buffer at the lowest level of the path with a cloned
1825  * version. The purpose is to be able to use it safely, after releasing the
1826  * commit root semaphore, even if relocation is happening in parallel, the
1827  * transaction used for relocation is committed and the extent buffer is
1828  * reallocated in the next transaction.
1829  *
1830  * This is used in a context where the caller does not prevent transaction
1831  * commits from happening, either by holding a transaction handle or holding
1832  * some lock, while it's doing searches through a commit root.
1833  * At the moment it's only used for send operations.
1834  */
1835 static int finish_need_commit_sem_search(struct btrfs_path *path)
1836 {
1837         const int i = path->lowest_level;
1838         const int slot = path->slots[i];
1839         struct extent_buffer *lowest = path->nodes[i];
1840         struct extent_buffer *clone;
1841
1842         ASSERT(path->need_commit_sem);
1843
1844         if (!lowest)
1845                 return 0;
1846
1847         lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1848
1849         clone = btrfs_clone_extent_buffer(lowest);
1850         if (!clone)
1851                 return -ENOMEM;
1852
1853         btrfs_release_path(path);
1854         path->nodes[i] = clone;
1855         path->slots[i] = slot;
1856
1857         return 0;
1858 }
1859
1860 static inline int search_for_key_slot(struct extent_buffer *eb,
1861                                       int search_low_slot,
1862                                       const struct btrfs_key *key,
1863                                       int prev_cmp,
1864                                       int *slot)
1865 {
1866         /*
1867          * If a previous call to btrfs_bin_search() on a parent node returned an
1868          * exact match (prev_cmp == 0), we can safely assume the target key will
1869          * always be at slot 0 on lower levels, since each key pointer
1870          * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1871          * subtree it points to. Thus we can skip searching lower levels.
1872          */
1873         if (prev_cmp == 0) {
1874                 *slot = 0;
1875                 return 0;
1876         }
1877
1878         return btrfs_bin_search(eb, search_low_slot, key, slot);
1879 }
1880
1881 static int search_leaf(struct btrfs_trans_handle *trans,
1882                        struct btrfs_root *root,
1883                        const struct btrfs_key *key,
1884                        struct btrfs_path *path,
1885                        int ins_len,
1886                        int prev_cmp)
1887 {
1888         struct extent_buffer *leaf = path->nodes[0];
1889         int leaf_free_space = -1;
1890         int search_low_slot = 0;
1891         int ret;
1892         bool do_bin_search = true;
1893
1894         /*
1895          * If we are doing an insertion, the leaf has enough free space and the
1896          * destination slot for the key is not slot 0, then we can unlock our
1897          * write lock on the parent, and any other upper nodes, before doing the
1898          * binary search on the leaf (with search_for_key_slot()), allowing other
1899          * tasks to lock the parent and any other upper nodes.
1900          */
1901         if (ins_len > 0) {
1902                 /*
1903                  * Cache the leaf free space, since we will need it later and it
1904                  * will not change until then.
1905                  */
1906                 leaf_free_space = btrfs_leaf_free_space(leaf);
1907
1908                 /*
1909                  * !path->locks[1] means we have a single node tree, the leaf is
1910                  * the root of the tree.
1911                  */
1912                 if (path->locks[1] && leaf_free_space >= ins_len) {
1913                         struct btrfs_disk_key first_key;
1914
1915                         ASSERT(btrfs_header_nritems(leaf) > 0);
1916                         btrfs_item_key(leaf, &first_key, 0);
1917
1918                         /*
1919                          * Doing the extra comparison with the first key is cheap,
1920                          * taking into account that the first key is very likely
1921                          * already in a cache line because it immediately follows
1922                          * the extent buffer's header and we have recently accessed
1923                          * the header's level field.
1924                          */
1925                         ret = comp_keys(&first_key, key);
1926                         if (ret < 0) {
1927                                 /*
1928                                  * The first key is smaller than the key we want
1929                                  * to insert, so we are safe to unlock all upper
1930                                  * nodes and we have to do the binary search.
1931                                  *
1932                                  * We do use btrfs_unlock_up_safe() and not
1933                                  * unlock_up() because the later does not unlock
1934                                  * nodes with a slot of 0 - we can safely unlock
1935                                  * any node even if its slot is 0 since in this
1936                                  * case the key does not end up at slot 0 of the
1937                                  * leaf and there's no need to split the leaf.
1938                                  */
1939                                 btrfs_unlock_up_safe(path, 1);
1940                                 search_low_slot = 1;
1941                         } else {
1942                                 /*
1943                                  * The first key is >= then the key we want to
1944                                  * insert, so we can skip the binary search as
1945                                  * the target key will be at slot 0.
1946                                  *
1947                                  * We can not unlock upper nodes when the key is
1948                                  * less than the first key, because we will need
1949                                  * to update the key at slot 0 of the parent node
1950                                  * and possibly of other upper nodes too.
1951                                  * If the key matches the first key, then we can
1952                                  * unlock all the upper nodes, using
1953                                  * btrfs_unlock_up_safe() instead of unlock_up()
1954                                  * as stated above.
1955                                  */
1956                                 if (ret == 0)
1957                                         btrfs_unlock_up_safe(path, 1);
1958                                 /*
1959                                  * ret is already 0 or 1, matching the result of
1960                                  * a btrfs_bin_search() call, so there is no need
1961                                  * to adjust it.
1962                                  */
1963                                 do_bin_search = false;
1964                                 path->slots[0] = 0;
1965                         }
1966                 }
1967         }
1968
1969         if (do_bin_search) {
1970                 ret = search_for_key_slot(leaf, search_low_slot, key,
1971                                           prev_cmp, &path->slots[0]);
1972                 if (ret < 0)
1973                         return ret;
1974         }
1975
1976         if (ins_len > 0) {
1977                 /*
1978                  * Item key already exists. In this case, if we are allowed to
1979                  * insert the item (for example, in dir_item case, item key
1980                  * collision is allowed), it will be merged with the original
1981                  * item. Only the item size grows, no new btrfs item will be
1982                  * added. If search_for_extension is not set, ins_len already
1983                  * accounts the size btrfs_item, deduct it here so leaf space
1984                  * check will be correct.
1985                  */
1986                 if (ret == 0 && !path->search_for_extension) {
1987                         ASSERT(ins_len >= sizeof(struct btrfs_item));
1988                         ins_len -= sizeof(struct btrfs_item);
1989                 }
1990
1991                 ASSERT(leaf_free_space >= 0);
1992
1993                 if (leaf_free_space < ins_len) {
1994                         int err;
1995
1996                         err = split_leaf(trans, root, key, path, ins_len,
1997                                          (ret == 0));
1998                         ASSERT(err <= 0);
1999                         if (WARN_ON(err > 0))
2000                                 err = -EUCLEAN;
2001                         if (err)
2002                                 ret = err;
2003                 }
2004         }
2005
2006         return ret;
2007 }
2008
2009 /*
2010  * btrfs_search_slot - look for a key in a tree and perform necessary
2011  * modifications to preserve tree invariants.
2012  *
2013  * @trans:      Handle of transaction, used when modifying the tree
2014  * @p:          Holds all btree nodes along the search path
2015  * @root:       The root node of the tree
2016  * @key:        The key we are looking for
2017  * @ins_len:    Indicates purpose of search:
2018  *              >0  for inserts it's size of item inserted (*)
2019  *              <0  for deletions
2020  *               0  for plain searches, not modifying the tree
2021  *
2022  *              (*) If size of item inserted doesn't include
2023  *              sizeof(struct btrfs_item), then p->search_for_extension must
2024  *              be set.
2025  * @cow:        boolean should CoW operations be performed. Must always be 1
2026  *              when modifying the tree.
2027  *
2028  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2029  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2030  *
2031  * If @key is found, 0 is returned and you can find the item in the leaf level
2032  * of the path (level 0)
2033  *
2034  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2035  * points to the slot where it should be inserted
2036  *
2037  * If an error is encountered while searching the tree a negative error number
2038  * is returned
2039  */
2040 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2041                       const struct btrfs_key *key, struct btrfs_path *p,
2042                       int ins_len, int cow)
2043 {
2044         struct btrfs_fs_info *fs_info = root->fs_info;
2045         struct extent_buffer *b;
2046         int slot;
2047         int ret;
2048         int err;
2049         int level;
2050         int lowest_unlock = 1;
2051         /* everything at write_lock_level or lower must be write locked */
2052         int write_lock_level = 0;
2053         u8 lowest_level = 0;
2054         int min_write_lock_level;
2055         int prev_cmp;
2056
2057         might_sleep();
2058
2059         lowest_level = p->lowest_level;
2060         WARN_ON(lowest_level && ins_len > 0);
2061         WARN_ON(p->nodes[0] != NULL);
2062         BUG_ON(!cow && ins_len);
2063
2064         /*
2065          * For now only allow nowait for read only operations.  There's no
2066          * strict reason why we can't, we just only need it for reads so it's
2067          * only implemented for reads.
2068          */
2069         ASSERT(!p->nowait || !cow);
2070
2071         if (ins_len < 0) {
2072                 lowest_unlock = 2;
2073
2074                 /* when we are removing items, we might have to go up to level
2075                  * two as we update tree pointers  Make sure we keep write
2076                  * for those levels as well
2077                  */
2078                 write_lock_level = 2;
2079         } else if (ins_len > 0) {
2080                 /*
2081                  * for inserting items, make sure we have a write lock on
2082                  * level 1 so we can update keys
2083                  */
2084                 write_lock_level = 1;
2085         }
2086
2087         if (!cow)
2088                 write_lock_level = -1;
2089
2090         if (cow && (p->keep_locks || p->lowest_level))
2091                 write_lock_level = BTRFS_MAX_LEVEL;
2092
2093         min_write_lock_level = write_lock_level;
2094
2095         if (p->need_commit_sem) {
2096                 ASSERT(p->search_commit_root);
2097                 if (p->nowait) {
2098                         if (!down_read_trylock(&fs_info->commit_root_sem))
2099                                 return -EAGAIN;
2100                 } else {
2101                         down_read(&fs_info->commit_root_sem);
2102                 }
2103         }
2104
2105 again:
2106         prev_cmp = -1;
2107         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2108         if (IS_ERR(b)) {
2109                 ret = PTR_ERR(b);
2110                 goto done;
2111         }
2112
2113         while (b) {
2114                 int dec = 0;
2115
2116                 level = btrfs_header_level(b);
2117
2118                 if (cow) {
2119                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2120
2121                         /*
2122                          * if we don't really need to cow this block
2123                          * then we don't want to set the path blocking,
2124                          * so we test it here
2125                          */
2126                         if (!should_cow_block(trans, root, b))
2127                                 goto cow_done;
2128
2129                         /*
2130                          * must have write locks on this node and the
2131                          * parent
2132                          */
2133                         if (level > write_lock_level ||
2134                             (level + 1 > write_lock_level &&
2135                             level + 1 < BTRFS_MAX_LEVEL &&
2136                             p->nodes[level + 1])) {
2137                                 write_lock_level = level + 1;
2138                                 btrfs_release_path(p);
2139                                 goto again;
2140                         }
2141
2142                         if (last_level)
2143                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2144                                                       &b,
2145                                                       BTRFS_NESTING_COW);
2146                         else
2147                                 err = btrfs_cow_block(trans, root, b,
2148                                                       p->nodes[level + 1],
2149                                                       p->slots[level + 1], &b,
2150                                                       BTRFS_NESTING_COW);
2151                         if (err) {
2152                                 ret = err;
2153                                 goto done;
2154                         }
2155                 }
2156 cow_done:
2157                 p->nodes[level] = b;
2158
2159                 /*
2160                  * we have a lock on b and as long as we aren't changing
2161                  * the tree, there is no way to for the items in b to change.
2162                  * It is safe to drop the lock on our parent before we
2163                  * go through the expensive btree search on b.
2164                  *
2165                  * If we're inserting or deleting (ins_len != 0), then we might
2166                  * be changing slot zero, which may require changing the parent.
2167                  * So, we can't drop the lock until after we know which slot
2168                  * we're operating on.
2169                  */
2170                 if (!ins_len && !p->keep_locks) {
2171                         int u = level + 1;
2172
2173                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2174                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2175                                 p->locks[u] = 0;
2176                         }
2177                 }
2178
2179                 if (level == 0) {
2180                         if (ins_len > 0)
2181                                 ASSERT(write_lock_level >= 1);
2182
2183                         ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2184                         if (!p->search_for_split)
2185                                 unlock_up(p, level, lowest_unlock,
2186                                           min_write_lock_level, NULL);
2187                         goto done;
2188                 }
2189
2190                 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2191                 if (ret < 0)
2192                         goto done;
2193                 prev_cmp = ret;
2194
2195                 if (ret && slot > 0) {
2196                         dec = 1;
2197                         slot--;
2198                 }
2199                 p->slots[level] = slot;
2200                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2201                                              &write_lock_level);
2202                 if (err == -EAGAIN)
2203                         goto again;
2204                 if (err) {
2205                         ret = err;
2206                         goto done;
2207                 }
2208                 b = p->nodes[level];
2209                 slot = p->slots[level];
2210
2211                 /*
2212                  * Slot 0 is special, if we change the key we have to update
2213                  * the parent pointer which means we must have a write lock on
2214                  * the parent
2215                  */
2216                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2217                         write_lock_level = level + 1;
2218                         btrfs_release_path(p);
2219                         goto again;
2220                 }
2221
2222                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2223                           &write_lock_level);
2224
2225                 if (level == lowest_level) {
2226                         if (dec)
2227                                 p->slots[level]++;
2228                         goto done;
2229                 }
2230
2231                 err = read_block_for_search(root, p, &b, level, slot, key);
2232                 if (err == -EAGAIN)
2233                         goto again;
2234                 if (err) {
2235                         ret = err;
2236                         goto done;
2237                 }
2238
2239                 if (!p->skip_locking) {
2240                         level = btrfs_header_level(b);
2241
2242                         btrfs_maybe_reset_lockdep_class(root, b);
2243
2244                         if (level <= write_lock_level) {
2245                                 btrfs_tree_lock(b);
2246                                 p->locks[level] = BTRFS_WRITE_LOCK;
2247                         } else {
2248                                 if (p->nowait) {
2249                                         if (!btrfs_try_tree_read_lock(b)) {
2250                                                 free_extent_buffer(b);
2251                                                 ret = -EAGAIN;
2252                                                 goto done;
2253                                         }
2254                                 } else {
2255                                         btrfs_tree_read_lock(b);
2256                                 }
2257                                 p->locks[level] = BTRFS_READ_LOCK;
2258                         }
2259                         p->nodes[level] = b;
2260                 }
2261         }
2262         ret = 1;
2263 done:
2264         if (ret < 0 && !p->skip_release_on_error)
2265                 btrfs_release_path(p);
2266
2267         if (p->need_commit_sem) {
2268                 int ret2;
2269
2270                 ret2 = finish_need_commit_sem_search(p);
2271                 up_read(&fs_info->commit_root_sem);
2272                 if (ret2)
2273                         ret = ret2;
2274         }
2275
2276         return ret;
2277 }
2278 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2279
2280 /*
2281  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2282  * current state of the tree together with the operations recorded in the tree
2283  * modification log to search for the key in a previous version of this tree, as
2284  * denoted by the time_seq parameter.
2285  *
2286  * Naturally, there is no support for insert, delete or cow operations.
2287  *
2288  * The resulting path and return value will be set up as if we called
2289  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2290  */
2291 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2292                           struct btrfs_path *p, u64 time_seq)
2293 {
2294         struct btrfs_fs_info *fs_info = root->fs_info;
2295         struct extent_buffer *b;
2296         int slot;
2297         int ret;
2298         int err;
2299         int level;
2300         int lowest_unlock = 1;
2301         u8 lowest_level = 0;
2302
2303         lowest_level = p->lowest_level;
2304         WARN_ON(p->nodes[0] != NULL);
2305         ASSERT(!p->nowait);
2306
2307         if (p->search_commit_root) {
2308                 BUG_ON(time_seq);
2309                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2310         }
2311
2312 again:
2313         b = btrfs_get_old_root(root, time_seq);
2314         if (!b) {
2315                 ret = -EIO;
2316                 goto done;
2317         }
2318         level = btrfs_header_level(b);
2319         p->locks[level] = BTRFS_READ_LOCK;
2320
2321         while (b) {
2322                 int dec = 0;
2323
2324                 level = btrfs_header_level(b);
2325                 p->nodes[level] = b;
2326
2327                 /*
2328                  * we have a lock on b and as long as we aren't changing
2329                  * the tree, there is no way to for the items in b to change.
2330                  * It is safe to drop the lock on our parent before we
2331                  * go through the expensive btree search on b.
2332                  */
2333                 btrfs_unlock_up_safe(p, level + 1);
2334
2335                 ret = btrfs_bin_search(b, 0, key, &slot);
2336                 if (ret < 0)
2337                         goto done;
2338
2339                 if (level == 0) {
2340                         p->slots[level] = slot;
2341                         unlock_up(p, level, lowest_unlock, 0, NULL);
2342                         goto done;
2343                 }
2344
2345                 if (ret && slot > 0) {
2346                         dec = 1;
2347                         slot--;
2348                 }
2349                 p->slots[level] = slot;
2350                 unlock_up(p, level, lowest_unlock, 0, NULL);
2351
2352                 if (level == lowest_level) {
2353                         if (dec)
2354                                 p->slots[level]++;
2355                         goto done;
2356                 }
2357
2358                 err = read_block_for_search(root, p, &b, level, slot, key);
2359                 if (err == -EAGAIN)
2360                         goto again;
2361                 if (err) {
2362                         ret = err;
2363                         goto done;
2364                 }
2365
2366                 level = btrfs_header_level(b);
2367                 btrfs_tree_read_lock(b);
2368                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2369                 if (!b) {
2370                         ret = -ENOMEM;
2371                         goto done;
2372                 }
2373                 p->locks[level] = BTRFS_READ_LOCK;
2374                 p->nodes[level] = b;
2375         }
2376         ret = 1;
2377 done:
2378         if (ret < 0)
2379                 btrfs_release_path(p);
2380
2381         return ret;
2382 }
2383
2384 /*
2385  * Search the tree again to find a leaf with smaller keys.
2386  * Returns 0 if it found something.
2387  * Returns 1 if there are no smaller keys.
2388  * Returns < 0 on error.
2389  *
2390  * This may release the path, and so you may lose any locks held at the
2391  * time you call it.
2392  */
2393 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2394 {
2395         struct btrfs_key key;
2396         struct btrfs_key orig_key;
2397         struct btrfs_disk_key found_key;
2398         int ret;
2399
2400         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2401         orig_key = key;
2402
2403         if (key.offset > 0) {
2404                 key.offset--;
2405         } else if (key.type > 0) {
2406                 key.type--;
2407                 key.offset = (u64)-1;
2408         } else if (key.objectid > 0) {
2409                 key.objectid--;
2410                 key.type = (u8)-1;
2411                 key.offset = (u64)-1;
2412         } else {
2413                 return 1;
2414         }
2415
2416         btrfs_release_path(path);
2417         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2418         if (ret <= 0)
2419                 return ret;
2420
2421         /*
2422          * Previous key not found. Even if we were at slot 0 of the leaf we had
2423          * before releasing the path and calling btrfs_search_slot(), we now may
2424          * be in a slot pointing to the same original key - this can happen if
2425          * after we released the path, one of more items were moved from a
2426          * sibling leaf into the front of the leaf we had due to an insertion
2427          * (see push_leaf_right()).
2428          * If we hit this case and our slot is > 0 and just decrement the slot
2429          * so that the caller does not process the same key again, which may or
2430          * may not break the caller, depending on its logic.
2431          */
2432         if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2433                 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2434                 ret = comp_keys(&found_key, &orig_key);
2435                 if (ret == 0) {
2436                         if (path->slots[0] > 0) {
2437                                 path->slots[0]--;
2438                                 return 0;
2439                         }
2440                         /*
2441                          * At slot 0, same key as before, it means orig_key is
2442                          * the lowest, leftmost, key in the tree. We're done.
2443                          */
2444                         return 1;
2445                 }
2446         }
2447
2448         btrfs_item_key(path->nodes[0], &found_key, 0);
2449         ret = comp_keys(&found_key, &key);
2450         /*
2451          * We might have had an item with the previous key in the tree right
2452          * before we released our path. And after we released our path, that
2453          * item might have been pushed to the first slot (0) of the leaf we
2454          * were holding due to a tree balance. Alternatively, an item with the
2455          * previous key can exist as the only element of a leaf (big fat item).
2456          * Therefore account for these 2 cases, so that our callers (like
2457          * btrfs_previous_item) don't miss an existing item with a key matching
2458          * the previous key we computed above.
2459          */
2460         if (ret <= 0)
2461                 return 0;
2462         return 1;
2463 }
2464
2465 /*
2466  * helper to use instead of search slot if no exact match is needed but
2467  * instead the next or previous item should be returned.
2468  * When find_higher is true, the next higher item is returned, the next lower
2469  * otherwise.
2470  * When return_any and find_higher are both true, and no higher item is found,
2471  * return the next lower instead.
2472  * When return_any is true and find_higher is false, and no lower item is found,
2473  * return the next higher instead.
2474  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2475  * < 0 on error
2476  */
2477 int btrfs_search_slot_for_read(struct btrfs_root *root,
2478                                const struct btrfs_key *key,
2479                                struct btrfs_path *p, int find_higher,
2480                                int return_any)
2481 {
2482         int ret;
2483         struct extent_buffer *leaf;
2484
2485 again:
2486         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2487         if (ret <= 0)
2488                 return ret;
2489         /*
2490          * a return value of 1 means the path is at the position where the
2491          * item should be inserted. Normally this is the next bigger item,
2492          * but in case the previous item is the last in a leaf, path points
2493          * to the first free slot in the previous leaf, i.e. at an invalid
2494          * item.
2495          */
2496         leaf = p->nodes[0];
2497
2498         if (find_higher) {
2499                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2500                         ret = btrfs_next_leaf(root, p);
2501                         if (ret <= 0)
2502                                 return ret;
2503                         if (!return_any)
2504                                 return 1;
2505                         /*
2506                          * no higher item found, return the next
2507                          * lower instead
2508                          */
2509                         return_any = 0;
2510                         find_higher = 0;
2511                         btrfs_release_path(p);
2512                         goto again;
2513                 }
2514         } else {
2515                 if (p->slots[0] == 0) {
2516                         ret = btrfs_prev_leaf(root, p);
2517                         if (ret < 0)
2518                                 return ret;
2519                         if (!ret) {
2520                                 leaf = p->nodes[0];
2521                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2522                                         p->slots[0]--;
2523                                 return 0;
2524                         }
2525                         if (!return_any)
2526                                 return 1;
2527                         /*
2528                          * no lower item found, return the next
2529                          * higher instead
2530                          */
2531                         return_any = 0;
2532                         find_higher = 1;
2533                         btrfs_release_path(p);
2534                         goto again;
2535                 } else {
2536                         --p->slots[0];
2537                 }
2538         }
2539         return 0;
2540 }
2541
2542 /*
2543  * Execute search and call btrfs_previous_item to traverse backwards if the item
2544  * was not found.
2545  *
2546  * Return 0 if found, 1 if not found and < 0 if error.
2547  */
2548 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2549                            struct btrfs_path *path)
2550 {
2551         int ret;
2552
2553         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2554         if (ret > 0)
2555                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2556
2557         if (ret == 0)
2558                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2559
2560         return ret;
2561 }
2562
2563 /*
2564  * Search for a valid slot for the given path.
2565  *
2566  * @root:       The root node of the tree.
2567  * @key:        Will contain a valid item if found.
2568  * @path:       The starting point to validate the slot.
2569  *
2570  * Return: 0  if the item is valid
2571  *         1  if not found
2572  *         <0 if error.
2573  */
2574 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2575                               struct btrfs_path *path)
2576 {
2577         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2578                 int ret;
2579
2580                 ret = btrfs_next_leaf(root, path);
2581                 if (ret)
2582                         return ret;
2583         }
2584
2585         btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2586         return 0;
2587 }
2588
2589 /*
2590  * adjust the pointers going up the tree, starting at level
2591  * making sure the right key of each node is points to 'key'.
2592  * This is used after shifting pointers to the left, so it stops
2593  * fixing up pointers when a given leaf/node is not in slot 0 of the
2594  * higher levels
2595  *
2596  */
2597 static void fixup_low_keys(struct btrfs_path *path,
2598                            struct btrfs_disk_key *key, int level)
2599 {
2600         int i;
2601         struct extent_buffer *t;
2602         int ret;
2603
2604         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2605                 int tslot = path->slots[i];
2606
2607                 if (!path->nodes[i])
2608                         break;
2609                 t = path->nodes[i];
2610                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2611                                                     BTRFS_MOD_LOG_KEY_REPLACE);
2612                 BUG_ON(ret < 0);
2613                 btrfs_set_node_key(t, key, tslot);
2614                 btrfs_mark_buffer_dirty(path->nodes[i]);
2615                 if (tslot != 0)
2616                         break;
2617         }
2618 }
2619
2620 /*
2621  * update item key.
2622  *
2623  * This function isn't completely safe. It's the caller's responsibility
2624  * that the new key won't break the order
2625  */
2626 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2627                              struct btrfs_path *path,
2628                              const struct btrfs_key *new_key)
2629 {
2630         struct btrfs_disk_key disk_key;
2631         struct extent_buffer *eb;
2632         int slot;
2633
2634         eb = path->nodes[0];
2635         slot = path->slots[0];
2636         if (slot > 0) {
2637                 btrfs_item_key(eb, &disk_key, slot - 1);
2638                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2639                         btrfs_print_leaf(eb);
2640                         btrfs_crit(fs_info,
2641                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2642                                    slot, btrfs_disk_key_objectid(&disk_key),
2643                                    btrfs_disk_key_type(&disk_key),
2644                                    btrfs_disk_key_offset(&disk_key),
2645                                    new_key->objectid, new_key->type,
2646                                    new_key->offset);
2647                         BUG();
2648                 }
2649         }
2650         if (slot < btrfs_header_nritems(eb) - 1) {
2651                 btrfs_item_key(eb, &disk_key, slot + 1);
2652                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2653                         btrfs_print_leaf(eb);
2654                         btrfs_crit(fs_info,
2655                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2656                                    slot, btrfs_disk_key_objectid(&disk_key),
2657                                    btrfs_disk_key_type(&disk_key),
2658                                    btrfs_disk_key_offset(&disk_key),
2659                                    new_key->objectid, new_key->type,
2660                                    new_key->offset);
2661                         BUG();
2662                 }
2663         }
2664
2665         btrfs_cpu_key_to_disk(&disk_key, new_key);
2666         btrfs_set_item_key(eb, &disk_key, slot);
2667         btrfs_mark_buffer_dirty(eb);
2668         if (slot == 0)
2669                 fixup_low_keys(path, &disk_key, 1);
2670 }
2671
2672 /*
2673  * Check key order of two sibling extent buffers.
2674  *
2675  * Return true if something is wrong.
2676  * Return false if everything is fine.
2677  *
2678  * Tree-checker only works inside one tree block, thus the following
2679  * corruption can not be detected by tree-checker:
2680  *
2681  * Leaf @left                   | Leaf @right
2682  * --------------------------------------------------------------
2683  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2684  *
2685  * Key f6 in leaf @left itself is valid, but not valid when the next
2686  * key in leaf @right is 7.
2687  * This can only be checked at tree block merge time.
2688  * And since tree checker has ensured all key order in each tree block
2689  * is correct, we only need to bother the last key of @left and the first
2690  * key of @right.
2691  */
2692 static bool check_sibling_keys(struct extent_buffer *left,
2693                                struct extent_buffer *right)
2694 {
2695         struct btrfs_key left_last;
2696         struct btrfs_key right_first;
2697         int level = btrfs_header_level(left);
2698         int nr_left = btrfs_header_nritems(left);
2699         int nr_right = btrfs_header_nritems(right);
2700
2701         /* No key to check in one of the tree blocks */
2702         if (!nr_left || !nr_right)
2703                 return false;
2704
2705         if (level) {
2706                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2707                 btrfs_node_key_to_cpu(right, &right_first, 0);
2708         } else {
2709                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2710                 btrfs_item_key_to_cpu(right, &right_first, 0);
2711         }
2712
2713         if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2714                 btrfs_crit(left->fs_info, "left extent buffer:");
2715                 btrfs_print_tree(left, false);
2716                 btrfs_crit(left->fs_info, "right extent buffer:");
2717                 btrfs_print_tree(right, false);
2718                 btrfs_crit(left->fs_info,
2719 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2720                            left_last.objectid, left_last.type,
2721                            left_last.offset, right_first.objectid,
2722                            right_first.type, right_first.offset);
2723                 return true;
2724         }
2725         return false;
2726 }
2727
2728 /*
2729  * try to push data from one node into the next node left in the
2730  * tree.
2731  *
2732  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2733  * error, and > 0 if there was no room in the left hand block.
2734  */
2735 static int push_node_left(struct btrfs_trans_handle *trans,
2736                           struct extent_buffer *dst,
2737                           struct extent_buffer *src, int empty)
2738 {
2739         struct btrfs_fs_info *fs_info = trans->fs_info;
2740         int push_items = 0;
2741         int src_nritems;
2742         int dst_nritems;
2743         int ret = 0;
2744
2745         src_nritems = btrfs_header_nritems(src);
2746         dst_nritems = btrfs_header_nritems(dst);
2747         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2748         WARN_ON(btrfs_header_generation(src) != trans->transid);
2749         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2750
2751         if (!empty && src_nritems <= 8)
2752                 return 1;
2753
2754         if (push_items <= 0)
2755                 return 1;
2756
2757         if (empty) {
2758                 push_items = min(src_nritems, push_items);
2759                 if (push_items < src_nritems) {
2760                         /* leave at least 8 pointers in the node if
2761                          * we aren't going to empty it
2762                          */
2763                         if (src_nritems - push_items < 8) {
2764                                 if (push_items <= 8)
2765                                         return 1;
2766                                 push_items -= 8;
2767                         }
2768                 }
2769         } else
2770                 push_items = min(src_nritems - 8, push_items);
2771
2772         /* dst is the left eb, src is the middle eb */
2773         if (check_sibling_keys(dst, src)) {
2774                 ret = -EUCLEAN;
2775                 btrfs_abort_transaction(trans, ret);
2776                 return ret;
2777         }
2778         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2779         if (ret) {
2780                 btrfs_abort_transaction(trans, ret);
2781                 return ret;
2782         }
2783         copy_extent_buffer(dst, src,
2784                            btrfs_node_key_ptr_offset(dst, dst_nritems),
2785                            btrfs_node_key_ptr_offset(src, 0),
2786                            push_items * sizeof(struct btrfs_key_ptr));
2787
2788         if (push_items < src_nritems) {
2789                 /*
2790                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2791                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2792                  */
2793                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2794                                       btrfs_node_key_ptr_offset(src, push_items),
2795                                       (src_nritems - push_items) *
2796                                       sizeof(struct btrfs_key_ptr));
2797         }
2798         btrfs_set_header_nritems(src, src_nritems - push_items);
2799         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2800         btrfs_mark_buffer_dirty(src);
2801         btrfs_mark_buffer_dirty(dst);
2802
2803         return ret;
2804 }
2805
2806 /*
2807  * try to push data from one node into the next node right in the
2808  * tree.
2809  *
2810  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2811  * error, and > 0 if there was no room in the right hand block.
2812  *
2813  * this will  only push up to 1/2 the contents of the left node over
2814  */
2815 static int balance_node_right(struct btrfs_trans_handle *trans,
2816                               struct extent_buffer *dst,
2817                               struct extent_buffer *src)
2818 {
2819         struct btrfs_fs_info *fs_info = trans->fs_info;
2820         int push_items = 0;
2821         int max_push;
2822         int src_nritems;
2823         int dst_nritems;
2824         int ret = 0;
2825
2826         WARN_ON(btrfs_header_generation(src) != trans->transid);
2827         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2828
2829         src_nritems = btrfs_header_nritems(src);
2830         dst_nritems = btrfs_header_nritems(dst);
2831         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2832         if (push_items <= 0)
2833                 return 1;
2834
2835         if (src_nritems < 4)
2836                 return 1;
2837
2838         max_push = src_nritems / 2 + 1;
2839         /* don't try to empty the node */
2840         if (max_push >= src_nritems)
2841                 return 1;
2842
2843         if (max_push < push_items)
2844                 push_items = max_push;
2845
2846         /* dst is the right eb, src is the middle eb */
2847         if (check_sibling_keys(src, dst)) {
2848                 ret = -EUCLEAN;
2849                 btrfs_abort_transaction(trans, ret);
2850                 return ret;
2851         }
2852         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2853         BUG_ON(ret < 0);
2854         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2855                                       btrfs_node_key_ptr_offset(dst, 0),
2856                                       (dst_nritems) *
2857                                       sizeof(struct btrfs_key_ptr));
2858
2859         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2860                                          push_items);
2861         if (ret) {
2862                 btrfs_abort_transaction(trans, ret);
2863                 return ret;
2864         }
2865         copy_extent_buffer(dst, src,
2866                            btrfs_node_key_ptr_offset(dst, 0),
2867                            btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2868                            push_items * sizeof(struct btrfs_key_ptr));
2869
2870         btrfs_set_header_nritems(src, src_nritems - push_items);
2871         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2872
2873         btrfs_mark_buffer_dirty(src);
2874         btrfs_mark_buffer_dirty(dst);
2875
2876         return ret;
2877 }
2878
2879 /*
2880  * helper function to insert a new root level in the tree.
2881  * A new node is allocated, and a single item is inserted to
2882  * point to the existing root
2883  *
2884  * returns zero on success or < 0 on failure.
2885  */
2886 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2887                            struct btrfs_root *root,
2888                            struct btrfs_path *path, int level)
2889 {
2890         struct btrfs_fs_info *fs_info = root->fs_info;
2891         u64 lower_gen;
2892         struct extent_buffer *lower;
2893         struct extent_buffer *c;
2894         struct extent_buffer *old;
2895         struct btrfs_disk_key lower_key;
2896         int ret;
2897
2898         BUG_ON(path->nodes[level]);
2899         BUG_ON(path->nodes[level-1] != root->node);
2900
2901         lower = path->nodes[level-1];
2902         if (level == 1)
2903                 btrfs_item_key(lower, &lower_key, 0);
2904         else
2905                 btrfs_node_key(lower, &lower_key, 0);
2906
2907         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2908                                    &lower_key, level, root->node->start, 0,
2909                                    BTRFS_NESTING_NEW_ROOT);
2910         if (IS_ERR(c))
2911                 return PTR_ERR(c);
2912
2913         root_add_used(root, fs_info->nodesize);
2914
2915         btrfs_set_header_nritems(c, 1);
2916         btrfs_set_node_key(c, &lower_key, 0);
2917         btrfs_set_node_blockptr(c, 0, lower->start);
2918         lower_gen = btrfs_header_generation(lower);
2919         WARN_ON(lower_gen != trans->transid);
2920
2921         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2922
2923         btrfs_mark_buffer_dirty(c);
2924
2925         old = root->node;
2926         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2927         BUG_ON(ret < 0);
2928         rcu_assign_pointer(root->node, c);
2929
2930         /* the super has an extra ref to root->node */
2931         free_extent_buffer(old);
2932
2933         add_root_to_dirty_list(root);
2934         atomic_inc(&c->refs);
2935         path->nodes[level] = c;
2936         path->locks[level] = BTRFS_WRITE_LOCK;
2937         path->slots[level] = 0;
2938         return 0;
2939 }
2940
2941 /*
2942  * worker function to insert a single pointer in a node.
2943  * the node should have enough room for the pointer already
2944  *
2945  * slot and level indicate where you want the key to go, and
2946  * blocknr is the block the key points to.
2947  */
2948 static void insert_ptr(struct btrfs_trans_handle *trans,
2949                        struct btrfs_path *path,
2950                        struct btrfs_disk_key *key, u64 bytenr,
2951                        int slot, int level)
2952 {
2953         struct extent_buffer *lower;
2954         int nritems;
2955         int ret;
2956
2957         BUG_ON(!path->nodes[level]);
2958         btrfs_assert_tree_write_locked(path->nodes[level]);
2959         lower = path->nodes[level];
2960         nritems = btrfs_header_nritems(lower);
2961         BUG_ON(slot > nritems);
2962         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2963         if (slot != nritems) {
2964                 if (level) {
2965                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2966                                         slot, nritems - slot);
2967                         BUG_ON(ret < 0);
2968                 }
2969                 memmove_extent_buffer(lower,
2970                               btrfs_node_key_ptr_offset(lower, slot + 1),
2971                               btrfs_node_key_ptr_offset(lower, slot),
2972                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2973         }
2974         if (level) {
2975                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2976                                                     BTRFS_MOD_LOG_KEY_ADD);
2977                 BUG_ON(ret < 0);
2978         }
2979         btrfs_set_node_key(lower, key, slot);
2980         btrfs_set_node_blockptr(lower, slot, bytenr);
2981         WARN_ON(trans->transid == 0);
2982         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2983         btrfs_set_header_nritems(lower, nritems + 1);
2984         btrfs_mark_buffer_dirty(lower);
2985 }
2986
2987 /*
2988  * split the node at the specified level in path in two.
2989  * The path is corrected to point to the appropriate node after the split
2990  *
2991  * Before splitting this tries to make some room in the node by pushing
2992  * left and right, if either one works, it returns right away.
2993  *
2994  * returns 0 on success and < 0 on failure
2995  */
2996 static noinline int split_node(struct btrfs_trans_handle *trans,
2997                                struct btrfs_root *root,
2998                                struct btrfs_path *path, int level)
2999 {
3000         struct btrfs_fs_info *fs_info = root->fs_info;
3001         struct extent_buffer *c;
3002         struct extent_buffer *split;
3003         struct btrfs_disk_key disk_key;
3004         int mid;
3005         int ret;
3006         u32 c_nritems;
3007
3008         c = path->nodes[level];
3009         WARN_ON(btrfs_header_generation(c) != trans->transid);
3010         if (c == root->node) {
3011                 /*
3012                  * trying to split the root, lets make a new one
3013                  *
3014                  * tree mod log: We don't log_removal old root in
3015                  * insert_new_root, because that root buffer will be kept as a
3016                  * normal node. We are going to log removal of half of the
3017                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
3018                  * holding a tree lock on the buffer, which is why we cannot
3019                  * race with other tree_mod_log users.
3020                  */
3021                 ret = insert_new_root(trans, root, path, level + 1);
3022                 if (ret)
3023                         return ret;
3024         } else {
3025                 ret = push_nodes_for_insert(trans, root, path, level);
3026                 c = path->nodes[level];
3027                 if (!ret && btrfs_header_nritems(c) <
3028                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3029                         return 0;
3030                 if (ret < 0)
3031                         return ret;
3032         }
3033
3034         c_nritems = btrfs_header_nritems(c);
3035         mid = (c_nritems + 1) / 2;
3036         btrfs_node_key(c, &disk_key, mid);
3037
3038         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3039                                        &disk_key, level, c->start, 0,
3040                                        BTRFS_NESTING_SPLIT);
3041         if (IS_ERR(split))
3042                 return PTR_ERR(split);
3043
3044         root_add_used(root, fs_info->nodesize);
3045         ASSERT(btrfs_header_level(c) == level);
3046
3047         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3048         if (ret) {
3049                 btrfs_abort_transaction(trans, ret);
3050                 return ret;
3051         }
3052         copy_extent_buffer(split, c,
3053                            btrfs_node_key_ptr_offset(split, 0),
3054                            btrfs_node_key_ptr_offset(c, mid),
3055                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3056         btrfs_set_header_nritems(split, c_nritems - mid);
3057         btrfs_set_header_nritems(c, mid);
3058
3059         btrfs_mark_buffer_dirty(c);
3060         btrfs_mark_buffer_dirty(split);
3061
3062         insert_ptr(trans, path, &disk_key, split->start,
3063                    path->slots[level + 1] + 1, level + 1);
3064
3065         if (path->slots[level] >= mid) {
3066                 path->slots[level] -= mid;
3067                 btrfs_tree_unlock(c);
3068                 free_extent_buffer(c);
3069                 path->nodes[level] = split;
3070                 path->slots[level + 1] += 1;
3071         } else {
3072                 btrfs_tree_unlock(split);
3073                 free_extent_buffer(split);
3074         }
3075         return 0;
3076 }
3077
3078 /*
3079  * how many bytes are required to store the items in a leaf.  start
3080  * and nr indicate which items in the leaf to check.  This totals up the
3081  * space used both by the item structs and the item data
3082  */
3083 static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3084 {
3085         int data_len;
3086         int nritems = btrfs_header_nritems(l);
3087         int end = min(nritems, start + nr) - 1;
3088
3089         if (!nr)
3090                 return 0;
3091         data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3092         data_len = data_len - btrfs_item_offset(l, end);
3093         data_len += sizeof(struct btrfs_item) * nr;
3094         WARN_ON(data_len < 0);
3095         return data_len;
3096 }
3097
3098 /*
3099  * The space between the end of the leaf items and
3100  * the start of the leaf data.  IOW, how much room
3101  * the leaf has left for both items and data
3102  */
3103 int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3104 {
3105         struct btrfs_fs_info *fs_info = leaf->fs_info;
3106         int nritems = btrfs_header_nritems(leaf);
3107         int ret;
3108
3109         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3110         if (ret < 0) {
3111                 btrfs_crit(fs_info,
3112                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3113                            ret,
3114                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3115                            leaf_space_used(leaf, 0, nritems), nritems);
3116         }
3117         return ret;
3118 }
3119
3120 /*
3121  * min slot controls the lowest index we're willing to push to the
3122  * right.  We'll push up to and including min_slot, but no lower
3123  */
3124 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3125                                       struct btrfs_path *path,
3126                                       int data_size, int empty,
3127                                       struct extent_buffer *right,
3128                                       int free_space, u32 left_nritems,
3129                                       u32 min_slot)
3130 {
3131         struct btrfs_fs_info *fs_info = right->fs_info;
3132         struct extent_buffer *left = path->nodes[0];
3133         struct extent_buffer *upper = path->nodes[1];
3134         struct btrfs_map_token token;
3135         struct btrfs_disk_key disk_key;
3136         int slot;
3137         u32 i;
3138         int push_space = 0;
3139         int push_items = 0;
3140         u32 nr;
3141         u32 right_nritems;
3142         u32 data_end;
3143         u32 this_item_size;
3144
3145         if (empty)
3146                 nr = 0;
3147         else
3148                 nr = max_t(u32, 1, min_slot);
3149
3150         if (path->slots[0] >= left_nritems)
3151                 push_space += data_size;
3152
3153         slot = path->slots[1];
3154         i = left_nritems - 1;
3155         while (i >= nr) {
3156                 if (!empty && push_items > 0) {
3157                         if (path->slots[0] > i)
3158                                 break;
3159                         if (path->slots[0] == i) {
3160                                 int space = btrfs_leaf_free_space(left);
3161
3162                                 if (space + push_space * 2 > free_space)
3163                                         break;
3164                         }
3165                 }
3166
3167                 if (path->slots[0] == i)
3168                         push_space += data_size;
3169
3170                 this_item_size = btrfs_item_size(left, i);
3171                 if (this_item_size + sizeof(struct btrfs_item) +
3172                     push_space > free_space)
3173                         break;
3174
3175                 push_items++;
3176                 push_space += this_item_size + sizeof(struct btrfs_item);
3177                 if (i == 0)
3178                         break;
3179                 i--;
3180         }
3181
3182         if (push_items == 0)
3183                 goto out_unlock;
3184
3185         WARN_ON(!empty && push_items == left_nritems);
3186
3187         /* push left to right */
3188         right_nritems = btrfs_header_nritems(right);
3189
3190         push_space = btrfs_item_data_end(left, left_nritems - push_items);
3191         push_space -= leaf_data_end(left);
3192
3193         /* make room in the right data area */
3194         data_end = leaf_data_end(right);
3195         memmove_leaf_data(right, data_end - push_space, data_end,
3196                           BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3197
3198         /* copy from the left data area */
3199         copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3200                        leaf_data_end(left), push_space);
3201
3202         memmove_leaf_items(right, push_items, 0, right_nritems);
3203
3204         /* copy the items from left to right */
3205         copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3206
3207         /* update the item pointers */
3208         btrfs_init_map_token(&token, right);
3209         right_nritems += push_items;
3210         btrfs_set_header_nritems(right, right_nritems);
3211         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3212         for (i = 0; i < right_nritems; i++) {
3213                 push_space -= btrfs_token_item_size(&token, i);
3214                 btrfs_set_token_item_offset(&token, i, push_space);
3215         }
3216
3217         left_nritems -= push_items;
3218         btrfs_set_header_nritems(left, left_nritems);
3219
3220         if (left_nritems)
3221                 btrfs_mark_buffer_dirty(left);
3222         else
3223                 btrfs_clear_buffer_dirty(trans, left);
3224
3225         btrfs_mark_buffer_dirty(right);
3226
3227         btrfs_item_key(right, &disk_key, 0);
3228         btrfs_set_node_key(upper, &disk_key, slot + 1);
3229         btrfs_mark_buffer_dirty(upper);
3230
3231         /* then fixup the leaf pointer in the path */
3232         if (path->slots[0] >= left_nritems) {
3233                 path->slots[0] -= left_nritems;
3234                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3235                         btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3236                 btrfs_tree_unlock(path->nodes[0]);
3237                 free_extent_buffer(path->nodes[0]);
3238                 path->nodes[0] = right;
3239                 path->slots[1] += 1;
3240         } else {
3241                 btrfs_tree_unlock(right);
3242                 free_extent_buffer(right);
3243         }
3244         return 0;
3245
3246 out_unlock:
3247         btrfs_tree_unlock(right);
3248         free_extent_buffer(right);
3249         return 1;
3250 }
3251
3252 /*
3253  * push some data in the path leaf to the right, trying to free up at
3254  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3255  *
3256  * returns 1 if the push failed because the other node didn't have enough
3257  * room, 0 if everything worked out and < 0 if there were major errors.
3258  *
3259  * this will push starting from min_slot to the end of the leaf.  It won't
3260  * push any slot lower than min_slot
3261  */
3262 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3263                            *root, struct btrfs_path *path,
3264                            int min_data_size, int data_size,
3265                            int empty, u32 min_slot)
3266 {
3267         struct extent_buffer *left = path->nodes[0];
3268         struct extent_buffer *right;
3269         struct extent_buffer *upper;
3270         int slot;
3271         int free_space;
3272         u32 left_nritems;
3273         int ret;
3274
3275         if (!path->nodes[1])
3276                 return 1;
3277
3278         slot = path->slots[1];
3279         upper = path->nodes[1];
3280         if (slot >= btrfs_header_nritems(upper) - 1)
3281                 return 1;
3282
3283         btrfs_assert_tree_write_locked(path->nodes[1]);
3284
3285         right = btrfs_read_node_slot(upper, slot + 1);
3286         if (IS_ERR(right))
3287                 return PTR_ERR(right);
3288
3289         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3290
3291         free_space = btrfs_leaf_free_space(right);
3292         if (free_space < data_size)
3293                 goto out_unlock;
3294
3295         ret = btrfs_cow_block(trans, root, right, upper,
3296                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3297         if (ret)
3298                 goto out_unlock;
3299
3300         left_nritems = btrfs_header_nritems(left);
3301         if (left_nritems == 0)
3302                 goto out_unlock;
3303
3304         if (check_sibling_keys(left, right)) {
3305                 ret = -EUCLEAN;
3306                 btrfs_abort_transaction(trans, ret);
3307                 btrfs_tree_unlock(right);
3308                 free_extent_buffer(right);
3309                 return ret;
3310         }
3311         if (path->slots[0] == left_nritems && !empty) {
3312                 /* Key greater than all keys in the leaf, right neighbor has
3313                  * enough room for it and we're not emptying our leaf to delete
3314                  * it, therefore use right neighbor to insert the new item and
3315                  * no need to touch/dirty our left leaf. */
3316                 btrfs_tree_unlock(left);
3317                 free_extent_buffer(left);
3318                 path->nodes[0] = right;
3319                 path->slots[0] = 0;
3320                 path->slots[1]++;
3321                 return 0;
3322         }
3323
3324         return __push_leaf_right(trans, path, min_data_size, empty, right,
3325                                  free_space, left_nritems, min_slot);
3326 out_unlock:
3327         btrfs_tree_unlock(right);
3328         free_extent_buffer(right);
3329         return 1;
3330 }
3331
3332 /*
3333  * push some data in the path leaf to the left, trying to free up at
3334  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3335  *
3336  * max_slot can put a limit on how far into the leaf we'll push items.  The
3337  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3338  * items
3339  */
3340 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3341                                      struct btrfs_path *path, int data_size,
3342                                      int empty, struct extent_buffer *left,
3343                                      int free_space, u32 right_nritems,
3344                                      u32 max_slot)
3345 {
3346         struct btrfs_fs_info *fs_info = left->fs_info;
3347         struct btrfs_disk_key disk_key;
3348         struct extent_buffer *right = path->nodes[0];
3349         int i;
3350         int push_space = 0;
3351         int push_items = 0;
3352         u32 old_left_nritems;
3353         u32 nr;
3354         int ret = 0;
3355         u32 this_item_size;
3356         u32 old_left_item_size;
3357         struct btrfs_map_token token;
3358
3359         if (empty)
3360                 nr = min(right_nritems, max_slot);
3361         else
3362                 nr = min(right_nritems - 1, max_slot);
3363
3364         for (i = 0; i < nr; i++) {
3365                 if (!empty && push_items > 0) {
3366                         if (path->slots[0] < i)
3367                                 break;
3368                         if (path->slots[0] == i) {
3369                                 int space = btrfs_leaf_free_space(right);
3370
3371                                 if (space + push_space * 2 > free_space)
3372                                         break;
3373                         }
3374                 }
3375
3376                 if (path->slots[0] == i)
3377                         push_space += data_size;
3378
3379                 this_item_size = btrfs_item_size(right, i);
3380                 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3381                     free_space)
3382                         break;
3383
3384                 push_items++;
3385                 push_space += this_item_size + sizeof(struct btrfs_item);
3386         }
3387
3388         if (push_items == 0) {
3389                 ret = 1;
3390                 goto out;
3391         }
3392         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3393
3394         /* push data from right to left */
3395         copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3396
3397         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3398                      btrfs_item_offset(right, push_items - 1);
3399
3400         copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3401                        btrfs_item_offset(right, push_items - 1), push_space);
3402         old_left_nritems = btrfs_header_nritems(left);
3403         BUG_ON(old_left_nritems <= 0);
3404
3405         btrfs_init_map_token(&token, left);
3406         old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3407         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3408                 u32 ioff;
3409
3410                 ioff = btrfs_token_item_offset(&token, i);
3411                 btrfs_set_token_item_offset(&token, i,
3412                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3413         }
3414         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3415
3416         /* fixup right node */
3417         if (push_items > right_nritems)
3418                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3419                        right_nritems);
3420
3421         if (push_items < right_nritems) {
3422                 push_space = btrfs_item_offset(right, push_items - 1) -
3423                                                   leaf_data_end(right);
3424                 memmove_leaf_data(right,
3425                                   BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3426                                   leaf_data_end(right), push_space);
3427
3428                 memmove_leaf_items(right, 0, push_items,
3429                                    btrfs_header_nritems(right) - push_items);
3430         }
3431
3432         btrfs_init_map_token(&token, right);
3433         right_nritems -= push_items;
3434         btrfs_set_header_nritems(right, right_nritems);
3435         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3436         for (i = 0; i < right_nritems; i++) {
3437                 push_space = push_space - btrfs_token_item_size(&token, i);
3438                 btrfs_set_token_item_offset(&token, i, push_space);
3439         }
3440
3441         btrfs_mark_buffer_dirty(left);
3442         if (right_nritems)
3443                 btrfs_mark_buffer_dirty(right);
3444         else
3445                 btrfs_clear_buffer_dirty(trans, right);
3446
3447         btrfs_item_key(right, &disk_key, 0);
3448         fixup_low_keys(path, &disk_key, 1);
3449
3450         /* then fixup the leaf pointer in the path */
3451         if (path->slots[0] < push_items) {
3452                 path->slots[0] += old_left_nritems;
3453                 btrfs_tree_unlock(path->nodes[0]);
3454                 free_extent_buffer(path->nodes[0]);
3455                 path->nodes[0] = left;
3456                 path->slots[1] -= 1;
3457         } else {
3458                 btrfs_tree_unlock(left);
3459                 free_extent_buffer(left);
3460                 path->slots[0] -= push_items;
3461         }
3462         BUG_ON(path->slots[0] < 0);
3463         return ret;
3464 out:
3465         btrfs_tree_unlock(left);
3466         free_extent_buffer(left);
3467         return ret;
3468 }
3469
3470 /*
3471  * push some data in the path leaf to the left, trying to free up at
3472  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3473  *
3474  * max_slot can put a limit on how far into the leaf we'll push items.  The
3475  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3476  * items
3477  */
3478 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3479                           *root, struct btrfs_path *path, int min_data_size,
3480                           int data_size, int empty, u32 max_slot)
3481 {
3482         struct extent_buffer *right = path->nodes[0];
3483         struct extent_buffer *left;
3484         int slot;
3485         int free_space;
3486         u32 right_nritems;
3487         int ret = 0;
3488
3489         slot = path->slots[1];
3490         if (slot == 0)
3491                 return 1;
3492         if (!path->nodes[1])
3493                 return 1;
3494
3495         right_nritems = btrfs_header_nritems(right);
3496         if (right_nritems == 0)
3497                 return 1;
3498
3499         btrfs_assert_tree_write_locked(path->nodes[1]);
3500
3501         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3502         if (IS_ERR(left))
3503                 return PTR_ERR(left);
3504
3505         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3506
3507         free_space = btrfs_leaf_free_space(left);
3508         if (free_space < data_size) {
3509                 ret = 1;
3510                 goto out;
3511         }
3512
3513         ret = btrfs_cow_block(trans, root, left,
3514                               path->nodes[1], slot - 1, &left,
3515                               BTRFS_NESTING_LEFT_COW);
3516         if (ret) {
3517                 /* we hit -ENOSPC, but it isn't fatal here */
3518                 if (ret == -ENOSPC)
3519                         ret = 1;
3520                 goto out;
3521         }
3522
3523         if (check_sibling_keys(left, right)) {
3524                 ret = -EUCLEAN;
3525                 btrfs_abort_transaction(trans, ret);
3526                 goto out;
3527         }
3528         return __push_leaf_left(trans, path, min_data_size, empty, left,
3529                                 free_space, right_nritems, max_slot);
3530 out:
3531         btrfs_tree_unlock(left);
3532         free_extent_buffer(left);
3533         return ret;
3534 }
3535
3536 /*
3537  * split the path's leaf in two, making sure there is at least data_size
3538  * available for the resulting leaf level of the path.
3539  */
3540 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3541                                     struct btrfs_path *path,
3542                                     struct extent_buffer *l,
3543                                     struct extent_buffer *right,
3544                                     int slot, int mid, int nritems)
3545 {
3546         struct btrfs_fs_info *fs_info = trans->fs_info;
3547         int data_copy_size;
3548         int rt_data_off;
3549         int i;
3550         struct btrfs_disk_key disk_key;
3551         struct btrfs_map_token token;
3552
3553         nritems = nritems - mid;
3554         btrfs_set_header_nritems(right, nritems);
3555         data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3556
3557         copy_leaf_items(right, l, 0, mid, nritems);
3558
3559         copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3560                        leaf_data_end(l), data_copy_size);
3561
3562         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3563
3564         btrfs_init_map_token(&token, right);
3565         for (i = 0; i < nritems; i++) {
3566                 u32 ioff;
3567
3568                 ioff = btrfs_token_item_offset(&token, i);
3569                 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3570         }
3571
3572         btrfs_set_header_nritems(l, mid);
3573         btrfs_item_key(right, &disk_key, 0);
3574         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3575
3576         btrfs_mark_buffer_dirty(right);
3577         btrfs_mark_buffer_dirty(l);
3578         BUG_ON(path->slots[0] != slot);
3579
3580         if (mid <= slot) {
3581                 btrfs_tree_unlock(path->nodes[0]);
3582                 free_extent_buffer(path->nodes[0]);
3583                 path->nodes[0] = right;
3584                 path->slots[0] -= mid;
3585                 path->slots[1] += 1;
3586         } else {
3587                 btrfs_tree_unlock(right);
3588                 free_extent_buffer(right);
3589         }
3590
3591         BUG_ON(path->slots[0] < 0);
3592 }
3593
3594 /*
3595  * double splits happen when we need to insert a big item in the middle
3596  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3597  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3598  *          A                 B                 C
3599  *
3600  * We avoid this by trying to push the items on either side of our target
3601  * into the adjacent leaves.  If all goes well we can avoid the double split
3602  * completely.
3603  */
3604 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3605                                           struct btrfs_root *root,
3606                                           struct btrfs_path *path,
3607                                           int data_size)
3608 {
3609         int ret;
3610         int progress = 0;
3611         int slot;
3612         u32 nritems;
3613         int space_needed = data_size;
3614
3615         slot = path->slots[0];
3616         if (slot < btrfs_header_nritems(path->nodes[0]))
3617                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3618
3619         /*
3620          * try to push all the items after our slot into the
3621          * right leaf
3622          */
3623         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3624         if (ret < 0)
3625                 return ret;
3626
3627         if (ret == 0)
3628                 progress++;
3629
3630         nritems = btrfs_header_nritems(path->nodes[0]);
3631         /*
3632          * our goal is to get our slot at the start or end of a leaf.  If
3633          * we've done so we're done
3634          */
3635         if (path->slots[0] == 0 || path->slots[0] == nritems)
3636                 return 0;
3637
3638         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3639                 return 0;
3640
3641         /* try to push all the items before our slot into the next leaf */
3642         slot = path->slots[0];
3643         space_needed = data_size;
3644         if (slot > 0)
3645                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3646         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3647         if (ret < 0)
3648                 return ret;
3649
3650         if (ret == 0)
3651                 progress++;
3652
3653         if (progress)
3654                 return 0;
3655         return 1;
3656 }
3657
3658 /*
3659  * split the path's leaf in two, making sure there is at least data_size
3660  * available for the resulting leaf level of the path.
3661  *
3662  * returns 0 if all went well and < 0 on failure.
3663  */
3664 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3665                                struct btrfs_root *root,
3666                                const struct btrfs_key *ins_key,
3667                                struct btrfs_path *path, int data_size,
3668                                int extend)
3669 {
3670         struct btrfs_disk_key disk_key;
3671         struct extent_buffer *l;
3672         u32 nritems;
3673         int mid;
3674         int slot;
3675         struct extent_buffer *right;
3676         struct btrfs_fs_info *fs_info = root->fs_info;
3677         int ret = 0;
3678         int wret;
3679         int split;
3680         int num_doubles = 0;
3681         int tried_avoid_double = 0;
3682
3683         l = path->nodes[0];
3684         slot = path->slots[0];
3685         if (extend && data_size + btrfs_item_size(l, slot) +
3686             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3687                 return -EOVERFLOW;
3688
3689         /* first try to make some room by pushing left and right */
3690         if (data_size && path->nodes[1]) {
3691                 int space_needed = data_size;
3692
3693                 if (slot < btrfs_header_nritems(l))
3694                         space_needed -= btrfs_leaf_free_space(l);
3695
3696                 wret = push_leaf_right(trans, root, path, space_needed,
3697                                        space_needed, 0, 0);
3698                 if (wret < 0)
3699                         return wret;
3700                 if (wret) {
3701                         space_needed = data_size;
3702                         if (slot > 0)
3703                                 space_needed -= btrfs_leaf_free_space(l);
3704                         wret = push_leaf_left(trans, root, path, space_needed,
3705                                               space_needed, 0, (u32)-1);
3706                         if (wret < 0)
3707                                 return wret;
3708                 }
3709                 l = path->nodes[0];
3710
3711                 /* did the pushes work? */
3712                 if (btrfs_leaf_free_space(l) >= data_size)
3713                         return 0;
3714         }
3715
3716         if (!path->nodes[1]) {
3717                 ret = insert_new_root(trans, root, path, 1);
3718                 if (ret)
3719                         return ret;
3720         }
3721 again:
3722         split = 1;
3723         l = path->nodes[0];
3724         slot = path->slots[0];
3725         nritems = btrfs_header_nritems(l);
3726         mid = (nritems + 1) / 2;
3727
3728         if (mid <= slot) {
3729                 if (nritems == 1 ||
3730                     leaf_space_used(l, mid, nritems - mid) + data_size >
3731                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3732                         if (slot >= nritems) {
3733                                 split = 0;
3734                         } else {
3735                                 mid = slot;
3736                                 if (mid != nritems &&
3737                                     leaf_space_used(l, mid, nritems - mid) +
3738                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3739                                         if (data_size && !tried_avoid_double)
3740                                                 goto push_for_double;
3741                                         split = 2;
3742                                 }
3743                         }
3744                 }
3745         } else {
3746                 if (leaf_space_used(l, 0, mid) + data_size >
3747                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3748                         if (!extend && data_size && slot == 0) {
3749                                 split = 0;
3750                         } else if ((extend || !data_size) && slot == 0) {
3751                                 mid = 1;
3752                         } else {
3753                                 mid = slot;
3754                                 if (mid != nritems &&
3755                                     leaf_space_used(l, mid, nritems - mid) +
3756                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3757                                         if (data_size && !tried_avoid_double)
3758                                                 goto push_for_double;
3759                                         split = 2;
3760                                 }
3761                         }
3762                 }
3763         }
3764
3765         if (split == 0)
3766                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3767         else
3768                 btrfs_item_key(l, &disk_key, mid);
3769
3770         /*
3771          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3772          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3773          * subclasses, which is 8 at the time of this patch, and we've maxed it
3774          * out.  In the future we could add a
3775          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3776          * use BTRFS_NESTING_NEW_ROOT.
3777          */
3778         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3779                                        &disk_key, 0, l->start, 0,
3780                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3781                                        BTRFS_NESTING_SPLIT);
3782         if (IS_ERR(right))
3783                 return PTR_ERR(right);
3784
3785         root_add_used(root, fs_info->nodesize);
3786
3787         if (split == 0) {
3788                 if (mid <= slot) {
3789                         btrfs_set_header_nritems(right, 0);
3790                         insert_ptr(trans, path, &disk_key,
3791                                    right->start, path->slots[1] + 1, 1);
3792                         btrfs_tree_unlock(path->nodes[0]);
3793                         free_extent_buffer(path->nodes[0]);
3794                         path->nodes[0] = right;
3795                         path->slots[0] = 0;
3796                         path->slots[1] += 1;
3797                 } else {
3798                         btrfs_set_header_nritems(right, 0);
3799                         insert_ptr(trans, path, &disk_key,
3800                                    right->start, path->slots[1], 1);
3801                         btrfs_tree_unlock(path->nodes[0]);
3802                         free_extent_buffer(path->nodes[0]);
3803                         path->nodes[0] = right;
3804                         path->slots[0] = 0;
3805                         if (path->slots[1] == 0)
3806                                 fixup_low_keys(path, &disk_key, 1);
3807                 }
3808                 /*
3809                  * We create a new leaf 'right' for the required ins_len and
3810                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3811                  * the content of ins_len to 'right'.
3812                  */
3813                 return ret;
3814         }
3815
3816         copy_for_split(trans, path, l, right, slot, mid, nritems);
3817
3818         if (split == 2) {
3819                 BUG_ON(num_doubles != 0);
3820                 num_doubles++;
3821                 goto again;
3822         }
3823
3824         return 0;
3825
3826 push_for_double:
3827         push_for_double_split(trans, root, path, data_size);
3828         tried_avoid_double = 1;
3829         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3830                 return 0;
3831         goto again;
3832 }
3833
3834 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3835                                          struct btrfs_root *root,
3836                                          struct btrfs_path *path, int ins_len)
3837 {
3838         struct btrfs_key key;
3839         struct extent_buffer *leaf;
3840         struct btrfs_file_extent_item *fi;
3841         u64 extent_len = 0;
3842         u32 item_size;
3843         int ret;
3844
3845         leaf = path->nodes[0];
3846         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3847
3848         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3849                key.type != BTRFS_EXTENT_CSUM_KEY);
3850
3851         if (btrfs_leaf_free_space(leaf) >= ins_len)
3852                 return 0;
3853
3854         item_size = btrfs_item_size(leaf, path->slots[0]);
3855         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3856                 fi = btrfs_item_ptr(leaf, path->slots[0],
3857                                     struct btrfs_file_extent_item);
3858                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3859         }
3860         btrfs_release_path(path);
3861
3862         path->keep_locks = 1;
3863         path->search_for_split = 1;
3864         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3865         path->search_for_split = 0;
3866         if (ret > 0)
3867                 ret = -EAGAIN;
3868         if (ret < 0)
3869                 goto err;
3870
3871         ret = -EAGAIN;
3872         leaf = path->nodes[0];
3873         /* if our item isn't there, return now */
3874         if (item_size != btrfs_item_size(leaf, path->slots[0]))
3875                 goto err;
3876
3877         /* the leaf has  changed, it now has room.  return now */
3878         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3879                 goto err;
3880
3881         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3882                 fi = btrfs_item_ptr(leaf, path->slots[0],
3883                                     struct btrfs_file_extent_item);
3884                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3885                         goto err;
3886         }
3887
3888         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3889         if (ret)
3890                 goto err;
3891
3892         path->keep_locks = 0;
3893         btrfs_unlock_up_safe(path, 1);
3894         return 0;
3895 err:
3896         path->keep_locks = 0;
3897         return ret;
3898 }
3899
3900 static noinline int split_item(struct btrfs_path *path,
3901                                const struct btrfs_key *new_key,
3902                                unsigned long split_offset)
3903 {
3904         struct extent_buffer *leaf;
3905         int orig_slot, slot;
3906         char *buf;
3907         u32 nritems;
3908         u32 item_size;
3909         u32 orig_offset;
3910         struct btrfs_disk_key disk_key;
3911
3912         leaf = path->nodes[0];
3913         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3914
3915         orig_slot = path->slots[0];
3916         orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3917         item_size = btrfs_item_size(leaf, path->slots[0]);
3918
3919         buf = kmalloc(item_size, GFP_NOFS);
3920         if (!buf)
3921                 return -ENOMEM;
3922
3923         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3924                             path->slots[0]), item_size);
3925
3926         slot = path->slots[0] + 1;
3927         nritems = btrfs_header_nritems(leaf);
3928         if (slot != nritems) {
3929                 /* shift the items */
3930                 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3931         }
3932
3933         btrfs_cpu_key_to_disk(&disk_key, new_key);
3934         btrfs_set_item_key(leaf, &disk_key, slot);
3935
3936         btrfs_set_item_offset(leaf, slot, orig_offset);
3937         btrfs_set_item_size(leaf, slot, item_size - split_offset);
3938
3939         btrfs_set_item_offset(leaf, orig_slot,
3940                                  orig_offset + item_size - split_offset);
3941         btrfs_set_item_size(leaf, orig_slot, split_offset);
3942
3943         btrfs_set_header_nritems(leaf, nritems + 1);
3944
3945         /* write the data for the start of the original item */
3946         write_extent_buffer(leaf, buf,
3947                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3948                             split_offset);
3949
3950         /* write the data for the new item */
3951         write_extent_buffer(leaf, buf + split_offset,
3952                             btrfs_item_ptr_offset(leaf, slot),
3953                             item_size - split_offset);
3954         btrfs_mark_buffer_dirty(leaf);
3955
3956         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3957         kfree(buf);
3958         return 0;
3959 }
3960
3961 /*
3962  * This function splits a single item into two items,
3963  * giving 'new_key' to the new item and splitting the
3964  * old one at split_offset (from the start of the item).
3965  *
3966  * The path may be released by this operation.  After
3967  * the split, the path is pointing to the old item.  The
3968  * new item is going to be in the same node as the old one.
3969  *
3970  * Note, the item being split must be smaller enough to live alone on
3971  * a tree block with room for one extra struct btrfs_item
3972  *
3973  * This allows us to split the item in place, keeping a lock on the
3974  * leaf the entire time.
3975  */
3976 int btrfs_split_item(struct btrfs_trans_handle *trans,
3977                      struct btrfs_root *root,
3978                      struct btrfs_path *path,
3979                      const struct btrfs_key *new_key,
3980                      unsigned long split_offset)
3981 {
3982         int ret;
3983         ret = setup_leaf_for_split(trans, root, path,
3984                                    sizeof(struct btrfs_item));
3985         if (ret)
3986                 return ret;
3987
3988         ret = split_item(path, new_key, split_offset);
3989         return ret;
3990 }
3991
3992 /*
3993  * make the item pointed to by the path smaller.  new_size indicates
3994  * how small to make it, and from_end tells us if we just chop bytes
3995  * off the end of the item or if we shift the item to chop bytes off
3996  * the front.
3997  */
3998 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3999 {
4000         int slot;
4001         struct extent_buffer *leaf;
4002         u32 nritems;
4003         unsigned int data_end;
4004         unsigned int old_data_start;
4005         unsigned int old_size;
4006         unsigned int size_diff;
4007         int i;
4008         struct btrfs_map_token token;
4009
4010         leaf = path->nodes[0];
4011         slot = path->slots[0];
4012
4013         old_size = btrfs_item_size(leaf, slot);
4014         if (old_size == new_size)
4015                 return;
4016
4017         nritems = btrfs_header_nritems(leaf);
4018         data_end = leaf_data_end(leaf);
4019
4020         old_data_start = btrfs_item_offset(leaf, slot);
4021
4022         size_diff = old_size - new_size;
4023
4024         BUG_ON(slot < 0);
4025         BUG_ON(slot >= nritems);
4026
4027         /*
4028          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4029          */
4030         /* first correct the data pointers */
4031         btrfs_init_map_token(&token, leaf);
4032         for (i = slot; i < nritems; i++) {
4033                 u32 ioff;
4034
4035                 ioff = btrfs_token_item_offset(&token, i);
4036                 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
4037         }
4038
4039         /* shift the data */
4040         if (from_end) {
4041                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4042                                   old_data_start + new_size - data_end);
4043         } else {
4044                 struct btrfs_disk_key disk_key;
4045                 u64 offset;
4046
4047                 btrfs_item_key(leaf, &disk_key, slot);
4048
4049                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4050                         unsigned long ptr;
4051                         struct btrfs_file_extent_item *fi;
4052
4053                         fi = btrfs_item_ptr(leaf, slot,
4054                                             struct btrfs_file_extent_item);
4055                         fi = (struct btrfs_file_extent_item *)(
4056                              (unsigned long)fi - size_diff);
4057
4058                         if (btrfs_file_extent_type(leaf, fi) ==
4059                             BTRFS_FILE_EXTENT_INLINE) {
4060                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4061                                 memmove_extent_buffer(leaf, ptr,
4062                                       (unsigned long)fi,
4063                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4064                         }
4065                 }
4066
4067                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4068                                   old_data_start - data_end);
4069
4070                 offset = btrfs_disk_key_offset(&disk_key);
4071                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4072                 btrfs_set_item_key(leaf, &disk_key, slot);
4073                 if (slot == 0)
4074                         fixup_low_keys(path, &disk_key, 1);
4075         }
4076
4077         btrfs_set_item_size(leaf, slot, new_size);
4078         btrfs_mark_buffer_dirty(leaf);
4079
4080         if (btrfs_leaf_free_space(leaf) < 0) {
4081                 btrfs_print_leaf(leaf);
4082                 BUG();
4083         }
4084 }
4085
4086 /*
4087  * make the item pointed to by the path bigger, data_size is the added size.
4088  */
4089 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4090 {
4091         int slot;
4092         struct extent_buffer *leaf;
4093         u32 nritems;
4094         unsigned int data_end;
4095         unsigned int old_data;
4096         unsigned int old_size;
4097         int i;
4098         struct btrfs_map_token token;
4099
4100         leaf = path->nodes[0];
4101
4102         nritems = btrfs_header_nritems(leaf);
4103         data_end = leaf_data_end(leaf);
4104
4105         if (btrfs_leaf_free_space(leaf) < data_size) {
4106                 btrfs_print_leaf(leaf);
4107                 BUG();
4108         }
4109         slot = path->slots[0];
4110         old_data = btrfs_item_data_end(leaf, slot);
4111
4112         BUG_ON(slot < 0);
4113         if (slot >= nritems) {
4114                 btrfs_print_leaf(leaf);
4115                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4116                            slot, nritems);
4117                 BUG();
4118         }
4119
4120         /*
4121          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4122          */
4123         /* first correct the data pointers */
4124         btrfs_init_map_token(&token, leaf);
4125         for (i = slot; i < nritems; i++) {
4126                 u32 ioff;
4127
4128                 ioff = btrfs_token_item_offset(&token, i);
4129                 btrfs_set_token_item_offset(&token, i, ioff - data_size);
4130         }
4131
4132         /* shift the data */
4133         memmove_leaf_data(leaf, data_end - data_size, data_end,
4134                           old_data - data_end);
4135
4136         data_end = old_data;
4137         old_size = btrfs_item_size(leaf, slot);
4138         btrfs_set_item_size(leaf, slot, old_size + data_size);
4139         btrfs_mark_buffer_dirty(leaf);
4140
4141         if (btrfs_leaf_free_space(leaf) < 0) {
4142                 btrfs_print_leaf(leaf);
4143                 BUG();
4144         }
4145 }
4146
4147 /*
4148  * Make space in the node before inserting one or more items.
4149  *
4150  * @root:       root we are inserting items to
4151  * @path:       points to the leaf/slot where we are going to insert new items
4152  * @batch:      information about the batch of items to insert
4153  *
4154  * Main purpose is to save stack depth by doing the bulk of the work in a
4155  * function that doesn't call btrfs_search_slot
4156  */
4157 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4158                                    const struct btrfs_item_batch *batch)
4159 {
4160         struct btrfs_fs_info *fs_info = root->fs_info;
4161         int i;
4162         u32 nritems;
4163         unsigned int data_end;
4164         struct btrfs_disk_key disk_key;
4165         struct extent_buffer *leaf;
4166         int slot;
4167         struct btrfs_map_token token;
4168         u32 total_size;
4169
4170         /*
4171          * Before anything else, update keys in the parent and other ancestors
4172          * if needed, then release the write locks on them, so that other tasks
4173          * can use them while we modify the leaf.
4174          */
4175         if (path->slots[0] == 0) {
4176                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4177                 fixup_low_keys(path, &disk_key, 1);
4178         }
4179         btrfs_unlock_up_safe(path, 1);
4180
4181         leaf = path->nodes[0];
4182         slot = path->slots[0];
4183
4184         nritems = btrfs_header_nritems(leaf);
4185         data_end = leaf_data_end(leaf);
4186         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4187
4188         if (btrfs_leaf_free_space(leaf) < total_size) {
4189                 btrfs_print_leaf(leaf);
4190                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4191                            total_size, btrfs_leaf_free_space(leaf));
4192                 BUG();
4193         }
4194
4195         btrfs_init_map_token(&token, leaf);
4196         if (slot != nritems) {
4197                 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4198
4199                 if (old_data < data_end) {
4200                         btrfs_print_leaf(leaf);
4201                         btrfs_crit(fs_info,
4202                 "item at slot %d with data offset %u beyond data end of leaf %u",
4203                                    slot, old_data, data_end);
4204                         BUG();
4205                 }
4206                 /*
4207                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4208                  */
4209                 /* first correct the data pointers */
4210                 for (i = slot; i < nritems; i++) {
4211                         u32 ioff;
4212
4213                         ioff = btrfs_token_item_offset(&token, i);
4214                         btrfs_set_token_item_offset(&token, i,
4215                                                        ioff - batch->total_data_size);
4216                 }
4217                 /* shift the items */
4218                 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4219
4220                 /* shift the data */
4221                 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4222                                   data_end, old_data - data_end);
4223                 data_end = old_data;
4224         }
4225
4226         /* setup the item for the new data */
4227         for (i = 0; i < batch->nr; i++) {
4228                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4229                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4230                 data_end -= batch->data_sizes[i];
4231                 btrfs_set_token_item_offset(&token, slot + i, data_end);
4232                 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4233         }
4234
4235         btrfs_set_header_nritems(leaf, nritems + batch->nr);
4236         btrfs_mark_buffer_dirty(leaf);
4237
4238         if (btrfs_leaf_free_space(leaf) < 0) {
4239                 btrfs_print_leaf(leaf);
4240                 BUG();
4241         }
4242 }
4243
4244 /*
4245  * Insert a new item into a leaf.
4246  *
4247  * @root:      The root of the btree.
4248  * @path:      A path pointing to the target leaf and slot.
4249  * @key:       The key of the new item.
4250  * @data_size: The size of the data associated with the new key.
4251  */
4252 void btrfs_setup_item_for_insert(struct btrfs_root *root,
4253                                  struct btrfs_path *path,
4254                                  const struct btrfs_key *key,
4255                                  u32 data_size)
4256 {
4257         struct btrfs_item_batch batch;
4258
4259         batch.keys = key;
4260         batch.data_sizes = &data_size;
4261         batch.total_data_size = data_size;
4262         batch.nr = 1;
4263
4264         setup_items_for_insert(root, path, &batch);
4265 }
4266
4267 /*
4268  * Given a key and some data, insert items into the tree.
4269  * This does all the path init required, making room in the tree if needed.
4270  */
4271 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4272                             struct btrfs_root *root,
4273                             struct btrfs_path *path,
4274                             const struct btrfs_item_batch *batch)
4275 {
4276         int ret = 0;
4277         int slot;
4278         u32 total_size;
4279
4280         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4281         ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4282         if (ret == 0)
4283                 return -EEXIST;
4284         if (ret < 0)
4285                 return ret;
4286
4287         slot = path->slots[0];
4288         BUG_ON(slot < 0);
4289
4290         setup_items_for_insert(root, path, batch);
4291         return 0;
4292 }
4293
4294 /*
4295  * Given a key and some data, insert an item into the tree.
4296  * This does all the path init required, making room in the tree if needed.
4297  */
4298 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4299                       const struct btrfs_key *cpu_key, void *data,
4300                       u32 data_size)
4301 {
4302         int ret = 0;
4303         struct btrfs_path *path;
4304         struct extent_buffer *leaf;
4305         unsigned long ptr;
4306
4307         path = btrfs_alloc_path();
4308         if (!path)
4309                 return -ENOMEM;
4310         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4311         if (!ret) {
4312                 leaf = path->nodes[0];
4313                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4314                 write_extent_buffer(leaf, data, ptr, data_size);
4315                 btrfs_mark_buffer_dirty(leaf);
4316         }
4317         btrfs_free_path(path);
4318         return ret;
4319 }
4320
4321 /*
4322  * This function duplicates an item, giving 'new_key' to the new item.
4323  * It guarantees both items live in the same tree leaf and the new item is
4324  * contiguous with the original item.
4325  *
4326  * This allows us to split a file extent in place, keeping a lock on the leaf
4327  * the entire time.
4328  */
4329 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4330                          struct btrfs_root *root,
4331                          struct btrfs_path *path,
4332                          const struct btrfs_key *new_key)
4333 {
4334         struct extent_buffer *leaf;
4335         int ret;
4336         u32 item_size;
4337
4338         leaf = path->nodes[0];
4339         item_size = btrfs_item_size(leaf, path->slots[0]);
4340         ret = setup_leaf_for_split(trans, root, path,
4341                                    item_size + sizeof(struct btrfs_item));
4342         if (ret)
4343                 return ret;
4344
4345         path->slots[0]++;
4346         btrfs_setup_item_for_insert(root, path, new_key, item_size);
4347         leaf = path->nodes[0];
4348         memcpy_extent_buffer(leaf,
4349                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4350                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4351                              item_size);
4352         return 0;
4353 }
4354
4355 /*
4356  * delete the pointer from a given node.
4357  *
4358  * the tree should have been previously balanced so the deletion does not
4359  * empty a node.
4360  */
4361 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4362                     int level, int slot)
4363 {
4364         struct extent_buffer *parent = path->nodes[level];
4365         u32 nritems;
4366         int ret;
4367
4368         nritems = btrfs_header_nritems(parent);
4369         if (slot != nritems - 1) {
4370                 if (level) {
4371                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
4372                                         slot + 1, nritems - slot - 1);
4373                         BUG_ON(ret < 0);
4374                 }
4375                 memmove_extent_buffer(parent,
4376                               btrfs_node_key_ptr_offset(parent, slot),
4377                               btrfs_node_key_ptr_offset(parent, slot + 1),
4378                               sizeof(struct btrfs_key_ptr) *
4379                               (nritems - slot - 1));
4380         } else if (level) {
4381                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4382                                                     BTRFS_MOD_LOG_KEY_REMOVE);
4383                 BUG_ON(ret < 0);
4384         }
4385
4386         nritems--;
4387         btrfs_set_header_nritems(parent, nritems);
4388         if (nritems == 0 && parent == root->node) {
4389                 BUG_ON(btrfs_header_level(root->node) != 1);
4390                 /* just turn the root into a leaf and break */
4391                 btrfs_set_header_level(root->node, 0);
4392         } else if (slot == 0) {
4393                 struct btrfs_disk_key disk_key;
4394
4395                 btrfs_node_key(parent, &disk_key, 0);
4396                 fixup_low_keys(path, &disk_key, level + 1);
4397         }
4398         btrfs_mark_buffer_dirty(parent);
4399 }
4400
4401 /*
4402  * a helper function to delete the leaf pointed to by path->slots[1] and
4403  * path->nodes[1].
4404  *
4405  * This deletes the pointer in path->nodes[1] and frees the leaf
4406  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4407  *
4408  * The path must have already been setup for deleting the leaf, including
4409  * all the proper balancing.  path->nodes[1] must be locked.
4410  */
4411 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4412                                     struct btrfs_root *root,
4413                                     struct btrfs_path *path,
4414                                     struct extent_buffer *leaf)
4415 {
4416         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4417         del_ptr(root, path, 1, path->slots[1]);
4418
4419         /*
4420          * btrfs_free_extent is expensive, we want to make sure we
4421          * aren't holding any locks when we call it
4422          */
4423         btrfs_unlock_up_safe(path, 0);
4424
4425         root_sub_used(root, leaf->len);
4426
4427         atomic_inc(&leaf->refs);
4428         btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4429         free_extent_buffer_stale(leaf);
4430 }
4431 /*
4432  * delete the item at the leaf level in path.  If that empties
4433  * the leaf, remove it from the tree
4434  */
4435 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4436                     struct btrfs_path *path, int slot, int nr)
4437 {
4438         struct btrfs_fs_info *fs_info = root->fs_info;
4439         struct extent_buffer *leaf;
4440         int ret = 0;
4441         int wret;
4442         u32 nritems;
4443
4444         leaf = path->nodes[0];
4445         nritems = btrfs_header_nritems(leaf);
4446
4447         if (slot + nr != nritems) {
4448                 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4449                 const int data_end = leaf_data_end(leaf);
4450                 struct btrfs_map_token token;
4451                 u32 dsize = 0;
4452                 int i;
4453
4454                 for (i = 0; i < nr; i++)
4455                         dsize += btrfs_item_size(leaf, slot + i);
4456
4457                 memmove_leaf_data(leaf, data_end + dsize, data_end,
4458                                   last_off - data_end);
4459
4460                 btrfs_init_map_token(&token, leaf);
4461                 for (i = slot + nr; i < nritems; i++) {
4462                         u32 ioff;
4463
4464                         ioff = btrfs_token_item_offset(&token, i);
4465                         btrfs_set_token_item_offset(&token, i, ioff + dsize);
4466                 }
4467
4468                 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4469         }
4470         btrfs_set_header_nritems(leaf, nritems - nr);
4471         nritems -= nr;
4472
4473         /* delete the leaf if we've emptied it */
4474         if (nritems == 0) {
4475                 if (leaf == root->node) {
4476                         btrfs_set_header_level(leaf, 0);
4477                 } else {
4478                         btrfs_clear_buffer_dirty(trans, leaf);
4479                         btrfs_del_leaf(trans, root, path, leaf);
4480                 }
4481         } else {
4482                 int used = leaf_space_used(leaf, 0, nritems);
4483                 if (slot == 0) {
4484                         struct btrfs_disk_key disk_key;
4485
4486                         btrfs_item_key(leaf, &disk_key, 0);
4487                         fixup_low_keys(path, &disk_key, 1);
4488                 }
4489
4490                 /*
4491                  * Try to delete the leaf if it is mostly empty. We do this by
4492                  * trying to move all its items into its left and right neighbours.
4493                  * If we can't move all the items, then we don't delete it - it's
4494                  * not ideal, but future insertions might fill the leaf with more
4495                  * items, or items from other leaves might be moved later into our
4496                  * leaf due to deletions on those leaves.
4497                  */
4498                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4499                         u32 min_push_space;
4500
4501                         /* push_leaf_left fixes the path.
4502                          * make sure the path still points to our leaf
4503                          * for possible call to del_ptr below
4504                          */
4505                         slot = path->slots[1];
4506                         atomic_inc(&leaf->refs);
4507                         /*
4508                          * We want to be able to at least push one item to the
4509                          * left neighbour leaf, and that's the first item.
4510                          */
4511                         min_push_space = sizeof(struct btrfs_item) +
4512                                 btrfs_item_size(leaf, 0);
4513                         wret = push_leaf_left(trans, root, path, 0,
4514                                               min_push_space, 1, (u32)-1);
4515                         if (wret < 0 && wret != -ENOSPC)
4516                                 ret = wret;
4517
4518                         if (path->nodes[0] == leaf &&
4519                             btrfs_header_nritems(leaf)) {
4520                                 /*
4521                                  * If we were not able to push all items from our
4522                                  * leaf to its left neighbour, then attempt to
4523                                  * either push all the remaining items to the
4524                                  * right neighbour or none. There's no advantage
4525                                  * in pushing only some items, instead of all, as
4526                                  * it's pointless to end up with a leaf having
4527                                  * too few items while the neighbours can be full
4528                                  * or nearly full.
4529                                  */
4530                                 nritems = btrfs_header_nritems(leaf);
4531                                 min_push_space = leaf_space_used(leaf, 0, nritems);
4532                                 wret = push_leaf_right(trans, root, path, 0,
4533                                                        min_push_space, 1, 0);
4534                                 if (wret < 0 && wret != -ENOSPC)
4535                                         ret = wret;
4536                         }
4537
4538                         if (btrfs_header_nritems(leaf) == 0) {
4539                                 path->slots[1] = slot;
4540                                 btrfs_del_leaf(trans, root, path, leaf);
4541                                 free_extent_buffer(leaf);
4542                                 ret = 0;
4543                         } else {
4544                                 /* if we're still in the path, make sure
4545                                  * we're dirty.  Otherwise, one of the
4546                                  * push_leaf functions must have already
4547                                  * dirtied this buffer
4548                                  */
4549                                 if (path->nodes[0] == leaf)
4550                                         btrfs_mark_buffer_dirty(leaf);
4551                                 free_extent_buffer(leaf);
4552                         }
4553                 } else {
4554                         btrfs_mark_buffer_dirty(leaf);
4555                 }
4556         }
4557         return ret;
4558 }
4559
4560 /*
4561  * A helper function to walk down the tree starting at min_key, and looking
4562  * for nodes or leaves that are have a minimum transaction id.
4563  * This is used by the btree defrag code, and tree logging
4564  *
4565  * This does not cow, but it does stuff the starting key it finds back
4566  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4567  * key and get a writable path.
4568  *
4569  * This honors path->lowest_level to prevent descent past a given level
4570  * of the tree.
4571  *
4572  * min_trans indicates the oldest transaction that you are interested
4573  * in walking through.  Any nodes or leaves older than min_trans are
4574  * skipped over (without reading them).
4575  *
4576  * returns zero if something useful was found, < 0 on error and 1 if there
4577  * was nothing in the tree that matched the search criteria.
4578  */
4579 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4580                          struct btrfs_path *path,
4581                          u64 min_trans)
4582 {
4583         struct extent_buffer *cur;
4584         struct btrfs_key found_key;
4585         int slot;
4586         int sret;
4587         u32 nritems;
4588         int level;
4589         int ret = 1;
4590         int keep_locks = path->keep_locks;
4591
4592         ASSERT(!path->nowait);
4593         path->keep_locks = 1;
4594 again:
4595         cur = btrfs_read_lock_root_node(root);
4596         level = btrfs_header_level(cur);
4597         WARN_ON(path->nodes[level]);
4598         path->nodes[level] = cur;
4599         path->locks[level] = BTRFS_READ_LOCK;
4600
4601         if (btrfs_header_generation(cur) < min_trans) {
4602                 ret = 1;
4603                 goto out;
4604         }
4605         while (1) {
4606                 nritems = btrfs_header_nritems(cur);
4607                 level = btrfs_header_level(cur);
4608                 sret = btrfs_bin_search(cur, 0, min_key, &slot);
4609                 if (sret < 0) {
4610                         ret = sret;
4611                         goto out;
4612                 }
4613
4614                 /* at the lowest level, we're done, setup the path and exit */
4615                 if (level == path->lowest_level) {
4616                         if (slot >= nritems)
4617                                 goto find_next_key;
4618                         ret = 0;
4619                         path->slots[level] = slot;
4620                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4621                         goto out;
4622                 }
4623                 if (sret && slot > 0)
4624                         slot--;
4625                 /*
4626                  * check this node pointer against the min_trans parameters.
4627                  * If it is too old, skip to the next one.
4628                  */
4629                 while (slot < nritems) {
4630                         u64 gen;
4631
4632                         gen = btrfs_node_ptr_generation(cur, slot);
4633                         if (gen < min_trans) {
4634                                 slot++;
4635                                 continue;
4636                         }
4637                         break;
4638                 }
4639 find_next_key:
4640                 /*
4641                  * we didn't find a candidate key in this node, walk forward
4642                  * and find another one
4643                  */
4644                 if (slot >= nritems) {
4645                         path->slots[level] = slot;
4646                         sret = btrfs_find_next_key(root, path, min_key, level,
4647                                                   min_trans);
4648                         if (sret == 0) {
4649                                 btrfs_release_path(path);
4650                                 goto again;
4651                         } else {
4652                                 goto out;
4653                         }
4654                 }
4655                 /* save our key for returning back */
4656                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4657                 path->slots[level] = slot;
4658                 if (level == path->lowest_level) {
4659                         ret = 0;
4660                         goto out;
4661                 }
4662                 cur = btrfs_read_node_slot(cur, slot);
4663                 if (IS_ERR(cur)) {
4664                         ret = PTR_ERR(cur);
4665                         goto out;
4666                 }
4667
4668                 btrfs_tree_read_lock(cur);
4669
4670                 path->locks[level - 1] = BTRFS_READ_LOCK;
4671                 path->nodes[level - 1] = cur;
4672                 unlock_up(path, level, 1, 0, NULL);
4673         }
4674 out:
4675         path->keep_locks = keep_locks;
4676         if (ret == 0) {
4677                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4678                 memcpy(min_key, &found_key, sizeof(found_key));
4679         }
4680         return ret;
4681 }
4682
4683 /*
4684  * this is similar to btrfs_next_leaf, but does not try to preserve
4685  * and fixup the path.  It looks for and returns the next key in the
4686  * tree based on the current path and the min_trans parameters.
4687  *
4688  * 0 is returned if another key is found, < 0 if there are any errors
4689  * and 1 is returned if there are no higher keys in the tree
4690  *
4691  * path->keep_locks should be set to 1 on the search made before
4692  * calling this function.
4693  */
4694 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4695                         struct btrfs_key *key, int level, u64 min_trans)
4696 {
4697         int slot;
4698         struct extent_buffer *c;
4699
4700         WARN_ON(!path->keep_locks && !path->skip_locking);
4701         while (level < BTRFS_MAX_LEVEL) {
4702                 if (!path->nodes[level])
4703                         return 1;
4704
4705                 slot = path->slots[level] + 1;
4706                 c = path->nodes[level];
4707 next:
4708                 if (slot >= btrfs_header_nritems(c)) {
4709                         int ret;
4710                         int orig_lowest;
4711                         struct btrfs_key cur_key;
4712                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4713                             !path->nodes[level + 1])
4714                                 return 1;
4715
4716                         if (path->locks[level + 1] || path->skip_locking) {
4717                                 level++;
4718                                 continue;
4719                         }
4720
4721                         slot = btrfs_header_nritems(c) - 1;
4722                         if (level == 0)
4723                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4724                         else
4725                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4726
4727                         orig_lowest = path->lowest_level;
4728                         btrfs_release_path(path);
4729                         path->lowest_level = level;
4730                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4731                                                 0, 0);
4732                         path->lowest_level = orig_lowest;
4733                         if (ret < 0)
4734                                 return ret;
4735
4736                         c = path->nodes[level];
4737                         slot = path->slots[level];
4738                         if (ret == 0)
4739                                 slot++;
4740                         goto next;
4741                 }
4742
4743                 if (level == 0)
4744                         btrfs_item_key_to_cpu(c, key, slot);
4745                 else {
4746                         u64 gen = btrfs_node_ptr_generation(c, slot);
4747
4748                         if (gen < min_trans) {
4749                                 slot++;
4750                                 goto next;
4751                         }
4752                         btrfs_node_key_to_cpu(c, key, slot);
4753                 }
4754                 return 0;
4755         }
4756         return 1;
4757 }
4758
4759 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4760                         u64 time_seq)
4761 {
4762         int slot;
4763         int level;
4764         struct extent_buffer *c;
4765         struct extent_buffer *next;
4766         struct btrfs_fs_info *fs_info = root->fs_info;
4767         struct btrfs_key key;
4768         bool need_commit_sem = false;
4769         u32 nritems;
4770         int ret;
4771         int i;
4772
4773         /*
4774          * The nowait semantics are used only for write paths, where we don't
4775          * use the tree mod log and sequence numbers.
4776          */
4777         if (time_seq)
4778                 ASSERT(!path->nowait);
4779
4780         nritems = btrfs_header_nritems(path->nodes[0]);
4781         if (nritems == 0)
4782                 return 1;
4783
4784         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4785 again:
4786         level = 1;
4787         next = NULL;
4788         btrfs_release_path(path);
4789
4790         path->keep_locks = 1;
4791
4792         if (time_seq) {
4793                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4794         } else {
4795                 if (path->need_commit_sem) {
4796                         path->need_commit_sem = 0;
4797                         need_commit_sem = true;
4798                         if (path->nowait) {
4799                                 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4800                                         ret = -EAGAIN;
4801                                         goto done;
4802                                 }
4803                         } else {
4804                                 down_read(&fs_info->commit_root_sem);
4805                         }
4806                 }
4807                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4808         }
4809         path->keep_locks = 0;
4810
4811         if (ret < 0)
4812                 goto done;
4813
4814         nritems = btrfs_header_nritems(path->nodes[0]);
4815         /*
4816          * by releasing the path above we dropped all our locks.  A balance
4817          * could have added more items next to the key that used to be
4818          * at the very end of the block.  So, check again here and
4819          * advance the path if there are now more items available.
4820          */
4821         if (nritems > 0 && path->slots[0] < nritems - 1) {
4822                 if (ret == 0)
4823                         path->slots[0]++;
4824                 ret = 0;
4825                 goto done;
4826         }
4827         /*
4828          * So the above check misses one case:
4829          * - after releasing the path above, someone has removed the item that
4830          *   used to be at the very end of the block, and balance between leafs
4831          *   gets another one with bigger key.offset to replace it.
4832          *
4833          * This one should be returned as well, or we can get leaf corruption
4834          * later(esp. in __btrfs_drop_extents()).
4835          *
4836          * And a bit more explanation about this check,
4837          * with ret > 0, the key isn't found, the path points to the slot
4838          * where it should be inserted, so the path->slots[0] item must be the
4839          * bigger one.
4840          */
4841         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4842                 ret = 0;
4843                 goto done;
4844         }
4845
4846         while (level < BTRFS_MAX_LEVEL) {
4847                 if (!path->nodes[level]) {
4848                         ret = 1;
4849                         goto done;
4850                 }
4851
4852                 slot = path->slots[level] + 1;
4853                 c = path->nodes[level];
4854                 if (slot >= btrfs_header_nritems(c)) {
4855                         level++;
4856                         if (level == BTRFS_MAX_LEVEL) {
4857                                 ret = 1;
4858                                 goto done;
4859                         }
4860                         continue;
4861                 }
4862
4863
4864                 /*
4865                  * Our current level is where we're going to start from, and to
4866                  * make sure lockdep doesn't complain we need to drop our locks
4867                  * and nodes from 0 to our current level.
4868                  */
4869                 for (i = 0; i < level; i++) {
4870                         if (path->locks[level]) {
4871                                 btrfs_tree_read_unlock(path->nodes[i]);
4872                                 path->locks[i] = 0;
4873                         }
4874                         free_extent_buffer(path->nodes[i]);
4875                         path->nodes[i] = NULL;
4876                 }
4877
4878                 next = c;
4879                 ret = read_block_for_search(root, path, &next, level,
4880                                             slot, &key);
4881                 if (ret == -EAGAIN && !path->nowait)
4882                         goto again;
4883
4884                 if (ret < 0) {
4885                         btrfs_release_path(path);
4886                         goto done;
4887                 }
4888
4889                 if (!path->skip_locking) {
4890                         ret = btrfs_try_tree_read_lock(next);
4891                         if (!ret && path->nowait) {
4892                                 ret = -EAGAIN;
4893                                 goto done;
4894                         }
4895                         if (!ret && time_seq) {
4896                                 /*
4897                                  * If we don't get the lock, we may be racing
4898                                  * with push_leaf_left, holding that lock while
4899                                  * itself waiting for the leaf we've currently
4900                                  * locked. To solve this situation, we give up
4901                                  * on our lock and cycle.
4902                                  */
4903                                 free_extent_buffer(next);
4904                                 btrfs_release_path(path);
4905                                 cond_resched();
4906                                 goto again;
4907                         }
4908                         if (!ret)
4909                                 btrfs_tree_read_lock(next);
4910                 }
4911                 break;
4912         }
4913         path->slots[level] = slot;
4914         while (1) {
4915                 level--;
4916                 path->nodes[level] = next;
4917                 path->slots[level] = 0;
4918                 if (!path->skip_locking)
4919                         path->locks[level] = BTRFS_READ_LOCK;
4920                 if (!level)
4921                         break;
4922
4923                 ret = read_block_for_search(root, path, &next, level,
4924                                             0, &key);
4925                 if (ret == -EAGAIN && !path->nowait)
4926                         goto again;
4927
4928                 if (ret < 0) {
4929                         btrfs_release_path(path);
4930                         goto done;
4931                 }
4932
4933                 if (!path->skip_locking) {
4934                         if (path->nowait) {
4935                                 if (!btrfs_try_tree_read_lock(next)) {
4936                                         ret = -EAGAIN;
4937                                         goto done;
4938                                 }
4939                         } else {
4940                                 btrfs_tree_read_lock(next);
4941                         }
4942                 }
4943         }
4944         ret = 0;
4945 done:
4946         unlock_up(path, 0, 1, 0, NULL);
4947         if (need_commit_sem) {
4948                 int ret2;
4949
4950                 path->need_commit_sem = 1;
4951                 ret2 = finish_need_commit_sem_search(path);
4952                 up_read(&fs_info->commit_root_sem);
4953                 if (ret2)
4954                         ret = ret2;
4955         }
4956
4957         return ret;
4958 }
4959
4960 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4961 {
4962         path->slots[0]++;
4963         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4964                 return btrfs_next_old_leaf(root, path, time_seq);
4965         return 0;
4966 }
4967
4968 /*
4969  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4970  * searching until it gets past min_objectid or finds an item of 'type'
4971  *
4972  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4973  */
4974 int btrfs_previous_item(struct btrfs_root *root,
4975                         struct btrfs_path *path, u64 min_objectid,
4976                         int type)
4977 {
4978         struct btrfs_key found_key;
4979         struct extent_buffer *leaf;
4980         u32 nritems;
4981         int ret;
4982
4983         while (1) {
4984                 if (path->slots[0] == 0) {
4985                         ret = btrfs_prev_leaf(root, path);
4986                         if (ret != 0)
4987                                 return ret;
4988                 } else {
4989                         path->slots[0]--;
4990                 }
4991                 leaf = path->nodes[0];
4992                 nritems = btrfs_header_nritems(leaf);
4993                 if (nritems == 0)
4994                         return 1;
4995                 if (path->slots[0] == nritems)
4996                         path->slots[0]--;
4997
4998                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4999                 if (found_key.objectid < min_objectid)
5000                         break;
5001                 if (found_key.type == type)
5002                         return 0;
5003                 if (found_key.objectid == min_objectid &&
5004                     found_key.type < type)
5005                         break;
5006         }
5007         return 1;
5008 }
5009
5010 /*
5011  * search in extent tree to find a previous Metadata/Data extent item with
5012  * min objecitd.
5013  *
5014  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5015  */
5016 int btrfs_previous_extent_item(struct btrfs_root *root,
5017                         struct btrfs_path *path, u64 min_objectid)
5018 {
5019         struct btrfs_key found_key;
5020         struct extent_buffer *leaf;
5021         u32 nritems;
5022         int ret;
5023
5024         while (1) {
5025                 if (path->slots[0] == 0) {
5026                         ret = btrfs_prev_leaf(root, path);
5027                         if (ret != 0)
5028                                 return ret;
5029                 } else {
5030                         path->slots[0]--;
5031                 }
5032                 leaf = path->nodes[0];
5033                 nritems = btrfs_header_nritems(leaf);
5034                 if (nritems == 0)
5035                         return 1;
5036                 if (path->slots[0] == nritems)
5037                         path->slots[0]--;
5038
5039                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5040                 if (found_key.objectid < min_objectid)
5041                         break;
5042                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5043                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5044                         return 0;
5045                 if (found_key.objectid == min_objectid &&
5046                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5047                         break;
5048         }
5049         return 1;
5050 }
5051
5052 int __init btrfs_ctree_init(void)
5053 {
5054         btrfs_path_cachep = kmem_cache_create("btrfs_path",
5055                         sizeof(struct btrfs_path), 0,
5056                         SLAB_MEM_SPREAD, NULL);
5057         if (!btrfs_path_cachep)
5058                 return -ENOMEM;
5059         return 0;
5060 }
5061
5062 void __cold btrfs_ctree_exit(void)
5063 {
5064         kmem_cache_destroy(btrfs_path_cachep);
5065 }