7397579786863c1e4b4fdd0ffe9b374705f14108
[linux-2.6-block.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      *name;
35 } btrfs_csums[] = {
36         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
37 };
38
39 int btrfs_super_csum_size(const struct btrfs_super_block *s)
40 {
41         u16 t = btrfs_super_csum_type(s);
42         /*
43          * csum type is validated at mount time
44          */
45         return btrfs_csums[t].size;
46 }
47
48 const char *btrfs_super_csum_name(u16 csum_type)
49 {
50         /* csum type is validated at mount time */
51         return btrfs_csums[csum_type].name;
52 }
53
54 struct btrfs_path *btrfs_alloc_path(void)
55 {
56         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
57 }
58
59 /* this also releases the path */
60 void btrfs_free_path(struct btrfs_path *p)
61 {
62         if (!p)
63                 return;
64         btrfs_release_path(p);
65         kmem_cache_free(btrfs_path_cachep, p);
66 }
67
68 /*
69  * path release drops references on the extent buffers in the path
70  * and it drops any locks held by this path
71  *
72  * It is safe to call this on paths that no locks or extent buffers held.
73  */
74 noinline void btrfs_release_path(struct btrfs_path *p)
75 {
76         int i;
77
78         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
79                 p->slots[i] = 0;
80                 if (!p->nodes[i])
81                         continue;
82                 if (p->locks[i]) {
83                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
84                         p->locks[i] = 0;
85                 }
86                 free_extent_buffer(p->nodes[i]);
87                 p->nodes[i] = NULL;
88         }
89 }
90
91 /*
92  * safely gets a reference on the root node of a tree.  A lock
93  * is not taken, so a concurrent writer may put a different node
94  * at the root of the tree.  See btrfs_lock_root_node for the
95  * looping required.
96  *
97  * The extent buffer returned by this has a reference taken, so
98  * it won't disappear.  It may stop being the root of the tree
99  * at any time because there are no locks held.
100  */
101 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
102 {
103         struct extent_buffer *eb;
104
105         while (1) {
106                 rcu_read_lock();
107                 eb = rcu_dereference(root->node);
108
109                 /*
110                  * RCU really hurts here, we could free up the root node because
111                  * it was COWed but we may not get the new root node yet so do
112                  * the inc_not_zero dance and if it doesn't work then
113                  * synchronize_rcu and try again.
114                  */
115                 if (atomic_inc_not_zero(&eb->refs)) {
116                         rcu_read_unlock();
117                         break;
118                 }
119                 rcu_read_unlock();
120                 synchronize_rcu();
121         }
122         return eb;
123 }
124
125 /* loop around taking references on and locking the root node of the
126  * tree until you end up with a lock on the root.  A locked buffer
127  * is returned, with a reference held.
128  */
129 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
130 {
131         struct extent_buffer *eb;
132
133         while (1) {
134                 eb = btrfs_root_node(root);
135                 btrfs_tree_lock(eb);
136                 if (eb == root->node)
137                         break;
138                 btrfs_tree_unlock(eb);
139                 free_extent_buffer(eb);
140         }
141         return eb;
142 }
143
144 /* loop around taking references on and locking the root node of the
145  * tree until you end up with a lock on the root.  A locked buffer
146  * is returned, with a reference held.
147  */
148 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 eb = btrfs_root_node(root);
154                 btrfs_tree_read_lock(eb);
155                 if (eb == root->node)
156                         break;
157                 btrfs_tree_read_unlock(eb);
158                 free_extent_buffer(eb);
159         }
160         return eb;
161 }
162
163 /* cowonly root (everything not a reference counted cow subvolume), just get
164  * put onto a simple dirty list.  transaction.c walks this to make sure they
165  * get properly updated on disk.
166  */
167 static void add_root_to_dirty_list(struct btrfs_root *root)
168 {
169         struct btrfs_fs_info *fs_info = root->fs_info;
170
171         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
172             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
173                 return;
174
175         spin_lock(&fs_info->trans_lock);
176         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
177                 /* Want the extent tree to be the last on the list */
178                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
179                         list_move_tail(&root->dirty_list,
180                                        &fs_info->dirty_cowonly_roots);
181                 else
182                         list_move(&root->dirty_list,
183                                   &fs_info->dirty_cowonly_roots);
184         }
185         spin_unlock(&fs_info->trans_lock);
186 }
187
188 /*
189  * used by snapshot creation to make a copy of a root for a tree with
190  * a given objectid.  The buffer with the new root node is returned in
191  * cow_ret, and this func returns zero on success or a negative error code.
192  */
193 int btrfs_copy_root(struct btrfs_trans_handle *trans,
194                       struct btrfs_root *root,
195                       struct extent_buffer *buf,
196                       struct extent_buffer **cow_ret, u64 new_root_objectid)
197 {
198         struct btrfs_fs_info *fs_info = root->fs_info;
199         struct extent_buffer *cow;
200         int ret = 0;
201         int level;
202         struct btrfs_disk_key disk_key;
203
204         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
205                 trans->transid != fs_info->running_transaction->transid);
206         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
207                 trans->transid != root->last_trans);
208
209         level = btrfs_header_level(buf);
210         if (level == 0)
211                 btrfs_item_key(buf, &disk_key, 0);
212         else
213                 btrfs_node_key(buf, &disk_key, 0);
214
215         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
216                         &disk_key, level, buf->start, 0);
217         if (IS_ERR(cow))
218                 return PTR_ERR(cow);
219
220         copy_extent_buffer_full(cow, buf);
221         btrfs_set_header_bytenr(cow, cow->start);
222         btrfs_set_header_generation(cow, trans->transid);
223         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
224         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
225                                      BTRFS_HEADER_FLAG_RELOC);
226         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
227                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
228         else
229                 btrfs_set_header_owner(cow, new_root_objectid);
230
231         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
232
233         WARN_ON(btrfs_header_generation(buf) > trans->transid);
234         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
235                 ret = btrfs_inc_ref(trans, root, cow, 1);
236         else
237                 ret = btrfs_inc_ref(trans, root, cow, 0);
238
239         if (ret)
240                 return ret;
241
242         btrfs_mark_buffer_dirty(cow);
243         *cow_ret = cow;
244         return 0;
245 }
246
247 enum mod_log_op {
248         MOD_LOG_KEY_REPLACE,
249         MOD_LOG_KEY_ADD,
250         MOD_LOG_KEY_REMOVE,
251         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
252         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
253         MOD_LOG_MOVE_KEYS,
254         MOD_LOG_ROOT_REPLACE,
255 };
256
257 struct tree_mod_root {
258         u64 logical;
259         u8 level;
260 };
261
262 struct tree_mod_elem {
263         struct rb_node node;
264         u64 logical;
265         u64 seq;
266         enum mod_log_op op;
267
268         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
269         int slot;
270
271         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
272         u64 generation;
273
274         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
275         struct btrfs_disk_key key;
276         u64 blockptr;
277
278         /* this is used for op == MOD_LOG_MOVE_KEYS */
279         struct {
280                 int dst_slot;
281                 int nr_items;
282         } move;
283
284         /* this is used for op == MOD_LOG_ROOT_REPLACE */
285         struct tree_mod_root old_root;
286 };
287
288 /*
289  * Pull a new tree mod seq number for our operation.
290  */
291 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
292 {
293         return atomic64_inc_return(&fs_info->tree_mod_seq);
294 }
295
296 /*
297  * This adds a new blocker to the tree mod log's blocker list if the @elem
298  * passed does not already have a sequence number set. So when a caller expects
299  * to record tree modifications, it should ensure to set elem->seq to zero
300  * before calling btrfs_get_tree_mod_seq.
301  * Returns a fresh, unused tree log modification sequence number, even if no new
302  * blocker was added.
303  */
304 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
305                            struct seq_list *elem)
306 {
307         write_lock(&fs_info->tree_mod_log_lock);
308         spin_lock(&fs_info->tree_mod_seq_lock);
309         if (!elem->seq) {
310                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
311                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
312         }
313         spin_unlock(&fs_info->tree_mod_seq_lock);
314         write_unlock(&fs_info->tree_mod_log_lock);
315
316         return elem->seq;
317 }
318
319 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
320                             struct seq_list *elem)
321 {
322         struct rb_root *tm_root;
323         struct rb_node *node;
324         struct rb_node *next;
325         struct seq_list *cur_elem;
326         struct tree_mod_elem *tm;
327         u64 min_seq = (u64)-1;
328         u64 seq_putting = elem->seq;
329
330         if (!seq_putting)
331                 return;
332
333         spin_lock(&fs_info->tree_mod_seq_lock);
334         list_del(&elem->list);
335         elem->seq = 0;
336
337         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
338                 if (cur_elem->seq < min_seq) {
339                         if (seq_putting > cur_elem->seq) {
340                                 /*
341                                  * blocker with lower sequence number exists, we
342                                  * cannot remove anything from the log
343                                  */
344                                 spin_unlock(&fs_info->tree_mod_seq_lock);
345                                 return;
346                         }
347                         min_seq = cur_elem->seq;
348                 }
349         }
350         spin_unlock(&fs_info->tree_mod_seq_lock);
351
352         /*
353          * anything that's lower than the lowest existing (read: blocked)
354          * sequence number can be removed from the tree.
355          */
356         write_lock(&fs_info->tree_mod_log_lock);
357         tm_root = &fs_info->tree_mod_log;
358         for (node = rb_first(tm_root); node; node = next) {
359                 next = rb_next(node);
360                 tm = rb_entry(node, struct tree_mod_elem, node);
361                 if (tm->seq > min_seq)
362                         continue;
363                 rb_erase(node, tm_root);
364                 kfree(tm);
365         }
366         write_unlock(&fs_info->tree_mod_log_lock);
367 }
368
369 /*
370  * key order of the log:
371  *       node/leaf start address -> sequence
372  *
373  * The 'start address' is the logical address of the *new* root node
374  * for root replace operations, or the logical address of the affected
375  * block for all other operations.
376  */
377 static noinline int
378 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
379 {
380         struct rb_root *tm_root;
381         struct rb_node **new;
382         struct rb_node *parent = NULL;
383         struct tree_mod_elem *cur;
384
385         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
386
387         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
388
389         tm_root = &fs_info->tree_mod_log;
390         new = &tm_root->rb_node;
391         while (*new) {
392                 cur = rb_entry(*new, struct tree_mod_elem, node);
393                 parent = *new;
394                 if (cur->logical < tm->logical)
395                         new = &((*new)->rb_left);
396                 else if (cur->logical > tm->logical)
397                         new = &((*new)->rb_right);
398                 else if (cur->seq < tm->seq)
399                         new = &((*new)->rb_left);
400                 else if (cur->seq > tm->seq)
401                         new = &((*new)->rb_right);
402                 else
403                         return -EEXIST;
404         }
405
406         rb_link_node(&tm->node, parent, new);
407         rb_insert_color(&tm->node, tm_root);
408         return 0;
409 }
410
411 /*
412  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
413  * returns zero with the tree_mod_log_lock acquired. The caller must hold
414  * this until all tree mod log insertions are recorded in the rb tree and then
415  * write unlock fs_info::tree_mod_log_lock.
416  */
417 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
418                                     struct extent_buffer *eb) {
419         smp_mb();
420         if (list_empty(&(fs_info)->tree_mod_seq_list))
421                 return 1;
422         if (eb && btrfs_header_level(eb) == 0)
423                 return 1;
424
425         write_lock(&fs_info->tree_mod_log_lock);
426         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
427                 write_unlock(&fs_info->tree_mod_log_lock);
428                 return 1;
429         }
430
431         return 0;
432 }
433
434 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
435 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
436                                     struct extent_buffer *eb)
437 {
438         smp_mb();
439         if (list_empty(&(fs_info)->tree_mod_seq_list))
440                 return 0;
441         if (eb && btrfs_header_level(eb) == 0)
442                 return 0;
443
444         return 1;
445 }
446
447 static struct tree_mod_elem *
448 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
449                     enum mod_log_op op, gfp_t flags)
450 {
451         struct tree_mod_elem *tm;
452
453         tm = kzalloc(sizeof(*tm), flags);
454         if (!tm)
455                 return NULL;
456
457         tm->logical = eb->start;
458         if (op != MOD_LOG_KEY_ADD) {
459                 btrfs_node_key(eb, &tm->key, slot);
460                 tm->blockptr = btrfs_node_blockptr(eb, slot);
461         }
462         tm->op = op;
463         tm->slot = slot;
464         tm->generation = btrfs_node_ptr_generation(eb, slot);
465         RB_CLEAR_NODE(&tm->node);
466
467         return tm;
468 }
469
470 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
471                 enum mod_log_op op, gfp_t flags)
472 {
473         struct tree_mod_elem *tm;
474         int ret;
475
476         if (!tree_mod_need_log(eb->fs_info, eb))
477                 return 0;
478
479         tm = alloc_tree_mod_elem(eb, slot, op, flags);
480         if (!tm)
481                 return -ENOMEM;
482
483         if (tree_mod_dont_log(eb->fs_info, eb)) {
484                 kfree(tm);
485                 return 0;
486         }
487
488         ret = __tree_mod_log_insert(eb->fs_info, tm);
489         write_unlock(&eb->fs_info->tree_mod_log_lock);
490         if (ret)
491                 kfree(tm);
492
493         return ret;
494 }
495
496 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
497                 int dst_slot, int src_slot, int nr_items)
498 {
499         struct tree_mod_elem *tm = NULL;
500         struct tree_mod_elem **tm_list = NULL;
501         int ret = 0;
502         int i;
503         int locked = 0;
504
505         if (!tree_mod_need_log(eb->fs_info, eb))
506                 return 0;
507
508         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
509         if (!tm_list)
510                 return -ENOMEM;
511
512         tm = kzalloc(sizeof(*tm), GFP_NOFS);
513         if (!tm) {
514                 ret = -ENOMEM;
515                 goto free_tms;
516         }
517
518         tm->logical = eb->start;
519         tm->slot = src_slot;
520         tm->move.dst_slot = dst_slot;
521         tm->move.nr_items = nr_items;
522         tm->op = MOD_LOG_MOVE_KEYS;
523
524         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
525                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
526                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
527                 if (!tm_list[i]) {
528                         ret = -ENOMEM;
529                         goto free_tms;
530                 }
531         }
532
533         if (tree_mod_dont_log(eb->fs_info, eb))
534                 goto free_tms;
535         locked = 1;
536
537         /*
538          * When we override something during the move, we log these removals.
539          * This can only happen when we move towards the beginning of the
540          * buffer, i.e. dst_slot < src_slot.
541          */
542         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
543                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
544                 if (ret)
545                         goto free_tms;
546         }
547
548         ret = __tree_mod_log_insert(eb->fs_info, tm);
549         if (ret)
550                 goto free_tms;
551         write_unlock(&eb->fs_info->tree_mod_log_lock);
552         kfree(tm_list);
553
554         return 0;
555 free_tms:
556         for (i = 0; i < nr_items; i++) {
557                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
558                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
559                 kfree(tm_list[i]);
560         }
561         if (locked)
562                 write_unlock(&eb->fs_info->tree_mod_log_lock);
563         kfree(tm_list);
564         kfree(tm);
565
566         return ret;
567 }
568
569 static inline int
570 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
571                        struct tree_mod_elem **tm_list,
572                        int nritems)
573 {
574         int i, j;
575         int ret;
576
577         for (i = nritems - 1; i >= 0; i--) {
578                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
579                 if (ret) {
580                         for (j = nritems - 1; j > i; j--)
581                                 rb_erase(&tm_list[j]->node,
582                                          &fs_info->tree_mod_log);
583                         return ret;
584                 }
585         }
586
587         return 0;
588 }
589
590 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
591                          struct extent_buffer *new_root, int log_removal)
592 {
593         struct btrfs_fs_info *fs_info = old_root->fs_info;
594         struct tree_mod_elem *tm = NULL;
595         struct tree_mod_elem **tm_list = NULL;
596         int nritems = 0;
597         int ret = 0;
598         int i;
599
600         if (!tree_mod_need_log(fs_info, NULL))
601                 return 0;
602
603         if (log_removal && btrfs_header_level(old_root) > 0) {
604                 nritems = btrfs_header_nritems(old_root);
605                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
606                                   GFP_NOFS);
607                 if (!tm_list) {
608                         ret = -ENOMEM;
609                         goto free_tms;
610                 }
611                 for (i = 0; i < nritems; i++) {
612                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
613                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
614                         if (!tm_list[i]) {
615                                 ret = -ENOMEM;
616                                 goto free_tms;
617                         }
618                 }
619         }
620
621         tm = kzalloc(sizeof(*tm), GFP_NOFS);
622         if (!tm) {
623                 ret = -ENOMEM;
624                 goto free_tms;
625         }
626
627         tm->logical = new_root->start;
628         tm->old_root.logical = old_root->start;
629         tm->old_root.level = btrfs_header_level(old_root);
630         tm->generation = btrfs_header_generation(old_root);
631         tm->op = MOD_LOG_ROOT_REPLACE;
632
633         if (tree_mod_dont_log(fs_info, NULL))
634                 goto free_tms;
635
636         if (tm_list)
637                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
638         if (!ret)
639                 ret = __tree_mod_log_insert(fs_info, tm);
640
641         write_unlock(&fs_info->tree_mod_log_lock);
642         if (ret)
643                 goto free_tms;
644         kfree(tm_list);
645
646         return ret;
647
648 free_tms:
649         if (tm_list) {
650                 for (i = 0; i < nritems; i++)
651                         kfree(tm_list[i]);
652                 kfree(tm_list);
653         }
654         kfree(tm);
655
656         return ret;
657 }
658
659 static struct tree_mod_elem *
660 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
661                       int smallest)
662 {
663         struct rb_root *tm_root;
664         struct rb_node *node;
665         struct tree_mod_elem *cur = NULL;
666         struct tree_mod_elem *found = NULL;
667
668         read_lock(&fs_info->tree_mod_log_lock);
669         tm_root = &fs_info->tree_mod_log;
670         node = tm_root->rb_node;
671         while (node) {
672                 cur = rb_entry(node, struct tree_mod_elem, node);
673                 if (cur->logical < start) {
674                         node = node->rb_left;
675                 } else if (cur->logical > start) {
676                         node = node->rb_right;
677                 } else if (cur->seq < min_seq) {
678                         node = node->rb_left;
679                 } else if (!smallest) {
680                         /* we want the node with the highest seq */
681                         if (found)
682                                 BUG_ON(found->seq > cur->seq);
683                         found = cur;
684                         node = node->rb_left;
685                 } else if (cur->seq > min_seq) {
686                         /* we want the node with the smallest seq */
687                         if (found)
688                                 BUG_ON(found->seq < cur->seq);
689                         found = cur;
690                         node = node->rb_right;
691                 } else {
692                         found = cur;
693                         break;
694                 }
695         }
696         read_unlock(&fs_info->tree_mod_log_lock);
697
698         return found;
699 }
700
701 /*
702  * this returns the element from the log with the smallest time sequence
703  * value that's in the log (the oldest log item). any element with a time
704  * sequence lower than min_seq will be ignored.
705  */
706 static struct tree_mod_elem *
707 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
708                            u64 min_seq)
709 {
710         return __tree_mod_log_search(fs_info, start, min_seq, 1);
711 }
712
713 /*
714  * this returns the element from the log with the largest time sequence
715  * value that's in the log (the most recent log item). any element with
716  * a time sequence lower than min_seq will be ignored.
717  */
718 static struct tree_mod_elem *
719 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
720 {
721         return __tree_mod_log_search(fs_info, start, min_seq, 0);
722 }
723
724 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
725                      struct extent_buffer *src, unsigned long dst_offset,
726                      unsigned long src_offset, int nr_items)
727 {
728         struct btrfs_fs_info *fs_info = dst->fs_info;
729         int ret = 0;
730         struct tree_mod_elem **tm_list = NULL;
731         struct tree_mod_elem **tm_list_add, **tm_list_rem;
732         int i;
733         int locked = 0;
734
735         if (!tree_mod_need_log(fs_info, NULL))
736                 return 0;
737
738         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
739                 return 0;
740
741         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
742                           GFP_NOFS);
743         if (!tm_list)
744                 return -ENOMEM;
745
746         tm_list_add = tm_list;
747         tm_list_rem = tm_list + nr_items;
748         for (i = 0; i < nr_items; i++) {
749                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
750                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
751                 if (!tm_list_rem[i]) {
752                         ret = -ENOMEM;
753                         goto free_tms;
754                 }
755
756                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
757                     MOD_LOG_KEY_ADD, GFP_NOFS);
758                 if (!tm_list_add[i]) {
759                         ret = -ENOMEM;
760                         goto free_tms;
761                 }
762         }
763
764         if (tree_mod_dont_log(fs_info, NULL))
765                 goto free_tms;
766         locked = 1;
767
768         for (i = 0; i < nr_items; i++) {
769                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
770                 if (ret)
771                         goto free_tms;
772                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
773                 if (ret)
774                         goto free_tms;
775         }
776
777         write_unlock(&fs_info->tree_mod_log_lock);
778         kfree(tm_list);
779
780         return 0;
781
782 free_tms:
783         for (i = 0; i < nr_items * 2; i++) {
784                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
785                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
786                 kfree(tm_list[i]);
787         }
788         if (locked)
789                 write_unlock(&fs_info->tree_mod_log_lock);
790         kfree(tm_list);
791
792         return ret;
793 }
794
795 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
796 {
797         struct tree_mod_elem **tm_list = NULL;
798         int nritems = 0;
799         int i;
800         int ret = 0;
801
802         if (btrfs_header_level(eb) == 0)
803                 return 0;
804
805         if (!tree_mod_need_log(eb->fs_info, NULL))
806                 return 0;
807
808         nritems = btrfs_header_nritems(eb);
809         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
810         if (!tm_list)
811                 return -ENOMEM;
812
813         for (i = 0; i < nritems; i++) {
814                 tm_list[i] = alloc_tree_mod_elem(eb, i,
815                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
816                 if (!tm_list[i]) {
817                         ret = -ENOMEM;
818                         goto free_tms;
819                 }
820         }
821
822         if (tree_mod_dont_log(eb->fs_info, eb))
823                 goto free_tms;
824
825         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
826         write_unlock(&eb->fs_info->tree_mod_log_lock);
827         if (ret)
828                 goto free_tms;
829         kfree(tm_list);
830
831         return 0;
832
833 free_tms:
834         for (i = 0; i < nritems; i++)
835                 kfree(tm_list[i]);
836         kfree(tm_list);
837
838         return ret;
839 }
840
841 /*
842  * check if the tree block can be shared by multiple trees
843  */
844 int btrfs_block_can_be_shared(struct btrfs_root *root,
845                               struct extent_buffer *buf)
846 {
847         /*
848          * Tree blocks not in reference counted trees and tree roots
849          * are never shared. If a block was allocated after the last
850          * snapshot and the block was not allocated by tree relocation,
851          * we know the block is not shared.
852          */
853         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
854             buf != root->node && buf != root->commit_root &&
855             (btrfs_header_generation(buf) <=
856              btrfs_root_last_snapshot(&root->root_item) ||
857              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
858                 return 1;
859
860         return 0;
861 }
862
863 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
864                                        struct btrfs_root *root,
865                                        struct extent_buffer *buf,
866                                        struct extent_buffer *cow,
867                                        int *last_ref)
868 {
869         struct btrfs_fs_info *fs_info = root->fs_info;
870         u64 refs;
871         u64 owner;
872         u64 flags;
873         u64 new_flags = 0;
874         int ret;
875
876         /*
877          * Backrefs update rules:
878          *
879          * Always use full backrefs for extent pointers in tree block
880          * allocated by tree relocation.
881          *
882          * If a shared tree block is no longer referenced by its owner
883          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
884          * use full backrefs for extent pointers in tree block.
885          *
886          * If a tree block is been relocating
887          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
888          * use full backrefs for extent pointers in tree block.
889          * The reason for this is some operations (such as drop tree)
890          * are only allowed for blocks use full backrefs.
891          */
892
893         if (btrfs_block_can_be_shared(root, buf)) {
894                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
895                                                btrfs_header_level(buf), 1,
896                                                &refs, &flags);
897                 if (ret)
898                         return ret;
899                 if (refs == 0) {
900                         ret = -EROFS;
901                         btrfs_handle_fs_error(fs_info, ret, NULL);
902                         return ret;
903                 }
904         } else {
905                 refs = 1;
906                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
907                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
908                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
909                 else
910                         flags = 0;
911         }
912
913         owner = btrfs_header_owner(buf);
914         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
915                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
916
917         if (refs > 1) {
918                 if ((owner == root->root_key.objectid ||
919                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
920                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
921                         ret = btrfs_inc_ref(trans, root, buf, 1);
922                         if (ret)
923                                 return ret;
924
925                         if (root->root_key.objectid ==
926                             BTRFS_TREE_RELOC_OBJECTID) {
927                                 ret = btrfs_dec_ref(trans, root, buf, 0);
928                                 if (ret)
929                                         return ret;
930                                 ret = btrfs_inc_ref(trans, root, cow, 1);
931                                 if (ret)
932                                         return ret;
933                         }
934                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
935                 } else {
936
937                         if (root->root_key.objectid ==
938                             BTRFS_TREE_RELOC_OBJECTID)
939                                 ret = btrfs_inc_ref(trans, root, cow, 1);
940                         else
941                                 ret = btrfs_inc_ref(trans, root, cow, 0);
942                         if (ret)
943                                 return ret;
944                 }
945                 if (new_flags != 0) {
946                         int level = btrfs_header_level(buf);
947
948                         ret = btrfs_set_disk_extent_flags(trans,
949                                                           buf->start,
950                                                           buf->len,
951                                                           new_flags, level, 0);
952                         if (ret)
953                                 return ret;
954                 }
955         } else {
956                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
957                         if (root->root_key.objectid ==
958                             BTRFS_TREE_RELOC_OBJECTID)
959                                 ret = btrfs_inc_ref(trans, root, cow, 1);
960                         else
961                                 ret = btrfs_inc_ref(trans, root, cow, 0);
962                         if (ret)
963                                 return ret;
964                         ret = btrfs_dec_ref(trans, root, buf, 1);
965                         if (ret)
966                                 return ret;
967                 }
968                 btrfs_clean_tree_block(buf);
969                 *last_ref = 1;
970         }
971         return 0;
972 }
973
974 static struct extent_buffer *alloc_tree_block_no_bg_flush(
975                                           struct btrfs_trans_handle *trans,
976                                           struct btrfs_root *root,
977                                           u64 parent_start,
978                                           const struct btrfs_disk_key *disk_key,
979                                           int level,
980                                           u64 hint,
981                                           u64 empty_size)
982 {
983         struct btrfs_fs_info *fs_info = root->fs_info;
984         struct extent_buffer *ret;
985
986         /*
987          * If we are COWing a node/leaf from the extent, chunk, device or free
988          * space trees, make sure that we do not finish block group creation of
989          * pending block groups. We do this to avoid a deadlock.
990          * COWing can result in allocation of a new chunk, and flushing pending
991          * block groups (btrfs_create_pending_block_groups()) can be triggered
992          * when finishing allocation of a new chunk. Creation of a pending block
993          * group modifies the extent, chunk, device and free space trees,
994          * therefore we could deadlock with ourselves since we are holding a
995          * lock on an extent buffer that btrfs_create_pending_block_groups() may
996          * try to COW later.
997          * For similar reasons, we also need to delay flushing pending block
998          * groups when splitting a leaf or node, from one of those trees, since
999          * we are holding a write lock on it and its parent or when inserting a
1000          * new root node for one of those trees.
1001          */
1002         if (root == fs_info->extent_root ||
1003             root == fs_info->chunk_root ||
1004             root == fs_info->dev_root ||
1005             root == fs_info->free_space_root)
1006                 trans->can_flush_pending_bgs = false;
1007
1008         ret = btrfs_alloc_tree_block(trans, root, parent_start,
1009                                      root->root_key.objectid, disk_key, level,
1010                                      hint, empty_size);
1011         trans->can_flush_pending_bgs = true;
1012
1013         return ret;
1014 }
1015
1016 /*
1017  * does the dirty work in cow of a single block.  The parent block (if
1018  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1019  * dirty and returned locked.  If you modify the block it needs to be marked
1020  * dirty again.
1021  *
1022  * search_start -- an allocation hint for the new block
1023  *
1024  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1025  * bytes the allocator should try to find free next to the block it returns.
1026  * This is just a hint and may be ignored by the allocator.
1027  */
1028 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1029                              struct btrfs_root *root,
1030                              struct extent_buffer *buf,
1031                              struct extent_buffer *parent, int parent_slot,
1032                              struct extent_buffer **cow_ret,
1033                              u64 search_start, u64 empty_size)
1034 {
1035         struct btrfs_fs_info *fs_info = root->fs_info;
1036         struct btrfs_disk_key disk_key;
1037         struct extent_buffer *cow;
1038         int level, ret;
1039         int last_ref = 0;
1040         int unlock_orig = 0;
1041         u64 parent_start = 0;
1042
1043         if (*cow_ret == buf)
1044                 unlock_orig = 1;
1045
1046         btrfs_assert_tree_locked(buf);
1047
1048         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1049                 trans->transid != fs_info->running_transaction->transid);
1050         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1051                 trans->transid != root->last_trans);
1052
1053         level = btrfs_header_level(buf);
1054
1055         if (level == 0)
1056                 btrfs_item_key(buf, &disk_key, 0);
1057         else
1058                 btrfs_node_key(buf, &disk_key, 0);
1059
1060         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1061                 parent_start = parent->start;
1062
1063         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1064                                            level, search_start, empty_size);
1065         if (IS_ERR(cow))
1066                 return PTR_ERR(cow);
1067
1068         /* cow is set to blocking by btrfs_init_new_buffer */
1069
1070         copy_extent_buffer_full(cow, buf);
1071         btrfs_set_header_bytenr(cow, cow->start);
1072         btrfs_set_header_generation(cow, trans->transid);
1073         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1074         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1075                                      BTRFS_HEADER_FLAG_RELOC);
1076         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1077                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1078         else
1079                 btrfs_set_header_owner(cow, root->root_key.objectid);
1080
1081         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1082
1083         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1084         if (ret) {
1085                 btrfs_abort_transaction(trans, ret);
1086                 return ret;
1087         }
1088
1089         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1090                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1091                 if (ret) {
1092                         btrfs_abort_transaction(trans, ret);
1093                         return ret;
1094                 }
1095         }
1096
1097         if (buf == root->node) {
1098                 WARN_ON(parent && parent != buf);
1099                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1100                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1101                         parent_start = buf->start;
1102
1103                 atomic_inc(&cow->refs);
1104                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1105                 BUG_ON(ret < 0);
1106                 rcu_assign_pointer(root->node, cow);
1107
1108                 btrfs_free_tree_block(trans, root, buf, parent_start,
1109                                       last_ref);
1110                 free_extent_buffer(buf);
1111                 add_root_to_dirty_list(root);
1112         } else {
1113                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1114                 tree_mod_log_insert_key(parent, parent_slot,
1115                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1116                 btrfs_set_node_blockptr(parent, parent_slot,
1117                                         cow->start);
1118                 btrfs_set_node_ptr_generation(parent, parent_slot,
1119                                               trans->transid);
1120                 btrfs_mark_buffer_dirty(parent);
1121                 if (last_ref) {
1122                         ret = tree_mod_log_free_eb(buf);
1123                         if (ret) {
1124                                 btrfs_abort_transaction(trans, ret);
1125                                 return ret;
1126                         }
1127                 }
1128                 btrfs_free_tree_block(trans, root, buf, parent_start,
1129                                       last_ref);
1130         }
1131         if (unlock_orig)
1132                 btrfs_tree_unlock(buf);
1133         free_extent_buffer_stale(buf);
1134         btrfs_mark_buffer_dirty(cow);
1135         *cow_ret = cow;
1136         return 0;
1137 }
1138
1139 /*
1140  * returns the logical address of the oldest predecessor of the given root.
1141  * entries older than time_seq are ignored.
1142  */
1143 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1144                 struct extent_buffer *eb_root, u64 time_seq)
1145 {
1146         struct tree_mod_elem *tm;
1147         struct tree_mod_elem *found = NULL;
1148         u64 root_logical = eb_root->start;
1149         int looped = 0;
1150
1151         if (!time_seq)
1152                 return NULL;
1153
1154         /*
1155          * the very last operation that's logged for a root is the
1156          * replacement operation (if it is replaced at all). this has
1157          * the logical address of the *new* root, making it the very
1158          * first operation that's logged for this root.
1159          */
1160         while (1) {
1161                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1162                                                 time_seq);
1163                 if (!looped && !tm)
1164                         return NULL;
1165                 /*
1166                  * if there are no tree operation for the oldest root, we simply
1167                  * return it. this should only happen if that (old) root is at
1168                  * level 0.
1169                  */
1170                 if (!tm)
1171                         break;
1172
1173                 /*
1174                  * if there's an operation that's not a root replacement, we
1175                  * found the oldest version of our root. normally, we'll find a
1176                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1177                  */
1178                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1179                         break;
1180
1181                 found = tm;
1182                 root_logical = tm->old_root.logical;
1183                 looped = 1;
1184         }
1185
1186         /* if there's no old root to return, return what we found instead */
1187         if (!found)
1188                 found = tm;
1189
1190         return found;
1191 }
1192
1193 /*
1194  * tm is a pointer to the first operation to rewind within eb. then, all
1195  * previous operations will be rewound (until we reach something older than
1196  * time_seq).
1197  */
1198 static void
1199 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1200                       u64 time_seq, struct tree_mod_elem *first_tm)
1201 {
1202         u32 n;
1203         struct rb_node *next;
1204         struct tree_mod_elem *tm = first_tm;
1205         unsigned long o_dst;
1206         unsigned long o_src;
1207         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1208
1209         n = btrfs_header_nritems(eb);
1210         read_lock(&fs_info->tree_mod_log_lock);
1211         while (tm && tm->seq >= time_seq) {
1212                 /*
1213                  * all the operations are recorded with the operator used for
1214                  * the modification. as we're going backwards, we do the
1215                  * opposite of each operation here.
1216                  */
1217                 switch (tm->op) {
1218                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1219                         BUG_ON(tm->slot < n);
1220                         /* Fallthrough */
1221                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1222                 case MOD_LOG_KEY_REMOVE:
1223                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1224                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1225                         btrfs_set_node_ptr_generation(eb, tm->slot,
1226                                                       tm->generation);
1227                         n++;
1228                         break;
1229                 case MOD_LOG_KEY_REPLACE:
1230                         BUG_ON(tm->slot >= n);
1231                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1232                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1233                         btrfs_set_node_ptr_generation(eb, tm->slot,
1234                                                       tm->generation);
1235                         break;
1236                 case MOD_LOG_KEY_ADD:
1237                         /* if a move operation is needed it's in the log */
1238                         n--;
1239                         break;
1240                 case MOD_LOG_MOVE_KEYS:
1241                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1242                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1243                         memmove_extent_buffer(eb, o_dst, o_src,
1244                                               tm->move.nr_items * p_size);
1245                         break;
1246                 case MOD_LOG_ROOT_REPLACE:
1247                         /*
1248                          * this operation is special. for roots, this must be
1249                          * handled explicitly before rewinding.
1250                          * for non-roots, this operation may exist if the node
1251                          * was a root: root A -> child B; then A gets empty and
1252                          * B is promoted to the new root. in the mod log, we'll
1253                          * have a root-replace operation for B, a tree block
1254                          * that is no root. we simply ignore that operation.
1255                          */
1256                         break;
1257                 }
1258                 next = rb_next(&tm->node);
1259                 if (!next)
1260                         break;
1261                 tm = rb_entry(next, struct tree_mod_elem, node);
1262                 if (tm->logical != first_tm->logical)
1263                         break;
1264         }
1265         read_unlock(&fs_info->tree_mod_log_lock);
1266         btrfs_set_header_nritems(eb, n);
1267 }
1268
1269 /*
1270  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1271  * is returned. If rewind operations happen, a fresh buffer is returned. The
1272  * returned buffer is always read-locked. If the returned buffer is not the
1273  * input buffer, the lock on the input buffer is released and the input buffer
1274  * is freed (its refcount is decremented).
1275  */
1276 static struct extent_buffer *
1277 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1278                     struct extent_buffer *eb, u64 time_seq)
1279 {
1280         struct extent_buffer *eb_rewin;
1281         struct tree_mod_elem *tm;
1282
1283         if (!time_seq)
1284                 return eb;
1285
1286         if (btrfs_header_level(eb) == 0)
1287                 return eb;
1288
1289         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1290         if (!tm)
1291                 return eb;
1292
1293         btrfs_set_path_blocking(path);
1294         btrfs_set_lock_blocking_read(eb);
1295
1296         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1297                 BUG_ON(tm->slot != 0);
1298                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1299                 if (!eb_rewin) {
1300                         btrfs_tree_read_unlock_blocking(eb);
1301                         free_extent_buffer(eb);
1302                         return NULL;
1303                 }
1304                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1305                 btrfs_set_header_backref_rev(eb_rewin,
1306                                              btrfs_header_backref_rev(eb));
1307                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1308                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1309         } else {
1310                 eb_rewin = btrfs_clone_extent_buffer(eb);
1311                 if (!eb_rewin) {
1312                         btrfs_tree_read_unlock_blocking(eb);
1313                         free_extent_buffer(eb);
1314                         return NULL;
1315                 }
1316         }
1317
1318         btrfs_tree_read_unlock_blocking(eb);
1319         free_extent_buffer(eb);
1320
1321         btrfs_tree_read_lock(eb_rewin);
1322         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1323         WARN_ON(btrfs_header_nritems(eb_rewin) >
1324                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1325
1326         return eb_rewin;
1327 }
1328
1329 /*
1330  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1331  * value. If there are no changes, the current root->root_node is returned. If
1332  * anything changed in between, there's a fresh buffer allocated on which the
1333  * rewind operations are done. In any case, the returned buffer is read locked.
1334  * Returns NULL on error (with no locks held).
1335  */
1336 static inline struct extent_buffer *
1337 get_old_root(struct btrfs_root *root, u64 time_seq)
1338 {
1339         struct btrfs_fs_info *fs_info = root->fs_info;
1340         struct tree_mod_elem *tm;
1341         struct extent_buffer *eb = NULL;
1342         struct extent_buffer *eb_root;
1343         u64 eb_root_owner = 0;
1344         struct extent_buffer *old;
1345         struct tree_mod_root *old_root = NULL;
1346         u64 old_generation = 0;
1347         u64 logical;
1348         int level;
1349
1350         eb_root = btrfs_read_lock_root_node(root);
1351         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1352         if (!tm)
1353                 return eb_root;
1354
1355         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1356                 old_root = &tm->old_root;
1357                 old_generation = tm->generation;
1358                 logical = old_root->logical;
1359                 level = old_root->level;
1360         } else {
1361                 logical = eb_root->start;
1362                 level = btrfs_header_level(eb_root);
1363         }
1364
1365         tm = tree_mod_log_search(fs_info, logical, time_seq);
1366         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1367                 btrfs_tree_read_unlock(eb_root);
1368                 free_extent_buffer(eb_root);
1369                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1370                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1371                         if (!IS_ERR(old))
1372                                 free_extent_buffer(old);
1373                         btrfs_warn(fs_info,
1374                                    "failed to read tree block %llu from get_old_root",
1375                                    logical);
1376                 } else {
1377                         eb = btrfs_clone_extent_buffer(old);
1378                         free_extent_buffer(old);
1379                 }
1380         } else if (old_root) {
1381                 eb_root_owner = btrfs_header_owner(eb_root);
1382                 btrfs_tree_read_unlock(eb_root);
1383                 free_extent_buffer(eb_root);
1384                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1385         } else {
1386                 btrfs_set_lock_blocking_read(eb_root);
1387                 eb = btrfs_clone_extent_buffer(eb_root);
1388                 btrfs_tree_read_unlock_blocking(eb_root);
1389                 free_extent_buffer(eb_root);
1390         }
1391
1392         if (!eb)
1393                 return NULL;
1394         btrfs_tree_read_lock(eb);
1395         if (old_root) {
1396                 btrfs_set_header_bytenr(eb, eb->start);
1397                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1398                 btrfs_set_header_owner(eb, eb_root_owner);
1399                 btrfs_set_header_level(eb, old_root->level);
1400                 btrfs_set_header_generation(eb, old_generation);
1401         }
1402         if (tm)
1403                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1404         else
1405                 WARN_ON(btrfs_header_level(eb) != 0);
1406         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1407
1408         return eb;
1409 }
1410
1411 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1412 {
1413         struct tree_mod_elem *tm;
1414         int level;
1415         struct extent_buffer *eb_root = btrfs_root_node(root);
1416
1417         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1418         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1419                 level = tm->old_root.level;
1420         } else {
1421                 level = btrfs_header_level(eb_root);
1422         }
1423         free_extent_buffer(eb_root);
1424
1425         return level;
1426 }
1427
1428 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1429                                    struct btrfs_root *root,
1430                                    struct extent_buffer *buf)
1431 {
1432         if (btrfs_is_testing(root->fs_info))
1433                 return 0;
1434
1435         /* Ensure we can see the FORCE_COW bit */
1436         smp_mb__before_atomic();
1437
1438         /*
1439          * We do not need to cow a block if
1440          * 1) this block is not created or changed in this transaction;
1441          * 2) this block does not belong to TREE_RELOC tree;
1442          * 3) the root is not forced COW.
1443          *
1444          * What is forced COW:
1445          *    when we create snapshot during committing the transaction,
1446          *    after we've finished copying src root, we must COW the shared
1447          *    block to ensure the metadata consistency.
1448          */
1449         if (btrfs_header_generation(buf) == trans->transid &&
1450             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1451             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1452               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1453             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1454                 return 0;
1455         return 1;
1456 }
1457
1458 /*
1459  * cows a single block, see __btrfs_cow_block for the real work.
1460  * This version of it has extra checks so that a block isn't COWed more than
1461  * once per transaction, as long as it hasn't been written yet
1462  */
1463 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1464                     struct btrfs_root *root, struct extent_buffer *buf,
1465                     struct extent_buffer *parent, int parent_slot,
1466                     struct extent_buffer **cow_ret)
1467 {
1468         struct btrfs_fs_info *fs_info = root->fs_info;
1469         u64 search_start;
1470         int ret;
1471
1472         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1473                 btrfs_err(fs_info,
1474                         "COW'ing blocks on a fs root that's being dropped");
1475
1476         if (trans->transaction != fs_info->running_transaction)
1477                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1478                        trans->transid,
1479                        fs_info->running_transaction->transid);
1480
1481         if (trans->transid != fs_info->generation)
1482                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1483                        trans->transid, fs_info->generation);
1484
1485         if (!should_cow_block(trans, root, buf)) {
1486                 trans->dirty = true;
1487                 *cow_ret = buf;
1488                 return 0;
1489         }
1490
1491         search_start = buf->start & ~((u64)SZ_1G - 1);
1492
1493         if (parent)
1494                 btrfs_set_lock_blocking_write(parent);
1495         btrfs_set_lock_blocking_write(buf);
1496
1497         /*
1498          * Before CoWing this block for later modification, check if it's
1499          * the subtree root and do the delayed subtree trace if needed.
1500          *
1501          * Also We don't care about the error, as it's handled internally.
1502          */
1503         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1504         ret = __btrfs_cow_block(trans, root, buf, parent,
1505                                  parent_slot, cow_ret, search_start, 0);
1506
1507         trace_btrfs_cow_block(root, buf, *cow_ret);
1508
1509         return ret;
1510 }
1511
1512 /*
1513  * helper function for defrag to decide if two blocks pointed to by a
1514  * node are actually close by
1515  */
1516 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1517 {
1518         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1519                 return 1;
1520         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1521                 return 1;
1522         return 0;
1523 }
1524
1525 /*
1526  * compare two keys in a memcmp fashion
1527  */
1528 static int comp_keys(const struct btrfs_disk_key *disk,
1529                      const struct btrfs_key *k2)
1530 {
1531         struct btrfs_key k1;
1532
1533         btrfs_disk_key_to_cpu(&k1, disk);
1534
1535         return btrfs_comp_cpu_keys(&k1, k2);
1536 }
1537
1538 /*
1539  * same as comp_keys only with two btrfs_key's
1540  */
1541 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1542 {
1543         if (k1->objectid > k2->objectid)
1544                 return 1;
1545         if (k1->objectid < k2->objectid)
1546                 return -1;
1547         if (k1->type > k2->type)
1548                 return 1;
1549         if (k1->type < k2->type)
1550                 return -1;
1551         if (k1->offset > k2->offset)
1552                 return 1;
1553         if (k1->offset < k2->offset)
1554                 return -1;
1555         return 0;
1556 }
1557
1558 /*
1559  * this is used by the defrag code to go through all the
1560  * leaves pointed to by a node and reallocate them so that
1561  * disk order is close to key order
1562  */
1563 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1564                        struct btrfs_root *root, struct extent_buffer *parent,
1565                        int start_slot, u64 *last_ret,
1566                        struct btrfs_key *progress)
1567 {
1568         struct btrfs_fs_info *fs_info = root->fs_info;
1569         struct extent_buffer *cur;
1570         u64 blocknr;
1571         u64 gen;
1572         u64 search_start = *last_ret;
1573         u64 last_block = 0;
1574         u64 other;
1575         u32 parent_nritems;
1576         int end_slot;
1577         int i;
1578         int err = 0;
1579         int parent_level;
1580         int uptodate;
1581         u32 blocksize;
1582         int progress_passed = 0;
1583         struct btrfs_disk_key disk_key;
1584
1585         parent_level = btrfs_header_level(parent);
1586
1587         WARN_ON(trans->transaction != fs_info->running_transaction);
1588         WARN_ON(trans->transid != fs_info->generation);
1589
1590         parent_nritems = btrfs_header_nritems(parent);
1591         blocksize = fs_info->nodesize;
1592         end_slot = parent_nritems - 1;
1593
1594         if (parent_nritems <= 1)
1595                 return 0;
1596
1597         btrfs_set_lock_blocking_write(parent);
1598
1599         for (i = start_slot; i <= end_slot; i++) {
1600                 struct btrfs_key first_key;
1601                 int close = 1;
1602
1603                 btrfs_node_key(parent, &disk_key, i);
1604                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1605                         continue;
1606
1607                 progress_passed = 1;
1608                 blocknr = btrfs_node_blockptr(parent, i);
1609                 gen = btrfs_node_ptr_generation(parent, i);
1610                 btrfs_node_key_to_cpu(parent, &first_key, i);
1611                 if (last_block == 0)
1612                         last_block = blocknr;
1613
1614                 if (i > 0) {
1615                         other = btrfs_node_blockptr(parent, i - 1);
1616                         close = close_blocks(blocknr, other, blocksize);
1617                 }
1618                 if (!close && i < end_slot) {
1619                         other = btrfs_node_blockptr(parent, i + 1);
1620                         close = close_blocks(blocknr, other, blocksize);
1621                 }
1622                 if (close) {
1623                         last_block = blocknr;
1624                         continue;
1625                 }
1626
1627                 cur = find_extent_buffer(fs_info, blocknr);
1628                 if (cur)
1629                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1630                 else
1631                         uptodate = 0;
1632                 if (!cur || !uptodate) {
1633                         if (!cur) {
1634                                 cur = read_tree_block(fs_info, blocknr, gen,
1635                                                       parent_level - 1,
1636                                                       &first_key);
1637                                 if (IS_ERR(cur)) {
1638                                         return PTR_ERR(cur);
1639                                 } else if (!extent_buffer_uptodate(cur)) {
1640                                         free_extent_buffer(cur);
1641                                         return -EIO;
1642                                 }
1643                         } else if (!uptodate) {
1644                                 err = btrfs_read_buffer(cur, gen,
1645                                                 parent_level - 1,&first_key);
1646                                 if (err) {
1647                                         free_extent_buffer(cur);
1648                                         return err;
1649                                 }
1650                         }
1651                 }
1652                 if (search_start == 0)
1653                         search_start = last_block;
1654
1655                 btrfs_tree_lock(cur);
1656                 btrfs_set_lock_blocking_write(cur);
1657                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1658                                         &cur, search_start,
1659                                         min(16 * blocksize,
1660                                             (end_slot - i) * blocksize));
1661                 if (err) {
1662                         btrfs_tree_unlock(cur);
1663                         free_extent_buffer(cur);
1664                         break;
1665                 }
1666                 search_start = cur->start;
1667                 last_block = cur->start;
1668                 *last_ret = search_start;
1669                 btrfs_tree_unlock(cur);
1670                 free_extent_buffer(cur);
1671         }
1672         return err;
1673 }
1674
1675 /*
1676  * search for key in the extent_buffer.  The items start at offset p,
1677  * and they are item_size apart.  There are 'max' items in p.
1678  *
1679  * the slot in the array is returned via slot, and it points to
1680  * the place where you would insert key if it is not found in
1681  * the array.
1682  *
1683  * slot may point to max if the key is bigger than all of the keys
1684  */
1685 static noinline int generic_bin_search(struct extent_buffer *eb,
1686                                        unsigned long p, int item_size,
1687                                        const struct btrfs_key *key,
1688                                        int max, int *slot)
1689 {
1690         int low = 0;
1691         int high = max;
1692         int mid;
1693         int ret;
1694         struct btrfs_disk_key *tmp = NULL;
1695         struct btrfs_disk_key unaligned;
1696         unsigned long offset;
1697         char *kaddr = NULL;
1698         unsigned long map_start = 0;
1699         unsigned long map_len = 0;
1700         int err;
1701
1702         if (low > high) {
1703                 btrfs_err(eb->fs_info,
1704                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1705                           __func__, low, high, eb->start,
1706                           btrfs_header_owner(eb), btrfs_header_level(eb));
1707                 return -EINVAL;
1708         }
1709
1710         while (low < high) {
1711                 mid = (low + high) / 2;
1712                 offset = p + mid * item_size;
1713
1714                 if (!kaddr || offset < map_start ||
1715                     (offset + sizeof(struct btrfs_disk_key)) >
1716                     map_start + map_len) {
1717
1718                         err = map_private_extent_buffer(eb, offset,
1719                                                 sizeof(struct btrfs_disk_key),
1720                                                 &kaddr, &map_start, &map_len);
1721
1722                         if (!err) {
1723                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1724                                                         map_start);
1725                         } else if (err == 1) {
1726                                 read_extent_buffer(eb, &unaligned,
1727                                                    offset, sizeof(unaligned));
1728                                 tmp = &unaligned;
1729                         } else {
1730                                 return err;
1731                         }
1732
1733                 } else {
1734                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1735                                                         map_start);
1736                 }
1737                 ret = comp_keys(tmp, key);
1738
1739                 if (ret < 0)
1740                         low = mid + 1;
1741                 else if (ret > 0)
1742                         high = mid;
1743                 else {
1744                         *slot = mid;
1745                         return 0;
1746                 }
1747         }
1748         *slot = low;
1749         return 1;
1750 }
1751
1752 /*
1753  * simple bin_search frontend that does the right thing for
1754  * leaves vs nodes
1755  */
1756 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1757                      int level, int *slot)
1758 {
1759         if (level == 0)
1760                 return generic_bin_search(eb,
1761                                           offsetof(struct btrfs_leaf, items),
1762                                           sizeof(struct btrfs_item),
1763                                           key, btrfs_header_nritems(eb),
1764                                           slot);
1765         else
1766                 return generic_bin_search(eb,
1767                                           offsetof(struct btrfs_node, ptrs),
1768                                           sizeof(struct btrfs_key_ptr),
1769                                           key, btrfs_header_nritems(eb),
1770                                           slot);
1771 }
1772
1773 static void root_add_used(struct btrfs_root *root, u32 size)
1774 {
1775         spin_lock(&root->accounting_lock);
1776         btrfs_set_root_used(&root->root_item,
1777                             btrfs_root_used(&root->root_item) + size);
1778         spin_unlock(&root->accounting_lock);
1779 }
1780
1781 static void root_sub_used(struct btrfs_root *root, u32 size)
1782 {
1783         spin_lock(&root->accounting_lock);
1784         btrfs_set_root_used(&root->root_item,
1785                             btrfs_root_used(&root->root_item) - size);
1786         spin_unlock(&root->accounting_lock);
1787 }
1788
1789 /* given a node and slot number, this reads the blocks it points to.  The
1790  * extent buffer is returned with a reference taken (but unlocked).
1791  */
1792 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1793                                            int slot)
1794 {
1795         int level = btrfs_header_level(parent);
1796         struct extent_buffer *eb;
1797         struct btrfs_key first_key;
1798
1799         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1800                 return ERR_PTR(-ENOENT);
1801
1802         BUG_ON(level == 0);
1803
1804         btrfs_node_key_to_cpu(parent, &first_key, slot);
1805         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1806                              btrfs_node_ptr_generation(parent, slot),
1807                              level - 1, &first_key);
1808         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1809                 free_extent_buffer(eb);
1810                 eb = ERR_PTR(-EIO);
1811         }
1812
1813         return eb;
1814 }
1815
1816 /*
1817  * node level balancing, used to make sure nodes are in proper order for
1818  * item deletion.  We balance from the top down, so we have to make sure
1819  * that a deletion won't leave an node completely empty later on.
1820  */
1821 static noinline int balance_level(struct btrfs_trans_handle *trans,
1822                          struct btrfs_root *root,
1823                          struct btrfs_path *path, int level)
1824 {
1825         struct btrfs_fs_info *fs_info = root->fs_info;
1826         struct extent_buffer *right = NULL;
1827         struct extent_buffer *mid;
1828         struct extent_buffer *left = NULL;
1829         struct extent_buffer *parent = NULL;
1830         int ret = 0;
1831         int wret;
1832         int pslot;
1833         int orig_slot = path->slots[level];
1834         u64 orig_ptr;
1835
1836         ASSERT(level > 0);
1837
1838         mid = path->nodes[level];
1839
1840         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1841                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1842         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1843
1844         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1845
1846         if (level < BTRFS_MAX_LEVEL - 1) {
1847                 parent = path->nodes[level + 1];
1848                 pslot = path->slots[level + 1];
1849         }
1850
1851         /*
1852          * deal with the case where there is only one pointer in the root
1853          * by promoting the node below to a root
1854          */
1855         if (!parent) {
1856                 struct extent_buffer *child;
1857
1858                 if (btrfs_header_nritems(mid) != 1)
1859                         return 0;
1860
1861                 /* promote the child to a root */
1862                 child = btrfs_read_node_slot(mid, 0);
1863                 if (IS_ERR(child)) {
1864                         ret = PTR_ERR(child);
1865                         btrfs_handle_fs_error(fs_info, ret, NULL);
1866                         goto enospc;
1867                 }
1868
1869                 btrfs_tree_lock(child);
1870                 btrfs_set_lock_blocking_write(child);
1871                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1872                 if (ret) {
1873                         btrfs_tree_unlock(child);
1874                         free_extent_buffer(child);
1875                         goto enospc;
1876                 }
1877
1878                 ret = tree_mod_log_insert_root(root->node, child, 1);
1879                 BUG_ON(ret < 0);
1880                 rcu_assign_pointer(root->node, child);
1881
1882                 add_root_to_dirty_list(root);
1883                 btrfs_tree_unlock(child);
1884
1885                 path->locks[level] = 0;
1886                 path->nodes[level] = NULL;
1887                 btrfs_clean_tree_block(mid);
1888                 btrfs_tree_unlock(mid);
1889                 /* once for the path */
1890                 free_extent_buffer(mid);
1891
1892                 root_sub_used(root, mid->len);
1893                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1894                 /* once for the root ptr */
1895                 free_extent_buffer_stale(mid);
1896                 return 0;
1897         }
1898         if (btrfs_header_nritems(mid) >
1899             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1900                 return 0;
1901
1902         left = btrfs_read_node_slot(parent, pslot - 1);
1903         if (IS_ERR(left))
1904                 left = NULL;
1905
1906         if (left) {
1907                 btrfs_tree_lock(left);
1908                 btrfs_set_lock_blocking_write(left);
1909                 wret = btrfs_cow_block(trans, root, left,
1910                                        parent, pslot - 1, &left);
1911                 if (wret) {
1912                         ret = wret;
1913                         goto enospc;
1914                 }
1915         }
1916
1917         right = btrfs_read_node_slot(parent, pslot + 1);
1918         if (IS_ERR(right))
1919                 right = NULL;
1920
1921         if (right) {
1922                 btrfs_tree_lock(right);
1923                 btrfs_set_lock_blocking_write(right);
1924                 wret = btrfs_cow_block(trans, root, right,
1925                                        parent, pslot + 1, &right);
1926                 if (wret) {
1927                         ret = wret;
1928                         goto enospc;
1929                 }
1930         }
1931
1932         /* first, try to make some room in the middle buffer */
1933         if (left) {
1934                 orig_slot += btrfs_header_nritems(left);
1935                 wret = push_node_left(trans, left, mid, 1);
1936                 if (wret < 0)
1937                         ret = wret;
1938         }
1939
1940         /*
1941          * then try to empty the right most buffer into the middle
1942          */
1943         if (right) {
1944                 wret = push_node_left(trans, mid, right, 1);
1945                 if (wret < 0 && wret != -ENOSPC)
1946                         ret = wret;
1947                 if (btrfs_header_nritems(right) == 0) {
1948                         btrfs_clean_tree_block(right);
1949                         btrfs_tree_unlock(right);
1950                         del_ptr(root, path, level + 1, pslot + 1);
1951                         root_sub_used(root, right->len);
1952                         btrfs_free_tree_block(trans, root, right, 0, 1);
1953                         free_extent_buffer_stale(right);
1954                         right = NULL;
1955                 } else {
1956                         struct btrfs_disk_key right_key;
1957                         btrfs_node_key(right, &right_key, 0);
1958                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1959                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1960                         BUG_ON(ret < 0);
1961                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1962                         btrfs_mark_buffer_dirty(parent);
1963                 }
1964         }
1965         if (btrfs_header_nritems(mid) == 1) {
1966                 /*
1967                  * we're not allowed to leave a node with one item in the
1968                  * tree during a delete.  A deletion from lower in the tree
1969                  * could try to delete the only pointer in this node.
1970                  * So, pull some keys from the left.
1971                  * There has to be a left pointer at this point because
1972                  * otherwise we would have pulled some pointers from the
1973                  * right
1974                  */
1975                 if (!left) {
1976                         ret = -EROFS;
1977                         btrfs_handle_fs_error(fs_info, ret, NULL);
1978                         goto enospc;
1979                 }
1980                 wret = balance_node_right(trans, mid, left);
1981                 if (wret < 0) {
1982                         ret = wret;
1983                         goto enospc;
1984                 }
1985                 if (wret == 1) {
1986                         wret = push_node_left(trans, left, mid, 1);
1987                         if (wret < 0)
1988                                 ret = wret;
1989                 }
1990                 BUG_ON(wret == 1);
1991         }
1992         if (btrfs_header_nritems(mid) == 0) {
1993                 btrfs_clean_tree_block(mid);
1994                 btrfs_tree_unlock(mid);
1995                 del_ptr(root, path, level + 1, pslot);
1996                 root_sub_used(root, mid->len);
1997                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1998                 free_extent_buffer_stale(mid);
1999                 mid = NULL;
2000         } else {
2001                 /* update the parent key to reflect our changes */
2002                 struct btrfs_disk_key mid_key;
2003                 btrfs_node_key(mid, &mid_key, 0);
2004                 ret = tree_mod_log_insert_key(parent, pslot,
2005                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2006                 BUG_ON(ret < 0);
2007                 btrfs_set_node_key(parent, &mid_key, pslot);
2008                 btrfs_mark_buffer_dirty(parent);
2009         }
2010
2011         /* update the path */
2012         if (left) {
2013                 if (btrfs_header_nritems(left) > orig_slot) {
2014                         atomic_inc(&left->refs);
2015                         /* left was locked after cow */
2016                         path->nodes[level] = left;
2017                         path->slots[level + 1] -= 1;
2018                         path->slots[level] = orig_slot;
2019                         if (mid) {
2020                                 btrfs_tree_unlock(mid);
2021                                 free_extent_buffer(mid);
2022                         }
2023                 } else {
2024                         orig_slot -= btrfs_header_nritems(left);
2025                         path->slots[level] = orig_slot;
2026                 }
2027         }
2028         /* double check we haven't messed things up */
2029         if (orig_ptr !=
2030             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2031                 BUG();
2032 enospc:
2033         if (right) {
2034                 btrfs_tree_unlock(right);
2035                 free_extent_buffer(right);
2036         }
2037         if (left) {
2038                 if (path->nodes[level] != left)
2039                         btrfs_tree_unlock(left);
2040                 free_extent_buffer(left);
2041         }
2042         return ret;
2043 }
2044
2045 /* Node balancing for insertion.  Here we only split or push nodes around
2046  * when they are completely full.  This is also done top down, so we
2047  * have to be pessimistic.
2048  */
2049 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2050                                           struct btrfs_root *root,
2051                                           struct btrfs_path *path, int level)
2052 {
2053         struct btrfs_fs_info *fs_info = root->fs_info;
2054         struct extent_buffer *right = NULL;
2055         struct extent_buffer *mid;
2056         struct extent_buffer *left = NULL;
2057         struct extent_buffer *parent = NULL;
2058         int ret = 0;
2059         int wret;
2060         int pslot;
2061         int orig_slot = path->slots[level];
2062
2063         if (level == 0)
2064                 return 1;
2065
2066         mid = path->nodes[level];
2067         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2068
2069         if (level < BTRFS_MAX_LEVEL - 1) {
2070                 parent = path->nodes[level + 1];
2071                 pslot = path->slots[level + 1];
2072         }
2073
2074         if (!parent)
2075                 return 1;
2076
2077         left = btrfs_read_node_slot(parent, pslot - 1);
2078         if (IS_ERR(left))
2079                 left = NULL;
2080
2081         /* first, try to make some room in the middle buffer */
2082         if (left) {
2083                 u32 left_nr;
2084
2085                 btrfs_tree_lock(left);
2086                 btrfs_set_lock_blocking_write(left);
2087
2088                 left_nr = btrfs_header_nritems(left);
2089                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2090                         wret = 1;
2091                 } else {
2092                         ret = btrfs_cow_block(trans, root, left, parent,
2093                                               pslot - 1, &left);
2094                         if (ret)
2095                                 wret = 1;
2096                         else {
2097                                 wret = push_node_left(trans, left, mid, 0);
2098                         }
2099                 }
2100                 if (wret < 0)
2101                         ret = wret;
2102                 if (wret == 0) {
2103                         struct btrfs_disk_key disk_key;
2104                         orig_slot += left_nr;
2105                         btrfs_node_key(mid, &disk_key, 0);
2106                         ret = tree_mod_log_insert_key(parent, pslot,
2107                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2108                         BUG_ON(ret < 0);
2109                         btrfs_set_node_key(parent, &disk_key, pslot);
2110                         btrfs_mark_buffer_dirty(parent);
2111                         if (btrfs_header_nritems(left) > orig_slot) {
2112                                 path->nodes[level] = left;
2113                                 path->slots[level + 1] -= 1;
2114                                 path->slots[level] = orig_slot;
2115                                 btrfs_tree_unlock(mid);
2116                                 free_extent_buffer(mid);
2117                         } else {
2118                                 orig_slot -=
2119                                         btrfs_header_nritems(left);
2120                                 path->slots[level] = orig_slot;
2121                                 btrfs_tree_unlock(left);
2122                                 free_extent_buffer(left);
2123                         }
2124                         return 0;
2125                 }
2126                 btrfs_tree_unlock(left);
2127                 free_extent_buffer(left);
2128         }
2129         right = btrfs_read_node_slot(parent, pslot + 1);
2130         if (IS_ERR(right))
2131                 right = NULL;
2132
2133         /*
2134          * then try to empty the right most buffer into the middle
2135          */
2136         if (right) {
2137                 u32 right_nr;
2138
2139                 btrfs_tree_lock(right);
2140                 btrfs_set_lock_blocking_write(right);
2141
2142                 right_nr = btrfs_header_nritems(right);
2143                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2144                         wret = 1;
2145                 } else {
2146                         ret = btrfs_cow_block(trans, root, right,
2147                                               parent, pslot + 1,
2148                                               &right);
2149                         if (ret)
2150                                 wret = 1;
2151                         else {
2152                                 wret = balance_node_right(trans, right, mid);
2153                         }
2154                 }
2155                 if (wret < 0)
2156                         ret = wret;
2157                 if (wret == 0) {
2158                         struct btrfs_disk_key disk_key;
2159
2160                         btrfs_node_key(right, &disk_key, 0);
2161                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2162                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2163                         BUG_ON(ret < 0);
2164                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2165                         btrfs_mark_buffer_dirty(parent);
2166
2167                         if (btrfs_header_nritems(mid) <= orig_slot) {
2168                                 path->nodes[level] = right;
2169                                 path->slots[level + 1] += 1;
2170                                 path->slots[level] = orig_slot -
2171                                         btrfs_header_nritems(mid);
2172                                 btrfs_tree_unlock(mid);
2173                                 free_extent_buffer(mid);
2174                         } else {
2175                                 btrfs_tree_unlock(right);
2176                                 free_extent_buffer(right);
2177                         }
2178                         return 0;
2179                 }
2180                 btrfs_tree_unlock(right);
2181                 free_extent_buffer(right);
2182         }
2183         return 1;
2184 }
2185
2186 /*
2187  * readahead one full node of leaves, finding things that are close
2188  * to the block in 'slot', and triggering ra on them.
2189  */
2190 static void reada_for_search(struct btrfs_fs_info *fs_info,
2191                              struct btrfs_path *path,
2192                              int level, int slot, u64 objectid)
2193 {
2194         struct extent_buffer *node;
2195         struct btrfs_disk_key disk_key;
2196         u32 nritems;
2197         u64 search;
2198         u64 target;
2199         u64 nread = 0;
2200         struct extent_buffer *eb;
2201         u32 nr;
2202         u32 blocksize;
2203         u32 nscan = 0;
2204
2205         if (level != 1)
2206                 return;
2207
2208         if (!path->nodes[level])
2209                 return;
2210
2211         node = path->nodes[level];
2212
2213         search = btrfs_node_blockptr(node, slot);
2214         blocksize = fs_info->nodesize;
2215         eb = find_extent_buffer(fs_info, search);
2216         if (eb) {
2217                 free_extent_buffer(eb);
2218                 return;
2219         }
2220
2221         target = search;
2222
2223         nritems = btrfs_header_nritems(node);
2224         nr = slot;
2225
2226         while (1) {
2227                 if (path->reada == READA_BACK) {
2228                         if (nr == 0)
2229                                 break;
2230                         nr--;
2231                 } else if (path->reada == READA_FORWARD) {
2232                         nr++;
2233                         if (nr >= nritems)
2234                                 break;
2235                 }
2236                 if (path->reada == READA_BACK && objectid) {
2237                         btrfs_node_key(node, &disk_key, nr);
2238                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2239                                 break;
2240                 }
2241                 search = btrfs_node_blockptr(node, nr);
2242                 if ((search <= target && target - search <= 65536) ||
2243                     (search > target && search - target <= 65536)) {
2244                         readahead_tree_block(fs_info, search);
2245                         nread += blocksize;
2246                 }
2247                 nscan++;
2248                 if ((nread > 65536 || nscan > 32))
2249                         break;
2250         }
2251 }
2252
2253 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2254                                        struct btrfs_path *path, int level)
2255 {
2256         int slot;
2257         int nritems;
2258         struct extent_buffer *parent;
2259         struct extent_buffer *eb;
2260         u64 gen;
2261         u64 block1 = 0;
2262         u64 block2 = 0;
2263
2264         parent = path->nodes[level + 1];
2265         if (!parent)
2266                 return;
2267
2268         nritems = btrfs_header_nritems(parent);
2269         slot = path->slots[level + 1];
2270
2271         if (slot > 0) {
2272                 block1 = btrfs_node_blockptr(parent, slot - 1);
2273                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2274                 eb = find_extent_buffer(fs_info, block1);
2275                 /*
2276                  * if we get -eagain from btrfs_buffer_uptodate, we
2277                  * don't want to return eagain here.  That will loop
2278                  * forever
2279                  */
2280                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2281                         block1 = 0;
2282                 free_extent_buffer(eb);
2283         }
2284         if (slot + 1 < nritems) {
2285                 block2 = btrfs_node_blockptr(parent, slot + 1);
2286                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2287                 eb = find_extent_buffer(fs_info, block2);
2288                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2289                         block2 = 0;
2290                 free_extent_buffer(eb);
2291         }
2292
2293         if (block1)
2294                 readahead_tree_block(fs_info, block1);
2295         if (block2)
2296                 readahead_tree_block(fs_info, block2);
2297 }
2298
2299
2300 /*
2301  * when we walk down the tree, it is usually safe to unlock the higher layers
2302  * in the tree.  The exceptions are when our path goes through slot 0, because
2303  * operations on the tree might require changing key pointers higher up in the
2304  * tree.
2305  *
2306  * callers might also have set path->keep_locks, which tells this code to keep
2307  * the lock if the path points to the last slot in the block.  This is part of
2308  * walking through the tree, and selecting the next slot in the higher block.
2309  *
2310  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2311  * if lowest_unlock is 1, level 0 won't be unlocked
2312  */
2313 static noinline void unlock_up(struct btrfs_path *path, int level,
2314                                int lowest_unlock, int min_write_lock_level,
2315                                int *write_lock_level)
2316 {
2317         int i;
2318         int skip_level = level;
2319         int no_skips = 0;
2320         struct extent_buffer *t;
2321
2322         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2323                 if (!path->nodes[i])
2324                         break;
2325                 if (!path->locks[i])
2326                         break;
2327                 if (!no_skips && path->slots[i] == 0) {
2328                         skip_level = i + 1;
2329                         continue;
2330                 }
2331                 if (!no_skips && path->keep_locks) {
2332                         u32 nritems;
2333                         t = path->nodes[i];
2334                         nritems = btrfs_header_nritems(t);
2335                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2336                                 skip_level = i + 1;
2337                                 continue;
2338                         }
2339                 }
2340                 if (skip_level < i && i >= lowest_unlock)
2341                         no_skips = 1;
2342
2343                 t = path->nodes[i];
2344                 if (i >= lowest_unlock && i > skip_level) {
2345                         btrfs_tree_unlock_rw(t, path->locks[i]);
2346                         path->locks[i] = 0;
2347                         if (write_lock_level &&
2348                             i > min_write_lock_level &&
2349                             i <= *write_lock_level) {
2350                                 *write_lock_level = i - 1;
2351                         }
2352                 }
2353         }
2354 }
2355
2356 /*
2357  * helper function for btrfs_search_slot.  The goal is to find a block
2358  * in cache without setting the path to blocking.  If we find the block
2359  * we return zero and the path is unchanged.
2360  *
2361  * If we can't find the block, we set the path blocking and do some
2362  * reada.  -EAGAIN is returned and the search must be repeated.
2363  */
2364 static int
2365 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2366                       struct extent_buffer **eb_ret, int level, int slot,
2367                       const struct btrfs_key *key)
2368 {
2369         struct btrfs_fs_info *fs_info = root->fs_info;
2370         u64 blocknr;
2371         u64 gen;
2372         struct extent_buffer *b = *eb_ret;
2373         struct extent_buffer *tmp;
2374         struct btrfs_key first_key;
2375         int ret;
2376         int parent_level;
2377
2378         blocknr = btrfs_node_blockptr(b, slot);
2379         gen = btrfs_node_ptr_generation(b, slot);
2380         parent_level = btrfs_header_level(b);
2381         btrfs_node_key_to_cpu(b, &first_key, slot);
2382
2383         tmp = find_extent_buffer(fs_info, blocknr);
2384         if (tmp) {
2385                 /* first we do an atomic uptodate check */
2386                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2387                         /*
2388                          * Do extra check for first_key, eb can be stale due to
2389                          * being cached, read from scrub, or have multiple
2390                          * parents (shared tree blocks).
2391                          */
2392                         if (btrfs_verify_level_key(tmp,
2393                                         parent_level - 1, &first_key, gen)) {
2394                                 free_extent_buffer(tmp);
2395                                 return -EUCLEAN;
2396                         }
2397                         *eb_ret = tmp;
2398                         return 0;
2399                 }
2400
2401                 /* the pages were up to date, but we failed
2402                  * the generation number check.  Do a full
2403                  * read for the generation number that is correct.
2404                  * We must do this without dropping locks so
2405                  * we can trust our generation number
2406                  */
2407                 btrfs_set_path_blocking(p);
2408
2409                 /* now we're allowed to do a blocking uptodate check */
2410                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2411                 if (!ret) {
2412                         *eb_ret = tmp;
2413                         return 0;
2414                 }
2415                 free_extent_buffer(tmp);
2416                 btrfs_release_path(p);
2417                 return -EIO;
2418         }
2419
2420         /*
2421          * reduce lock contention at high levels
2422          * of the btree by dropping locks before
2423          * we read.  Don't release the lock on the current
2424          * level because we need to walk this node to figure
2425          * out which blocks to read.
2426          */
2427         btrfs_unlock_up_safe(p, level + 1);
2428         btrfs_set_path_blocking(p);
2429
2430         if (p->reada != READA_NONE)
2431                 reada_for_search(fs_info, p, level, slot, key->objectid);
2432
2433         ret = -EAGAIN;
2434         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2435                               &first_key);
2436         if (!IS_ERR(tmp)) {
2437                 /*
2438                  * If the read above didn't mark this buffer up to date,
2439                  * it will never end up being up to date.  Set ret to EIO now
2440                  * and give up so that our caller doesn't loop forever
2441                  * on our EAGAINs.
2442                  */
2443                 if (!extent_buffer_uptodate(tmp))
2444                         ret = -EIO;
2445                 free_extent_buffer(tmp);
2446         } else {
2447                 ret = PTR_ERR(tmp);
2448         }
2449
2450         btrfs_release_path(p);
2451         return ret;
2452 }
2453
2454 /*
2455  * helper function for btrfs_search_slot.  This does all of the checks
2456  * for node-level blocks and does any balancing required based on
2457  * the ins_len.
2458  *
2459  * If no extra work was required, zero is returned.  If we had to
2460  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2461  * start over
2462  */
2463 static int
2464 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2465                        struct btrfs_root *root, struct btrfs_path *p,
2466                        struct extent_buffer *b, int level, int ins_len,
2467                        int *write_lock_level)
2468 {
2469         struct btrfs_fs_info *fs_info = root->fs_info;
2470         int ret;
2471
2472         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2473             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2474                 int sret;
2475
2476                 if (*write_lock_level < level + 1) {
2477                         *write_lock_level = level + 1;
2478                         btrfs_release_path(p);
2479                         goto again;
2480                 }
2481
2482                 btrfs_set_path_blocking(p);
2483                 reada_for_balance(fs_info, p, level);
2484                 sret = split_node(trans, root, p, level);
2485
2486                 BUG_ON(sret > 0);
2487                 if (sret) {
2488                         ret = sret;
2489                         goto done;
2490                 }
2491                 b = p->nodes[level];
2492         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2493                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2494                 int sret;
2495
2496                 if (*write_lock_level < level + 1) {
2497                         *write_lock_level = level + 1;
2498                         btrfs_release_path(p);
2499                         goto again;
2500                 }
2501
2502                 btrfs_set_path_blocking(p);
2503                 reada_for_balance(fs_info, p, level);
2504                 sret = balance_level(trans, root, p, level);
2505
2506                 if (sret) {
2507                         ret = sret;
2508                         goto done;
2509                 }
2510                 b = p->nodes[level];
2511                 if (!b) {
2512                         btrfs_release_path(p);
2513                         goto again;
2514                 }
2515                 BUG_ON(btrfs_header_nritems(b) == 1);
2516         }
2517         return 0;
2518
2519 again:
2520         ret = -EAGAIN;
2521 done:
2522         return ret;
2523 }
2524
2525 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2526                       int level, int *prev_cmp, int *slot)
2527 {
2528         if (*prev_cmp != 0) {
2529                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2530                 return *prev_cmp;
2531         }
2532
2533         *slot = 0;
2534
2535         return 0;
2536 }
2537
2538 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2539                 u64 iobjectid, u64 ioff, u8 key_type,
2540                 struct btrfs_key *found_key)
2541 {
2542         int ret;
2543         struct btrfs_key key;
2544         struct extent_buffer *eb;
2545
2546         ASSERT(path);
2547         ASSERT(found_key);
2548
2549         key.type = key_type;
2550         key.objectid = iobjectid;
2551         key.offset = ioff;
2552
2553         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2554         if (ret < 0)
2555                 return ret;
2556
2557         eb = path->nodes[0];
2558         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2559                 ret = btrfs_next_leaf(fs_root, path);
2560                 if (ret)
2561                         return ret;
2562                 eb = path->nodes[0];
2563         }
2564
2565         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2566         if (found_key->type != key.type ||
2567                         found_key->objectid != key.objectid)
2568                 return 1;
2569
2570         return 0;
2571 }
2572
2573 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2574                                                         struct btrfs_path *p,
2575                                                         int write_lock_level)
2576 {
2577         struct btrfs_fs_info *fs_info = root->fs_info;
2578         struct extent_buffer *b;
2579         int root_lock;
2580         int level = 0;
2581
2582         /* We try very hard to do read locks on the root */
2583         root_lock = BTRFS_READ_LOCK;
2584
2585         if (p->search_commit_root) {
2586                 /*
2587                  * The commit roots are read only so we always do read locks,
2588                  * and we always must hold the commit_root_sem when doing
2589                  * searches on them, the only exception is send where we don't
2590                  * want to block transaction commits for a long time, so
2591                  * we need to clone the commit root in order to avoid races
2592                  * with transaction commits that create a snapshot of one of
2593                  * the roots used by a send operation.
2594                  */
2595                 if (p->need_commit_sem) {
2596                         down_read(&fs_info->commit_root_sem);
2597                         b = btrfs_clone_extent_buffer(root->commit_root);
2598                         up_read(&fs_info->commit_root_sem);
2599                         if (!b)
2600                                 return ERR_PTR(-ENOMEM);
2601
2602                 } else {
2603                         b = root->commit_root;
2604                         atomic_inc(&b->refs);
2605                 }
2606                 level = btrfs_header_level(b);
2607                 /*
2608                  * Ensure that all callers have set skip_locking when
2609                  * p->search_commit_root = 1.
2610                  */
2611                 ASSERT(p->skip_locking == 1);
2612
2613                 goto out;
2614         }
2615
2616         if (p->skip_locking) {
2617                 b = btrfs_root_node(root);
2618                 level = btrfs_header_level(b);
2619                 goto out;
2620         }
2621
2622         /*
2623          * If the level is set to maximum, we can skip trying to get the read
2624          * lock.
2625          */
2626         if (write_lock_level < BTRFS_MAX_LEVEL) {
2627                 /*
2628                  * We don't know the level of the root node until we actually
2629                  * have it read locked
2630                  */
2631                 b = btrfs_read_lock_root_node(root);
2632                 level = btrfs_header_level(b);
2633                 if (level > write_lock_level)
2634                         goto out;
2635
2636                 /* Whoops, must trade for write lock */
2637                 btrfs_tree_read_unlock(b);
2638                 free_extent_buffer(b);
2639         }
2640
2641         b = btrfs_lock_root_node(root);
2642         root_lock = BTRFS_WRITE_LOCK;
2643
2644         /* The level might have changed, check again */
2645         level = btrfs_header_level(b);
2646
2647 out:
2648         p->nodes[level] = b;
2649         if (!p->skip_locking)
2650                 p->locks[level] = root_lock;
2651         /*
2652          * Callers are responsible for dropping b's references.
2653          */
2654         return b;
2655 }
2656
2657
2658 /*
2659  * btrfs_search_slot - look for a key in a tree and perform necessary
2660  * modifications to preserve tree invariants.
2661  *
2662  * @trans:      Handle of transaction, used when modifying the tree
2663  * @p:          Holds all btree nodes along the search path
2664  * @root:       The root node of the tree
2665  * @key:        The key we are looking for
2666  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2667  *              deletions it's -1. 0 for plain searches
2668  * @cow:        boolean should CoW operations be performed. Must always be 1
2669  *              when modifying the tree.
2670  *
2671  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2672  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2673  *
2674  * If @key is found, 0 is returned and you can find the item in the leaf level
2675  * of the path (level 0)
2676  *
2677  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2678  * points to the slot where it should be inserted
2679  *
2680  * If an error is encountered while searching the tree a negative error number
2681  * is returned
2682  */
2683 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2684                       const struct btrfs_key *key, struct btrfs_path *p,
2685                       int ins_len, int cow)
2686 {
2687         struct extent_buffer *b;
2688         int slot;
2689         int ret;
2690         int err;
2691         int level;
2692         int lowest_unlock = 1;
2693         /* everything at write_lock_level or lower must be write locked */
2694         int write_lock_level = 0;
2695         u8 lowest_level = 0;
2696         int min_write_lock_level;
2697         int prev_cmp;
2698
2699         lowest_level = p->lowest_level;
2700         WARN_ON(lowest_level && ins_len > 0);
2701         WARN_ON(p->nodes[0] != NULL);
2702         BUG_ON(!cow && ins_len);
2703
2704         if (ins_len < 0) {
2705                 lowest_unlock = 2;
2706
2707                 /* when we are removing items, we might have to go up to level
2708                  * two as we update tree pointers  Make sure we keep write
2709                  * for those levels as well
2710                  */
2711                 write_lock_level = 2;
2712         } else if (ins_len > 0) {
2713                 /*
2714                  * for inserting items, make sure we have a write lock on
2715                  * level 1 so we can update keys
2716                  */
2717                 write_lock_level = 1;
2718         }
2719
2720         if (!cow)
2721                 write_lock_level = -1;
2722
2723         if (cow && (p->keep_locks || p->lowest_level))
2724                 write_lock_level = BTRFS_MAX_LEVEL;
2725
2726         min_write_lock_level = write_lock_level;
2727
2728 again:
2729         prev_cmp = -1;
2730         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2731         if (IS_ERR(b)) {
2732                 ret = PTR_ERR(b);
2733                 goto done;
2734         }
2735
2736         while (b) {
2737                 int dec = 0;
2738
2739                 level = btrfs_header_level(b);
2740
2741                 if (cow) {
2742                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2743
2744                         /*
2745                          * if we don't really need to cow this block
2746                          * then we don't want to set the path blocking,
2747                          * so we test it here
2748                          */
2749                         if (!should_cow_block(trans, root, b)) {
2750                                 trans->dirty = true;
2751                                 goto cow_done;
2752                         }
2753
2754                         /*
2755                          * must have write locks on this node and the
2756                          * parent
2757                          */
2758                         if (level > write_lock_level ||
2759                             (level + 1 > write_lock_level &&
2760                             level + 1 < BTRFS_MAX_LEVEL &&
2761                             p->nodes[level + 1])) {
2762                                 write_lock_level = level + 1;
2763                                 btrfs_release_path(p);
2764                                 goto again;
2765                         }
2766
2767                         btrfs_set_path_blocking(p);
2768                         if (last_level)
2769                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2770                                                       &b);
2771                         else
2772                                 err = btrfs_cow_block(trans, root, b,
2773                                                       p->nodes[level + 1],
2774                                                       p->slots[level + 1], &b);
2775                         if (err) {
2776                                 ret = err;
2777                                 goto done;
2778                         }
2779                 }
2780 cow_done:
2781                 p->nodes[level] = b;
2782                 /*
2783                  * Leave path with blocking locks to avoid massive
2784                  * lock context switch, this is made on purpose.
2785                  */
2786
2787                 /*
2788                  * we have a lock on b and as long as we aren't changing
2789                  * the tree, there is no way to for the items in b to change.
2790                  * It is safe to drop the lock on our parent before we
2791                  * go through the expensive btree search on b.
2792                  *
2793                  * If we're inserting or deleting (ins_len != 0), then we might
2794                  * be changing slot zero, which may require changing the parent.
2795                  * So, we can't drop the lock until after we know which slot
2796                  * we're operating on.
2797                  */
2798                 if (!ins_len && !p->keep_locks) {
2799                         int u = level + 1;
2800
2801                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2802                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2803                                 p->locks[u] = 0;
2804                         }
2805                 }
2806
2807                 ret = key_search(b, key, level, &prev_cmp, &slot);
2808                 if (ret < 0)
2809                         goto done;
2810
2811                 if (level == 0) {
2812                         p->slots[level] = slot;
2813                         if (ins_len > 0 &&
2814                             btrfs_leaf_free_space(b) < ins_len) {
2815                                 if (write_lock_level < 1) {
2816                                         write_lock_level = 1;
2817                                         btrfs_release_path(p);
2818                                         goto again;
2819                                 }
2820
2821                                 btrfs_set_path_blocking(p);
2822                                 err = split_leaf(trans, root, key,
2823                                                  p, ins_len, ret == 0);
2824
2825                                 BUG_ON(err > 0);
2826                                 if (err) {
2827                                         ret = err;
2828                                         goto done;
2829                                 }
2830                         }
2831                         if (!p->search_for_split)
2832                                 unlock_up(p, level, lowest_unlock,
2833                                           min_write_lock_level, NULL);
2834                         goto done;
2835                 }
2836                 if (ret && slot > 0) {
2837                         dec = 1;
2838                         slot--;
2839                 }
2840                 p->slots[level] = slot;
2841                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2842                                              &write_lock_level);
2843                 if (err == -EAGAIN)
2844                         goto again;
2845                 if (err) {
2846                         ret = err;
2847                         goto done;
2848                 }
2849                 b = p->nodes[level];
2850                 slot = p->slots[level];
2851
2852                 /*
2853                  * Slot 0 is special, if we change the key we have to update
2854                  * the parent pointer which means we must have a write lock on
2855                  * the parent
2856                  */
2857                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2858                         write_lock_level = level + 1;
2859                         btrfs_release_path(p);
2860                         goto again;
2861                 }
2862
2863                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2864                           &write_lock_level);
2865
2866                 if (level == lowest_level) {
2867                         if (dec)
2868                                 p->slots[level]++;
2869                         goto done;
2870                 }
2871
2872                 err = read_block_for_search(root, p, &b, level, slot, key);
2873                 if (err == -EAGAIN)
2874                         goto again;
2875                 if (err) {
2876                         ret = err;
2877                         goto done;
2878                 }
2879
2880                 if (!p->skip_locking) {
2881                         level = btrfs_header_level(b);
2882                         if (level <= write_lock_level) {
2883                                 if (!btrfs_try_tree_write_lock(b)) {
2884                                         btrfs_set_path_blocking(p);
2885                                         btrfs_tree_lock(b);
2886                                 }
2887                                 p->locks[level] = BTRFS_WRITE_LOCK;
2888                         } else {
2889                                 if (!btrfs_tree_read_lock_atomic(b)) {
2890                                         btrfs_set_path_blocking(p);
2891                                         btrfs_tree_read_lock(b);
2892                                 }
2893                                 p->locks[level] = BTRFS_READ_LOCK;
2894                         }
2895                         p->nodes[level] = b;
2896                 }
2897         }
2898         ret = 1;
2899 done:
2900         /*
2901          * we don't really know what they plan on doing with the path
2902          * from here on, so for now just mark it as blocking
2903          */
2904         if (!p->leave_spinning)
2905                 btrfs_set_path_blocking(p);
2906         if (ret < 0 && !p->skip_release_on_error)
2907                 btrfs_release_path(p);
2908         return ret;
2909 }
2910
2911 /*
2912  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2913  * current state of the tree together with the operations recorded in the tree
2914  * modification log to search for the key in a previous version of this tree, as
2915  * denoted by the time_seq parameter.
2916  *
2917  * Naturally, there is no support for insert, delete or cow operations.
2918  *
2919  * The resulting path and return value will be set up as if we called
2920  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2921  */
2922 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2923                           struct btrfs_path *p, u64 time_seq)
2924 {
2925         struct btrfs_fs_info *fs_info = root->fs_info;
2926         struct extent_buffer *b;
2927         int slot;
2928         int ret;
2929         int err;
2930         int level;
2931         int lowest_unlock = 1;
2932         u8 lowest_level = 0;
2933         int prev_cmp = -1;
2934
2935         lowest_level = p->lowest_level;
2936         WARN_ON(p->nodes[0] != NULL);
2937
2938         if (p->search_commit_root) {
2939                 BUG_ON(time_seq);
2940                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2941         }
2942
2943 again:
2944         b = get_old_root(root, time_seq);
2945         if (!b) {
2946                 ret = -EIO;
2947                 goto done;
2948         }
2949         level = btrfs_header_level(b);
2950         p->locks[level] = BTRFS_READ_LOCK;
2951
2952         while (b) {
2953                 int dec = 0;
2954
2955                 level = btrfs_header_level(b);
2956                 p->nodes[level] = b;
2957
2958                 /*
2959                  * we have a lock on b and as long as we aren't changing
2960                  * the tree, there is no way to for the items in b to change.
2961                  * It is safe to drop the lock on our parent before we
2962                  * go through the expensive btree search on b.
2963                  */
2964                 btrfs_unlock_up_safe(p, level + 1);
2965
2966                 /*
2967                  * Since we can unwind ebs we want to do a real search every
2968                  * time.
2969                  */
2970                 prev_cmp = -1;
2971                 ret = key_search(b, key, level, &prev_cmp, &slot);
2972                 if (ret < 0)
2973                         goto done;
2974
2975                 if (level == 0) {
2976                         p->slots[level] = slot;
2977                         unlock_up(p, level, lowest_unlock, 0, NULL);
2978                         goto done;
2979                 }
2980
2981                 if (ret && slot > 0) {
2982                         dec = 1;
2983                         slot--;
2984                 }
2985                 p->slots[level] = slot;
2986                 unlock_up(p, level, lowest_unlock, 0, NULL);
2987
2988                 if (level == lowest_level) {
2989                         if (dec)
2990                                 p->slots[level]++;
2991                         goto done;
2992                 }
2993
2994                 err = read_block_for_search(root, p, &b, level, slot, key);
2995                 if (err == -EAGAIN)
2996                         goto again;
2997                 if (err) {
2998                         ret = err;
2999                         goto done;
3000                 }
3001
3002                 level = btrfs_header_level(b);
3003                 if (!btrfs_tree_read_lock_atomic(b)) {
3004                         btrfs_set_path_blocking(p);
3005                         btrfs_tree_read_lock(b);
3006                 }
3007                 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3008                 if (!b) {
3009                         ret = -ENOMEM;
3010                         goto done;
3011                 }
3012                 p->locks[level] = BTRFS_READ_LOCK;
3013                 p->nodes[level] = b;
3014         }
3015         ret = 1;
3016 done:
3017         if (!p->leave_spinning)
3018                 btrfs_set_path_blocking(p);
3019         if (ret < 0)
3020                 btrfs_release_path(p);
3021
3022         return ret;
3023 }
3024
3025 /*
3026  * helper to use instead of search slot if no exact match is needed but
3027  * instead the next or previous item should be returned.
3028  * When find_higher is true, the next higher item is returned, the next lower
3029  * otherwise.
3030  * When return_any and find_higher are both true, and no higher item is found,
3031  * return the next lower instead.
3032  * When return_any is true and find_higher is false, and no lower item is found,
3033  * return the next higher instead.
3034  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3035  * < 0 on error
3036  */
3037 int btrfs_search_slot_for_read(struct btrfs_root *root,
3038                                const struct btrfs_key *key,
3039                                struct btrfs_path *p, int find_higher,
3040                                int return_any)
3041 {
3042         int ret;
3043         struct extent_buffer *leaf;
3044
3045 again:
3046         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3047         if (ret <= 0)
3048                 return ret;
3049         /*
3050          * a return value of 1 means the path is at the position where the
3051          * item should be inserted. Normally this is the next bigger item,
3052          * but in case the previous item is the last in a leaf, path points
3053          * to the first free slot in the previous leaf, i.e. at an invalid
3054          * item.
3055          */
3056         leaf = p->nodes[0];
3057
3058         if (find_higher) {
3059                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3060                         ret = btrfs_next_leaf(root, p);
3061                         if (ret <= 0)
3062                                 return ret;
3063                         if (!return_any)
3064                                 return 1;
3065                         /*
3066                          * no higher item found, return the next
3067                          * lower instead
3068                          */
3069                         return_any = 0;
3070                         find_higher = 0;
3071                         btrfs_release_path(p);
3072                         goto again;
3073                 }
3074         } else {
3075                 if (p->slots[0] == 0) {
3076                         ret = btrfs_prev_leaf(root, p);
3077                         if (ret < 0)
3078                                 return ret;
3079                         if (!ret) {
3080                                 leaf = p->nodes[0];
3081                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3082                                         p->slots[0]--;
3083                                 return 0;
3084                         }
3085                         if (!return_any)
3086                                 return 1;
3087                         /*
3088                          * no lower item found, return the next
3089                          * higher instead
3090                          */
3091                         return_any = 0;
3092                         find_higher = 1;
3093                         btrfs_release_path(p);
3094                         goto again;
3095                 } else {
3096                         --p->slots[0];
3097                 }
3098         }
3099         return 0;
3100 }
3101
3102 /*
3103  * adjust the pointers going up the tree, starting at level
3104  * making sure the right key of each node is points to 'key'.
3105  * This is used after shifting pointers to the left, so it stops
3106  * fixing up pointers when a given leaf/node is not in slot 0 of the
3107  * higher levels
3108  *
3109  */
3110 static void fixup_low_keys(struct btrfs_path *path,
3111                            struct btrfs_disk_key *key, int level)
3112 {
3113         int i;
3114         struct extent_buffer *t;
3115         int ret;
3116
3117         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3118                 int tslot = path->slots[i];
3119
3120                 if (!path->nodes[i])
3121                         break;
3122                 t = path->nodes[i];
3123                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3124                                 GFP_ATOMIC);
3125                 BUG_ON(ret < 0);
3126                 btrfs_set_node_key(t, key, tslot);
3127                 btrfs_mark_buffer_dirty(path->nodes[i]);
3128                 if (tslot != 0)
3129                         break;
3130         }
3131 }
3132
3133 /*
3134  * update item key.
3135  *
3136  * This function isn't completely safe. It's the caller's responsibility
3137  * that the new key won't break the order
3138  */
3139 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3140                              struct btrfs_path *path,
3141                              const struct btrfs_key *new_key)
3142 {
3143         struct btrfs_disk_key disk_key;
3144         struct extent_buffer *eb;
3145         int slot;
3146
3147         eb = path->nodes[0];
3148         slot = path->slots[0];
3149         if (slot > 0) {
3150                 btrfs_item_key(eb, &disk_key, slot - 1);
3151                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3152                         btrfs_crit(fs_info,
3153                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3154                                    slot, btrfs_disk_key_objectid(&disk_key),
3155                                    btrfs_disk_key_type(&disk_key),
3156                                    btrfs_disk_key_offset(&disk_key),
3157                                    new_key->objectid, new_key->type,
3158                                    new_key->offset);
3159                         btrfs_print_leaf(eb);
3160                         BUG();
3161                 }
3162         }
3163         if (slot < btrfs_header_nritems(eb) - 1) {
3164                 btrfs_item_key(eb, &disk_key, slot + 1);
3165                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3166                         btrfs_crit(fs_info,
3167                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3168                                    slot, btrfs_disk_key_objectid(&disk_key),
3169                                    btrfs_disk_key_type(&disk_key),
3170                                    btrfs_disk_key_offset(&disk_key),
3171                                    new_key->objectid, new_key->type,
3172                                    new_key->offset);
3173                         btrfs_print_leaf(eb);
3174                         BUG();
3175                 }
3176         }
3177
3178         btrfs_cpu_key_to_disk(&disk_key, new_key);
3179         btrfs_set_item_key(eb, &disk_key, slot);
3180         btrfs_mark_buffer_dirty(eb);
3181         if (slot == 0)
3182                 fixup_low_keys(path, &disk_key, 1);
3183 }
3184
3185 /*
3186  * try to push data from one node into the next node left in the
3187  * tree.
3188  *
3189  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3190  * error, and > 0 if there was no room in the left hand block.
3191  */
3192 static int push_node_left(struct btrfs_trans_handle *trans,
3193                           struct extent_buffer *dst,
3194                           struct extent_buffer *src, int empty)
3195 {
3196         struct btrfs_fs_info *fs_info = trans->fs_info;
3197         int push_items = 0;
3198         int src_nritems;
3199         int dst_nritems;
3200         int ret = 0;
3201
3202         src_nritems = btrfs_header_nritems(src);
3203         dst_nritems = btrfs_header_nritems(dst);
3204         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3205         WARN_ON(btrfs_header_generation(src) != trans->transid);
3206         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3207
3208         if (!empty && src_nritems <= 8)
3209                 return 1;
3210
3211         if (push_items <= 0)
3212                 return 1;
3213
3214         if (empty) {
3215                 push_items = min(src_nritems, push_items);
3216                 if (push_items < src_nritems) {
3217                         /* leave at least 8 pointers in the node if
3218                          * we aren't going to empty it
3219                          */
3220                         if (src_nritems - push_items < 8) {
3221                                 if (push_items <= 8)
3222                                         return 1;
3223                                 push_items -= 8;
3224                         }
3225                 }
3226         } else
3227                 push_items = min(src_nritems - 8, push_items);
3228
3229         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3230         if (ret) {
3231                 btrfs_abort_transaction(trans, ret);
3232                 return ret;
3233         }
3234         copy_extent_buffer(dst, src,
3235                            btrfs_node_key_ptr_offset(dst_nritems),
3236                            btrfs_node_key_ptr_offset(0),
3237                            push_items * sizeof(struct btrfs_key_ptr));
3238
3239         if (push_items < src_nritems) {
3240                 /*
3241                  * Don't call tree_mod_log_insert_move here, key removal was
3242                  * already fully logged by tree_mod_log_eb_copy above.
3243                  */
3244                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3245                                       btrfs_node_key_ptr_offset(push_items),
3246                                       (src_nritems - push_items) *
3247                                       sizeof(struct btrfs_key_ptr));
3248         }
3249         btrfs_set_header_nritems(src, src_nritems - push_items);
3250         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3251         btrfs_mark_buffer_dirty(src);
3252         btrfs_mark_buffer_dirty(dst);
3253
3254         return ret;
3255 }
3256
3257 /*
3258  * try to push data from one node into the next node right in the
3259  * tree.
3260  *
3261  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3262  * error, and > 0 if there was no room in the right hand block.
3263  *
3264  * this will  only push up to 1/2 the contents of the left node over
3265  */
3266 static int balance_node_right(struct btrfs_trans_handle *trans,
3267                               struct extent_buffer *dst,
3268                               struct extent_buffer *src)
3269 {
3270         struct btrfs_fs_info *fs_info = trans->fs_info;
3271         int push_items = 0;
3272         int max_push;
3273         int src_nritems;
3274         int dst_nritems;
3275         int ret = 0;
3276
3277         WARN_ON(btrfs_header_generation(src) != trans->transid);
3278         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3279
3280         src_nritems = btrfs_header_nritems(src);
3281         dst_nritems = btrfs_header_nritems(dst);
3282         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3283         if (push_items <= 0)
3284                 return 1;
3285
3286         if (src_nritems < 4)
3287                 return 1;
3288
3289         max_push = src_nritems / 2 + 1;
3290         /* don't try to empty the node */
3291         if (max_push >= src_nritems)
3292                 return 1;
3293
3294         if (max_push < push_items)
3295                 push_items = max_push;
3296
3297         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3298         BUG_ON(ret < 0);
3299         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3300                                       btrfs_node_key_ptr_offset(0),
3301                                       (dst_nritems) *
3302                                       sizeof(struct btrfs_key_ptr));
3303
3304         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3305                                    push_items);
3306         if (ret) {
3307                 btrfs_abort_transaction(trans, ret);
3308                 return ret;
3309         }
3310         copy_extent_buffer(dst, src,
3311                            btrfs_node_key_ptr_offset(0),
3312                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3313                            push_items * sizeof(struct btrfs_key_ptr));
3314
3315         btrfs_set_header_nritems(src, src_nritems - push_items);
3316         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3317
3318         btrfs_mark_buffer_dirty(src);
3319         btrfs_mark_buffer_dirty(dst);
3320
3321         return ret;
3322 }
3323
3324 /*
3325  * helper function to insert a new root level in the tree.
3326  * A new node is allocated, and a single item is inserted to
3327  * point to the existing root
3328  *
3329  * returns zero on success or < 0 on failure.
3330  */
3331 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3332                            struct btrfs_root *root,
3333                            struct btrfs_path *path, int level)
3334 {
3335         struct btrfs_fs_info *fs_info = root->fs_info;
3336         u64 lower_gen;
3337         struct extent_buffer *lower;
3338         struct extent_buffer *c;
3339         struct extent_buffer *old;
3340         struct btrfs_disk_key lower_key;
3341         int ret;
3342
3343         BUG_ON(path->nodes[level]);
3344         BUG_ON(path->nodes[level-1] != root->node);
3345
3346         lower = path->nodes[level-1];
3347         if (level == 1)
3348                 btrfs_item_key(lower, &lower_key, 0);
3349         else
3350                 btrfs_node_key(lower, &lower_key, 0);
3351
3352         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3353                                          root->node->start, 0);
3354         if (IS_ERR(c))
3355                 return PTR_ERR(c);
3356
3357         root_add_used(root, fs_info->nodesize);
3358
3359         btrfs_set_header_nritems(c, 1);
3360         btrfs_set_node_key(c, &lower_key, 0);
3361         btrfs_set_node_blockptr(c, 0, lower->start);
3362         lower_gen = btrfs_header_generation(lower);
3363         WARN_ON(lower_gen != trans->transid);
3364
3365         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3366
3367         btrfs_mark_buffer_dirty(c);
3368
3369         old = root->node;
3370         ret = tree_mod_log_insert_root(root->node, c, 0);
3371         BUG_ON(ret < 0);
3372         rcu_assign_pointer(root->node, c);
3373
3374         /* the super has an extra ref to root->node */
3375         free_extent_buffer(old);
3376
3377         add_root_to_dirty_list(root);
3378         atomic_inc(&c->refs);
3379         path->nodes[level] = c;
3380         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3381         path->slots[level] = 0;
3382         return 0;
3383 }
3384
3385 /*
3386  * worker function to insert a single pointer in a node.
3387  * the node should have enough room for the pointer already
3388  *
3389  * slot and level indicate where you want the key to go, and
3390  * blocknr is the block the key points to.
3391  */
3392 static void insert_ptr(struct btrfs_trans_handle *trans,
3393                        struct btrfs_path *path,
3394                        struct btrfs_disk_key *key, u64 bytenr,
3395                        int slot, int level)
3396 {
3397         struct extent_buffer *lower;
3398         int nritems;
3399         int ret;
3400
3401         BUG_ON(!path->nodes[level]);
3402         btrfs_assert_tree_locked(path->nodes[level]);
3403         lower = path->nodes[level];
3404         nritems = btrfs_header_nritems(lower);
3405         BUG_ON(slot > nritems);
3406         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3407         if (slot != nritems) {
3408                 if (level) {
3409                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3410                                         nritems - slot);
3411                         BUG_ON(ret < 0);
3412                 }
3413                 memmove_extent_buffer(lower,
3414                               btrfs_node_key_ptr_offset(slot + 1),
3415                               btrfs_node_key_ptr_offset(slot),
3416                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3417         }
3418         if (level) {
3419                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3420                                 GFP_NOFS);
3421                 BUG_ON(ret < 0);
3422         }
3423         btrfs_set_node_key(lower, key, slot);
3424         btrfs_set_node_blockptr(lower, slot, bytenr);
3425         WARN_ON(trans->transid == 0);
3426         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3427         btrfs_set_header_nritems(lower, nritems + 1);
3428         btrfs_mark_buffer_dirty(lower);
3429 }
3430
3431 /*
3432  * split the node at the specified level in path in two.
3433  * The path is corrected to point to the appropriate node after the split
3434  *
3435  * Before splitting this tries to make some room in the node by pushing
3436  * left and right, if either one works, it returns right away.
3437  *
3438  * returns 0 on success and < 0 on failure
3439  */
3440 static noinline int split_node(struct btrfs_trans_handle *trans,
3441                                struct btrfs_root *root,
3442                                struct btrfs_path *path, int level)
3443 {
3444         struct btrfs_fs_info *fs_info = root->fs_info;
3445         struct extent_buffer *c;
3446         struct extent_buffer *split;
3447         struct btrfs_disk_key disk_key;
3448         int mid;
3449         int ret;
3450         u32 c_nritems;
3451
3452         c = path->nodes[level];
3453         WARN_ON(btrfs_header_generation(c) != trans->transid);
3454         if (c == root->node) {
3455                 /*
3456                  * trying to split the root, lets make a new one
3457                  *
3458                  * tree mod log: We don't log_removal old root in
3459                  * insert_new_root, because that root buffer will be kept as a
3460                  * normal node. We are going to log removal of half of the
3461                  * elements below with tree_mod_log_eb_copy. We're holding a
3462                  * tree lock on the buffer, which is why we cannot race with
3463                  * other tree_mod_log users.
3464                  */
3465                 ret = insert_new_root(trans, root, path, level + 1);
3466                 if (ret)
3467                         return ret;
3468         } else {
3469                 ret = push_nodes_for_insert(trans, root, path, level);
3470                 c = path->nodes[level];
3471                 if (!ret && btrfs_header_nritems(c) <
3472                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3473                         return 0;
3474                 if (ret < 0)
3475                         return ret;
3476         }
3477
3478         c_nritems = btrfs_header_nritems(c);
3479         mid = (c_nritems + 1) / 2;
3480         btrfs_node_key(c, &disk_key, mid);
3481
3482         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3483                                              c->start, 0);
3484         if (IS_ERR(split))
3485                 return PTR_ERR(split);
3486
3487         root_add_used(root, fs_info->nodesize);
3488         ASSERT(btrfs_header_level(c) == level);
3489
3490         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3491         if (ret) {
3492                 btrfs_abort_transaction(trans, ret);
3493                 return ret;
3494         }
3495         copy_extent_buffer(split, c,
3496                            btrfs_node_key_ptr_offset(0),
3497                            btrfs_node_key_ptr_offset(mid),
3498                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3499         btrfs_set_header_nritems(split, c_nritems - mid);
3500         btrfs_set_header_nritems(c, mid);
3501         ret = 0;
3502
3503         btrfs_mark_buffer_dirty(c);
3504         btrfs_mark_buffer_dirty(split);
3505
3506         insert_ptr(trans, path, &disk_key, split->start,
3507                    path->slots[level + 1] + 1, level + 1);
3508
3509         if (path->slots[level] >= mid) {
3510                 path->slots[level] -= mid;
3511                 btrfs_tree_unlock(c);
3512                 free_extent_buffer(c);
3513                 path->nodes[level] = split;
3514                 path->slots[level + 1] += 1;
3515         } else {
3516                 btrfs_tree_unlock(split);
3517                 free_extent_buffer(split);
3518         }
3519         return ret;
3520 }
3521
3522 /*
3523  * how many bytes are required to store the items in a leaf.  start
3524  * and nr indicate which items in the leaf to check.  This totals up the
3525  * space used both by the item structs and the item data
3526  */
3527 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3528 {
3529         struct btrfs_item *start_item;
3530         struct btrfs_item *end_item;
3531         struct btrfs_map_token token;
3532         int data_len;
3533         int nritems = btrfs_header_nritems(l);
3534         int end = min(nritems, start + nr) - 1;
3535
3536         if (!nr)
3537                 return 0;
3538         btrfs_init_map_token(&token, l);
3539         start_item = btrfs_item_nr(start);
3540         end_item = btrfs_item_nr(end);
3541         data_len = btrfs_token_item_offset(l, start_item, &token) +
3542                 btrfs_token_item_size(l, start_item, &token);
3543         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3544         data_len += sizeof(struct btrfs_item) * nr;
3545         WARN_ON(data_len < 0);
3546         return data_len;
3547 }
3548
3549 /*
3550  * The space between the end of the leaf items and
3551  * the start of the leaf data.  IOW, how much room
3552  * the leaf has left for both items and data
3553  */
3554 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3555 {
3556         struct btrfs_fs_info *fs_info = leaf->fs_info;
3557         int nritems = btrfs_header_nritems(leaf);
3558         int ret;
3559
3560         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3561         if (ret < 0) {
3562                 btrfs_crit(fs_info,
3563                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3564                            ret,
3565                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3566                            leaf_space_used(leaf, 0, nritems), nritems);
3567         }
3568         return ret;
3569 }
3570
3571 /*
3572  * min slot controls the lowest index we're willing to push to the
3573  * right.  We'll push up to and including min_slot, but no lower
3574  */
3575 static noinline int __push_leaf_right(struct btrfs_path *path,
3576                                       int data_size, int empty,
3577                                       struct extent_buffer *right,
3578                                       int free_space, u32 left_nritems,
3579                                       u32 min_slot)
3580 {
3581         struct btrfs_fs_info *fs_info = right->fs_info;
3582         struct extent_buffer *left = path->nodes[0];
3583         struct extent_buffer *upper = path->nodes[1];
3584         struct btrfs_map_token token;
3585         struct btrfs_disk_key disk_key;
3586         int slot;
3587         u32 i;
3588         int push_space = 0;
3589         int push_items = 0;
3590         struct btrfs_item *item;
3591         u32 nr;
3592         u32 right_nritems;
3593         u32 data_end;
3594         u32 this_item_size;
3595
3596         if (empty)
3597                 nr = 0;
3598         else
3599                 nr = max_t(u32, 1, min_slot);
3600
3601         if (path->slots[0] >= left_nritems)
3602                 push_space += data_size;
3603
3604         slot = path->slots[1];
3605         i = left_nritems - 1;
3606         while (i >= nr) {
3607                 item = btrfs_item_nr(i);
3608
3609                 if (!empty && push_items > 0) {
3610                         if (path->slots[0] > i)
3611                                 break;
3612                         if (path->slots[0] == i) {
3613                                 int space = btrfs_leaf_free_space(left);
3614
3615                                 if (space + push_space * 2 > free_space)
3616                                         break;
3617                         }
3618                 }
3619
3620                 if (path->slots[0] == i)
3621                         push_space += data_size;
3622
3623                 this_item_size = btrfs_item_size(left, item);
3624                 if (this_item_size + sizeof(*item) + push_space > free_space)
3625                         break;
3626
3627                 push_items++;
3628                 push_space += this_item_size + sizeof(*item);
3629                 if (i == 0)
3630                         break;
3631                 i--;
3632         }
3633
3634         if (push_items == 0)
3635                 goto out_unlock;
3636
3637         WARN_ON(!empty && push_items == left_nritems);
3638
3639         /* push left to right */
3640         right_nritems = btrfs_header_nritems(right);
3641
3642         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3643         push_space -= leaf_data_end(left);
3644
3645         /* make room in the right data area */
3646         data_end = leaf_data_end(right);
3647         memmove_extent_buffer(right,
3648                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3649                               BTRFS_LEAF_DATA_OFFSET + data_end,
3650                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3651
3652         /* copy from the left data area */
3653         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3654                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3655                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3656                      push_space);
3657
3658         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3659                               btrfs_item_nr_offset(0),
3660                               right_nritems * sizeof(struct btrfs_item));
3661
3662         /* copy the items from left to right */
3663         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3664                    btrfs_item_nr_offset(left_nritems - push_items),
3665                    push_items * sizeof(struct btrfs_item));
3666
3667         /* update the item pointers */
3668         btrfs_init_map_token(&token, right);
3669         right_nritems += push_items;
3670         btrfs_set_header_nritems(right, right_nritems);
3671         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3672         for (i = 0; i < right_nritems; i++) {
3673                 item = btrfs_item_nr(i);
3674                 push_space -= btrfs_token_item_size(right, item, &token);
3675                 btrfs_set_token_item_offset(right, item, push_space, &token);
3676         }
3677
3678         left_nritems -= push_items;
3679         btrfs_set_header_nritems(left, left_nritems);
3680
3681         if (left_nritems)
3682                 btrfs_mark_buffer_dirty(left);
3683         else
3684                 btrfs_clean_tree_block(left);
3685
3686         btrfs_mark_buffer_dirty(right);
3687
3688         btrfs_item_key(right, &disk_key, 0);
3689         btrfs_set_node_key(upper, &disk_key, slot + 1);
3690         btrfs_mark_buffer_dirty(upper);
3691
3692         /* then fixup the leaf pointer in the path */
3693         if (path->slots[0] >= left_nritems) {
3694                 path->slots[0] -= left_nritems;
3695                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3696                         btrfs_clean_tree_block(path->nodes[0]);
3697                 btrfs_tree_unlock(path->nodes[0]);
3698                 free_extent_buffer(path->nodes[0]);
3699                 path->nodes[0] = right;
3700                 path->slots[1] += 1;
3701         } else {
3702                 btrfs_tree_unlock(right);
3703                 free_extent_buffer(right);
3704         }
3705         return 0;
3706
3707 out_unlock:
3708         btrfs_tree_unlock(right);
3709         free_extent_buffer(right);
3710         return 1;
3711 }
3712
3713 /*
3714  * push some data in the path leaf to the right, trying to free up at
3715  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3716  *
3717  * returns 1 if the push failed because the other node didn't have enough
3718  * room, 0 if everything worked out and < 0 if there were major errors.
3719  *
3720  * this will push starting from min_slot to the end of the leaf.  It won't
3721  * push any slot lower than min_slot
3722  */
3723 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3724                            *root, struct btrfs_path *path,
3725                            int min_data_size, int data_size,
3726                            int empty, u32 min_slot)
3727 {
3728         struct extent_buffer *left = path->nodes[0];
3729         struct extent_buffer *right;
3730         struct extent_buffer *upper;
3731         int slot;
3732         int free_space;
3733         u32 left_nritems;
3734         int ret;
3735
3736         if (!path->nodes[1])
3737                 return 1;
3738
3739         slot = path->slots[1];
3740         upper = path->nodes[1];
3741         if (slot >= btrfs_header_nritems(upper) - 1)
3742                 return 1;
3743
3744         btrfs_assert_tree_locked(path->nodes[1]);
3745
3746         right = btrfs_read_node_slot(upper, slot + 1);
3747         /*
3748          * slot + 1 is not valid or we fail to read the right node,
3749          * no big deal, just return.
3750          */
3751         if (IS_ERR(right))
3752                 return 1;
3753
3754         btrfs_tree_lock(right);
3755         btrfs_set_lock_blocking_write(right);
3756
3757         free_space = btrfs_leaf_free_space(right);
3758         if (free_space < data_size)
3759                 goto out_unlock;
3760
3761         /* cow and double check */
3762         ret = btrfs_cow_block(trans, root, right, upper,
3763                               slot + 1, &right);
3764         if (ret)
3765                 goto out_unlock;
3766
3767         free_space = btrfs_leaf_free_space(right);
3768         if (free_space < data_size)
3769                 goto out_unlock;
3770
3771         left_nritems = btrfs_header_nritems(left);
3772         if (left_nritems == 0)
3773                 goto out_unlock;
3774
3775         if (path->slots[0] == left_nritems && !empty) {
3776                 /* Key greater than all keys in the leaf, right neighbor has
3777                  * enough room for it and we're not emptying our leaf to delete
3778                  * it, therefore use right neighbor to insert the new item and
3779                  * no need to touch/dirty our left leaf. */
3780                 btrfs_tree_unlock(left);
3781                 free_extent_buffer(left);
3782                 path->nodes[0] = right;
3783                 path->slots[0] = 0;
3784                 path->slots[1]++;
3785                 return 0;
3786         }
3787
3788         return __push_leaf_right(path, min_data_size, empty,
3789                                 right, free_space, left_nritems, min_slot);
3790 out_unlock:
3791         btrfs_tree_unlock(right);
3792         free_extent_buffer(right);
3793         return 1;
3794 }
3795
3796 /*
3797  * push some data in the path leaf to the left, trying to free up at
3798  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3799  *
3800  * max_slot can put a limit on how far into the leaf we'll push items.  The
3801  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3802  * items
3803  */
3804 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3805                                      int empty, struct extent_buffer *left,
3806                                      int free_space, u32 right_nritems,
3807                                      u32 max_slot)
3808 {
3809         struct btrfs_fs_info *fs_info = left->fs_info;
3810         struct btrfs_disk_key disk_key;
3811         struct extent_buffer *right = path->nodes[0];
3812         int i;
3813         int push_space = 0;
3814         int push_items = 0;
3815         struct btrfs_item *item;
3816         u32 old_left_nritems;
3817         u32 nr;
3818         int ret = 0;
3819         u32 this_item_size;
3820         u32 old_left_item_size;
3821         struct btrfs_map_token token;
3822
3823         if (empty)
3824                 nr = min(right_nritems, max_slot);
3825         else
3826                 nr = min(right_nritems - 1, max_slot);
3827
3828         for (i = 0; i < nr; i++) {
3829                 item = btrfs_item_nr(i);
3830
3831                 if (!empty && push_items > 0) {
3832                         if (path->slots[0] < i)
3833                                 break;
3834                         if (path->slots[0] == i) {
3835                                 int space = btrfs_leaf_free_space(right);
3836
3837                                 if (space + push_space * 2 > free_space)
3838                                         break;
3839                         }
3840                 }
3841
3842                 if (path->slots[0] == i)
3843                         push_space += data_size;
3844
3845                 this_item_size = btrfs_item_size(right, item);
3846                 if (this_item_size + sizeof(*item) + push_space > free_space)
3847                         break;
3848
3849                 push_items++;
3850                 push_space += this_item_size + sizeof(*item);
3851         }
3852
3853         if (push_items == 0) {
3854                 ret = 1;
3855                 goto out;
3856         }
3857         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3858
3859         /* push data from right to left */
3860         copy_extent_buffer(left, right,
3861                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3862                            btrfs_item_nr_offset(0),
3863                            push_items * sizeof(struct btrfs_item));
3864
3865         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3866                      btrfs_item_offset_nr(right, push_items - 1);
3867
3868         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3869                      leaf_data_end(left) - push_space,
3870                      BTRFS_LEAF_DATA_OFFSET +
3871                      btrfs_item_offset_nr(right, push_items - 1),
3872                      push_space);
3873         old_left_nritems = btrfs_header_nritems(left);
3874         BUG_ON(old_left_nritems <= 0);
3875
3876         btrfs_init_map_token(&token, left);
3877         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3878         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3879                 u32 ioff;
3880
3881                 item = btrfs_item_nr(i);
3882
3883                 ioff = btrfs_token_item_offset(left, item, &token);
3884                 btrfs_set_token_item_offset(left, item,
3885                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3886                       &token);
3887         }
3888         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3889
3890         /* fixup right node */
3891         if (push_items > right_nritems)
3892                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3893                        right_nritems);
3894
3895         if (push_items < right_nritems) {
3896                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3897                                                   leaf_data_end(right);
3898                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3899                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3900                                       BTRFS_LEAF_DATA_OFFSET +
3901                                       leaf_data_end(right), push_space);
3902
3903                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3904                               btrfs_item_nr_offset(push_items),
3905                              (btrfs_header_nritems(right) - push_items) *
3906                              sizeof(struct btrfs_item));
3907         }
3908
3909         btrfs_init_map_token(&token, right);
3910         right_nritems -= push_items;
3911         btrfs_set_header_nritems(right, right_nritems);
3912         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3913         for (i = 0; i < right_nritems; i++) {
3914                 item = btrfs_item_nr(i);
3915
3916                 push_space = push_space - btrfs_token_item_size(right,
3917                                                                 item, &token);
3918                 btrfs_set_token_item_offset(right, item, push_space, &token);
3919         }
3920
3921         btrfs_mark_buffer_dirty(left);
3922         if (right_nritems)
3923                 btrfs_mark_buffer_dirty(right);
3924         else
3925                 btrfs_clean_tree_block(right);
3926
3927         btrfs_item_key(right, &disk_key, 0);
3928         fixup_low_keys(path, &disk_key, 1);
3929
3930         /* then fixup the leaf pointer in the path */
3931         if (path->slots[0] < push_items) {
3932                 path->slots[0] += old_left_nritems;
3933                 btrfs_tree_unlock(path->nodes[0]);
3934                 free_extent_buffer(path->nodes[0]);
3935                 path->nodes[0] = left;
3936                 path->slots[1] -= 1;
3937         } else {
3938                 btrfs_tree_unlock(left);
3939                 free_extent_buffer(left);
3940                 path->slots[0] -= push_items;
3941         }
3942         BUG_ON(path->slots[0] < 0);
3943         return ret;
3944 out:
3945         btrfs_tree_unlock(left);
3946         free_extent_buffer(left);
3947         return ret;
3948 }
3949
3950 /*
3951  * push some data in the path leaf to the left, trying to free up at
3952  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3953  *
3954  * max_slot can put a limit on how far into the leaf we'll push items.  The
3955  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3956  * items
3957  */
3958 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3959                           *root, struct btrfs_path *path, int min_data_size,
3960                           int data_size, int empty, u32 max_slot)
3961 {
3962         struct extent_buffer *right = path->nodes[0];
3963         struct extent_buffer *left;
3964         int slot;
3965         int free_space;
3966         u32 right_nritems;
3967         int ret = 0;
3968
3969         slot = path->slots[1];
3970         if (slot == 0)
3971                 return 1;
3972         if (!path->nodes[1])
3973                 return 1;
3974
3975         right_nritems = btrfs_header_nritems(right);
3976         if (right_nritems == 0)
3977                 return 1;
3978
3979         btrfs_assert_tree_locked(path->nodes[1]);
3980
3981         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3982         /*
3983          * slot - 1 is not valid or we fail to read the left node,
3984          * no big deal, just return.
3985          */
3986         if (IS_ERR(left))
3987                 return 1;
3988
3989         btrfs_tree_lock(left);
3990         btrfs_set_lock_blocking_write(left);
3991
3992         free_space = btrfs_leaf_free_space(left);
3993         if (free_space < data_size) {
3994                 ret = 1;
3995                 goto out;
3996         }
3997
3998         /* cow and double check */
3999         ret = btrfs_cow_block(trans, root, left,
4000                               path->nodes[1], slot - 1, &left);
4001         if (ret) {
4002                 /* we hit -ENOSPC, but it isn't fatal here */
4003                 if (ret == -ENOSPC)
4004                         ret = 1;
4005                 goto out;
4006         }
4007
4008         free_space = btrfs_leaf_free_space(left);
4009         if (free_space < data_size) {
4010                 ret = 1;
4011                 goto out;
4012         }
4013
4014         return __push_leaf_left(path, min_data_size,
4015                                empty, left, free_space, right_nritems,
4016                                max_slot);
4017 out:
4018         btrfs_tree_unlock(left);
4019         free_extent_buffer(left);
4020         return ret;
4021 }
4022
4023 /*
4024  * split the path's leaf in two, making sure there is at least data_size
4025  * available for the resulting leaf level of the path.
4026  */
4027 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4028                                     struct btrfs_path *path,
4029                                     struct extent_buffer *l,
4030                                     struct extent_buffer *right,
4031                                     int slot, int mid, int nritems)
4032 {
4033         struct btrfs_fs_info *fs_info = trans->fs_info;
4034         int data_copy_size;
4035         int rt_data_off;
4036         int i;
4037         struct btrfs_disk_key disk_key;
4038         struct btrfs_map_token token;
4039
4040         nritems = nritems - mid;
4041         btrfs_set_header_nritems(right, nritems);
4042         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4043
4044         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4045                            btrfs_item_nr_offset(mid),
4046                            nritems * sizeof(struct btrfs_item));
4047
4048         copy_extent_buffer(right, l,
4049                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4050                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4051                      leaf_data_end(l), data_copy_size);
4052
4053         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4054
4055         btrfs_init_map_token(&token, right);
4056         for (i = 0; i < nritems; i++) {
4057                 struct btrfs_item *item = btrfs_item_nr(i);
4058                 u32 ioff;
4059
4060                 ioff = btrfs_token_item_offset(right, item, &token);
4061                 btrfs_set_token_item_offset(right, item,
4062                                             ioff + rt_data_off, &token);
4063         }
4064
4065         btrfs_set_header_nritems(l, mid);
4066         btrfs_item_key(right, &disk_key, 0);
4067         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4068
4069         btrfs_mark_buffer_dirty(right);
4070         btrfs_mark_buffer_dirty(l);
4071         BUG_ON(path->slots[0] != slot);
4072
4073         if (mid <= slot) {
4074                 btrfs_tree_unlock(path->nodes[0]);
4075                 free_extent_buffer(path->nodes[0]);
4076                 path->nodes[0] = right;
4077                 path->slots[0] -= mid;
4078                 path->slots[1] += 1;
4079         } else {
4080                 btrfs_tree_unlock(right);
4081                 free_extent_buffer(right);
4082         }
4083
4084         BUG_ON(path->slots[0] < 0);
4085 }
4086
4087 /*
4088  * double splits happen when we need to insert a big item in the middle
4089  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4090  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4091  *          A                 B                 C
4092  *
4093  * We avoid this by trying to push the items on either side of our target
4094  * into the adjacent leaves.  If all goes well we can avoid the double split
4095  * completely.
4096  */
4097 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4098                                           struct btrfs_root *root,
4099                                           struct btrfs_path *path,
4100                                           int data_size)
4101 {
4102         int ret;
4103         int progress = 0;
4104         int slot;
4105         u32 nritems;
4106         int space_needed = data_size;
4107
4108         slot = path->slots[0];
4109         if (slot < btrfs_header_nritems(path->nodes[0]))
4110                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4111
4112         /*
4113          * try to push all the items after our slot into the
4114          * right leaf
4115          */
4116         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4117         if (ret < 0)
4118                 return ret;
4119
4120         if (ret == 0)
4121                 progress++;
4122
4123         nritems = btrfs_header_nritems(path->nodes[0]);
4124         /*
4125          * our goal is to get our slot at the start or end of a leaf.  If
4126          * we've done so we're done
4127          */
4128         if (path->slots[0] == 0 || path->slots[0] == nritems)
4129                 return 0;
4130
4131         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4132                 return 0;
4133
4134         /* try to push all the items before our slot into the next leaf */
4135         slot = path->slots[0];
4136         space_needed = data_size;
4137         if (slot > 0)
4138                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4139         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4140         if (ret < 0)
4141                 return ret;
4142
4143         if (ret == 0)
4144                 progress++;
4145
4146         if (progress)
4147                 return 0;
4148         return 1;
4149 }
4150
4151 /*
4152  * split the path's leaf in two, making sure there is at least data_size
4153  * available for the resulting leaf level of the path.
4154  *
4155  * returns 0 if all went well and < 0 on failure.
4156  */
4157 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4158                                struct btrfs_root *root,
4159                                const struct btrfs_key *ins_key,
4160                                struct btrfs_path *path, int data_size,
4161                                int extend)
4162 {
4163         struct btrfs_disk_key disk_key;
4164         struct extent_buffer *l;
4165         u32 nritems;
4166         int mid;
4167         int slot;
4168         struct extent_buffer *right;
4169         struct btrfs_fs_info *fs_info = root->fs_info;
4170         int ret = 0;
4171         int wret;
4172         int split;
4173         int num_doubles = 0;
4174         int tried_avoid_double = 0;
4175
4176         l = path->nodes[0];
4177         slot = path->slots[0];
4178         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4179             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4180                 return -EOVERFLOW;
4181
4182         /* first try to make some room by pushing left and right */
4183         if (data_size && path->nodes[1]) {
4184                 int space_needed = data_size;
4185
4186                 if (slot < btrfs_header_nritems(l))
4187                         space_needed -= btrfs_leaf_free_space(l);
4188
4189                 wret = push_leaf_right(trans, root, path, space_needed,
4190                                        space_needed, 0, 0);
4191                 if (wret < 0)
4192                         return wret;
4193                 if (wret) {
4194                         space_needed = data_size;
4195                         if (slot > 0)
4196                                 space_needed -= btrfs_leaf_free_space(l);
4197                         wret = push_leaf_left(trans, root, path, space_needed,
4198                                               space_needed, 0, (u32)-1);
4199                         if (wret < 0)
4200                                 return wret;
4201                 }
4202                 l = path->nodes[0];
4203
4204                 /* did the pushes work? */
4205                 if (btrfs_leaf_free_space(l) >= data_size)
4206                         return 0;
4207         }
4208
4209         if (!path->nodes[1]) {
4210                 ret = insert_new_root(trans, root, path, 1);
4211                 if (ret)
4212                         return ret;
4213         }
4214 again:
4215         split = 1;
4216         l = path->nodes[0];
4217         slot = path->slots[0];
4218         nritems = btrfs_header_nritems(l);
4219         mid = (nritems + 1) / 2;
4220
4221         if (mid <= slot) {
4222                 if (nritems == 1 ||
4223                     leaf_space_used(l, mid, nritems - mid) + data_size >
4224                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4225                         if (slot >= nritems) {
4226                                 split = 0;
4227                         } else {
4228                                 mid = slot;
4229                                 if (mid != nritems &&
4230                                     leaf_space_used(l, mid, nritems - mid) +
4231                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4232                                         if (data_size && !tried_avoid_double)
4233                                                 goto push_for_double;
4234                                         split = 2;
4235                                 }
4236                         }
4237                 }
4238         } else {
4239                 if (leaf_space_used(l, 0, mid) + data_size >
4240                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4241                         if (!extend && data_size && slot == 0) {
4242                                 split = 0;
4243                         } else if ((extend || !data_size) && slot == 0) {
4244                                 mid = 1;
4245                         } else {
4246                                 mid = slot;
4247                                 if (mid != nritems &&
4248                                     leaf_space_used(l, mid, nritems - mid) +
4249                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4250                                         if (data_size && !tried_avoid_double)
4251                                                 goto push_for_double;
4252                                         split = 2;
4253                                 }
4254                         }
4255                 }
4256         }
4257
4258         if (split == 0)
4259                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4260         else
4261                 btrfs_item_key(l, &disk_key, mid);
4262
4263         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4264                                              l->start, 0);
4265         if (IS_ERR(right))
4266                 return PTR_ERR(right);
4267
4268         root_add_used(root, fs_info->nodesize);
4269
4270         if (split == 0) {
4271                 if (mid <= slot) {
4272                         btrfs_set_header_nritems(right, 0);
4273                         insert_ptr(trans, path, &disk_key,
4274                                    right->start, path->slots[1] + 1, 1);
4275                         btrfs_tree_unlock(path->nodes[0]);
4276                         free_extent_buffer(path->nodes[0]);
4277                         path->nodes[0] = right;
4278                         path->slots[0] = 0;
4279                         path->slots[1] += 1;
4280                 } else {
4281                         btrfs_set_header_nritems(right, 0);
4282                         insert_ptr(trans, path, &disk_key,
4283                                    right->start, path->slots[1], 1);
4284                         btrfs_tree_unlock(path->nodes[0]);
4285                         free_extent_buffer(path->nodes[0]);
4286                         path->nodes[0] = right;
4287                         path->slots[0] = 0;
4288                         if (path->slots[1] == 0)
4289                                 fixup_low_keys(path, &disk_key, 1);
4290                 }
4291                 /*
4292                  * We create a new leaf 'right' for the required ins_len and
4293                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4294                  * the content of ins_len to 'right'.
4295                  */
4296                 return ret;
4297         }
4298
4299         copy_for_split(trans, path, l, right, slot, mid, nritems);
4300
4301         if (split == 2) {
4302                 BUG_ON(num_doubles != 0);
4303                 num_doubles++;
4304                 goto again;
4305         }
4306
4307         return 0;
4308
4309 push_for_double:
4310         push_for_double_split(trans, root, path, data_size);
4311         tried_avoid_double = 1;
4312         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4313                 return 0;
4314         goto again;
4315 }
4316
4317 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4318                                          struct btrfs_root *root,
4319                                          struct btrfs_path *path, int ins_len)
4320 {
4321         struct btrfs_key key;
4322         struct extent_buffer *leaf;
4323         struct btrfs_file_extent_item *fi;
4324         u64 extent_len = 0;
4325         u32 item_size;
4326         int ret;
4327
4328         leaf = path->nodes[0];
4329         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4330
4331         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4332                key.type != BTRFS_EXTENT_CSUM_KEY);
4333
4334         if (btrfs_leaf_free_space(leaf) >= ins_len)
4335                 return 0;
4336
4337         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4338         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4339                 fi = btrfs_item_ptr(leaf, path->slots[0],
4340                                     struct btrfs_file_extent_item);
4341                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4342         }
4343         btrfs_release_path(path);
4344
4345         path->keep_locks = 1;
4346         path->search_for_split = 1;
4347         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4348         path->search_for_split = 0;
4349         if (ret > 0)
4350                 ret = -EAGAIN;
4351         if (ret < 0)
4352                 goto err;
4353
4354         ret = -EAGAIN;
4355         leaf = path->nodes[0];
4356         /* if our item isn't there, return now */
4357         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4358                 goto err;
4359
4360         /* the leaf has  changed, it now has room.  return now */
4361         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4362                 goto err;
4363
4364         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4365                 fi = btrfs_item_ptr(leaf, path->slots[0],
4366                                     struct btrfs_file_extent_item);
4367                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4368                         goto err;
4369         }
4370
4371         btrfs_set_path_blocking(path);
4372         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4373         if (ret)
4374                 goto err;
4375
4376         path->keep_locks = 0;
4377         btrfs_unlock_up_safe(path, 1);
4378         return 0;
4379 err:
4380         path->keep_locks = 0;
4381         return ret;
4382 }
4383
4384 static noinline int split_item(struct btrfs_path *path,
4385                                const struct btrfs_key *new_key,
4386                                unsigned long split_offset)
4387 {
4388         struct extent_buffer *leaf;
4389         struct btrfs_item *item;
4390         struct btrfs_item *new_item;
4391         int slot;
4392         char *buf;
4393         u32 nritems;
4394         u32 item_size;
4395         u32 orig_offset;
4396         struct btrfs_disk_key disk_key;
4397
4398         leaf = path->nodes[0];
4399         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4400
4401         btrfs_set_path_blocking(path);
4402
4403         item = btrfs_item_nr(path->slots[0]);
4404         orig_offset = btrfs_item_offset(leaf, item);
4405         item_size = btrfs_item_size(leaf, item);
4406
4407         buf = kmalloc(item_size, GFP_NOFS);
4408         if (!buf)
4409                 return -ENOMEM;
4410
4411         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4412                             path->slots[0]), item_size);
4413
4414         slot = path->slots[0] + 1;
4415         nritems = btrfs_header_nritems(leaf);
4416         if (slot != nritems) {
4417                 /* shift the items */
4418                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4419                                 btrfs_item_nr_offset(slot),
4420                                 (nritems - slot) * sizeof(struct btrfs_item));
4421         }
4422
4423         btrfs_cpu_key_to_disk(&disk_key, new_key);
4424         btrfs_set_item_key(leaf, &disk_key, slot);
4425
4426         new_item = btrfs_item_nr(slot);
4427
4428         btrfs_set_item_offset(leaf, new_item, orig_offset);
4429         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4430
4431         btrfs_set_item_offset(leaf, item,
4432                               orig_offset + item_size - split_offset);
4433         btrfs_set_item_size(leaf, item, split_offset);
4434
4435         btrfs_set_header_nritems(leaf, nritems + 1);
4436
4437         /* write the data for the start of the original item */
4438         write_extent_buffer(leaf, buf,
4439                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4440                             split_offset);
4441
4442         /* write the data for the new item */
4443         write_extent_buffer(leaf, buf + split_offset,
4444                             btrfs_item_ptr_offset(leaf, slot),
4445                             item_size - split_offset);
4446         btrfs_mark_buffer_dirty(leaf);
4447
4448         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4449         kfree(buf);
4450         return 0;
4451 }
4452
4453 /*
4454  * This function splits a single item into two items,
4455  * giving 'new_key' to the new item and splitting the
4456  * old one at split_offset (from the start of the item).
4457  *
4458  * The path may be released by this operation.  After
4459  * the split, the path is pointing to the old item.  The
4460  * new item is going to be in the same node as the old one.
4461  *
4462  * Note, the item being split must be smaller enough to live alone on
4463  * a tree block with room for one extra struct btrfs_item
4464  *
4465  * This allows us to split the item in place, keeping a lock on the
4466  * leaf the entire time.
4467  */
4468 int btrfs_split_item(struct btrfs_trans_handle *trans,
4469                      struct btrfs_root *root,
4470                      struct btrfs_path *path,
4471                      const struct btrfs_key *new_key,
4472                      unsigned long split_offset)
4473 {
4474         int ret;
4475         ret = setup_leaf_for_split(trans, root, path,
4476                                    sizeof(struct btrfs_item));
4477         if (ret)
4478                 return ret;
4479
4480         ret = split_item(path, new_key, split_offset);
4481         return ret;
4482 }
4483
4484 /*
4485  * This function duplicate a item, giving 'new_key' to the new item.
4486  * It guarantees both items live in the same tree leaf and the new item
4487  * is contiguous with the original item.
4488  *
4489  * This allows us to split file extent in place, keeping a lock on the
4490  * leaf the entire time.
4491  */
4492 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4493                          struct btrfs_root *root,
4494                          struct btrfs_path *path,
4495                          const struct btrfs_key *new_key)
4496 {
4497         struct extent_buffer *leaf;
4498         int ret;
4499         u32 item_size;
4500
4501         leaf = path->nodes[0];
4502         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4503         ret = setup_leaf_for_split(trans, root, path,
4504                                    item_size + sizeof(struct btrfs_item));
4505         if (ret)
4506                 return ret;
4507
4508         path->slots[0]++;
4509         setup_items_for_insert(root, path, new_key, &item_size,
4510                                item_size, item_size +
4511                                sizeof(struct btrfs_item), 1);
4512         leaf = path->nodes[0];
4513         memcpy_extent_buffer(leaf,
4514                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4515                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4516                              item_size);
4517         return 0;
4518 }
4519
4520 /*
4521  * make the item pointed to by the path smaller.  new_size indicates
4522  * how small to make it, and from_end tells us if we just chop bytes
4523  * off the end of the item or if we shift the item to chop bytes off
4524  * the front.
4525  */
4526 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4527 {
4528         int slot;
4529         struct extent_buffer *leaf;
4530         struct btrfs_item *item;
4531         u32 nritems;
4532         unsigned int data_end;
4533         unsigned int old_data_start;
4534         unsigned int old_size;
4535         unsigned int size_diff;
4536         int i;
4537         struct btrfs_map_token token;
4538
4539         leaf = path->nodes[0];
4540         slot = path->slots[0];
4541
4542         old_size = btrfs_item_size_nr(leaf, slot);
4543         if (old_size == new_size)
4544                 return;
4545
4546         nritems = btrfs_header_nritems(leaf);
4547         data_end = leaf_data_end(leaf);
4548
4549         old_data_start = btrfs_item_offset_nr(leaf, slot);
4550
4551         size_diff = old_size - new_size;
4552
4553         BUG_ON(slot < 0);
4554         BUG_ON(slot >= nritems);
4555
4556         /*
4557          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4558          */
4559         /* first correct the data pointers */
4560         btrfs_init_map_token(&token, leaf);
4561         for (i = slot; i < nritems; i++) {
4562                 u32 ioff;
4563                 item = btrfs_item_nr(i);
4564
4565                 ioff = btrfs_token_item_offset(leaf, item, &token);
4566                 btrfs_set_token_item_offset(leaf, item,
4567                                             ioff + size_diff, &token);
4568         }
4569
4570         /* shift the data */
4571         if (from_end) {
4572                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4573                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4574                               data_end, old_data_start + new_size - data_end);
4575         } else {
4576                 struct btrfs_disk_key disk_key;
4577                 u64 offset;
4578
4579                 btrfs_item_key(leaf, &disk_key, slot);
4580
4581                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4582                         unsigned long ptr;
4583                         struct btrfs_file_extent_item *fi;
4584
4585                         fi = btrfs_item_ptr(leaf, slot,
4586                                             struct btrfs_file_extent_item);
4587                         fi = (struct btrfs_file_extent_item *)(
4588                              (unsigned long)fi - size_diff);
4589
4590                         if (btrfs_file_extent_type(leaf, fi) ==
4591                             BTRFS_FILE_EXTENT_INLINE) {
4592                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4593                                 memmove_extent_buffer(leaf, ptr,
4594                                       (unsigned long)fi,
4595                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4596                         }
4597                 }
4598
4599                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4600                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4601                               data_end, old_data_start - data_end);
4602
4603                 offset = btrfs_disk_key_offset(&disk_key);
4604                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4605                 btrfs_set_item_key(leaf, &disk_key, slot);
4606                 if (slot == 0)
4607                         fixup_low_keys(path, &disk_key, 1);
4608         }
4609
4610         item = btrfs_item_nr(slot);
4611         btrfs_set_item_size(leaf, item, new_size);
4612         btrfs_mark_buffer_dirty(leaf);
4613
4614         if (btrfs_leaf_free_space(leaf) < 0) {
4615                 btrfs_print_leaf(leaf);
4616                 BUG();
4617         }
4618 }
4619
4620 /*
4621  * make the item pointed to by the path bigger, data_size is the added size.
4622  */
4623 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4624 {
4625         int slot;
4626         struct extent_buffer *leaf;
4627         struct btrfs_item *item;
4628         u32 nritems;
4629         unsigned int data_end;
4630         unsigned int old_data;
4631         unsigned int old_size;
4632         int i;
4633         struct btrfs_map_token token;
4634
4635         leaf = path->nodes[0];
4636
4637         nritems = btrfs_header_nritems(leaf);
4638         data_end = leaf_data_end(leaf);
4639
4640         if (btrfs_leaf_free_space(leaf) < data_size) {
4641                 btrfs_print_leaf(leaf);
4642                 BUG();
4643         }
4644         slot = path->slots[0];
4645         old_data = btrfs_item_end_nr(leaf, slot);
4646
4647         BUG_ON(slot < 0);
4648         if (slot >= nritems) {
4649                 btrfs_print_leaf(leaf);
4650                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4651                            slot, nritems);
4652                 BUG();
4653         }
4654
4655         /*
4656          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4657          */
4658         /* first correct the data pointers */
4659         btrfs_init_map_token(&token, leaf);
4660         for (i = slot; i < nritems; i++) {
4661                 u32 ioff;
4662                 item = btrfs_item_nr(i);
4663
4664                 ioff = btrfs_token_item_offset(leaf, item, &token);
4665                 btrfs_set_token_item_offset(leaf, item,
4666                                             ioff - data_size, &token);
4667         }
4668
4669         /* shift the data */
4670         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4671                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4672                       data_end, old_data - data_end);
4673
4674         data_end = old_data;
4675         old_size = btrfs_item_size_nr(leaf, slot);
4676         item = btrfs_item_nr(slot);
4677         btrfs_set_item_size(leaf, item, old_size + data_size);
4678         btrfs_mark_buffer_dirty(leaf);
4679
4680         if (btrfs_leaf_free_space(leaf) < 0) {
4681                 btrfs_print_leaf(leaf);
4682                 BUG();
4683         }
4684 }
4685
4686 /*
4687  * this is a helper for btrfs_insert_empty_items, the main goal here is
4688  * to save stack depth by doing the bulk of the work in a function
4689  * that doesn't call btrfs_search_slot
4690  */
4691 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4692                             const struct btrfs_key *cpu_key, u32 *data_size,
4693                             u32 total_data, u32 total_size, int nr)
4694 {
4695         struct btrfs_fs_info *fs_info = root->fs_info;
4696         struct btrfs_item *item;
4697         int i;
4698         u32 nritems;
4699         unsigned int data_end;
4700         struct btrfs_disk_key disk_key;
4701         struct extent_buffer *leaf;
4702         int slot;
4703         struct btrfs_map_token token;
4704
4705         if (path->slots[0] == 0) {
4706                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4707                 fixup_low_keys(path, &disk_key, 1);
4708         }
4709         btrfs_unlock_up_safe(path, 1);
4710
4711         leaf = path->nodes[0];
4712         slot = path->slots[0];
4713
4714         nritems = btrfs_header_nritems(leaf);
4715         data_end = leaf_data_end(leaf);
4716
4717         if (btrfs_leaf_free_space(leaf) < total_size) {
4718                 btrfs_print_leaf(leaf);
4719                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4720                            total_size, btrfs_leaf_free_space(leaf));
4721                 BUG();
4722         }
4723
4724         btrfs_init_map_token(&token, leaf);
4725         if (slot != nritems) {
4726                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4727
4728                 if (old_data < data_end) {
4729                         btrfs_print_leaf(leaf);
4730                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4731                                    slot, old_data, data_end);
4732                         BUG();
4733                 }
4734                 /*
4735                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4736                  */
4737                 /* first correct the data pointers */
4738                 for (i = slot; i < nritems; i++) {
4739                         u32 ioff;
4740
4741                         item = btrfs_item_nr(i);
4742                         ioff = btrfs_token_item_offset(leaf, item, &token);
4743                         btrfs_set_token_item_offset(leaf, item,
4744                                                     ioff - total_data, &token);
4745                 }
4746                 /* shift the items */
4747                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4748                               btrfs_item_nr_offset(slot),
4749                               (nritems - slot) * sizeof(struct btrfs_item));
4750
4751                 /* shift the data */
4752                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4753                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4754                               data_end, old_data - data_end);
4755                 data_end = old_data;
4756         }
4757
4758         /* setup the item for the new data */
4759         for (i = 0; i < nr; i++) {
4760                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4761                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4762                 item = btrfs_item_nr(slot + i);
4763                 btrfs_set_token_item_offset(leaf, item,
4764                                             data_end - data_size[i], &token);
4765                 data_end -= data_size[i];
4766                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4767         }
4768
4769         btrfs_set_header_nritems(leaf, nritems + nr);
4770         btrfs_mark_buffer_dirty(leaf);
4771
4772         if (btrfs_leaf_free_space(leaf) < 0) {
4773                 btrfs_print_leaf(leaf);
4774                 BUG();
4775         }
4776 }
4777
4778 /*
4779  * Given a key and some data, insert items into the tree.
4780  * This does all the path init required, making room in the tree if needed.
4781  */
4782 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4783                             struct btrfs_root *root,
4784                             struct btrfs_path *path,
4785                             const struct btrfs_key *cpu_key, u32 *data_size,
4786                             int nr)
4787 {
4788         int ret = 0;
4789         int slot;
4790         int i;
4791         u32 total_size = 0;
4792         u32 total_data = 0;
4793
4794         for (i = 0; i < nr; i++)
4795                 total_data += data_size[i];
4796
4797         total_size = total_data + (nr * sizeof(struct btrfs_item));
4798         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4799         if (ret == 0)
4800                 return -EEXIST;
4801         if (ret < 0)
4802                 return ret;
4803
4804         slot = path->slots[0];
4805         BUG_ON(slot < 0);
4806
4807         setup_items_for_insert(root, path, cpu_key, data_size,
4808                                total_data, total_size, nr);
4809         return 0;
4810 }
4811
4812 /*
4813  * Given a key and some data, insert an item into the tree.
4814  * This does all the path init required, making room in the tree if needed.
4815  */
4816 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4817                       const struct btrfs_key *cpu_key, void *data,
4818                       u32 data_size)
4819 {
4820         int ret = 0;
4821         struct btrfs_path *path;
4822         struct extent_buffer *leaf;
4823         unsigned long ptr;
4824
4825         path = btrfs_alloc_path();
4826         if (!path)
4827                 return -ENOMEM;
4828         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4829         if (!ret) {
4830                 leaf = path->nodes[0];
4831                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4832                 write_extent_buffer(leaf, data, ptr, data_size);
4833                 btrfs_mark_buffer_dirty(leaf);
4834         }
4835         btrfs_free_path(path);
4836         return ret;
4837 }
4838
4839 /*
4840  * delete the pointer from a given node.
4841  *
4842  * the tree should have been previously balanced so the deletion does not
4843  * empty a node.
4844  */
4845 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4846                     int level, int slot)
4847 {
4848         struct extent_buffer *parent = path->nodes[level];
4849         u32 nritems;
4850         int ret;
4851
4852         nritems = btrfs_header_nritems(parent);
4853         if (slot != nritems - 1) {
4854                 if (level) {
4855                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4856                                         nritems - slot - 1);
4857                         BUG_ON(ret < 0);
4858                 }
4859                 memmove_extent_buffer(parent,
4860                               btrfs_node_key_ptr_offset(slot),
4861                               btrfs_node_key_ptr_offset(slot + 1),
4862                               sizeof(struct btrfs_key_ptr) *
4863                               (nritems - slot - 1));
4864         } else if (level) {
4865                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4866                                 GFP_NOFS);
4867                 BUG_ON(ret < 0);
4868         }
4869
4870         nritems--;
4871         btrfs_set_header_nritems(parent, nritems);
4872         if (nritems == 0 && parent == root->node) {
4873                 BUG_ON(btrfs_header_level(root->node) != 1);
4874                 /* just turn the root into a leaf and break */
4875                 btrfs_set_header_level(root->node, 0);
4876         } else if (slot == 0) {
4877                 struct btrfs_disk_key disk_key;
4878
4879                 btrfs_node_key(parent, &disk_key, 0);
4880                 fixup_low_keys(path, &disk_key, level + 1);
4881         }
4882         btrfs_mark_buffer_dirty(parent);
4883 }
4884
4885 /*
4886  * a helper function to delete the leaf pointed to by path->slots[1] and
4887  * path->nodes[1].
4888  *
4889  * This deletes the pointer in path->nodes[1] and frees the leaf
4890  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4891  *
4892  * The path must have already been setup for deleting the leaf, including
4893  * all the proper balancing.  path->nodes[1] must be locked.
4894  */
4895 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4896                                     struct btrfs_root *root,
4897                                     struct btrfs_path *path,
4898                                     struct extent_buffer *leaf)
4899 {
4900         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4901         del_ptr(root, path, 1, path->slots[1]);
4902
4903         /*
4904          * btrfs_free_extent is expensive, we want to make sure we
4905          * aren't holding any locks when we call it
4906          */
4907         btrfs_unlock_up_safe(path, 0);
4908
4909         root_sub_used(root, leaf->len);
4910
4911         atomic_inc(&leaf->refs);
4912         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4913         free_extent_buffer_stale(leaf);
4914 }
4915 /*
4916  * delete the item at the leaf level in path.  If that empties
4917  * the leaf, remove it from the tree
4918  */
4919 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4920                     struct btrfs_path *path, int slot, int nr)
4921 {
4922         struct btrfs_fs_info *fs_info = root->fs_info;
4923         struct extent_buffer *leaf;
4924         struct btrfs_item *item;
4925         u32 last_off;
4926         u32 dsize = 0;
4927         int ret = 0;
4928         int wret;
4929         int i;
4930         u32 nritems;
4931
4932         leaf = path->nodes[0];
4933         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4934
4935         for (i = 0; i < nr; i++)
4936                 dsize += btrfs_item_size_nr(leaf, slot + i);
4937
4938         nritems = btrfs_header_nritems(leaf);
4939
4940         if (slot + nr != nritems) {
4941                 int data_end = leaf_data_end(leaf);
4942                 struct btrfs_map_token token;
4943
4944                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4945                               data_end + dsize,
4946                               BTRFS_LEAF_DATA_OFFSET + data_end,
4947                               last_off - data_end);
4948
4949                 btrfs_init_map_token(&token, leaf);
4950                 for (i = slot + nr; i < nritems; i++) {
4951                         u32 ioff;
4952
4953                         item = btrfs_item_nr(i);
4954                         ioff = btrfs_token_item_offset(leaf, item, &token);
4955                         btrfs_set_token_item_offset(leaf, item,
4956                                                     ioff + dsize, &token);
4957                 }
4958
4959                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4960                               btrfs_item_nr_offset(slot + nr),
4961                               sizeof(struct btrfs_item) *
4962                               (nritems - slot - nr));
4963         }
4964         btrfs_set_header_nritems(leaf, nritems - nr);
4965         nritems -= nr;
4966
4967         /* delete the leaf if we've emptied it */
4968         if (nritems == 0) {
4969                 if (leaf == root->node) {
4970                         btrfs_set_header_level(leaf, 0);
4971                 } else {
4972                         btrfs_set_path_blocking(path);
4973                         btrfs_clean_tree_block(leaf);
4974                         btrfs_del_leaf(trans, root, path, leaf);
4975                 }
4976         } else {
4977                 int used = leaf_space_used(leaf, 0, nritems);
4978                 if (slot == 0) {
4979                         struct btrfs_disk_key disk_key;
4980
4981                         btrfs_item_key(leaf, &disk_key, 0);
4982                         fixup_low_keys(path, &disk_key, 1);
4983                 }
4984
4985                 /* delete the leaf if it is mostly empty */
4986                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4987                         /* push_leaf_left fixes the path.
4988                          * make sure the path still points to our leaf
4989                          * for possible call to del_ptr below
4990                          */
4991                         slot = path->slots[1];
4992                         atomic_inc(&leaf->refs);
4993
4994                         btrfs_set_path_blocking(path);
4995                         wret = push_leaf_left(trans, root, path, 1, 1,
4996                                               1, (u32)-1);
4997                         if (wret < 0 && wret != -ENOSPC)
4998                                 ret = wret;
4999
5000                         if (path->nodes[0] == leaf &&
5001                             btrfs_header_nritems(leaf)) {
5002                                 wret = push_leaf_right(trans, root, path, 1,
5003                                                        1, 1, 0);
5004                                 if (wret < 0 && wret != -ENOSPC)
5005                                         ret = wret;
5006                         }
5007
5008                         if (btrfs_header_nritems(leaf) == 0) {
5009                                 path->slots[1] = slot;
5010                                 btrfs_del_leaf(trans, root, path, leaf);
5011                                 free_extent_buffer(leaf);
5012                                 ret = 0;
5013                         } else {
5014                                 /* if we're still in the path, make sure
5015                                  * we're dirty.  Otherwise, one of the
5016                                  * push_leaf functions must have already
5017                                  * dirtied this buffer
5018                                  */
5019                                 if (path->nodes[0] == leaf)
5020                                         btrfs_mark_buffer_dirty(leaf);
5021                                 free_extent_buffer(leaf);
5022                         }
5023                 } else {
5024                         btrfs_mark_buffer_dirty(leaf);
5025                 }
5026         }
5027         return ret;
5028 }
5029
5030 /*
5031  * search the tree again to find a leaf with lesser keys
5032  * returns 0 if it found something or 1 if there are no lesser leaves.
5033  * returns < 0 on io errors.
5034  *
5035  * This may release the path, and so you may lose any locks held at the
5036  * time you call it.
5037  */
5038 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5039 {
5040         struct btrfs_key key;
5041         struct btrfs_disk_key found_key;
5042         int ret;
5043
5044         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5045
5046         if (key.offset > 0) {
5047                 key.offset--;
5048         } else if (key.type > 0) {
5049                 key.type--;
5050                 key.offset = (u64)-1;
5051         } else if (key.objectid > 0) {
5052                 key.objectid--;
5053                 key.type = (u8)-1;
5054                 key.offset = (u64)-1;
5055         } else {
5056                 return 1;
5057         }
5058
5059         btrfs_release_path(path);
5060         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5061         if (ret < 0)
5062                 return ret;
5063         btrfs_item_key(path->nodes[0], &found_key, 0);
5064         ret = comp_keys(&found_key, &key);
5065         /*
5066          * We might have had an item with the previous key in the tree right
5067          * before we released our path. And after we released our path, that
5068          * item might have been pushed to the first slot (0) of the leaf we
5069          * were holding due to a tree balance. Alternatively, an item with the
5070          * previous key can exist as the only element of a leaf (big fat item).
5071          * Therefore account for these 2 cases, so that our callers (like
5072          * btrfs_previous_item) don't miss an existing item with a key matching
5073          * the previous key we computed above.
5074          */
5075         if (ret <= 0)
5076                 return 0;
5077         return 1;
5078 }
5079
5080 /*
5081  * A helper function to walk down the tree starting at min_key, and looking
5082  * for nodes or leaves that are have a minimum transaction id.
5083  * This is used by the btree defrag code, and tree logging
5084  *
5085  * This does not cow, but it does stuff the starting key it finds back
5086  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5087  * key and get a writable path.
5088  *
5089  * This honors path->lowest_level to prevent descent past a given level
5090  * of the tree.
5091  *
5092  * min_trans indicates the oldest transaction that you are interested
5093  * in walking through.  Any nodes or leaves older than min_trans are
5094  * skipped over (without reading them).
5095  *
5096  * returns zero if something useful was found, < 0 on error and 1 if there
5097  * was nothing in the tree that matched the search criteria.
5098  */
5099 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5100                          struct btrfs_path *path,
5101                          u64 min_trans)
5102 {
5103         struct extent_buffer *cur;
5104         struct btrfs_key found_key;
5105         int slot;
5106         int sret;
5107         u32 nritems;
5108         int level;
5109         int ret = 1;
5110         int keep_locks = path->keep_locks;
5111
5112         path->keep_locks = 1;
5113 again:
5114         cur = btrfs_read_lock_root_node(root);
5115         level = btrfs_header_level(cur);
5116         WARN_ON(path->nodes[level]);
5117         path->nodes[level] = cur;
5118         path->locks[level] = BTRFS_READ_LOCK;
5119
5120         if (btrfs_header_generation(cur) < min_trans) {
5121                 ret = 1;
5122                 goto out;
5123         }
5124         while (1) {
5125                 nritems = btrfs_header_nritems(cur);
5126                 level = btrfs_header_level(cur);
5127                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5128                 if (sret < 0) {
5129                         ret = sret;
5130                         goto out;
5131                 }
5132
5133                 /* at the lowest level, we're done, setup the path and exit */
5134                 if (level == path->lowest_level) {
5135                         if (slot >= nritems)
5136                                 goto find_next_key;
5137                         ret = 0;
5138                         path->slots[level] = slot;
5139                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5140                         goto out;
5141                 }
5142                 if (sret && slot > 0)
5143                         slot--;
5144                 /*
5145                  * check this node pointer against the min_trans parameters.
5146                  * If it is too old, old, skip to the next one.
5147                  */
5148                 while (slot < nritems) {
5149                         u64 gen;
5150
5151                         gen = btrfs_node_ptr_generation(cur, slot);
5152                         if (gen < min_trans) {
5153                                 slot++;
5154                                 continue;
5155                         }
5156                         break;
5157                 }
5158 find_next_key:
5159                 /*
5160                  * we didn't find a candidate key in this node, walk forward
5161                  * and find another one
5162                  */
5163                 if (slot >= nritems) {
5164                         path->slots[level] = slot;
5165                         btrfs_set_path_blocking(path);
5166                         sret = btrfs_find_next_key(root, path, min_key, level,
5167                                                   min_trans);
5168                         if (sret == 0) {
5169                                 btrfs_release_path(path);
5170                                 goto again;
5171                         } else {
5172                                 goto out;
5173                         }
5174                 }
5175                 /* save our key for returning back */
5176                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5177                 path->slots[level] = slot;
5178                 if (level == path->lowest_level) {
5179                         ret = 0;
5180                         goto out;
5181                 }
5182                 btrfs_set_path_blocking(path);
5183                 cur = btrfs_read_node_slot(cur, slot);
5184                 if (IS_ERR(cur)) {
5185                         ret = PTR_ERR(cur);
5186                         goto out;
5187                 }
5188
5189                 btrfs_tree_read_lock(cur);
5190
5191                 path->locks[level - 1] = BTRFS_READ_LOCK;
5192                 path->nodes[level - 1] = cur;
5193                 unlock_up(path, level, 1, 0, NULL);
5194         }
5195 out:
5196         path->keep_locks = keep_locks;
5197         if (ret == 0) {
5198                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5199                 btrfs_set_path_blocking(path);
5200                 memcpy(min_key, &found_key, sizeof(found_key));
5201         }
5202         return ret;
5203 }
5204
5205 /*
5206  * this is similar to btrfs_next_leaf, but does not try to preserve
5207  * and fixup the path.  It looks for and returns the next key in the
5208  * tree based on the current path and the min_trans parameters.
5209  *
5210  * 0 is returned if another key is found, < 0 if there are any errors
5211  * and 1 is returned if there are no higher keys in the tree
5212  *
5213  * path->keep_locks should be set to 1 on the search made before
5214  * calling this function.
5215  */
5216 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5217                         struct btrfs_key *key, int level, u64 min_trans)
5218 {
5219         int slot;
5220         struct extent_buffer *c;
5221
5222         WARN_ON(!path->keep_locks && !path->skip_locking);
5223         while (level < BTRFS_MAX_LEVEL) {
5224                 if (!path->nodes[level])
5225                         return 1;
5226
5227                 slot = path->slots[level] + 1;
5228                 c = path->nodes[level];
5229 next:
5230                 if (slot >= btrfs_header_nritems(c)) {
5231                         int ret;
5232                         int orig_lowest;
5233                         struct btrfs_key cur_key;
5234                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5235                             !path->nodes[level + 1])
5236                                 return 1;
5237
5238                         if (path->locks[level + 1] || path->skip_locking) {
5239                                 level++;
5240                                 continue;
5241                         }
5242
5243                         slot = btrfs_header_nritems(c) - 1;
5244                         if (level == 0)
5245                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5246                         else
5247                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5248
5249                         orig_lowest = path->lowest_level;
5250                         btrfs_release_path(path);
5251                         path->lowest_level = level;
5252                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5253                                                 0, 0);
5254                         path->lowest_level = orig_lowest;
5255                         if (ret < 0)
5256                                 return ret;
5257
5258                         c = path->nodes[level];
5259                         slot = path->slots[level];
5260                         if (ret == 0)
5261                                 slot++;
5262                         goto next;
5263                 }
5264
5265                 if (level == 0)
5266                         btrfs_item_key_to_cpu(c, key, slot);
5267                 else {
5268                         u64 gen = btrfs_node_ptr_generation(c, slot);
5269
5270                         if (gen < min_trans) {
5271                                 slot++;
5272                                 goto next;
5273                         }
5274                         btrfs_node_key_to_cpu(c, key, slot);
5275                 }
5276                 return 0;
5277         }
5278         return 1;
5279 }
5280
5281 /*
5282  * search the tree again to find a leaf with greater keys
5283  * returns 0 if it found something or 1 if there are no greater leaves.
5284  * returns < 0 on io errors.
5285  */
5286 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5287 {
5288         return btrfs_next_old_leaf(root, path, 0);
5289 }
5290
5291 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5292                         u64 time_seq)
5293 {
5294         int slot;
5295         int level;
5296         struct extent_buffer *c;
5297         struct extent_buffer *next;
5298         struct btrfs_key key;
5299         u32 nritems;
5300         int ret;
5301         int old_spinning = path->leave_spinning;
5302         int next_rw_lock = 0;
5303
5304         nritems = btrfs_header_nritems(path->nodes[0]);
5305         if (nritems == 0)
5306                 return 1;
5307
5308         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5309 again:
5310         level = 1;
5311         next = NULL;
5312         next_rw_lock = 0;
5313         btrfs_release_path(path);
5314
5315         path->keep_locks = 1;
5316         path->leave_spinning = 1;
5317
5318         if (time_seq)
5319                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5320         else
5321                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5322         path->keep_locks = 0;
5323
5324         if (ret < 0)
5325                 return ret;
5326
5327         nritems = btrfs_header_nritems(path->nodes[0]);
5328         /*
5329          * by releasing the path above we dropped all our locks.  A balance
5330          * could have added more items next to the key that used to be
5331          * at the very end of the block.  So, check again here and
5332          * advance the path if there are now more items available.
5333          */
5334         if (nritems > 0 && path->slots[0] < nritems - 1) {
5335                 if (ret == 0)
5336                         path->slots[0]++;
5337                 ret = 0;
5338                 goto done;
5339         }
5340         /*
5341          * So the above check misses one case:
5342          * - after releasing the path above, someone has removed the item that
5343          *   used to be at the very end of the block, and balance between leafs
5344          *   gets another one with bigger key.offset to replace it.
5345          *
5346          * This one should be returned as well, or we can get leaf corruption
5347          * later(esp. in __btrfs_drop_extents()).
5348          *
5349          * And a bit more explanation about this check,
5350          * with ret > 0, the key isn't found, the path points to the slot
5351          * where it should be inserted, so the path->slots[0] item must be the
5352          * bigger one.
5353          */
5354         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5355                 ret = 0;
5356                 goto done;
5357         }
5358
5359         while (level < BTRFS_MAX_LEVEL) {
5360                 if (!path->nodes[level]) {
5361                         ret = 1;
5362                         goto done;
5363                 }
5364
5365                 slot = path->slots[level] + 1;
5366                 c = path->nodes[level];
5367                 if (slot >= btrfs_header_nritems(c)) {
5368                         level++;
5369                         if (level == BTRFS_MAX_LEVEL) {
5370                                 ret = 1;
5371                                 goto done;
5372                         }
5373                         continue;
5374                 }
5375
5376                 if (next) {
5377                         btrfs_tree_unlock_rw(next, next_rw_lock);
5378                         free_extent_buffer(next);
5379                 }
5380
5381                 next = c;
5382                 next_rw_lock = path->locks[level];
5383                 ret = read_block_for_search(root, path, &next, level,
5384                                             slot, &key);
5385                 if (ret == -EAGAIN)
5386                         goto again;
5387
5388                 if (ret < 0) {
5389                         btrfs_release_path(path);
5390                         goto done;
5391                 }
5392
5393                 if (!path->skip_locking) {
5394                         ret = btrfs_try_tree_read_lock(next);
5395                         if (!ret && time_seq) {
5396                                 /*
5397                                  * If we don't get the lock, we may be racing
5398                                  * with push_leaf_left, holding that lock while
5399                                  * itself waiting for the leaf we've currently
5400                                  * locked. To solve this situation, we give up
5401                                  * on our lock and cycle.
5402                                  */
5403                                 free_extent_buffer(next);
5404                                 btrfs_release_path(path);
5405                                 cond_resched();
5406                                 goto again;
5407                         }
5408                         if (!ret) {
5409                                 btrfs_set_path_blocking(path);
5410                                 btrfs_tree_read_lock(next);
5411                         }
5412                         next_rw_lock = BTRFS_READ_LOCK;
5413                 }
5414                 break;
5415         }
5416         path->slots[level] = slot;
5417         while (1) {
5418                 level--;
5419                 c = path->nodes[level];
5420                 if (path->locks[level])
5421                         btrfs_tree_unlock_rw(c, path->locks[level]);
5422
5423                 free_extent_buffer(c);
5424                 path->nodes[level] = next;
5425                 path->slots[level] = 0;
5426                 if (!path->skip_locking)
5427                         path->locks[level] = next_rw_lock;
5428                 if (!level)
5429                         break;
5430
5431                 ret = read_block_for_search(root, path, &next, level,
5432                                             0, &key);
5433                 if (ret == -EAGAIN)
5434                         goto again;
5435
5436                 if (ret < 0) {
5437                         btrfs_release_path(path);
5438                         goto done;
5439                 }
5440
5441                 if (!path->skip_locking) {
5442                         ret = btrfs_try_tree_read_lock(next);
5443                         if (!ret) {
5444                                 btrfs_set_path_blocking(path);
5445                                 btrfs_tree_read_lock(next);
5446                         }
5447                         next_rw_lock = BTRFS_READ_LOCK;
5448                 }
5449         }
5450         ret = 0;
5451 done:
5452         unlock_up(path, 0, 1, 0, NULL);
5453         path->leave_spinning = old_spinning;
5454         if (!old_spinning)
5455                 btrfs_set_path_blocking(path);
5456
5457         return ret;
5458 }
5459
5460 /*
5461  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5462  * searching until it gets past min_objectid or finds an item of 'type'
5463  *
5464  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5465  */
5466 int btrfs_previous_item(struct btrfs_root *root,
5467                         struct btrfs_path *path, u64 min_objectid,
5468                         int type)
5469 {
5470         struct btrfs_key found_key;
5471         struct extent_buffer *leaf;
5472         u32 nritems;
5473         int ret;
5474
5475         while (1) {
5476                 if (path->slots[0] == 0) {
5477                         btrfs_set_path_blocking(path);
5478                         ret = btrfs_prev_leaf(root, path);
5479                         if (ret != 0)
5480                                 return ret;
5481                 } else {
5482                         path->slots[0]--;
5483                 }
5484                 leaf = path->nodes[0];
5485                 nritems = btrfs_header_nritems(leaf);
5486                 if (nritems == 0)
5487                         return 1;
5488                 if (path->slots[0] == nritems)
5489                         path->slots[0]--;
5490
5491                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5492                 if (found_key.objectid < min_objectid)
5493                         break;
5494                 if (found_key.type == type)
5495                         return 0;
5496                 if (found_key.objectid == min_objectid &&
5497                     found_key.type < type)
5498                         break;
5499         }
5500         return 1;
5501 }
5502
5503 /*
5504  * search in extent tree to find a previous Metadata/Data extent item with
5505  * min objecitd.
5506  *
5507  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5508  */
5509 int btrfs_previous_extent_item(struct btrfs_root *root,
5510                         struct btrfs_path *path, u64 min_objectid)
5511 {
5512         struct btrfs_key found_key;
5513         struct extent_buffer *leaf;
5514         u32 nritems;
5515         int ret;
5516
5517         while (1) {
5518                 if (path->slots[0] == 0) {
5519                         btrfs_set_path_blocking(path);
5520                         ret = btrfs_prev_leaf(root, path);
5521                         if (ret != 0)
5522                                 return ret;
5523                 } else {
5524                         path->slots[0]--;
5525                 }
5526                 leaf = path->nodes[0];
5527                 nritems = btrfs_header_nritems(leaf);
5528                 if (nritems == 0)
5529                         return 1;
5530                 if (path->slots[0] == nritems)
5531                         path->slots[0]--;
5532
5533                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5534                 if (found_key.objectid < min_objectid)
5535                         break;
5536                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5537                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5538                         return 0;
5539                 if (found_key.objectid == min_objectid &&
5540                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5541                         break;
5542         }
5543         return 1;
5544 }