btrfs: fix deadlock with concurrent chunk allocations involving system chunks
[linux-block.git] / fs / btrfs / ctree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570 2/*
d352ac68 3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6cbd5570
CM
4 */
5
a6b6e75e 6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
bd989ba3 8#include <linux/rbtree.h>
adf02123 9#include <linux/mm.h>
eb60ceac
CM
10#include "ctree.h"
11#include "disk-io.h"
7f5c1516 12#include "transaction.h"
5f39d397 13#include "print-tree.h"
925baedd 14#include "locking.h"
de37aa51 15#include "volumes.h"
f616f5cd 16#include "qgroup.h"
f3a84ccd 17#include "tree-mod-log.h"
9a8dd150 18
e089f05c
CM
19static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20 *root, struct btrfs_path *path, int level);
310712b2
OS
21static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22 const struct btrfs_key *ins_key, struct btrfs_path *path,
23 int data_size, int extend);
5f39d397 24static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 25 struct extent_buffer *dst,
971a1f66 26 struct extent_buffer *src, int empty);
5f39d397 27static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
28 struct extent_buffer *dst_buf,
29 struct extent_buffer *src_buf);
afe5fea7
TI
30static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31 int level, int slot);
d97e63b6 32
af024ed2
JT
33static const struct btrfs_csums {
34 u16 size;
59a0fcdb
DS
35 const char name[10];
36 const char driver[12];
af024ed2
JT
37} btrfs_csums[] = {
38 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
3951e7f0 39 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
3831bf00 40 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
352ae07b
DS
41 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42 .driver = "blake2b-256" },
af024ed2
JT
43};
44
45int btrfs_super_csum_size(const struct btrfs_super_block *s)
46{
47 u16 t = btrfs_super_csum_type(s);
48 /*
49 * csum type is validated at mount time
50 */
51 return btrfs_csums[t].size;
52}
53
54const char *btrfs_super_csum_name(u16 csum_type)
55{
56 /* csum type is validated at mount time */
57 return btrfs_csums[csum_type].name;
58}
59
b4e967be
DS
60/*
61 * Return driver name if defined, otherwise the name that's also a valid driver
62 * name
63 */
64const char *btrfs_super_csum_driver(u16 csum_type)
65{
66 /* csum type is validated at mount time */
59a0fcdb
DS
67 return btrfs_csums[csum_type].driver[0] ?
68 btrfs_csums[csum_type].driver :
b4e967be
DS
69 btrfs_csums[csum_type].name;
70}
71
604997b4 72size_t __attribute_const__ btrfs_get_num_csums(void)
f7cea56c
DS
73{
74 return ARRAY_SIZE(btrfs_csums);
75}
76
df24a2b9 77struct btrfs_path *btrfs_alloc_path(void)
2c90e5d6 78{
e2c89907 79 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
2c90e5d6
CM
80}
81
d352ac68 82/* this also releases the path */
df24a2b9 83void btrfs_free_path(struct btrfs_path *p)
be0e5c09 84{
ff175d57
JJ
85 if (!p)
86 return;
b3b4aa74 87 btrfs_release_path(p);
df24a2b9 88 kmem_cache_free(btrfs_path_cachep, p);
be0e5c09
CM
89}
90
d352ac68
CM
91/*
92 * path release drops references on the extent buffers in the path
93 * and it drops any locks held by this path
94 *
95 * It is safe to call this on paths that no locks or extent buffers held.
96 */
b3b4aa74 97noinline void btrfs_release_path(struct btrfs_path *p)
eb60ceac
CM
98{
99 int i;
a2135011 100
234b63a0 101 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3f157a2f 102 p->slots[i] = 0;
eb60ceac 103 if (!p->nodes[i])
925baedd
CM
104 continue;
105 if (p->locks[i]) {
bd681513 106 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
925baedd
CM
107 p->locks[i] = 0;
108 }
5f39d397 109 free_extent_buffer(p->nodes[i]);
3f157a2f 110 p->nodes[i] = NULL;
eb60ceac
CM
111 }
112}
113
d352ac68
CM
114/*
115 * safely gets a reference on the root node of a tree. A lock
116 * is not taken, so a concurrent writer may put a different node
117 * at the root of the tree. See btrfs_lock_root_node for the
118 * looping required.
119 *
120 * The extent buffer returned by this has a reference taken, so
121 * it won't disappear. It may stop being the root of the tree
122 * at any time because there are no locks held.
123 */
925baedd
CM
124struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125{
126 struct extent_buffer *eb;
240f62c8 127
3083ee2e
JB
128 while (1) {
129 rcu_read_lock();
130 eb = rcu_dereference(root->node);
131
132 /*
133 * RCU really hurts here, we could free up the root node because
01327610 134 * it was COWed but we may not get the new root node yet so do
3083ee2e
JB
135 * the inc_not_zero dance and if it doesn't work then
136 * synchronize_rcu and try again.
137 */
138 if (atomic_inc_not_zero(&eb->refs)) {
139 rcu_read_unlock();
140 break;
141 }
142 rcu_read_unlock();
143 synchronize_rcu();
144 }
925baedd
CM
145 return eb;
146}
147
92a7cc42
QW
148/*
149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150 * just get put onto a simple dirty list. Transaction walks this list to make
151 * sure they get properly updated on disk.
d352ac68 152 */
0b86a832
CM
153static void add_root_to_dirty_list(struct btrfs_root *root)
154{
0b246afa
JM
155 struct btrfs_fs_info *fs_info = root->fs_info;
156
e7070be1
JB
157 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159 return;
160
0b246afa 161 spin_lock(&fs_info->trans_lock);
e7070be1
JB
162 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163 /* Want the extent tree to be the last on the list */
4fd786e6 164 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
e7070be1 165 list_move_tail(&root->dirty_list,
0b246afa 166 &fs_info->dirty_cowonly_roots);
e7070be1
JB
167 else
168 list_move(&root->dirty_list,
0b246afa 169 &fs_info->dirty_cowonly_roots);
0b86a832 170 }
0b246afa 171 spin_unlock(&fs_info->trans_lock);
0b86a832
CM
172}
173
d352ac68
CM
174/*
175 * used by snapshot creation to make a copy of a root for a tree with
176 * a given objectid. The buffer with the new root node is returned in
177 * cow_ret, and this func returns zero on success or a negative error code.
178 */
be20aa9d
CM
179int btrfs_copy_root(struct btrfs_trans_handle *trans,
180 struct btrfs_root *root,
181 struct extent_buffer *buf,
182 struct extent_buffer **cow_ret, u64 new_root_objectid)
183{
0b246afa 184 struct btrfs_fs_info *fs_info = root->fs_info;
be20aa9d 185 struct extent_buffer *cow;
be20aa9d
CM
186 int ret = 0;
187 int level;
5d4f98a2 188 struct btrfs_disk_key disk_key;
be20aa9d 189
92a7cc42 190 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 191 trans->transid != fs_info->running_transaction->transid);
92a7cc42 192 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 193 trans->transid != root->last_trans);
be20aa9d
CM
194
195 level = btrfs_header_level(buf);
5d4f98a2
YZ
196 if (level == 0)
197 btrfs_item_key(buf, &disk_key, 0);
198 else
199 btrfs_node_key(buf, &disk_key, 0);
31840ae1 200
4d75f8a9 201 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
cf6f34aa
JB
202 &disk_key, level, buf->start, 0,
203 BTRFS_NESTING_NEW_ROOT);
5d4f98a2 204 if (IS_ERR(cow))
be20aa9d
CM
205 return PTR_ERR(cow);
206
58e8012c 207 copy_extent_buffer_full(cow, buf);
be20aa9d
CM
208 btrfs_set_header_bytenr(cow, cow->start);
209 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
210 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212 BTRFS_HEADER_FLAG_RELOC);
213 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215 else
216 btrfs_set_header_owner(cow, new_root_objectid);
be20aa9d 217
de37aa51 218 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 219
be20aa9d 220 WARN_ON(btrfs_header_generation(buf) > trans->transid);
5d4f98a2 221 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 222 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 223 else
e339a6b0 224 ret = btrfs_inc_ref(trans, root, cow, 0);
867ed321 225 if (ret) {
72c9925f
FM
226 btrfs_tree_unlock(cow);
227 free_extent_buffer(cow);
867ed321 228 btrfs_abort_transaction(trans, ret);
be20aa9d 229 return ret;
867ed321 230 }
be20aa9d
CM
231
232 btrfs_mark_buffer_dirty(cow);
233 *cow_ret = cow;
234 return 0;
235}
236
5d4f98a2
YZ
237/*
238 * check if the tree block can be shared by multiple trees
239 */
240int btrfs_block_can_be_shared(struct btrfs_root *root,
241 struct extent_buffer *buf)
242{
243 /*
92a7cc42
QW
244 * Tree blocks not in shareable trees and tree roots are never shared.
245 * If a block was allocated after the last snapshot and the block was
246 * not allocated by tree relocation, we know the block is not shared.
5d4f98a2 247 */
92a7cc42 248 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
5d4f98a2
YZ
249 buf != root->node && buf != root->commit_root &&
250 (btrfs_header_generation(buf) <=
251 btrfs_root_last_snapshot(&root->root_item) ||
252 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253 return 1;
a79865c6 254
5d4f98a2
YZ
255 return 0;
256}
257
258static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct extent_buffer *buf,
f0486c68
YZ
261 struct extent_buffer *cow,
262 int *last_ref)
5d4f98a2 263{
0b246afa 264 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
265 u64 refs;
266 u64 owner;
267 u64 flags;
268 u64 new_flags = 0;
269 int ret;
270
271 /*
272 * Backrefs update rules:
273 *
274 * Always use full backrefs for extent pointers in tree block
275 * allocated by tree relocation.
276 *
277 * If a shared tree block is no longer referenced by its owner
278 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279 * use full backrefs for extent pointers in tree block.
280 *
281 * If a tree block is been relocating
282 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283 * use full backrefs for extent pointers in tree block.
284 * The reason for this is some operations (such as drop tree)
285 * are only allowed for blocks use full backrefs.
286 */
287
288 if (btrfs_block_can_be_shared(root, buf)) {
2ff7e61e 289 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
3173a18f
JB
290 btrfs_header_level(buf), 1,
291 &refs, &flags);
be1a5564
MF
292 if (ret)
293 return ret;
e5df9573
MF
294 if (refs == 0) {
295 ret = -EROFS;
0b246afa 296 btrfs_handle_fs_error(fs_info, ret, NULL);
e5df9573
MF
297 return ret;
298 }
5d4f98a2
YZ
299 } else {
300 refs = 1;
301 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304 else
305 flags = 0;
306 }
307
308 owner = btrfs_header_owner(buf);
309 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311
312 if (refs > 1) {
313 if ((owner == root->root_key.objectid ||
314 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
e339a6b0 316 ret = btrfs_inc_ref(trans, root, buf, 1);
692826b2
JM
317 if (ret)
318 return ret;
5d4f98a2
YZ
319
320 if (root->root_key.objectid ==
321 BTRFS_TREE_RELOC_OBJECTID) {
e339a6b0 322 ret = btrfs_dec_ref(trans, root, buf, 0);
692826b2
JM
323 if (ret)
324 return ret;
e339a6b0 325 ret = btrfs_inc_ref(trans, root, cow, 1);
692826b2
JM
326 if (ret)
327 return ret;
5d4f98a2
YZ
328 }
329 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330 } else {
331
332 if (root->root_key.objectid ==
333 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 334 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 335 else
e339a6b0 336 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
337 if (ret)
338 return ret;
5d4f98a2
YZ
339 }
340 if (new_flags != 0) {
b1c79e09
JB
341 int level = btrfs_header_level(buf);
342
42c9d0b5 343 ret = btrfs_set_disk_extent_flags(trans, buf,
b1c79e09 344 new_flags, level, 0);
be1a5564
MF
345 if (ret)
346 return ret;
5d4f98a2
YZ
347 }
348 } else {
349 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350 if (root->root_key.objectid ==
351 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 352 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 353 else
e339a6b0 354 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
355 if (ret)
356 return ret;
e339a6b0 357 ret = btrfs_dec_ref(trans, root, buf, 1);
692826b2
JM
358 if (ret)
359 return ret;
5d4f98a2 360 }
6a884d7d 361 btrfs_clean_tree_block(buf);
f0486c68 362 *last_ref = 1;
5d4f98a2
YZ
363 }
364 return 0;
365}
366
a6279470
FM
367static struct extent_buffer *alloc_tree_block_no_bg_flush(
368 struct btrfs_trans_handle *trans,
369 struct btrfs_root *root,
370 u64 parent_start,
371 const struct btrfs_disk_key *disk_key,
372 int level,
373 u64 hint,
9631e4cc
JB
374 u64 empty_size,
375 enum btrfs_lock_nesting nest)
a6279470
FM
376{
377 struct btrfs_fs_info *fs_info = root->fs_info;
378 struct extent_buffer *ret;
379
380 /*
381 * If we are COWing a node/leaf from the extent, chunk, device or free
382 * space trees, make sure that we do not finish block group creation of
383 * pending block groups. We do this to avoid a deadlock.
384 * COWing can result in allocation of a new chunk, and flushing pending
385 * block groups (btrfs_create_pending_block_groups()) can be triggered
386 * when finishing allocation of a new chunk. Creation of a pending block
387 * group modifies the extent, chunk, device and free space trees,
388 * therefore we could deadlock with ourselves since we are holding a
389 * lock on an extent buffer that btrfs_create_pending_block_groups() may
390 * try to COW later.
391 * For similar reasons, we also need to delay flushing pending block
392 * groups when splitting a leaf or node, from one of those trees, since
393 * we are holding a write lock on it and its parent or when inserting a
394 * new root node for one of those trees.
395 */
396 if (root == fs_info->extent_root ||
397 root == fs_info->chunk_root ||
398 root == fs_info->dev_root ||
399 root == fs_info->free_space_root)
400 trans->can_flush_pending_bgs = false;
401
402 ret = btrfs_alloc_tree_block(trans, root, parent_start,
403 root->root_key.objectid, disk_key, level,
9631e4cc 404 hint, empty_size, nest);
a6279470
FM
405 trans->can_flush_pending_bgs = true;
406
407 return ret;
408}
409
d352ac68 410/*
d397712b
CM
411 * does the dirty work in cow of a single block. The parent block (if
412 * supplied) is updated to point to the new cow copy. The new buffer is marked
413 * dirty and returned locked. If you modify the block it needs to be marked
414 * dirty again.
d352ac68
CM
415 *
416 * search_start -- an allocation hint for the new block
417 *
d397712b
CM
418 * empty_size -- a hint that you plan on doing more cow. This is the size in
419 * bytes the allocator should try to find free next to the block it returns.
420 * This is just a hint and may be ignored by the allocator.
d352ac68 421 */
d397712b 422static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
423 struct btrfs_root *root,
424 struct extent_buffer *buf,
425 struct extent_buffer *parent, int parent_slot,
426 struct extent_buffer **cow_ret,
9631e4cc
JB
427 u64 search_start, u64 empty_size,
428 enum btrfs_lock_nesting nest)
02217ed2 429{
0b246afa 430 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 431 struct btrfs_disk_key disk_key;
5f39d397 432 struct extent_buffer *cow;
be1a5564 433 int level, ret;
f0486c68 434 int last_ref = 0;
925baedd 435 int unlock_orig = 0;
0f5053eb 436 u64 parent_start = 0;
7bb86316 437
925baedd
CM
438 if (*cow_ret == buf)
439 unlock_orig = 1;
440
b9447ef8 441 btrfs_assert_tree_locked(buf);
925baedd 442
92a7cc42 443 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 444 trans->transid != fs_info->running_transaction->transid);
92a7cc42 445 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 446 trans->transid != root->last_trans);
5f39d397 447
7bb86316 448 level = btrfs_header_level(buf);
31840ae1 449
5d4f98a2
YZ
450 if (level == 0)
451 btrfs_item_key(buf, &disk_key, 0);
452 else
453 btrfs_node_key(buf, &disk_key, 0);
454
0f5053eb
GR
455 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
456 parent_start = parent->start;
5d4f98a2 457
a6279470 458 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
9631e4cc 459 level, search_start, empty_size, nest);
54aa1f4d
CM
460 if (IS_ERR(cow))
461 return PTR_ERR(cow);
6702ed49 462
b4ce94de
CM
463 /* cow is set to blocking by btrfs_init_new_buffer */
464
58e8012c 465 copy_extent_buffer_full(cow, buf);
db94535d 466 btrfs_set_header_bytenr(cow, cow->start);
5f39d397 467 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
468 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
469 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
470 BTRFS_HEADER_FLAG_RELOC);
471 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
472 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
473 else
474 btrfs_set_header_owner(cow, root->root_key.objectid);
6702ed49 475
de37aa51 476 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 477
be1a5564 478 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
b68dc2a9 479 if (ret) {
572c83ac
JB
480 btrfs_tree_unlock(cow);
481 free_extent_buffer(cow);
66642832 482 btrfs_abort_transaction(trans, ret);
b68dc2a9
MF
483 return ret;
484 }
1a40e23b 485
92a7cc42 486 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
83d4cfd4 487 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
93314e3b 488 if (ret) {
572c83ac
JB
489 btrfs_tree_unlock(cow);
490 free_extent_buffer(cow);
66642832 491 btrfs_abort_transaction(trans, ret);
83d4cfd4 492 return ret;
93314e3b 493 }
83d4cfd4 494 }
3fd0a558 495
02217ed2 496 if (buf == root->node) {
925baedd 497 WARN_ON(parent && parent != buf);
5d4f98a2
YZ
498 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
499 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
500 parent_start = buf->start;
925baedd 501
67439dad 502 atomic_inc(&cow->refs);
406808ab 503 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
d9d19a01 504 BUG_ON(ret < 0);
240f62c8 505 rcu_assign_pointer(root->node, cow);
925baedd 506
f0486c68 507 btrfs_free_tree_block(trans, root, buf, parent_start,
5581a51a 508 last_ref);
5f39d397 509 free_extent_buffer(buf);
0b86a832 510 add_root_to_dirty_list(root);
02217ed2 511 } else {
5d4f98a2 512 WARN_ON(trans->transid != btrfs_header_generation(parent));
f3a84ccd
FM
513 btrfs_tree_mod_log_insert_key(parent, parent_slot,
514 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
5f39d397 515 btrfs_set_node_blockptr(parent, parent_slot,
db94535d 516 cow->start);
74493f7a
CM
517 btrfs_set_node_ptr_generation(parent, parent_slot,
518 trans->transid);
d6025579 519 btrfs_mark_buffer_dirty(parent);
5de865ee 520 if (last_ref) {
f3a84ccd 521 ret = btrfs_tree_mod_log_free_eb(buf);
5de865ee 522 if (ret) {
572c83ac
JB
523 btrfs_tree_unlock(cow);
524 free_extent_buffer(cow);
66642832 525 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
526 return ret;
527 }
528 }
f0486c68 529 btrfs_free_tree_block(trans, root, buf, parent_start,
5581a51a 530 last_ref);
02217ed2 531 }
925baedd
CM
532 if (unlock_orig)
533 btrfs_tree_unlock(buf);
3083ee2e 534 free_extent_buffer_stale(buf);
ccd467d6 535 btrfs_mark_buffer_dirty(cow);
2c90e5d6 536 *cow_ret = cow;
02217ed2
CM
537 return 0;
538}
539
5d4f98a2
YZ
540static inline int should_cow_block(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root,
542 struct extent_buffer *buf)
543{
f5ee5c9a 544 if (btrfs_is_testing(root->fs_info))
faa2dbf0 545 return 0;
fccb84c9 546
d1980131
DS
547 /* Ensure we can see the FORCE_COW bit */
548 smp_mb__before_atomic();
f1ebcc74
LB
549
550 /*
551 * We do not need to cow a block if
552 * 1) this block is not created or changed in this transaction;
553 * 2) this block does not belong to TREE_RELOC tree;
554 * 3) the root is not forced COW.
555 *
556 * What is forced COW:
01327610 557 * when we create snapshot during committing the transaction,
52042d8e 558 * after we've finished copying src root, we must COW the shared
f1ebcc74
LB
559 * block to ensure the metadata consistency.
560 */
5d4f98a2
YZ
561 if (btrfs_header_generation(buf) == trans->transid &&
562 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
563 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
f1ebcc74 564 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
27cdeb70 565 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
5d4f98a2
YZ
566 return 0;
567 return 1;
568}
569
d352ac68
CM
570/*
571 * cows a single block, see __btrfs_cow_block for the real work.
01327610 572 * This version of it has extra checks so that a block isn't COWed more than
d352ac68
CM
573 * once per transaction, as long as it hasn't been written yet
574 */
d397712b 575noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
576 struct btrfs_root *root, struct extent_buffer *buf,
577 struct extent_buffer *parent, int parent_slot,
9631e4cc
JB
578 struct extent_buffer **cow_ret,
579 enum btrfs_lock_nesting nest)
6702ed49 580{
0b246afa 581 struct btrfs_fs_info *fs_info = root->fs_info;
6702ed49 582 u64 search_start;
f510cfec 583 int ret;
dc17ff8f 584
83354f07
JB
585 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
586 btrfs_err(fs_info,
587 "COW'ing blocks on a fs root that's being dropped");
588
0b246afa 589 if (trans->transaction != fs_info->running_transaction)
31b1a2bd 590 WARN(1, KERN_CRIT "trans %llu running %llu\n",
c1c9ff7c 591 trans->transid,
0b246afa 592 fs_info->running_transaction->transid);
31b1a2bd 593
0b246afa 594 if (trans->transid != fs_info->generation)
31b1a2bd 595 WARN(1, KERN_CRIT "trans %llu running %llu\n",
0b246afa 596 trans->transid, fs_info->generation);
dc17ff8f 597
5d4f98a2 598 if (!should_cow_block(trans, root, buf)) {
6702ed49
CM
599 *cow_ret = buf;
600 return 0;
601 }
c487685d 602
ee22184b 603 search_start = buf->start & ~((u64)SZ_1G - 1);
b4ce94de 604
f616f5cd
QW
605 /*
606 * Before CoWing this block for later modification, check if it's
607 * the subtree root and do the delayed subtree trace if needed.
608 *
609 * Also We don't care about the error, as it's handled internally.
610 */
611 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
f510cfec 612 ret = __btrfs_cow_block(trans, root, buf, parent,
9631e4cc 613 parent_slot, cow_ret, search_start, 0, nest);
1abe9b8a 614
615 trace_btrfs_cow_block(root, buf, *cow_ret);
616
f510cfec 617 return ret;
6702ed49 618}
f75e2b79 619ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
6702ed49 620
d352ac68
CM
621/*
622 * helper function for defrag to decide if two blocks pointed to by a
623 * node are actually close by
624 */
6b80053d 625static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
6702ed49 626{
6b80053d 627 if (blocknr < other && other - (blocknr + blocksize) < 32768)
6702ed49 628 return 1;
6b80053d 629 if (blocknr > other && blocknr - (other + blocksize) < 32768)
6702ed49
CM
630 return 1;
631 return 0;
632}
633
ce6ef5ab
DS
634#ifdef __LITTLE_ENDIAN
635
636/*
637 * Compare two keys, on little-endian the disk order is same as CPU order and
638 * we can avoid the conversion.
639 */
640static int comp_keys(const struct btrfs_disk_key *disk_key,
641 const struct btrfs_key *k2)
642{
643 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
644
645 return btrfs_comp_cpu_keys(k1, k2);
646}
647
648#else
649
081e9573
CM
650/*
651 * compare two keys in a memcmp fashion
652 */
310712b2
OS
653static int comp_keys(const struct btrfs_disk_key *disk,
654 const struct btrfs_key *k2)
081e9573
CM
655{
656 struct btrfs_key k1;
657
658 btrfs_disk_key_to_cpu(&k1, disk);
659
20736aba 660 return btrfs_comp_cpu_keys(&k1, k2);
081e9573 661}
ce6ef5ab 662#endif
081e9573 663
f3465ca4
JB
664/*
665 * same as comp_keys only with two btrfs_key's
666 */
e1f60a65 667int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
f3465ca4
JB
668{
669 if (k1->objectid > k2->objectid)
670 return 1;
671 if (k1->objectid < k2->objectid)
672 return -1;
673 if (k1->type > k2->type)
674 return 1;
675 if (k1->type < k2->type)
676 return -1;
677 if (k1->offset > k2->offset)
678 return 1;
679 if (k1->offset < k2->offset)
680 return -1;
681 return 0;
682}
081e9573 683
d352ac68
CM
684/*
685 * this is used by the defrag code to go through all the
686 * leaves pointed to by a node and reallocate them so that
687 * disk order is close to key order
688 */
6702ed49 689int btrfs_realloc_node(struct btrfs_trans_handle *trans,
5f39d397 690 struct btrfs_root *root, struct extent_buffer *parent,
de78b51a 691 int start_slot, u64 *last_ret,
a6b6e75e 692 struct btrfs_key *progress)
6702ed49 693{
0b246afa 694 struct btrfs_fs_info *fs_info = root->fs_info;
6b80053d 695 struct extent_buffer *cur;
6702ed49 696 u64 blocknr;
e9d0b13b
CM
697 u64 search_start = *last_ret;
698 u64 last_block = 0;
6702ed49
CM
699 u64 other;
700 u32 parent_nritems;
6702ed49
CM
701 int end_slot;
702 int i;
703 int err = 0;
6b80053d 704 u32 blocksize;
081e9573
CM
705 int progress_passed = 0;
706 struct btrfs_disk_key disk_key;
6702ed49 707
0b246afa
JM
708 WARN_ON(trans->transaction != fs_info->running_transaction);
709 WARN_ON(trans->transid != fs_info->generation);
86479a04 710
6b80053d 711 parent_nritems = btrfs_header_nritems(parent);
0b246afa 712 blocksize = fs_info->nodesize;
5dfe2be7 713 end_slot = parent_nritems - 1;
6702ed49 714
5dfe2be7 715 if (parent_nritems <= 1)
6702ed49
CM
716 return 0;
717
5dfe2be7 718 for (i = start_slot; i <= end_slot; i++) {
6702ed49 719 int close = 1;
a6b6e75e 720
081e9573
CM
721 btrfs_node_key(parent, &disk_key, i);
722 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
723 continue;
724
725 progress_passed = 1;
6b80053d 726 blocknr = btrfs_node_blockptr(parent, i);
e9d0b13b
CM
727 if (last_block == 0)
728 last_block = blocknr;
5708b959 729
6702ed49 730 if (i > 0) {
6b80053d
CM
731 other = btrfs_node_blockptr(parent, i - 1);
732 close = close_blocks(blocknr, other, blocksize);
6702ed49 733 }
5dfe2be7 734 if (!close && i < end_slot) {
6b80053d
CM
735 other = btrfs_node_blockptr(parent, i + 1);
736 close = close_blocks(blocknr, other, blocksize);
6702ed49 737 }
e9d0b13b
CM
738 if (close) {
739 last_block = blocknr;
6702ed49 740 continue;
e9d0b13b 741 }
6702ed49 742
206983b7
JB
743 cur = btrfs_read_node_slot(parent, i);
744 if (IS_ERR(cur))
745 return PTR_ERR(cur);
e9d0b13b 746 if (search_start == 0)
6b80053d 747 search_start = last_block;
e9d0b13b 748
e7a84565 749 btrfs_tree_lock(cur);
6b80053d 750 err = __btrfs_cow_block(trans, root, cur, parent, i,
e7a84565 751 &cur, search_start,
6b80053d 752 min(16 * blocksize,
9631e4cc
JB
753 (end_slot - i) * blocksize),
754 BTRFS_NESTING_COW);
252c38f0 755 if (err) {
e7a84565 756 btrfs_tree_unlock(cur);
6b80053d 757 free_extent_buffer(cur);
6702ed49 758 break;
252c38f0 759 }
e7a84565
CM
760 search_start = cur->start;
761 last_block = cur->start;
f2183bde 762 *last_ret = search_start;
e7a84565
CM
763 btrfs_tree_unlock(cur);
764 free_extent_buffer(cur);
6702ed49
CM
765 }
766 return err;
767}
768
74123bd7 769/*
5f39d397
CM
770 * search for key in the extent_buffer. The items start at offset p,
771 * and they are item_size apart. There are 'max' items in p.
772 *
74123bd7
CM
773 * the slot in the array is returned via slot, and it points to
774 * the place where you would insert key if it is not found in
775 * the array.
776 *
777 * slot may point to max if the key is bigger than all of the keys
778 */
e02119d5 779static noinline int generic_bin_search(struct extent_buffer *eb,
310712b2
OS
780 unsigned long p, int item_size,
781 const struct btrfs_key *key,
e02119d5 782 int max, int *slot)
be0e5c09
CM
783{
784 int low = 0;
785 int high = max;
be0e5c09 786 int ret;
5cd17f34 787 const int key_size = sizeof(struct btrfs_disk_key);
be0e5c09 788
5e24e9af
LB
789 if (low > high) {
790 btrfs_err(eb->fs_info,
791 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
792 __func__, low, high, eb->start,
793 btrfs_header_owner(eb), btrfs_header_level(eb));
794 return -EINVAL;
795 }
796
d397712b 797 while (low < high) {
5cd17f34
DS
798 unsigned long oip;
799 unsigned long offset;
800 struct btrfs_disk_key *tmp;
801 struct btrfs_disk_key unaligned;
802 int mid;
803
be0e5c09 804 mid = (low + high) / 2;
5f39d397 805 offset = p + mid * item_size;
5cd17f34 806 oip = offset_in_page(offset);
5f39d397 807
5cd17f34 808 if (oip + key_size <= PAGE_SIZE) {
884b07d0 809 const unsigned long idx = get_eb_page_index(offset);
5cd17f34 810 char *kaddr = page_address(eb->pages[idx]);
5f39d397 811
884b07d0 812 oip = get_eb_offset_in_page(eb, offset);
5cd17f34 813 tmp = (struct btrfs_disk_key *)(kaddr + oip);
5f39d397 814 } else {
5cd17f34
DS
815 read_extent_buffer(eb, &unaligned, offset, key_size);
816 tmp = &unaligned;
5f39d397 817 }
5cd17f34 818
be0e5c09
CM
819 ret = comp_keys(tmp, key);
820
821 if (ret < 0)
822 low = mid + 1;
823 else if (ret > 0)
824 high = mid;
825 else {
826 *slot = mid;
827 return 0;
828 }
829 }
830 *slot = low;
831 return 1;
832}
833
97571fd0
CM
834/*
835 * simple bin_search frontend that does the right thing for
836 * leaves vs nodes
837 */
a74b35ec 838int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
e3b83361 839 int *slot)
be0e5c09 840{
e3b83361 841 if (btrfs_header_level(eb) == 0)
5f39d397
CM
842 return generic_bin_search(eb,
843 offsetof(struct btrfs_leaf, items),
0783fcfc 844 sizeof(struct btrfs_item),
5f39d397 845 key, btrfs_header_nritems(eb),
7518a238 846 slot);
f775738f 847 else
5f39d397
CM
848 return generic_bin_search(eb,
849 offsetof(struct btrfs_node, ptrs),
123abc88 850 sizeof(struct btrfs_key_ptr),
5f39d397 851 key, btrfs_header_nritems(eb),
7518a238 852 slot);
be0e5c09
CM
853}
854
f0486c68
YZ
855static void root_add_used(struct btrfs_root *root, u32 size)
856{
857 spin_lock(&root->accounting_lock);
858 btrfs_set_root_used(&root->root_item,
859 btrfs_root_used(&root->root_item) + size);
860 spin_unlock(&root->accounting_lock);
861}
862
863static void root_sub_used(struct btrfs_root *root, u32 size)
864{
865 spin_lock(&root->accounting_lock);
866 btrfs_set_root_used(&root->root_item,
867 btrfs_root_used(&root->root_item) - size);
868 spin_unlock(&root->accounting_lock);
869}
870
d352ac68
CM
871/* given a node and slot number, this reads the blocks it points to. The
872 * extent buffer is returned with a reference taken (but unlocked).
d352ac68 873 */
4b231ae4
DS
874struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
875 int slot)
bb803951 876{
ca7a79ad 877 int level = btrfs_header_level(parent);
416bc658 878 struct extent_buffer *eb;
581c1760 879 struct btrfs_key first_key;
416bc658 880
fb770ae4
LB
881 if (slot < 0 || slot >= btrfs_header_nritems(parent))
882 return ERR_PTR(-ENOENT);
ca7a79ad
CM
883
884 BUG_ON(level == 0);
885
581c1760 886 btrfs_node_key_to_cpu(parent, &first_key, slot);
d0d20b0f 887 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1b7ec85e 888 btrfs_header_owner(parent),
581c1760
QW
889 btrfs_node_ptr_generation(parent, slot),
890 level - 1, &first_key);
fb770ae4
LB
891 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
892 free_extent_buffer(eb);
893 eb = ERR_PTR(-EIO);
416bc658
JB
894 }
895
896 return eb;
bb803951
CM
897}
898
d352ac68
CM
899/*
900 * node level balancing, used to make sure nodes are in proper order for
901 * item deletion. We balance from the top down, so we have to make sure
902 * that a deletion won't leave an node completely empty later on.
903 */
e02119d5 904static noinline int balance_level(struct btrfs_trans_handle *trans,
98ed5174
CM
905 struct btrfs_root *root,
906 struct btrfs_path *path, int level)
bb803951 907{
0b246afa 908 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
909 struct extent_buffer *right = NULL;
910 struct extent_buffer *mid;
911 struct extent_buffer *left = NULL;
912 struct extent_buffer *parent = NULL;
bb803951
CM
913 int ret = 0;
914 int wret;
915 int pslot;
bb803951 916 int orig_slot = path->slots[level];
79f95c82 917 u64 orig_ptr;
bb803951 918
98e6b1eb 919 ASSERT(level > 0);
bb803951 920
5f39d397 921 mid = path->nodes[level];
b4ce94de 922
ac5887c8 923 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
7bb86316
CM
924 WARN_ON(btrfs_header_generation(mid) != trans->transid);
925
1d4f8a0c 926 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
79f95c82 927
a05a9bb1 928 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 929 parent = path->nodes[level + 1];
a05a9bb1
LZ
930 pslot = path->slots[level + 1];
931 }
bb803951 932
40689478
CM
933 /*
934 * deal with the case where there is only one pointer in the root
935 * by promoting the node below to a root
936 */
5f39d397
CM
937 if (!parent) {
938 struct extent_buffer *child;
bb803951 939
5f39d397 940 if (btrfs_header_nritems(mid) != 1)
bb803951
CM
941 return 0;
942
943 /* promote the child to a root */
4b231ae4 944 child = btrfs_read_node_slot(mid, 0);
fb770ae4
LB
945 if (IS_ERR(child)) {
946 ret = PTR_ERR(child);
0b246afa 947 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
948 goto enospc;
949 }
950
925baedd 951 btrfs_tree_lock(child);
9631e4cc
JB
952 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
953 BTRFS_NESTING_COW);
f0486c68
YZ
954 if (ret) {
955 btrfs_tree_unlock(child);
956 free_extent_buffer(child);
957 goto enospc;
958 }
2f375ab9 959
406808ab 960 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
d9d19a01 961 BUG_ON(ret < 0);
240f62c8 962 rcu_assign_pointer(root->node, child);
925baedd 963
0b86a832 964 add_root_to_dirty_list(root);
925baedd 965 btrfs_tree_unlock(child);
b4ce94de 966
925baedd 967 path->locks[level] = 0;
bb803951 968 path->nodes[level] = NULL;
6a884d7d 969 btrfs_clean_tree_block(mid);
925baedd 970 btrfs_tree_unlock(mid);
bb803951 971 /* once for the path */
5f39d397 972 free_extent_buffer(mid);
f0486c68
YZ
973
974 root_sub_used(root, mid->len);
5581a51a 975 btrfs_free_tree_block(trans, root, mid, 0, 1);
bb803951 976 /* once for the root ptr */
3083ee2e 977 free_extent_buffer_stale(mid);
f0486c68 978 return 0;
bb803951 979 }
5f39d397 980 if (btrfs_header_nritems(mid) >
0b246afa 981 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
bb803951
CM
982 return 0;
983
4b231ae4 984 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
985 if (IS_ERR(left))
986 left = NULL;
987
5f39d397 988 if (left) {
bf77467a 989 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
5f39d397 990 wret = btrfs_cow_block(trans, root, left,
9631e4cc 991 parent, pslot - 1, &left,
bf59a5a2 992 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
993 if (wret) {
994 ret = wret;
995 goto enospc;
996 }
2cc58cf2 997 }
fb770ae4 998
4b231ae4 999 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
1000 if (IS_ERR(right))
1001 right = NULL;
1002
5f39d397 1003 if (right) {
bf77467a 1004 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
5f39d397 1005 wret = btrfs_cow_block(trans, root, right,
9631e4cc 1006 parent, pslot + 1, &right,
bf59a5a2 1007 BTRFS_NESTING_RIGHT_COW);
2cc58cf2
CM
1008 if (wret) {
1009 ret = wret;
1010 goto enospc;
1011 }
1012 }
1013
1014 /* first, try to make some room in the middle buffer */
5f39d397
CM
1015 if (left) {
1016 orig_slot += btrfs_header_nritems(left);
d30a668f 1017 wret = push_node_left(trans, left, mid, 1);
79f95c82
CM
1018 if (wret < 0)
1019 ret = wret;
bb803951 1020 }
79f95c82
CM
1021
1022 /*
1023 * then try to empty the right most buffer into the middle
1024 */
5f39d397 1025 if (right) {
d30a668f 1026 wret = push_node_left(trans, mid, right, 1);
54aa1f4d 1027 if (wret < 0 && wret != -ENOSPC)
79f95c82 1028 ret = wret;
5f39d397 1029 if (btrfs_header_nritems(right) == 0) {
6a884d7d 1030 btrfs_clean_tree_block(right);
925baedd 1031 btrfs_tree_unlock(right);
afe5fea7 1032 del_ptr(root, path, level + 1, pslot + 1);
f0486c68 1033 root_sub_used(root, right->len);
5581a51a 1034 btrfs_free_tree_block(trans, root, right, 0, 1);
3083ee2e 1035 free_extent_buffer_stale(right);
f0486c68 1036 right = NULL;
bb803951 1037 } else {
5f39d397
CM
1038 struct btrfs_disk_key right_key;
1039 btrfs_node_key(right, &right_key, 0);
f3a84ccd
FM
1040 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1041 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
0e82bcfe 1042 BUG_ON(ret < 0);
5f39d397
CM
1043 btrfs_set_node_key(parent, &right_key, pslot + 1);
1044 btrfs_mark_buffer_dirty(parent);
bb803951
CM
1045 }
1046 }
5f39d397 1047 if (btrfs_header_nritems(mid) == 1) {
79f95c82
CM
1048 /*
1049 * we're not allowed to leave a node with one item in the
1050 * tree during a delete. A deletion from lower in the tree
1051 * could try to delete the only pointer in this node.
1052 * So, pull some keys from the left.
1053 * There has to be a left pointer at this point because
1054 * otherwise we would have pulled some pointers from the
1055 * right
1056 */
305a26af
MF
1057 if (!left) {
1058 ret = -EROFS;
0b246afa 1059 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
1060 goto enospc;
1061 }
55d32ed8 1062 wret = balance_node_right(trans, mid, left);
54aa1f4d 1063 if (wret < 0) {
79f95c82 1064 ret = wret;
54aa1f4d
CM
1065 goto enospc;
1066 }
bce4eae9 1067 if (wret == 1) {
d30a668f 1068 wret = push_node_left(trans, left, mid, 1);
bce4eae9
CM
1069 if (wret < 0)
1070 ret = wret;
1071 }
79f95c82
CM
1072 BUG_ON(wret == 1);
1073 }
5f39d397 1074 if (btrfs_header_nritems(mid) == 0) {
6a884d7d 1075 btrfs_clean_tree_block(mid);
925baedd 1076 btrfs_tree_unlock(mid);
afe5fea7 1077 del_ptr(root, path, level + 1, pslot);
f0486c68 1078 root_sub_used(root, mid->len);
5581a51a 1079 btrfs_free_tree_block(trans, root, mid, 0, 1);
3083ee2e 1080 free_extent_buffer_stale(mid);
f0486c68 1081 mid = NULL;
79f95c82
CM
1082 } else {
1083 /* update the parent key to reflect our changes */
5f39d397
CM
1084 struct btrfs_disk_key mid_key;
1085 btrfs_node_key(mid, &mid_key, 0);
f3a84ccd
FM
1086 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1087 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
0e82bcfe 1088 BUG_ON(ret < 0);
5f39d397
CM
1089 btrfs_set_node_key(parent, &mid_key, pslot);
1090 btrfs_mark_buffer_dirty(parent);
79f95c82 1091 }
bb803951 1092
79f95c82 1093 /* update the path */
5f39d397
CM
1094 if (left) {
1095 if (btrfs_header_nritems(left) > orig_slot) {
67439dad 1096 atomic_inc(&left->refs);
925baedd 1097 /* left was locked after cow */
5f39d397 1098 path->nodes[level] = left;
bb803951
CM
1099 path->slots[level + 1] -= 1;
1100 path->slots[level] = orig_slot;
925baedd
CM
1101 if (mid) {
1102 btrfs_tree_unlock(mid);
5f39d397 1103 free_extent_buffer(mid);
925baedd 1104 }
bb803951 1105 } else {
5f39d397 1106 orig_slot -= btrfs_header_nritems(left);
bb803951
CM
1107 path->slots[level] = orig_slot;
1108 }
1109 }
79f95c82 1110 /* double check we haven't messed things up */
e20d96d6 1111 if (orig_ptr !=
5f39d397 1112 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
79f95c82 1113 BUG();
54aa1f4d 1114enospc:
925baedd
CM
1115 if (right) {
1116 btrfs_tree_unlock(right);
5f39d397 1117 free_extent_buffer(right);
925baedd
CM
1118 }
1119 if (left) {
1120 if (path->nodes[level] != left)
1121 btrfs_tree_unlock(left);
5f39d397 1122 free_extent_buffer(left);
925baedd 1123 }
bb803951
CM
1124 return ret;
1125}
1126
d352ac68
CM
1127/* Node balancing for insertion. Here we only split or push nodes around
1128 * when they are completely full. This is also done top down, so we
1129 * have to be pessimistic.
1130 */
d397712b 1131static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
98ed5174
CM
1132 struct btrfs_root *root,
1133 struct btrfs_path *path, int level)
e66f709b 1134{
0b246afa 1135 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
1136 struct extent_buffer *right = NULL;
1137 struct extent_buffer *mid;
1138 struct extent_buffer *left = NULL;
1139 struct extent_buffer *parent = NULL;
e66f709b
CM
1140 int ret = 0;
1141 int wret;
1142 int pslot;
1143 int orig_slot = path->slots[level];
e66f709b
CM
1144
1145 if (level == 0)
1146 return 1;
1147
5f39d397 1148 mid = path->nodes[level];
7bb86316 1149 WARN_ON(btrfs_header_generation(mid) != trans->transid);
e66f709b 1150
a05a9bb1 1151 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1152 parent = path->nodes[level + 1];
a05a9bb1
LZ
1153 pslot = path->slots[level + 1];
1154 }
e66f709b 1155
5f39d397 1156 if (!parent)
e66f709b 1157 return 1;
e66f709b 1158
4b231ae4 1159 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
1160 if (IS_ERR(left))
1161 left = NULL;
e66f709b
CM
1162
1163 /* first, try to make some room in the middle buffer */
5f39d397 1164 if (left) {
e66f709b 1165 u32 left_nr;
925baedd 1166
bf77467a 1167 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
b4ce94de 1168
5f39d397 1169 left_nr = btrfs_header_nritems(left);
0b246afa 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1171 wret = 1;
1172 } else {
5f39d397 1173 ret = btrfs_cow_block(trans, root, left, parent,
9631e4cc 1174 pslot - 1, &left,
bf59a5a2 1175 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
1176 if (ret)
1177 wret = 1;
1178 else {
d30a668f 1179 wret = push_node_left(trans, left, mid, 0);
54aa1f4d 1180 }
33ade1f8 1181 }
e66f709b
CM
1182 if (wret < 0)
1183 ret = wret;
1184 if (wret == 0) {
5f39d397 1185 struct btrfs_disk_key disk_key;
e66f709b 1186 orig_slot += left_nr;
5f39d397 1187 btrfs_node_key(mid, &disk_key, 0);
f3a84ccd
FM
1188 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1189 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
0e82bcfe 1190 BUG_ON(ret < 0);
5f39d397
CM
1191 btrfs_set_node_key(parent, &disk_key, pslot);
1192 btrfs_mark_buffer_dirty(parent);
1193 if (btrfs_header_nritems(left) > orig_slot) {
1194 path->nodes[level] = left;
e66f709b
CM
1195 path->slots[level + 1] -= 1;
1196 path->slots[level] = orig_slot;
925baedd 1197 btrfs_tree_unlock(mid);
5f39d397 1198 free_extent_buffer(mid);
e66f709b
CM
1199 } else {
1200 orig_slot -=
5f39d397 1201 btrfs_header_nritems(left);
e66f709b 1202 path->slots[level] = orig_slot;
925baedd 1203 btrfs_tree_unlock(left);
5f39d397 1204 free_extent_buffer(left);
e66f709b 1205 }
e66f709b
CM
1206 return 0;
1207 }
925baedd 1208 btrfs_tree_unlock(left);
5f39d397 1209 free_extent_buffer(left);
e66f709b 1210 }
4b231ae4 1211 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
1212 if (IS_ERR(right))
1213 right = NULL;
e66f709b
CM
1214
1215 /*
1216 * then try to empty the right most buffer into the middle
1217 */
5f39d397 1218 if (right) {
33ade1f8 1219 u32 right_nr;
b4ce94de 1220
bf77467a 1221 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
b4ce94de 1222
5f39d397 1223 right_nr = btrfs_header_nritems(right);
0b246afa 1224 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1225 wret = 1;
1226 } else {
5f39d397
CM
1227 ret = btrfs_cow_block(trans, root, right,
1228 parent, pslot + 1,
bf59a5a2 1229 &right, BTRFS_NESTING_RIGHT_COW);
54aa1f4d
CM
1230 if (ret)
1231 wret = 1;
1232 else {
55d32ed8 1233 wret = balance_node_right(trans, right, mid);
54aa1f4d 1234 }
33ade1f8 1235 }
e66f709b
CM
1236 if (wret < 0)
1237 ret = wret;
1238 if (wret == 0) {
5f39d397
CM
1239 struct btrfs_disk_key disk_key;
1240
1241 btrfs_node_key(right, &disk_key, 0);
f3a84ccd
FM
1242 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1243 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
0e82bcfe 1244 BUG_ON(ret < 0);
5f39d397
CM
1245 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1246 btrfs_mark_buffer_dirty(parent);
1247
1248 if (btrfs_header_nritems(mid) <= orig_slot) {
1249 path->nodes[level] = right;
e66f709b
CM
1250 path->slots[level + 1] += 1;
1251 path->slots[level] = orig_slot -
5f39d397 1252 btrfs_header_nritems(mid);
925baedd 1253 btrfs_tree_unlock(mid);
5f39d397 1254 free_extent_buffer(mid);
e66f709b 1255 } else {
925baedd 1256 btrfs_tree_unlock(right);
5f39d397 1257 free_extent_buffer(right);
e66f709b 1258 }
e66f709b
CM
1259 return 0;
1260 }
925baedd 1261 btrfs_tree_unlock(right);
5f39d397 1262 free_extent_buffer(right);
e66f709b 1263 }
e66f709b
CM
1264 return 1;
1265}
1266
3c69faec 1267/*
d352ac68
CM
1268 * readahead one full node of leaves, finding things that are close
1269 * to the block in 'slot', and triggering ra on them.
3c69faec 1270 */
2ff7e61e 1271static void reada_for_search(struct btrfs_fs_info *fs_info,
c8c42864
CM
1272 struct btrfs_path *path,
1273 int level, int slot, u64 objectid)
3c69faec 1274{
5f39d397 1275 struct extent_buffer *node;
01f46658 1276 struct btrfs_disk_key disk_key;
3c69faec 1277 u32 nritems;
3c69faec 1278 u64 search;
a7175319 1279 u64 target;
6b80053d 1280 u64 nread = 0;
ace75066 1281 u64 nread_max;
5f39d397 1282 struct extent_buffer *eb;
6b80053d
CM
1283 u32 nr;
1284 u32 blocksize;
1285 u32 nscan = 0;
db94535d 1286
ace75066 1287 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
6702ed49
CM
1288 return;
1289
1290 if (!path->nodes[level])
3c69faec
CM
1291 return;
1292
5f39d397 1293 node = path->nodes[level];
925baedd 1294
ace75066
FM
1295 /*
1296 * Since the time between visiting leaves is much shorter than the time
1297 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1298 * much IO at once (possibly random).
1299 */
1300 if (path->reada == READA_FORWARD_ALWAYS) {
1301 if (level > 1)
1302 nread_max = node->fs_info->nodesize;
1303 else
1304 nread_max = SZ_128K;
1305 } else {
1306 nread_max = SZ_64K;
1307 }
1308
3c69faec 1309 search = btrfs_node_blockptr(node, slot);
0b246afa
JM
1310 blocksize = fs_info->nodesize;
1311 eb = find_extent_buffer(fs_info, search);
5f39d397
CM
1312 if (eb) {
1313 free_extent_buffer(eb);
3c69faec
CM
1314 return;
1315 }
1316
a7175319 1317 target = search;
6b80053d 1318
5f39d397 1319 nritems = btrfs_header_nritems(node);
6b80053d 1320 nr = slot;
25b8b936 1321
d397712b 1322 while (1) {
e4058b54 1323 if (path->reada == READA_BACK) {
6b80053d
CM
1324 if (nr == 0)
1325 break;
1326 nr--;
ace75066
FM
1327 } else if (path->reada == READA_FORWARD ||
1328 path->reada == READA_FORWARD_ALWAYS) {
6b80053d
CM
1329 nr++;
1330 if (nr >= nritems)
1331 break;
3c69faec 1332 }
e4058b54 1333 if (path->reada == READA_BACK && objectid) {
01f46658
CM
1334 btrfs_node_key(node, &disk_key, nr);
1335 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1336 break;
1337 }
6b80053d 1338 search = btrfs_node_blockptr(node, nr);
ace75066
FM
1339 if (path->reada == READA_FORWARD_ALWAYS ||
1340 (search <= target && target - search <= 65536) ||
a7175319 1341 (search > target && search - target <= 65536)) {
bfb484d9 1342 btrfs_readahead_node_child(node, nr);
6b80053d
CM
1343 nread += blocksize;
1344 }
1345 nscan++;
ace75066 1346 if (nread > nread_max || nscan > 32)
6b80053d 1347 break;
3c69faec
CM
1348 }
1349}
925baedd 1350
bfb484d9 1351static noinline void reada_for_balance(struct btrfs_path *path, int level)
b4ce94de 1352{
bfb484d9 1353 struct extent_buffer *parent;
b4ce94de
CM
1354 int slot;
1355 int nritems;
b4ce94de 1356
8c594ea8 1357 parent = path->nodes[level + 1];
b4ce94de 1358 if (!parent)
0b08851f 1359 return;
b4ce94de
CM
1360
1361 nritems = btrfs_header_nritems(parent);
8c594ea8 1362 slot = path->slots[level + 1];
b4ce94de 1363
bfb484d9
JB
1364 if (slot > 0)
1365 btrfs_readahead_node_child(parent, slot - 1);
1366 if (slot + 1 < nritems)
1367 btrfs_readahead_node_child(parent, slot + 1);
b4ce94de
CM
1368}
1369
1370
d352ac68 1371/*
d397712b
CM
1372 * when we walk down the tree, it is usually safe to unlock the higher layers
1373 * in the tree. The exceptions are when our path goes through slot 0, because
1374 * operations on the tree might require changing key pointers higher up in the
1375 * tree.
d352ac68 1376 *
d397712b
CM
1377 * callers might also have set path->keep_locks, which tells this code to keep
1378 * the lock if the path points to the last slot in the block. This is part of
1379 * walking through the tree, and selecting the next slot in the higher block.
d352ac68 1380 *
d397712b
CM
1381 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1382 * if lowest_unlock is 1, level 0 won't be unlocked
d352ac68 1383 */
e02119d5 1384static noinline void unlock_up(struct btrfs_path *path, int level,
f7c79f30
CM
1385 int lowest_unlock, int min_write_lock_level,
1386 int *write_lock_level)
925baedd
CM
1387{
1388 int i;
1389 int skip_level = level;
051e1b9f 1390 int no_skips = 0;
925baedd
CM
1391 struct extent_buffer *t;
1392
1393 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1394 if (!path->nodes[i])
1395 break;
1396 if (!path->locks[i])
1397 break;
051e1b9f 1398 if (!no_skips && path->slots[i] == 0) {
925baedd
CM
1399 skip_level = i + 1;
1400 continue;
1401 }
051e1b9f 1402 if (!no_skips && path->keep_locks) {
925baedd
CM
1403 u32 nritems;
1404 t = path->nodes[i];
1405 nritems = btrfs_header_nritems(t);
051e1b9f 1406 if (nritems < 1 || path->slots[i] >= nritems - 1) {
925baedd
CM
1407 skip_level = i + 1;
1408 continue;
1409 }
1410 }
051e1b9f
CM
1411 if (skip_level < i && i >= lowest_unlock)
1412 no_skips = 1;
1413
925baedd 1414 t = path->nodes[i];
d80bb3f9 1415 if (i >= lowest_unlock && i > skip_level) {
bd681513 1416 btrfs_tree_unlock_rw(t, path->locks[i]);
925baedd 1417 path->locks[i] = 0;
f7c79f30
CM
1418 if (write_lock_level &&
1419 i > min_write_lock_level &&
1420 i <= *write_lock_level) {
1421 *write_lock_level = i - 1;
1422 }
925baedd
CM
1423 }
1424 }
1425}
1426
c8c42864
CM
1427/*
1428 * helper function for btrfs_search_slot. The goal is to find a block
1429 * in cache without setting the path to blocking. If we find the block
1430 * we return zero and the path is unchanged.
1431 *
1432 * If we can't find the block, we set the path blocking and do some
1433 * reada. -EAGAIN is returned and the search must be repeated.
1434 */
1435static int
d07b8528
LB
1436read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1437 struct extent_buffer **eb_ret, int level, int slot,
cda79c54 1438 const struct btrfs_key *key)
c8c42864 1439{
0b246afa 1440 struct btrfs_fs_info *fs_info = root->fs_info;
c8c42864
CM
1441 u64 blocknr;
1442 u64 gen;
c8c42864 1443 struct extent_buffer *tmp;
581c1760 1444 struct btrfs_key first_key;
76a05b35 1445 int ret;
581c1760 1446 int parent_level;
c8c42864 1447
213ff4b7
NB
1448 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1449 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1450 parent_level = btrfs_header_level(*eb_ret);
1451 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
c8c42864 1452
0b246afa 1453 tmp = find_extent_buffer(fs_info, blocknr);
cb44921a 1454 if (tmp) {
ace75066
FM
1455 if (p->reada == READA_FORWARD_ALWAYS)
1456 reada_for_search(fs_info, p, level, slot, key->objectid);
1457
b9fab919 1458 /* first we do an atomic uptodate check */
bdf7c00e 1459 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
448de471
QW
1460 /*
1461 * Do extra check for first_key, eb can be stale due to
1462 * being cached, read from scrub, or have multiple
1463 * parents (shared tree blocks).
1464 */
e064d5e9 1465 if (btrfs_verify_level_key(tmp,
448de471
QW
1466 parent_level - 1, &first_key, gen)) {
1467 free_extent_buffer(tmp);
1468 return -EUCLEAN;
1469 }
bdf7c00e
JB
1470 *eb_ret = tmp;
1471 return 0;
1472 }
1473
bdf7c00e 1474 /* now we're allowed to do a blocking uptodate check */
581c1760 1475 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
bdf7c00e
JB
1476 if (!ret) {
1477 *eb_ret = tmp;
1478 return 0;
cb44921a 1479 }
bdf7c00e
JB
1480 free_extent_buffer(tmp);
1481 btrfs_release_path(p);
1482 return -EIO;
c8c42864
CM
1483 }
1484
1485 /*
1486 * reduce lock contention at high levels
1487 * of the btree by dropping locks before
76a05b35
CM
1488 * we read. Don't release the lock on the current
1489 * level because we need to walk this node to figure
1490 * out which blocks to read.
c8c42864 1491 */
8c594ea8 1492 btrfs_unlock_up_safe(p, level + 1);
8c594ea8 1493
e4058b54 1494 if (p->reada != READA_NONE)
2ff7e61e 1495 reada_for_search(fs_info, p, level, slot, key->objectid);
c8c42864 1496
76a05b35 1497 ret = -EAGAIN;
1b7ec85e
JB
1498 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1499 gen, parent_level - 1, &first_key);
64c043de 1500 if (!IS_ERR(tmp)) {
76a05b35
CM
1501 /*
1502 * If the read above didn't mark this buffer up to date,
1503 * it will never end up being up to date. Set ret to EIO now
1504 * and give up so that our caller doesn't loop forever
1505 * on our EAGAINs.
1506 */
e6a1d6fd 1507 if (!extent_buffer_uptodate(tmp))
76a05b35 1508 ret = -EIO;
c8c42864 1509 free_extent_buffer(tmp);
c871b0f2
LB
1510 } else {
1511 ret = PTR_ERR(tmp);
76a05b35 1512 }
02a3307a
LB
1513
1514 btrfs_release_path(p);
76a05b35 1515 return ret;
c8c42864
CM
1516}
1517
1518/*
1519 * helper function for btrfs_search_slot. This does all of the checks
1520 * for node-level blocks and does any balancing required based on
1521 * the ins_len.
1522 *
1523 * If no extra work was required, zero is returned. If we had to
1524 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1525 * start over
1526 */
1527static int
1528setup_nodes_for_search(struct btrfs_trans_handle *trans,
1529 struct btrfs_root *root, struct btrfs_path *p,
bd681513
CM
1530 struct extent_buffer *b, int level, int ins_len,
1531 int *write_lock_level)
c8c42864 1532{
0b246afa 1533 struct btrfs_fs_info *fs_info = root->fs_info;
95b982de 1534 int ret = 0;
0b246afa 1535
c8c42864 1536 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
0b246afa 1537 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
c8c42864 1538
bd681513
CM
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
95b982de 1542 return -EAGAIN;
bd681513
CM
1543 }
1544
bfb484d9 1545 reada_for_balance(p, level);
95b982de 1546 ret = split_node(trans, root, p, level);
c8c42864 1547
c8c42864
CM
1548 b = p->nodes[level];
1549 } else if (ins_len < 0 && btrfs_header_nritems(b) <
0b246afa 1550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
c8c42864 1551
bd681513
CM
1552 if (*write_lock_level < level + 1) {
1553 *write_lock_level = level + 1;
1554 btrfs_release_path(p);
95b982de 1555 return -EAGAIN;
bd681513
CM
1556 }
1557
bfb484d9 1558 reada_for_balance(p, level);
95b982de
NB
1559 ret = balance_level(trans, root, p, level);
1560 if (ret)
1561 return ret;
c8c42864 1562
c8c42864
CM
1563 b = p->nodes[level];
1564 if (!b) {
b3b4aa74 1565 btrfs_release_path(p);
95b982de 1566 return -EAGAIN;
c8c42864
CM
1567 }
1568 BUG_ON(btrfs_header_nritems(b) == 1);
1569 }
c8c42864
CM
1570 return ret;
1571}
1572
381cf658 1573int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
e33d5c3d
KN
1574 u64 iobjectid, u64 ioff, u8 key_type,
1575 struct btrfs_key *found_key)
1576{
1577 int ret;
1578 struct btrfs_key key;
1579 struct extent_buffer *eb;
381cf658
DS
1580
1581 ASSERT(path);
1d4c08e0 1582 ASSERT(found_key);
e33d5c3d
KN
1583
1584 key.type = key_type;
1585 key.objectid = iobjectid;
1586 key.offset = ioff;
1587
1588 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1d4c08e0 1589 if (ret < 0)
e33d5c3d
KN
1590 return ret;
1591
1592 eb = path->nodes[0];
1593 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1594 ret = btrfs_next_leaf(fs_root, path);
1595 if (ret)
1596 return ret;
1597 eb = path->nodes[0];
1598 }
1599
1600 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1601 if (found_key->type != key.type ||
1602 found_key->objectid != key.objectid)
1603 return 1;
1604
1605 return 0;
1606}
1607
1fc28d8e
LB
1608static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1609 struct btrfs_path *p,
1610 int write_lock_level)
1611{
1612 struct btrfs_fs_info *fs_info = root->fs_info;
1613 struct extent_buffer *b;
1614 int root_lock;
1615 int level = 0;
1616
1617 /* We try very hard to do read locks on the root */
1618 root_lock = BTRFS_READ_LOCK;
1619
1620 if (p->search_commit_root) {
be6821f8
FM
1621 /*
1622 * The commit roots are read only so we always do read locks,
1623 * and we always must hold the commit_root_sem when doing
1624 * searches on them, the only exception is send where we don't
1625 * want to block transaction commits for a long time, so
1626 * we need to clone the commit root in order to avoid races
1627 * with transaction commits that create a snapshot of one of
1628 * the roots used by a send operation.
1629 */
1630 if (p->need_commit_sem) {
1fc28d8e 1631 down_read(&fs_info->commit_root_sem);
be6821f8 1632 b = btrfs_clone_extent_buffer(root->commit_root);
1fc28d8e 1633 up_read(&fs_info->commit_root_sem);
be6821f8
FM
1634 if (!b)
1635 return ERR_PTR(-ENOMEM);
1636
1637 } else {
1638 b = root->commit_root;
67439dad 1639 atomic_inc(&b->refs);
be6821f8
FM
1640 }
1641 level = btrfs_header_level(b);
f9ddfd05
LB
1642 /*
1643 * Ensure that all callers have set skip_locking when
1644 * p->search_commit_root = 1.
1645 */
1646 ASSERT(p->skip_locking == 1);
1fc28d8e
LB
1647
1648 goto out;
1649 }
1650
1651 if (p->skip_locking) {
1652 b = btrfs_root_node(root);
1653 level = btrfs_header_level(b);
1654 goto out;
1655 }
1656
1657 /*
662c653b
LB
1658 * If the level is set to maximum, we can skip trying to get the read
1659 * lock.
1fc28d8e 1660 */
662c653b
LB
1661 if (write_lock_level < BTRFS_MAX_LEVEL) {
1662 /*
1663 * We don't know the level of the root node until we actually
1664 * have it read locked
1665 */
1bb96598 1666 b = btrfs_read_lock_root_node(root);
662c653b
LB
1667 level = btrfs_header_level(b);
1668 if (level > write_lock_level)
1669 goto out;
1670
1671 /* Whoops, must trade for write lock */
1672 btrfs_tree_read_unlock(b);
1673 free_extent_buffer(b);
1674 }
1fc28d8e 1675
1fc28d8e
LB
1676 b = btrfs_lock_root_node(root);
1677 root_lock = BTRFS_WRITE_LOCK;
1678
1679 /* The level might have changed, check again */
1680 level = btrfs_header_level(b);
1681
1682out:
1683 p->nodes[level] = b;
1684 if (!p->skip_locking)
1685 p->locks[level] = root_lock;
1686 /*
1687 * Callers are responsible for dropping b's references.
1688 */
1689 return b;
1690}
1691
1692
74123bd7 1693/*
4271ecea
NB
1694 * btrfs_search_slot - look for a key in a tree and perform necessary
1695 * modifications to preserve tree invariants.
74123bd7 1696 *
4271ecea
NB
1697 * @trans: Handle of transaction, used when modifying the tree
1698 * @p: Holds all btree nodes along the search path
1699 * @root: The root node of the tree
1700 * @key: The key we are looking for
9a664971 1701 * @ins_len: Indicates purpose of search:
1702 * >0 for inserts it's size of item inserted (*)
1703 * <0 for deletions
1704 * 0 for plain searches, not modifying the tree
1705 *
1706 * (*) If size of item inserted doesn't include
1707 * sizeof(struct btrfs_item), then p->search_for_extension must
1708 * be set.
4271ecea
NB
1709 * @cow: boolean should CoW operations be performed. Must always be 1
1710 * when modifying the tree.
97571fd0 1711 *
4271ecea
NB
1712 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1713 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1714 *
1715 * If @key is found, 0 is returned and you can find the item in the leaf level
1716 * of the path (level 0)
1717 *
1718 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1719 * points to the slot where it should be inserted
1720 *
1721 * If an error is encountered while searching the tree a negative error number
1722 * is returned
74123bd7 1723 */
310712b2
OS
1724int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1725 const struct btrfs_key *key, struct btrfs_path *p,
1726 int ins_len, int cow)
be0e5c09 1727{
5f39d397 1728 struct extent_buffer *b;
be0e5c09
CM
1729 int slot;
1730 int ret;
33c66f43 1731 int err;
be0e5c09 1732 int level;
925baedd 1733 int lowest_unlock = 1;
bd681513
CM
1734 /* everything at write_lock_level or lower must be write locked */
1735 int write_lock_level = 0;
9f3a7427 1736 u8 lowest_level = 0;
f7c79f30 1737 int min_write_lock_level;
d7396f07 1738 int prev_cmp;
9f3a7427 1739
6702ed49 1740 lowest_level = p->lowest_level;
323ac95b 1741 WARN_ON(lowest_level && ins_len > 0);
22b0ebda 1742 WARN_ON(p->nodes[0] != NULL);
eb653de1 1743 BUG_ON(!cow && ins_len);
25179201 1744
bd681513 1745 if (ins_len < 0) {
925baedd 1746 lowest_unlock = 2;
65b51a00 1747
bd681513
CM
1748 /* when we are removing items, we might have to go up to level
1749 * two as we update tree pointers Make sure we keep write
1750 * for those levels as well
1751 */
1752 write_lock_level = 2;
1753 } else if (ins_len > 0) {
1754 /*
1755 * for inserting items, make sure we have a write lock on
1756 * level 1 so we can update keys
1757 */
1758 write_lock_level = 1;
1759 }
1760
1761 if (!cow)
1762 write_lock_level = -1;
1763
09a2a8f9 1764 if (cow && (p->keep_locks || p->lowest_level))
bd681513
CM
1765 write_lock_level = BTRFS_MAX_LEVEL;
1766
f7c79f30
CM
1767 min_write_lock_level = write_lock_level;
1768
bb803951 1769again:
d7396f07 1770 prev_cmp = -1;
1fc28d8e 1771 b = btrfs_search_slot_get_root(root, p, write_lock_level);
be6821f8
FM
1772 if (IS_ERR(b)) {
1773 ret = PTR_ERR(b);
1774 goto done;
1775 }
925baedd 1776
eb60ceac 1777 while (b) {
f624d976
QW
1778 int dec = 0;
1779
5f39d397 1780 level = btrfs_header_level(b);
65b51a00 1781
02217ed2 1782 if (cow) {
9ea2c7c9
NB
1783 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1784
c8c42864
CM
1785 /*
1786 * if we don't really need to cow this block
1787 * then we don't want to set the path blocking,
1788 * so we test it here
1789 */
5963ffca 1790 if (!should_cow_block(trans, root, b))
65b51a00 1791 goto cow_done;
5d4f98a2 1792
bd681513
CM
1793 /*
1794 * must have write locks on this node and the
1795 * parent
1796 */
5124e00e
JB
1797 if (level > write_lock_level ||
1798 (level + 1 > write_lock_level &&
1799 level + 1 < BTRFS_MAX_LEVEL &&
1800 p->nodes[level + 1])) {
bd681513
CM
1801 write_lock_level = level + 1;
1802 btrfs_release_path(p);
1803 goto again;
1804 }
1805
9ea2c7c9
NB
1806 if (last_level)
1807 err = btrfs_cow_block(trans, root, b, NULL, 0,
9631e4cc
JB
1808 &b,
1809 BTRFS_NESTING_COW);
9ea2c7c9
NB
1810 else
1811 err = btrfs_cow_block(trans, root, b,
1812 p->nodes[level + 1],
9631e4cc
JB
1813 p->slots[level + 1], &b,
1814 BTRFS_NESTING_COW);
33c66f43 1815 if (err) {
33c66f43 1816 ret = err;
65b51a00 1817 goto done;
54aa1f4d 1818 }
02217ed2 1819 }
65b51a00 1820cow_done:
eb60ceac 1821 p->nodes[level] = b;
52398340
LB
1822 /*
1823 * Leave path with blocking locks to avoid massive
1824 * lock context switch, this is made on purpose.
1825 */
b4ce94de
CM
1826
1827 /*
1828 * we have a lock on b and as long as we aren't changing
1829 * the tree, there is no way to for the items in b to change.
1830 * It is safe to drop the lock on our parent before we
1831 * go through the expensive btree search on b.
1832 *
eb653de1
FDBM
1833 * If we're inserting or deleting (ins_len != 0), then we might
1834 * be changing slot zero, which may require changing the parent.
1835 * So, we can't drop the lock until after we know which slot
1836 * we're operating on.
b4ce94de 1837 */
eb653de1
FDBM
1838 if (!ins_len && !p->keep_locks) {
1839 int u = level + 1;
1840
1841 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1842 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1843 p->locks[u] = 0;
1844 }
1845 }
b4ce94de 1846
995e9a16
NB
1847 /*
1848 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1849 * we can safely assume the target key will always be in slot 0
1850 * on lower levels due to the invariants BTRFS' btree provides,
1851 * namely that a btrfs_key_ptr entry always points to the
1852 * lowest key in the child node, thus we can skip searching
1853 * lower levels
1854 */
1855 if (prev_cmp == 0) {
1856 slot = 0;
1857 ret = 0;
1858 } else {
1859 ret = btrfs_bin_search(b, key, &slot);
1860 prev_cmp = ret;
1861 if (ret < 0)
1862 goto done;
1863 }
b4ce94de 1864
f624d976 1865 if (level == 0) {
be0e5c09 1866 p->slots[level] = slot;
9a664971 1867 /*
1868 * Item key already exists. In this case, if we are
1869 * allowed to insert the item (for example, in dir_item
1870 * case, item key collision is allowed), it will be
1871 * merged with the original item. Only the item size
1872 * grows, no new btrfs item will be added. If
1873 * search_for_extension is not set, ins_len already
1874 * accounts the size btrfs_item, deduct it here so leaf
1875 * space check will be correct.
1876 */
1877 if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1878 ASSERT(ins_len >= sizeof(struct btrfs_item));
1879 ins_len -= sizeof(struct btrfs_item);
1880 }
87b29b20 1881 if (ins_len > 0 &&
e902baac 1882 btrfs_leaf_free_space(b) < ins_len) {
bd681513
CM
1883 if (write_lock_level < 1) {
1884 write_lock_level = 1;
1885 btrfs_release_path(p);
1886 goto again;
1887 }
1888
33c66f43
YZ
1889 err = split_leaf(trans, root, key,
1890 p, ins_len, ret == 0);
b4ce94de 1891
33c66f43
YZ
1892 BUG_ON(err > 0);
1893 if (err) {
1894 ret = err;
65b51a00
CM
1895 goto done;
1896 }
5c680ed6 1897 }
459931ec 1898 if (!p->search_for_split)
f7c79f30 1899 unlock_up(p, level, lowest_unlock,
4b6f8e96 1900 min_write_lock_level, NULL);
65b51a00 1901 goto done;
be0e5c09 1902 }
f624d976
QW
1903 if (ret && slot > 0) {
1904 dec = 1;
1905 slot--;
1906 }
1907 p->slots[level] = slot;
1908 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1909 &write_lock_level);
1910 if (err == -EAGAIN)
1911 goto again;
1912 if (err) {
1913 ret = err;
1914 goto done;
1915 }
1916 b = p->nodes[level];
1917 slot = p->slots[level];
1918
1919 /*
1920 * Slot 0 is special, if we change the key we have to update
1921 * the parent pointer which means we must have a write lock on
1922 * the parent
1923 */
1924 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1925 write_lock_level = level + 1;
1926 btrfs_release_path(p);
1927 goto again;
1928 }
1929
1930 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1931 &write_lock_level);
1932
1933 if (level == lowest_level) {
1934 if (dec)
1935 p->slots[level]++;
1936 goto done;
1937 }
1938
1939 err = read_block_for_search(root, p, &b, level, slot, key);
1940 if (err == -EAGAIN)
1941 goto again;
1942 if (err) {
1943 ret = err;
1944 goto done;
1945 }
1946
1947 if (!p->skip_locking) {
1948 level = btrfs_header_level(b);
1949 if (level <= write_lock_level) {
ac5887c8 1950 btrfs_tree_lock(b);
f624d976
QW
1951 p->locks[level] = BTRFS_WRITE_LOCK;
1952 } else {
fe596ca3 1953 btrfs_tree_read_lock(b);
f624d976
QW
1954 p->locks[level] = BTRFS_READ_LOCK;
1955 }
1956 p->nodes[level] = b;
1957 }
be0e5c09 1958 }
65b51a00
CM
1959 ret = 1;
1960done:
5f5bc6b1 1961 if (ret < 0 && !p->skip_release_on_error)
b3b4aa74 1962 btrfs_release_path(p);
65b51a00 1963 return ret;
be0e5c09 1964}
f75e2b79 1965ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
be0e5c09 1966
5d9e75c4
JS
1967/*
1968 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1969 * current state of the tree together with the operations recorded in the tree
1970 * modification log to search for the key in a previous version of this tree, as
1971 * denoted by the time_seq parameter.
1972 *
1973 * Naturally, there is no support for insert, delete or cow operations.
1974 *
1975 * The resulting path and return value will be set up as if we called
1976 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1977 */
310712b2 1978int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
5d9e75c4
JS
1979 struct btrfs_path *p, u64 time_seq)
1980{
0b246afa 1981 struct btrfs_fs_info *fs_info = root->fs_info;
5d9e75c4
JS
1982 struct extent_buffer *b;
1983 int slot;
1984 int ret;
1985 int err;
1986 int level;
1987 int lowest_unlock = 1;
1988 u8 lowest_level = 0;
1989
1990 lowest_level = p->lowest_level;
1991 WARN_ON(p->nodes[0] != NULL);
1992
1993 if (p->search_commit_root) {
1994 BUG_ON(time_seq);
1995 return btrfs_search_slot(NULL, root, key, p, 0, 0);
1996 }
1997
1998again:
f3a84ccd 1999 b = btrfs_get_old_root(root, time_seq);
315bed43
NB
2000 if (!b) {
2001 ret = -EIO;
2002 goto done;
2003 }
5d9e75c4 2004 level = btrfs_header_level(b);
5d9e75c4
JS
2005 p->locks[level] = BTRFS_READ_LOCK;
2006
2007 while (b) {
abe9339d
QW
2008 int dec = 0;
2009
5d9e75c4
JS
2010 level = btrfs_header_level(b);
2011 p->nodes[level] = b;
5d9e75c4
JS
2012
2013 /*
2014 * we have a lock on b and as long as we aren't changing
2015 * the tree, there is no way to for the items in b to change.
2016 * It is safe to drop the lock on our parent before we
2017 * go through the expensive btree search on b.
2018 */
2019 btrfs_unlock_up_safe(p, level + 1);
2020
995e9a16 2021 ret = btrfs_bin_search(b, key, &slot);
cbca7d59
FM
2022 if (ret < 0)
2023 goto done;
5d9e75c4 2024
abe9339d 2025 if (level == 0) {
5d9e75c4
JS
2026 p->slots[level] = slot;
2027 unlock_up(p, level, lowest_unlock, 0, NULL);
abe9339d
QW
2028 goto done;
2029 }
5d9e75c4 2030
abe9339d
QW
2031 if (ret && slot > 0) {
2032 dec = 1;
2033 slot--;
2034 }
2035 p->slots[level] = slot;
2036 unlock_up(p, level, lowest_unlock, 0, NULL);
5d9e75c4 2037
abe9339d
QW
2038 if (level == lowest_level) {
2039 if (dec)
2040 p->slots[level]++;
2041 goto done;
2042 }
5d9e75c4 2043
abe9339d
QW
2044 err = read_block_for_search(root, p, &b, level, slot, key);
2045 if (err == -EAGAIN)
2046 goto again;
2047 if (err) {
2048 ret = err;
5d9e75c4
JS
2049 goto done;
2050 }
abe9339d
QW
2051
2052 level = btrfs_header_level(b);
ac5887c8 2053 btrfs_tree_read_lock(b);
f3a84ccd 2054 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
abe9339d
QW
2055 if (!b) {
2056 ret = -ENOMEM;
2057 goto done;
2058 }
2059 p->locks[level] = BTRFS_READ_LOCK;
2060 p->nodes[level] = b;
5d9e75c4
JS
2061 }
2062 ret = 1;
2063done:
5d9e75c4
JS
2064 if (ret < 0)
2065 btrfs_release_path(p);
2066
2067 return ret;
2068}
2069
2f38b3e1
AJ
2070/*
2071 * helper to use instead of search slot if no exact match is needed but
2072 * instead the next or previous item should be returned.
2073 * When find_higher is true, the next higher item is returned, the next lower
2074 * otherwise.
2075 * When return_any and find_higher are both true, and no higher item is found,
2076 * return the next lower instead.
2077 * When return_any is true and find_higher is false, and no lower item is found,
2078 * return the next higher instead.
2079 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2080 * < 0 on error
2081 */
2082int btrfs_search_slot_for_read(struct btrfs_root *root,
310712b2
OS
2083 const struct btrfs_key *key,
2084 struct btrfs_path *p, int find_higher,
2085 int return_any)
2f38b3e1
AJ
2086{
2087 int ret;
2088 struct extent_buffer *leaf;
2089
2090again:
2091 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2092 if (ret <= 0)
2093 return ret;
2094 /*
2095 * a return value of 1 means the path is at the position where the
2096 * item should be inserted. Normally this is the next bigger item,
2097 * but in case the previous item is the last in a leaf, path points
2098 * to the first free slot in the previous leaf, i.e. at an invalid
2099 * item.
2100 */
2101 leaf = p->nodes[0];
2102
2103 if (find_higher) {
2104 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2105 ret = btrfs_next_leaf(root, p);
2106 if (ret <= 0)
2107 return ret;
2108 if (!return_any)
2109 return 1;
2110 /*
2111 * no higher item found, return the next
2112 * lower instead
2113 */
2114 return_any = 0;
2115 find_higher = 0;
2116 btrfs_release_path(p);
2117 goto again;
2118 }
2119 } else {
e6793769
AJ
2120 if (p->slots[0] == 0) {
2121 ret = btrfs_prev_leaf(root, p);
2122 if (ret < 0)
2123 return ret;
2124 if (!ret) {
23c6bf6a
FDBM
2125 leaf = p->nodes[0];
2126 if (p->slots[0] == btrfs_header_nritems(leaf))
2127 p->slots[0]--;
e6793769 2128 return 0;
2f38b3e1 2129 }
e6793769
AJ
2130 if (!return_any)
2131 return 1;
2132 /*
2133 * no lower item found, return the next
2134 * higher instead
2135 */
2136 return_any = 0;
2137 find_higher = 1;
2138 btrfs_release_path(p);
2139 goto again;
2140 } else {
2f38b3e1
AJ
2141 --p->slots[0];
2142 }
2143 }
2144 return 0;
2145}
2146
74123bd7
CM
2147/*
2148 * adjust the pointers going up the tree, starting at level
2149 * making sure the right key of each node is points to 'key'.
2150 * This is used after shifting pointers to the left, so it stops
2151 * fixing up pointers when a given leaf/node is not in slot 0 of the
2152 * higher levels
aa5d6bed 2153 *
74123bd7 2154 */
b167fa91 2155static void fixup_low_keys(struct btrfs_path *path,
143bede5 2156 struct btrfs_disk_key *key, int level)
be0e5c09
CM
2157{
2158 int i;
5f39d397 2159 struct extent_buffer *t;
0e82bcfe 2160 int ret;
5f39d397 2161
234b63a0 2162 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
be0e5c09 2163 int tslot = path->slots[i];
0e82bcfe 2164
eb60ceac 2165 if (!path->nodes[i])
be0e5c09 2166 break;
5f39d397 2167 t = path->nodes[i];
f3a84ccd
FM
2168 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2169 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
0e82bcfe 2170 BUG_ON(ret < 0);
5f39d397 2171 btrfs_set_node_key(t, key, tslot);
d6025579 2172 btrfs_mark_buffer_dirty(path->nodes[i]);
be0e5c09
CM
2173 if (tslot != 0)
2174 break;
2175 }
2176}
2177
31840ae1
ZY
2178/*
2179 * update item key.
2180 *
2181 * This function isn't completely safe. It's the caller's responsibility
2182 * that the new key won't break the order
2183 */
b7a0365e
DD
2184void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2185 struct btrfs_path *path,
310712b2 2186 const struct btrfs_key *new_key)
31840ae1
ZY
2187{
2188 struct btrfs_disk_key disk_key;
2189 struct extent_buffer *eb;
2190 int slot;
2191
2192 eb = path->nodes[0];
2193 slot = path->slots[0];
2194 if (slot > 0) {
2195 btrfs_item_key(eb, &disk_key, slot - 1);
7c15d410
QW
2196 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2197 btrfs_crit(fs_info,
2198 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2199 slot, btrfs_disk_key_objectid(&disk_key),
2200 btrfs_disk_key_type(&disk_key),
2201 btrfs_disk_key_offset(&disk_key),
2202 new_key->objectid, new_key->type,
2203 new_key->offset);
2204 btrfs_print_leaf(eb);
2205 BUG();
2206 }
31840ae1
ZY
2207 }
2208 if (slot < btrfs_header_nritems(eb) - 1) {
2209 btrfs_item_key(eb, &disk_key, slot + 1);
7c15d410
QW
2210 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2211 btrfs_crit(fs_info,
2212 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2213 slot, btrfs_disk_key_objectid(&disk_key),
2214 btrfs_disk_key_type(&disk_key),
2215 btrfs_disk_key_offset(&disk_key),
2216 new_key->objectid, new_key->type,
2217 new_key->offset);
2218 btrfs_print_leaf(eb);
2219 BUG();
2220 }
31840ae1
ZY
2221 }
2222
2223 btrfs_cpu_key_to_disk(&disk_key, new_key);
2224 btrfs_set_item_key(eb, &disk_key, slot);
2225 btrfs_mark_buffer_dirty(eb);
2226 if (slot == 0)
b167fa91 2227 fixup_low_keys(path, &disk_key, 1);
31840ae1
ZY
2228}
2229
d16c702f
QW
2230/*
2231 * Check key order of two sibling extent buffers.
2232 *
2233 * Return true if something is wrong.
2234 * Return false if everything is fine.
2235 *
2236 * Tree-checker only works inside one tree block, thus the following
2237 * corruption can not be detected by tree-checker:
2238 *
2239 * Leaf @left | Leaf @right
2240 * --------------------------------------------------------------
2241 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2242 *
2243 * Key f6 in leaf @left itself is valid, but not valid when the next
2244 * key in leaf @right is 7.
2245 * This can only be checked at tree block merge time.
2246 * And since tree checker has ensured all key order in each tree block
2247 * is correct, we only need to bother the last key of @left and the first
2248 * key of @right.
2249 */
2250static bool check_sibling_keys(struct extent_buffer *left,
2251 struct extent_buffer *right)
2252{
2253 struct btrfs_key left_last;
2254 struct btrfs_key right_first;
2255 int level = btrfs_header_level(left);
2256 int nr_left = btrfs_header_nritems(left);
2257 int nr_right = btrfs_header_nritems(right);
2258
2259 /* No key to check in one of the tree blocks */
2260 if (!nr_left || !nr_right)
2261 return false;
2262
2263 if (level) {
2264 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2265 btrfs_node_key_to_cpu(right, &right_first, 0);
2266 } else {
2267 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2268 btrfs_item_key_to_cpu(right, &right_first, 0);
2269 }
2270
2271 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2272 btrfs_crit(left->fs_info,
2273"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2274 left_last.objectid, left_last.type,
2275 left_last.offset, right_first.objectid,
2276 right_first.type, right_first.offset);
2277 return true;
2278 }
2279 return false;
2280}
2281
74123bd7
CM
2282/*
2283 * try to push data from one node into the next node left in the
79f95c82 2284 * tree.
aa5d6bed
CM
2285 *
2286 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2287 * error, and > 0 if there was no room in the left hand block.
74123bd7 2288 */
98ed5174 2289static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 2290 struct extent_buffer *dst,
971a1f66 2291 struct extent_buffer *src, int empty)
be0e5c09 2292{
d30a668f 2293 struct btrfs_fs_info *fs_info = trans->fs_info;
be0e5c09 2294 int push_items = 0;
bb803951
CM
2295 int src_nritems;
2296 int dst_nritems;
aa5d6bed 2297 int ret = 0;
be0e5c09 2298
5f39d397
CM
2299 src_nritems = btrfs_header_nritems(src);
2300 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2301 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
7bb86316
CM
2302 WARN_ON(btrfs_header_generation(src) != trans->transid);
2303 WARN_ON(btrfs_header_generation(dst) != trans->transid);
54aa1f4d 2304
bce4eae9 2305 if (!empty && src_nritems <= 8)
971a1f66
CM
2306 return 1;
2307
d397712b 2308 if (push_items <= 0)
be0e5c09
CM
2309 return 1;
2310
bce4eae9 2311 if (empty) {
971a1f66 2312 push_items = min(src_nritems, push_items);
bce4eae9
CM
2313 if (push_items < src_nritems) {
2314 /* leave at least 8 pointers in the node if
2315 * we aren't going to empty it
2316 */
2317 if (src_nritems - push_items < 8) {
2318 if (push_items <= 8)
2319 return 1;
2320 push_items -= 8;
2321 }
2322 }
2323 } else
2324 push_items = min(src_nritems - 8, push_items);
79f95c82 2325
d16c702f
QW
2326 /* dst is the left eb, src is the middle eb */
2327 if (check_sibling_keys(dst, src)) {
2328 ret = -EUCLEAN;
2329 btrfs_abort_transaction(trans, ret);
2330 return ret;
2331 }
f3a84ccd 2332 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
5de865ee 2333 if (ret) {
66642832 2334 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2335 return ret;
2336 }
5f39d397
CM
2337 copy_extent_buffer(dst, src,
2338 btrfs_node_key_ptr_offset(dst_nritems),
2339 btrfs_node_key_ptr_offset(0),
d397712b 2340 push_items * sizeof(struct btrfs_key_ptr));
5f39d397 2341
bb803951 2342 if (push_items < src_nritems) {
57911b8b 2343 /*
f3a84ccd
FM
2344 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2345 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
57911b8b 2346 */
5f39d397
CM
2347 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2348 btrfs_node_key_ptr_offset(push_items),
2349 (src_nritems - push_items) *
2350 sizeof(struct btrfs_key_ptr));
2351 }
2352 btrfs_set_header_nritems(src, src_nritems - push_items);
2353 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2354 btrfs_mark_buffer_dirty(src);
2355 btrfs_mark_buffer_dirty(dst);
31840ae1 2356
79f95c82
CM
2357 return ret;
2358}
2359
2360/*
2361 * try to push data from one node into the next node right in the
2362 * tree.
2363 *
2364 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2365 * error, and > 0 if there was no room in the right hand block.
2366 *
2367 * this will only push up to 1/2 the contents of the left node over
2368 */
5f39d397 2369static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
2370 struct extent_buffer *dst,
2371 struct extent_buffer *src)
79f95c82 2372{
55d32ed8 2373 struct btrfs_fs_info *fs_info = trans->fs_info;
79f95c82
CM
2374 int push_items = 0;
2375 int max_push;
2376 int src_nritems;
2377 int dst_nritems;
2378 int ret = 0;
79f95c82 2379
7bb86316
CM
2380 WARN_ON(btrfs_header_generation(src) != trans->transid);
2381 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2382
5f39d397
CM
2383 src_nritems = btrfs_header_nritems(src);
2384 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2385 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
d397712b 2386 if (push_items <= 0)
79f95c82 2387 return 1;
bce4eae9 2388
d397712b 2389 if (src_nritems < 4)
bce4eae9 2390 return 1;
79f95c82
CM
2391
2392 max_push = src_nritems / 2 + 1;
2393 /* don't try to empty the node */
d397712b 2394 if (max_push >= src_nritems)
79f95c82 2395 return 1;
252c38f0 2396
79f95c82
CM
2397 if (max_push < push_items)
2398 push_items = max_push;
2399
d16c702f
QW
2400 /* dst is the right eb, src is the middle eb */
2401 if (check_sibling_keys(src, dst)) {
2402 ret = -EUCLEAN;
2403 btrfs_abort_transaction(trans, ret);
2404 return ret;
2405 }
f3a84ccd 2406 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
bf1d3425 2407 BUG_ON(ret < 0);
5f39d397
CM
2408 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2409 btrfs_node_key_ptr_offset(0),
2410 (dst_nritems) *
2411 sizeof(struct btrfs_key_ptr));
d6025579 2412
f3a84ccd
FM
2413 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2414 push_items);
5de865ee 2415 if (ret) {
66642832 2416 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2417 return ret;
2418 }
5f39d397
CM
2419 copy_extent_buffer(dst, src,
2420 btrfs_node_key_ptr_offset(0),
2421 btrfs_node_key_ptr_offset(src_nritems - push_items),
d397712b 2422 push_items * sizeof(struct btrfs_key_ptr));
79f95c82 2423
5f39d397
CM
2424 btrfs_set_header_nritems(src, src_nritems - push_items);
2425 btrfs_set_header_nritems(dst, dst_nritems + push_items);
79f95c82 2426
5f39d397
CM
2427 btrfs_mark_buffer_dirty(src);
2428 btrfs_mark_buffer_dirty(dst);
31840ae1 2429
aa5d6bed 2430 return ret;
be0e5c09
CM
2431}
2432
97571fd0
CM
2433/*
2434 * helper function to insert a new root level in the tree.
2435 * A new node is allocated, and a single item is inserted to
2436 * point to the existing root
aa5d6bed
CM
2437 *
2438 * returns zero on success or < 0 on failure.
97571fd0 2439 */
d397712b 2440static noinline int insert_new_root(struct btrfs_trans_handle *trans,
5f39d397 2441 struct btrfs_root *root,
fdd99c72 2442 struct btrfs_path *path, int level)
5c680ed6 2443{
0b246afa 2444 struct btrfs_fs_info *fs_info = root->fs_info;
7bb86316 2445 u64 lower_gen;
5f39d397
CM
2446 struct extent_buffer *lower;
2447 struct extent_buffer *c;
925baedd 2448 struct extent_buffer *old;
5f39d397 2449 struct btrfs_disk_key lower_key;
d9d19a01 2450 int ret;
5c680ed6
CM
2451
2452 BUG_ON(path->nodes[level]);
2453 BUG_ON(path->nodes[level-1] != root->node);
2454
7bb86316
CM
2455 lower = path->nodes[level-1];
2456 if (level == 1)
2457 btrfs_item_key(lower, &lower_key, 0);
2458 else
2459 btrfs_node_key(lower, &lower_key, 0);
2460
a6279470 2461 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
9631e4cc 2462 root->node->start, 0,
cf6f34aa 2463 BTRFS_NESTING_NEW_ROOT);
5f39d397
CM
2464 if (IS_ERR(c))
2465 return PTR_ERR(c);
925baedd 2466
0b246afa 2467 root_add_used(root, fs_info->nodesize);
f0486c68 2468
5f39d397 2469 btrfs_set_header_nritems(c, 1);
5f39d397 2470 btrfs_set_node_key(c, &lower_key, 0);
db94535d 2471 btrfs_set_node_blockptr(c, 0, lower->start);
7bb86316 2472 lower_gen = btrfs_header_generation(lower);
31840ae1 2473 WARN_ON(lower_gen != trans->transid);
7bb86316
CM
2474
2475 btrfs_set_node_ptr_generation(c, 0, lower_gen);
d5719762 2476
5f39d397 2477 btrfs_mark_buffer_dirty(c);
d5719762 2478
925baedd 2479 old = root->node;
406808ab 2480 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
d9d19a01 2481 BUG_ON(ret < 0);
240f62c8 2482 rcu_assign_pointer(root->node, c);
925baedd
CM
2483
2484 /* the super has an extra ref to root->node */
2485 free_extent_buffer(old);
2486
0b86a832 2487 add_root_to_dirty_list(root);
67439dad 2488 atomic_inc(&c->refs);
5f39d397 2489 path->nodes[level] = c;
ac5887c8 2490 path->locks[level] = BTRFS_WRITE_LOCK;
5c680ed6
CM
2491 path->slots[level] = 0;
2492 return 0;
2493}
2494
74123bd7
CM
2495/*
2496 * worker function to insert a single pointer in a node.
2497 * the node should have enough room for the pointer already
97571fd0 2498 *
74123bd7
CM
2499 * slot and level indicate where you want the key to go, and
2500 * blocknr is the block the key points to.
2501 */
143bede5 2502static void insert_ptr(struct btrfs_trans_handle *trans,
6ad3cf6d 2503 struct btrfs_path *path,
143bede5 2504 struct btrfs_disk_key *key, u64 bytenr,
c3e06965 2505 int slot, int level)
74123bd7 2506{
5f39d397 2507 struct extent_buffer *lower;
74123bd7 2508 int nritems;
f3ea38da 2509 int ret;
5c680ed6
CM
2510
2511 BUG_ON(!path->nodes[level]);
f0486c68 2512 btrfs_assert_tree_locked(path->nodes[level]);
5f39d397
CM
2513 lower = path->nodes[level];
2514 nritems = btrfs_header_nritems(lower);
c293498b 2515 BUG_ON(slot > nritems);
6ad3cf6d 2516 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
74123bd7 2517 if (slot != nritems) {
bf1d3425 2518 if (level) {
f3a84ccd
FM
2519 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2520 slot, nritems - slot);
bf1d3425
DS
2521 BUG_ON(ret < 0);
2522 }
5f39d397
CM
2523 memmove_extent_buffer(lower,
2524 btrfs_node_key_ptr_offset(slot + 1),
2525 btrfs_node_key_ptr_offset(slot),
d6025579 2526 (nritems - slot) * sizeof(struct btrfs_key_ptr));
74123bd7 2527 }
c3e06965 2528 if (level) {
f3a84ccd
FM
2529 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2530 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
f3ea38da
JS
2531 BUG_ON(ret < 0);
2532 }
5f39d397 2533 btrfs_set_node_key(lower, key, slot);
db94535d 2534 btrfs_set_node_blockptr(lower, slot, bytenr);
74493f7a
CM
2535 WARN_ON(trans->transid == 0);
2536 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
5f39d397
CM
2537 btrfs_set_header_nritems(lower, nritems + 1);
2538 btrfs_mark_buffer_dirty(lower);
74123bd7
CM
2539}
2540
97571fd0
CM
2541/*
2542 * split the node at the specified level in path in two.
2543 * The path is corrected to point to the appropriate node after the split
2544 *
2545 * Before splitting this tries to make some room in the node by pushing
2546 * left and right, if either one works, it returns right away.
aa5d6bed
CM
2547 *
2548 * returns 0 on success and < 0 on failure
97571fd0 2549 */
e02119d5
CM
2550static noinline int split_node(struct btrfs_trans_handle *trans,
2551 struct btrfs_root *root,
2552 struct btrfs_path *path, int level)
be0e5c09 2553{
0b246afa 2554 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
2555 struct extent_buffer *c;
2556 struct extent_buffer *split;
2557 struct btrfs_disk_key disk_key;
be0e5c09 2558 int mid;
5c680ed6 2559 int ret;
7518a238 2560 u32 c_nritems;
eb60ceac 2561
5f39d397 2562 c = path->nodes[level];
7bb86316 2563 WARN_ON(btrfs_header_generation(c) != trans->transid);
5f39d397 2564 if (c == root->node) {
d9abbf1c 2565 /*
90f8d62e
JS
2566 * trying to split the root, lets make a new one
2567 *
fdd99c72 2568 * tree mod log: We don't log_removal old root in
90f8d62e
JS
2569 * insert_new_root, because that root buffer will be kept as a
2570 * normal node. We are going to log removal of half of the
f3a84ccd
FM
2571 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2572 * holding a tree lock on the buffer, which is why we cannot
2573 * race with other tree_mod_log users.
d9abbf1c 2574 */
fdd99c72 2575 ret = insert_new_root(trans, root, path, level + 1);
5c680ed6
CM
2576 if (ret)
2577 return ret;
b3612421 2578 } else {
e66f709b 2579 ret = push_nodes_for_insert(trans, root, path, level);
5f39d397
CM
2580 c = path->nodes[level];
2581 if (!ret && btrfs_header_nritems(c) <
0b246afa 2582 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
e66f709b 2583 return 0;
54aa1f4d
CM
2584 if (ret < 0)
2585 return ret;
be0e5c09 2586 }
e66f709b 2587
5f39d397 2588 c_nritems = btrfs_header_nritems(c);
5d4f98a2
YZ
2589 mid = (c_nritems + 1) / 2;
2590 btrfs_node_key(c, &disk_key, mid);
7bb86316 2591
a6279470 2592 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
4dff97e6 2593 c->start, 0, BTRFS_NESTING_SPLIT);
5f39d397
CM
2594 if (IS_ERR(split))
2595 return PTR_ERR(split);
2596
0b246afa 2597 root_add_used(root, fs_info->nodesize);
bc877d28 2598 ASSERT(btrfs_header_level(c) == level);
54aa1f4d 2599
f3a84ccd 2600 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
5de865ee 2601 if (ret) {
66642832 2602 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2603 return ret;
2604 }
5f39d397
CM
2605 copy_extent_buffer(split, c,
2606 btrfs_node_key_ptr_offset(0),
2607 btrfs_node_key_ptr_offset(mid),
2608 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2609 btrfs_set_header_nritems(split, c_nritems - mid);
2610 btrfs_set_header_nritems(c, mid);
aa5d6bed 2611
5f39d397
CM
2612 btrfs_mark_buffer_dirty(c);
2613 btrfs_mark_buffer_dirty(split);
2614
6ad3cf6d 2615 insert_ptr(trans, path, &disk_key, split->start,
c3e06965 2616 path->slots[level + 1] + 1, level + 1);
aa5d6bed 2617
5de08d7d 2618 if (path->slots[level] >= mid) {
5c680ed6 2619 path->slots[level] -= mid;
925baedd 2620 btrfs_tree_unlock(c);
5f39d397
CM
2621 free_extent_buffer(c);
2622 path->nodes[level] = split;
5c680ed6
CM
2623 path->slots[level + 1] += 1;
2624 } else {
925baedd 2625 btrfs_tree_unlock(split);
5f39d397 2626 free_extent_buffer(split);
be0e5c09 2627 }
d5286a92 2628 return 0;
be0e5c09
CM
2629}
2630
74123bd7
CM
2631/*
2632 * how many bytes are required to store the items in a leaf. start
2633 * and nr indicate which items in the leaf to check. This totals up the
2634 * space used both by the item structs and the item data
2635 */
5f39d397 2636static int leaf_space_used(struct extent_buffer *l, int start, int nr)
be0e5c09 2637{
41be1f3b
JB
2638 struct btrfs_item *start_item;
2639 struct btrfs_item *end_item;
be0e5c09 2640 int data_len;
5f39d397 2641 int nritems = btrfs_header_nritems(l);
d4dbff95 2642 int end = min(nritems, start + nr) - 1;
be0e5c09
CM
2643
2644 if (!nr)
2645 return 0;
dd3cc16b
RK
2646 start_item = btrfs_item_nr(start);
2647 end_item = btrfs_item_nr(end);
a31356b9
DS
2648 data_len = btrfs_item_offset(l, start_item) +
2649 btrfs_item_size(l, start_item);
2650 data_len = data_len - btrfs_item_offset(l, end_item);
0783fcfc 2651 data_len += sizeof(struct btrfs_item) * nr;
d4dbff95 2652 WARN_ON(data_len < 0);
be0e5c09
CM
2653 return data_len;
2654}
2655
d4dbff95
CM
2656/*
2657 * The space between the end of the leaf items and
2658 * the start of the leaf data. IOW, how much room
2659 * the leaf has left for both items and data
2660 */
e902baac 2661noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
d4dbff95 2662{
e902baac 2663 struct btrfs_fs_info *fs_info = leaf->fs_info;
5f39d397
CM
2664 int nritems = btrfs_header_nritems(leaf);
2665 int ret;
0b246afa
JM
2666
2667 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
5f39d397 2668 if (ret < 0) {
0b246afa
JM
2669 btrfs_crit(fs_info,
2670 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2671 ret,
2672 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2673 leaf_space_used(leaf, 0, nritems), nritems);
5f39d397
CM
2674 }
2675 return ret;
d4dbff95
CM
2676}
2677
99d8f83c
CM
2678/*
2679 * min slot controls the lowest index we're willing to push to the
2680 * right. We'll push up to and including min_slot, but no lower
2681 */
f72f0010 2682static noinline int __push_leaf_right(struct btrfs_path *path,
44871b1b
CM
2683 int data_size, int empty,
2684 struct extent_buffer *right,
99d8f83c
CM
2685 int free_space, u32 left_nritems,
2686 u32 min_slot)
00ec4c51 2687{
f72f0010 2688 struct btrfs_fs_info *fs_info = right->fs_info;
5f39d397 2689 struct extent_buffer *left = path->nodes[0];
44871b1b 2690 struct extent_buffer *upper = path->nodes[1];
cfed81a0 2691 struct btrfs_map_token token;
5f39d397 2692 struct btrfs_disk_key disk_key;
00ec4c51 2693 int slot;
34a38218 2694 u32 i;
00ec4c51
CM
2695 int push_space = 0;
2696 int push_items = 0;
0783fcfc 2697 struct btrfs_item *item;
34a38218 2698 u32 nr;
7518a238 2699 u32 right_nritems;
5f39d397 2700 u32 data_end;
db94535d 2701 u32 this_item_size;
00ec4c51 2702
34a38218
CM
2703 if (empty)
2704 nr = 0;
2705 else
99d8f83c 2706 nr = max_t(u32, 1, min_slot);
34a38218 2707
31840ae1 2708 if (path->slots[0] >= left_nritems)
87b29b20 2709 push_space += data_size;
31840ae1 2710
44871b1b 2711 slot = path->slots[1];
34a38218
CM
2712 i = left_nritems - 1;
2713 while (i >= nr) {
dd3cc16b 2714 item = btrfs_item_nr(i);
db94535d 2715
31840ae1
ZY
2716 if (!empty && push_items > 0) {
2717 if (path->slots[0] > i)
2718 break;
2719 if (path->slots[0] == i) {
e902baac
DS
2720 int space = btrfs_leaf_free_space(left);
2721
31840ae1
ZY
2722 if (space + push_space * 2 > free_space)
2723 break;
2724 }
2725 }
2726
00ec4c51 2727 if (path->slots[0] == i)
87b29b20 2728 push_space += data_size;
db94535d 2729
db94535d
CM
2730 this_item_size = btrfs_item_size(left, item);
2731 if (this_item_size + sizeof(*item) + push_space > free_space)
00ec4c51 2732 break;
31840ae1 2733
00ec4c51 2734 push_items++;
db94535d 2735 push_space += this_item_size + sizeof(*item);
34a38218
CM
2736 if (i == 0)
2737 break;
2738 i--;
db94535d 2739 }
5f39d397 2740
925baedd
CM
2741 if (push_items == 0)
2742 goto out_unlock;
5f39d397 2743
6c1500f2 2744 WARN_ON(!empty && push_items == left_nritems);
5f39d397 2745
00ec4c51 2746 /* push left to right */
5f39d397 2747 right_nritems = btrfs_header_nritems(right);
34a38218 2748
5f39d397 2749 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
8f881e8c 2750 push_space -= leaf_data_end(left);
5f39d397 2751
00ec4c51 2752 /* make room in the right data area */
8f881e8c 2753 data_end = leaf_data_end(right);
5f39d397 2754 memmove_extent_buffer(right,
3d9ec8c4
NB
2755 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2756 BTRFS_LEAF_DATA_OFFSET + data_end,
0b246afa 2757 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
5f39d397 2758
00ec4c51 2759 /* copy from the left data area */
3d9ec8c4 2760 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
0b246afa 2761 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
8f881e8c 2762 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
d6025579 2763 push_space);
5f39d397
CM
2764
2765 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2766 btrfs_item_nr_offset(0),
2767 right_nritems * sizeof(struct btrfs_item));
2768
00ec4c51 2769 /* copy the items from left to right */
5f39d397
CM
2770 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2771 btrfs_item_nr_offset(left_nritems - push_items),
2772 push_items * sizeof(struct btrfs_item));
00ec4c51
CM
2773
2774 /* update the item pointers */
c82f823c 2775 btrfs_init_map_token(&token, right);
7518a238 2776 right_nritems += push_items;
5f39d397 2777 btrfs_set_header_nritems(right, right_nritems);
0b246afa 2778 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
7518a238 2779 for (i = 0; i < right_nritems; i++) {
dd3cc16b 2780 item = btrfs_item_nr(i);
cc4c13d5
DS
2781 push_space -= btrfs_token_item_size(&token, item);
2782 btrfs_set_token_item_offset(&token, item, push_space);
db94535d
CM
2783 }
2784
7518a238 2785 left_nritems -= push_items;
5f39d397 2786 btrfs_set_header_nritems(left, left_nritems);
00ec4c51 2787
34a38218
CM
2788 if (left_nritems)
2789 btrfs_mark_buffer_dirty(left);
f0486c68 2790 else
6a884d7d 2791 btrfs_clean_tree_block(left);
f0486c68 2792
5f39d397 2793 btrfs_mark_buffer_dirty(right);
a429e513 2794
5f39d397
CM
2795 btrfs_item_key(right, &disk_key, 0);
2796 btrfs_set_node_key(upper, &disk_key, slot + 1);
d6025579 2797 btrfs_mark_buffer_dirty(upper);
02217ed2 2798
00ec4c51 2799 /* then fixup the leaf pointer in the path */
7518a238
CM
2800 if (path->slots[0] >= left_nritems) {
2801 path->slots[0] -= left_nritems;
925baedd 2802 if (btrfs_header_nritems(path->nodes[0]) == 0)
6a884d7d 2803 btrfs_clean_tree_block(path->nodes[0]);
925baedd 2804 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
2805 free_extent_buffer(path->nodes[0]);
2806 path->nodes[0] = right;
00ec4c51
CM
2807 path->slots[1] += 1;
2808 } else {
925baedd 2809 btrfs_tree_unlock(right);
5f39d397 2810 free_extent_buffer(right);
00ec4c51
CM
2811 }
2812 return 0;
925baedd
CM
2813
2814out_unlock:
2815 btrfs_tree_unlock(right);
2816 free_extent_buffer(right);
2817 return 1;
00ec4c51 2818}
925baedd 2819
44871b1b
CM
2820/*
2821 * push some data in the path leaf to the right, trying to free up at
2822 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2823 *
2824 * returns 1 if the push failed because the other node didn't have enough
2825 * room, 0 if everything worked out and < 0 if there were major errors.
99d8f83c
CM
2826 *
2827 * this will push starting from min_slot to the end of the leaf. It won't
2828 * push any slot lower than min_slot
44871b1b
CM
2829 */
2830static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
2831 *root, struct btrfs_path *path,
2832 int min_data_size, int data_size,
2833 int empty, u32 min_slot)
44871b1b
CM
2834{
2835 struct extent_buffer *left = path->nodes[0];
2836 struct extent_buffer *right;
2837 struct extent_buffer *upper;
2838 int slot;
2839 int free_space;
2840 u32 left_nritems;
2841 int ret;
2842
2843 if (!path->nodes[1])
2844 return 1;
2845
2846 slot = path->slots[1];
2847 upper = path->nodes[1];
2848 if (slot >= btrfs_header_nritems(upper) - 1)
2849 return 1;
2850
2851 btrfs_assert_tree_locked(path->nodes[1]);
2852
4b231ae4 2853 right = btrfs_read_node_slot(upper, slot + 1);
fb770ae4
LB
2854 /*
2855 * slot + 1 is not valid or we fail to read the right node,
2856 * no big deal, just return.
2857 */
2858 if (IS_ERR(right))
91ca338d
TI
2859 return 1;
2860
bf77467a 2861 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
44871b1b 2862
e902baac 2863 free_space = btrfs_leaf_free_space(right);
44871b1b
CM
2864 if (free_space < data_size)
2865 goto out_unlock;
2866
2867 /* cow and double check */
2868 ret = btrfs_cow_block(trans, root, right, upper,
bf59a5a2 2869 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
44871b1b
CM
2870 if (ret)
2871 goto out_unlock;
2872
e902baac 2873 free_space = btrfs_leaf_free_space(right);
44871b1b
CM
2874 if (free_space < data_size)
2875 goto out_unlock;
2876
2877 left_nritems = btrfs_header_nritems(left);
2878 if (left_nritems == 0)
2879 goto out_unlock;
2880
d16c702f
QW
2881 if (check_sibling_keys(left, right)) {
2882 ret = -EUCLEAN;
2883 btrfs_tree_unlock(right);
2884 free_extent_buffer(right);
2885 return ret;
2886 }
2ef1fed2
FDBM
2887 if (path->slots[0] == left_nritems && !empty) {
2888 /* Key greater than all keys in the leaf, right neighbor has
2889 * enough room for it and we're not emptying our leaf to delete
2890 * it, therefore use right neighbor to insert the new item and
52042d8e 2891 * no need to touch/dirty our left leaf. */
2ef1fed2
FDBM
2892 btrfs_tree_unlock(left);
2893 free_extent_buffer(left);
2894 path->nodes[0] = right;
2895 path->slots[0] = 0;
2896 path->slots[1]++;
2897 return 0;
2898 }
2899
f72f0010 2900 return __push_leaf_right(path, min_data_size, empty,
99d8f83c 2901 right, free_space, left_nritems, min_slot);
44871b1b
CM
2902out_unlock:
2903 btrfs_tree_unlock(right);
2904 free_extent_buffer(right);
2905 return 1;
2906}
2907
74123bd7
CM
2908/*
2909 * push some data in the path leaf to the left, trying to free up at
2910 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
2911 *
2912 * max_slot can put a limit on how far into the leaf we'll push items. The
2913 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2914 * items
74123bd7 2915 */
8087c193 2916static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
44871b1b 2917 int empty, struct extent_buffer *left,
99d8f83c
CM
2918 int free_space, u32 right_nritems,
2919 u32 max_slot)
be0e5c09 2920{
8087c193 2921 struct btrfs_fs_info *fs_info = left->fs_info;
5f39d397
CM
2922 struct btrfs_disk_key disk_key;
2923 struct extent_buffer *right = path->nodes[0];
be0e5c09 2924 int i;
be0e5c09
CM
2925 int push_space = 0;
2926 int push_items = 0;
0783fcfc 2927 struct btrfs_item *item;
7518a238 2928 u32 old_left_nritems;
34a38218 2929 u32 nr;
aa5d6bed 2930 int ret = 0;
db94535d
CM
2931 u32 this_item_size;
2932 u32 old_left_item_size;
cfed81a0
CM
2933 struct btrfs_map_token token;
2934
34a38218 2935 if (empty)
99d8f83c 2936 nr = min(right_nritems, max_slot);
34a38218 2937 else
99d8f83c 2938 nr = min(right_nritems - 1, max_slot);
34a38218
CM
2939
2940 for (i = 0; i < nr; i++) {
dd3cc16b 2941 item = btrfs_item_nr(i);
db94535d 2942
31840ae1
ZY
2943 if (!empty && push_items > 0) {
2944 if (path->slots[0] < i)
2945 break;
2946 if (path->slots[0] == i) {
e902baac
DS
2947 int space = btrfs_leaf_free_space(right);
2948
31840ae1
ZY
2949 if (space + push_space * 2 > free_space)
2950 break;
2951 }
2952 }
2953
be0e5c09 2954 if (path->slots[0] == i)
87b29b20 2955 push_space += data_size;
db94535d
CM
2956
2957 this_item_size = btrfs_item_size(right, item);
2958 if (this_item_size + sizeof(*item) + push_space > free_space)
be0e5c09 2959 break;
db94535d 2960
be0e5c09 2961 push_items++;
db94535d
CM
2962 push_space += this_item_size + sizeof(*item);
2963 }
2964
be0e5c09 2965 if (push_items == 0) {
925baedd
CM
2966 ret = 1;
2967 goto out;
be0e5c09 2968 }
fae7f21c 2969 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
5f39d397 2970
be0e5c09 2971 /* push data from right to left */
5f39d397
CM
2972 copy_extent_buffer(left, right,
2973 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2974 btrfs_item_nr_offset(0),
2975 push_items * sizeof(struct btrfs_item));
2976
0b246afa 2977 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
d397712b 2978 btrfs_item_offset_nr(right, push_items - 1);
5f39d397 2979
3d9ec8c4 2980 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 2981 leaf_data_end(left) - push_space,
3d9ec8c4 2982 BTRFS_LEAF_DATA_OFFSET +
5f39d397 2983 btrfs_item_offset_nr(right, push_items - 1),
d6025579 2984 push_space);
5f39d397 2985 old_left_nritems = btrfs_header_nritems(left);
87b29b20 2986 BUG_ON(old_left_nritems <= 0);
eb60ceac 2987
c82f823c 2988 btrfs_init_map_token(&token, left);
db94535d 2989 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
0783fcfc 2990 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
5f39d397 2991 u32 ioff;
db94535d 2992
dd3cc16b 2993 item = btrfs_item_nr(i);
db94535d 2994
cc4c13d5
DS
2995 ioff = btrfs_token_item_offset(&token, item);
2996 btrfs_set_token_item_offset(&token, item,
2997 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
be0e5c09 2998 }
5f39d397 2999 btrfs_set_header_nritems(left, old_left_nritems + push_items);
be0e5c09
CM
3000
3001 /* fixup right node */
31b1a2bd
JL
3002 if (push_items > right_nritems)
3003 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
d397712b 3004 right_nritems);
34a38218
CM
3005
3006 if (push_items < right_nritems) {
3007 push_space = btrfs_item_offset_nr(right, push_items - 1) -
8f881e8c 3008 leaf_data_end(right);
3d9ec8c4 3009 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
0b246afa 3010 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3d9ec8c4 3011 BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3012 leaf_data_end(right), push_space);
34a38218
CM
3013
3014 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
5f39d397
CM
3015 btrfs_item_nr_offset(push_items),
3016 (btrfs_header_nritems(right) - push_items) *
3017 sizeof(struct btrfs_item));
34a38218 3018 }
c82f823c
DS
3019
3020 btrfs_init_map_token(&token, right);
eef1c494
Y
3021 right_nritems -= push_items;
3022 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3023 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
5f39d397 3024 for (i = 0; i < right_nritems; i++) {
dd3cc16b 3025 item = btrfs_item_nr(i);
db94535d 3026
cc4c13d5
DS
3027 push_space = push_space - btrfs_token_item_size(&token, item);
3028 btrfs_set_token_item_offset(&token, item, push_space);
db94535d 3029 }
eb60ceac 3030
5f39d397 3031 btrfs_mark_buffer_dirty(left);
34a38218
CM
3032 if (right_nritems)
3033 btrfs_mark_buffer_dirty(right);
f0486c68 3034 else
6a884d7d 3035 btrfs_clean_tree_block(right);
098f59c2 3036
5f39d397 3037 btrfs_item_key(right, &disk_key, 0);
b167fa91 3038 fixup_low_keys(path, &disk_key, 1);
be0e5c09
CM
3039
3040 /* then fixup the leaf pointer in the path */
3041 if (path->slots[0] < push_items) {
3042 path->slots[0] += old_left_nritems;
925baedd 3043 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3044 free_extent_buffer(path->nodes[0]);
3045 path->nodes[0] = left;
be0e5c09
CM
3046 path->slots[1] -= 1;
3047 } else {
925baedd 3048 btrfs_tree_unlock(left);
5f39d397 3049 free_extent_buffer(left);
be0e5c09
CM
3050 path->slots[0] -= push_items;
3051 }
eb60ceac 3052 BUG_ON(path->slots[0] < 0);
aa5d6bed 3053 return ret;
925baedd
CM
3054out:
3055 btrfs_tree_unlock(left);
3056 free_extent_buffer(left);
3057 return ret;
be0e5c09
CM
3058}
3059
44871b1b
CM
3060/*
3061 * push some data in the path leaf to the left, trying to free up at
3062 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3063 *
3064 * max_slot can put a limit on how far into the leaf we'll push items. The
3065 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3066 * items
44871b1b
CM
3067 */
3068static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3069 *root, struct btrfs_path *path, int min_data_size,
3070 int data_size, int empty, u32 max_slot)
44871b1b
CM
3071{
3072 struct extent_buffer *right = path->nodes[0];
3073 struct extent_buffer *left;
3074 int slot;
3075 int free_space;
3076 u32 right_nritems;
3077 int ret = 0;
3078
3079 slot = path->slots[1];
3080 if (slot == 0)
3081 return 1;
3082 if (!path->nodes[1])
3083 return 1;
3084
3085 right_nritems = btrfs_header_nritems(right);
3086 if (right_nritems == 0)
3087 return 1;
3088
3089 btrfs_assert_tree_locked(path->nodes[1]);
3090
4b231ae4 3091 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
fb770ae4
LB
3092 /*
3093 * slot - 1 is not valid or we fail to read the left node,
3094 * no big deal, just return.
3095 */
3096 if (IS_ERR(left))
91ca338d
TI
3097 return 1;
3098
bf77467a 3099 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
44871b1b 3100
e902baac 3101 free_space = btrfs_leaf_free_space(left);
44871b1b
CM
3102 if (free_space < data_size) {
3103 ret = 1;
3104 goto out;
3105 }
3106
3107 /* cow and double check */
3108 ret = btrfs_cow_block(trans, root, left,
9631e4cc 3109 path->nodes[1], slot - 1, &left,
bf59a5a2 3110 BTRFS_NESTING_LEFT_COW);
44871b1b
CM
3111 if (ret) {
3112 /* we hit -ENOSPC, but it isn't fatal here */
79787eaa
JM
3113 if (ret == -ENOSPC)
3114 ret = 1;
44871b1b
CM
3115 goto out;
3116 }
3117
e902baac 3118 free_space = btrfs_leaf_free_space(left);
44871b1b
CM
3119 if (free_space < data_size) {
3120 ret = 1;
3121 goto out;
3122 }
3123
d16c702f
QW
3124 if (check_sibling_keys(left, right)) {
3125 ret = -EUCLEAN;
3126 goto out;
3127 }
8087c193 3128 return __push_leaf_left(path, min_data_size,
99d8f83c
CM
3129 empty, left, free_space, right_nritems,
3130 max_slot);
44871b1b
CM
3131out:
3132 btrfs_tree_unlock(left);
3133 free_extent_buffer(left);
3134 return ret;
3135}
3136
3137/*
3138 * split the path's leaf in two, making sure there is at least data_size
3139 * available for the resulting leaf level of the path.
44871b1b 3140 */
143bede5 3141static noinline void copy_for_split(struct btrfs_trans_handle *trans,
143bede5
JM
3142 struct btrfs_path *path,
3143 struct extent_buffer *l,
3144 struct extent_buffer *right,
3145 int slot, int mid, int nritems)
44871b1b 3146{
94f94ad9 3147 struct btrfs_fs_info *fs_info = trans->fs_info;
44871b1b
CM
3148 int data_copy_size;
3149 int rt_data_off;
3150 int i;
44871b1b 3151 struct btrfs_disk_key disk_key;
cfed81a0
CM
3152 struct btrfs_map_token token;
3153
44871b1b
CM
3154 nritems = nritems - mid;
3155 btrfs_set_header_nritems(right, nritems);
8f881e8c 3156 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
44871b1b
CM
3157
3158 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3159 btrfs_item_nr_offset(mid),
3160 nritems * sizeof(struct btrfs_item));
3161
3162 copy_extent_buffer(right, l,
3d9ec8c4
NB
3163 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3164 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3165 leaf_data_end(l), data_copy_size);
44871b1b 3166
0b246afa 3167 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
44871b1b 3168
c82f823c 3169 btrfs_init_map_token(&token, right);
44871b1b 3170 for (i = 0; i < nritems; i++) {
dd3cc16b 3171 struct btrfs_item *item = btrfs_item_nr(i);
44871b1b
CM
3172 u32 ioff;
3173
cc4c13d5
DS
3174 ioff = btrfs_token_item_offset(&token, item);
3175 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
44871b1b
CM
3176 }
3177
44871b1b 3178 btrfs_set_header_nritems(l, mid);
44871b1b 3179 btrfs_item_key(right, &disk_key, 0);
6ad3cf6d 3180 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
44871b1b
CM
3181
3182 btrfs_mark_buffer_dirty(right);
3183 btrfs_mark_buffer_dirty(l);
3184 BUG_ON(path->slots[0] != slot);
3185
44871b1b
CM
3186 if (mid <= slot) {
3187 btrfs_tree_unlock(path->nodes[0]);
3188 free_extent_buffer(path->nodes[0]);
3189 path->nodes[0] = right;
3190 path->slots[0] -= mid;
3191 path->slots[1] += 1;
3192 } else {
3193 btrfs_tree_unlock(right);
3194 free_extent_buffer(right);
3195 }
3196
3197 BUG_ON(path->slots[0] < 0);
44871b1b
CM
3198}
3199
99d8f83c
CM
3200/*
3201 * double splits happen when we need to insert a big item in the middle
3202 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3203 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3204 * A B C
3205 *
3206 * We avoid this by trying to push the items on either side of our target
3207 * into the adjacent leaves. If all goes well we can avoid the double split
3208 * completely.
3209 */
3210static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3211 struct btrfs_root *root,
3212 struct btrfs_path *path,
3213 int data_size)
3214{
3215 int ret;
3216 int progress = 0;
3217 int slot;
3218 u32 nritems;
5a4267ca 3219 int space_needed = data_size;
99d8f83c
CM
3220
3221 slot = path->slots[0];
5a4267ca 3222 if (slot < btrfs_header_nritems(path->nodes[0]))
e902baac 3223 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
99d8f83c
CM
3224
3225 /*
3226 * try to push all the items after our slot into the
3227 * right leaf
3228 */
5a4267ca 3229 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3230 if (ret < 0)
3231 return ret;
3232
3233 if (ret == 0)
3234 progress++;
3235
3236 nritems = btrfs_header_nritems(path->nodes[0]);
3237 /*
3238 * our goal is to get our slot at the start or end of a leaf. If
3239 * we've done so we're done
3240 */
3241 if (path->slots[0] == 0 || path->slots[0] == nritems)
3242 return 0;
3243
e902baac 3244 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3245 return 0;
3246
3247 /* try to push all the items before our slot into the next leaf */
3248 slot = path->slots[0];
263d3995
FM
3249 space_needed = data_size;
3250 if (slot > 0)
e902baac 3251 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
5a4267ca 3252 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3253 if (ret < 0)
3254 return ret;
3255
3256 if (ret == 0)
3257 progress++;
3258
3259 if (progress)
3260 return 0;
3261 return 1;
3262}
3263
74123bd7
CM
3264/*
3265 * split the path's leaf in two, making sure there is at least data_size
3266 * available for the resulting leaf level of the path.
aa5d6bed
CM
3267 *
3268 * returns 0 if all went well and < 0 on failure.
74123bd7 3269 */
e02119d5
CM
3270static noinline int split_leaf(struct btrfs_trans_handle *trans,
3271 struct btrfs_root *root,
310712b2 3272 const struct btrfs_key *ins_key,
e02119d5
CM
3273 struct btrfs_path *path, int data_size,
3274 int extend)
be0e5c09 3275{
5d4f98a2 3276 struct btrfs_disk_key disk_key;
5f39d397 3277 struct extent_buffer *l;
7518a238 3278 u32 nritems;
eb60ceac
CM
3279 int mid;
3280 int slot;
5f39d397 3281 struct extent_buffer *right;
b7a0365e 3282 struct btrfs_fs_info *fs_info = root->fs_info;
d4dbff95 3283 int ret = 0;
aa5d6bed 3284 int wret;
5d4f98a2 3285 int split;
cc0c5538 3286 int num_doubles = 0;
99d8f83c 3287 int tried_avoid_double = 0;
aa5d6bed 3288
a5719521
YZ
3289 l = path->nodes[0];
3290 slot = path->slots[0];
3291 if (extend && data_size + btrfs_item_size_nr(l, slot) +
0b246afa 3292 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
a5719521
YZ
3293 return -EOVERFLOW;
3294
40689478 3295 /* first try to make some room by pushing left and right */
33157e05 3296 if (data_size && path->nodes[1]) {
5a4267ca
FDBM
3297 int space_needed = data_size;
3298
3299 if (slot < btrfs_header_nritems(l))
e902baac 3300 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3301
3302 wret = push_leaf_right(trans, root, path, space_needed,
3303 space_needed, 0, 0);
d397712b 3304 if (wret < 0)
eaee50e8 3305 return wret;
3685f791 3306 if (wret) {
263d3995
FM
3307 space_needed = data_size;
3308 if (slot > 0)
e902baac 3309 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3310 wret = push_leaf_left(trans, root, path, space_needed,
3311 space_needed, 0, (u32)-1);
3685f791
CM
3312 if (wret < 0)
3313 return wret;
3314 }
3315 l = path->nodes[0];
aa5d6bed 3316
3685f791 3317 /* did the pushes work? */
e902baac 3318 if (btrfs_leaf_free_space(l) >= data_size)
3685f791 3319 return 0;
3326d1b0 3320 }
aa5d6bed 3321
5c680ed6 3322 if (!path->nodes[1]) {
fdd99c72 3323 ret = insert_new_root(trans, root, path, 1);
5c680ed6
CM
3324 if (ret)
3325 return ret;
3326 }
cc0c5538 3327again:
5d4f98a2 3328 split = 1;
cc0c5538 3329 l = path->nodes[0];
eb60ceac 3330 slot = path->slots[0];
5f39d397 3331 nritems = btrfs_header_nritems(l);
d397712b 3332 mid = (nritems + 1) / 2;
54aa1f4d 3333
5d4f98a2
YZ
3334 if (mid <= slot) {
3335 if (nritems == 1 ||
3336 leaf_space_used(l, mid, nritems - mid) + data_size >
0b246afa 3337 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3338 if (slot >= nritems) {
3339 split = 0;
3340 } else {
3341 mid = slot;
3342 if (mid != nritems &&
3343 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3344 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3345 if (data_size && !tried_avoid_double)
3346 goto push_for_double;
5d4f98a2
YZ
3347 split = 2;
3348 }
3349 }
3350 }
3351 } else {
3352 if (leaf_space_used(l, 0, mid) + data_size >
0b246afa 3353 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3354 if (!extend && data_size && slot == 0) {
3355 split = 0;
3356 } else if ((extend || !data_size) && slot == 0) {
3357 mid = 1;
3358 } else {
3359 mid = slot;
3360 if (mid != nritems &&
3361 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3362 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3363 if (data_size && !tried_avoid_double)
3364 goto push_for_double;
67871254 3365 split = 2;
5d4f98a2
YZ
3366 }
3367 }
3368 }
3369 }
3370
3371 if (split == 0)
3372 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3373 else
3374 btrfs_item_key(l, &disk_key, mid);
3375
ca9d473a
JB
3376 /*
3377 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3378 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3379 * subclasses, which is 8 at the time of this patch, and we've maxed it
3380 * out. In the future we could add a
3381 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3382 * use BTRFS_NESTING_NEW_ROOT.
3383 */
a6279470 3384 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
ca9d473a
JB
3385 l->start, 0, num_doubles ?
3386 BTRFS_NESTING_NEW_ROOT :
3387 BTRFS_NESTING_SPLIT);
f0486c68 3388 if (IS_ERR(right))
5f39d397 3389 return PTR_ERR(right);
f0486c68 3390
0b246afa 3391 root_add_used(root, fs_info->nodesize);
5f39d397 3392
5d4f98a2
YZ
3393 if (split == 0) {
3394 if (mid <= slot) {
3395 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3396 insert_ptr(trans, path, &disk_key,
2ff7e61e 3397 right->start, path->slots[1] + 1, 1);
5d4f98a2
YZ
3398 btrfs_tree_unlock(path->nodes[0]);
3399 free_extent_buffer(path->nodes[0]);
3400 path->nodes[0] = right;
3401 path->slots[0] = 0;
3402 path->slots[1] += 1;
3403 } else {
3404 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3405 insert_ptr(trans, path, &disk_key,
2ff7e61e 3406 right->start, path->slots[1], 1);
5d4f98a2
YZ
3407 btrfs_tree_unlock(path->nodes[0]);
3408 free_extent_buffer(path->nodes[0]);
3409 path->nodes[0] = right;
3410 path->slots[0] = 0;
143bede5 3411 if (path->slots[1] == 0)
b167fa91 3412 fixup_low_keys(path, &disk_key, 1);
d4dbff95 3413 }
196e0249
LB
3414 /*
3415 * We create a new leaf 'right' for the required ins_len and
3416 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3417 * the content of ins_len to 'right'.
3418 */
5d4f98a2 3419 return ret;
d4dbff95 3420 }
74123bd7 3421
94f94ad9 3422 copy_for_split(trans, path, l, right, slot, mid, nritems);
31840ae1 3423
5d4f98a2 3424 if (split == 2) {
cc0c5538
CM
3425 BUG_ON(num_doubles != 0);
3426 num_doubles++;
3427 goto again;
a429e513 3428 }
44871b1b 3429
143bede5 3430 return 0;
99d8f83c
CM
3431
3432push_for_double:
3433 push_for_double_split(trans, root, path, data_size);
3434 tried_avoid_double = 1;
e902baac 3435 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3436 return 0;
3437 goto again;
be0e5c09
CM
3438}
3439
ad48fd75
YZ
3440static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3441 struct btrfs_root *root,
3442 struct btrfs_path *path, int ins_len)
459931ec 3443{
ad48fd75 3444 struct btrfs_key key;
459931ec 3445 struct extent_buffer *leaf;
ad48fd75
YZ
3446 struct btrfs_file_extent_item *fi;
3447 u64 extent_len = 0;
3448 u32 item_size;
3449 int ret;
459931ec
CM
3450
3451 leaf = path->nodes[0];
ad48fd75
YZ
3452 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3453
3454 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3455 key.type != BTRFS_EXTENT_CSUM_KEY);
3456
e902baac 3457 if (btrfs_leaf_free_space(leaf) >= ins_len)
ad48fd75 3458 return 0;
459931ec
CM
3459
3460 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ad48fd75
YZ
3461 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3462 fi = btrfs_item_ptr(leaf, path->slots[0],
3463 struct btrfs_file_extent_item);
3464 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3465 }
b3b4aa74 3466 btrfs_release_path(path);
459931ec 3467
459931ec 3468 path->keep_locks = 1;
ad48fd75
YZ
3469 path->search_for_split = 1;
3470 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
459931ec 3471 path->search_for_split = 0;
a8df6fe6
FM
3472 if (ret > 0)
3473 ret = -EAGAIN;
ad48fd75
YZ
3474 if (ret < 0)
3475 goto err;
459931ec 3476
ad48fd75
YZ
3477 ret = -EAGAIN;
3478 leaf = path->nodes[0];
a8df6fe6
FM
3479 /* if our item isn't there, return now */
3480 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
ad48fd75
YZ
3481 goto err;
3482
109f6aef 3483 /* the leaf has changed, it now has room. return now */
e902baac 3484 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
109f6aef
CM
3485 goto err;
3486
ad48fd75
YZ
3487 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3488 fi = btrfs_item_ptr(leaf, path->slots[0],
3489 struct btrfs_file_extent_item);
3490 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3491 goto err;
459931ec
CM
3492 }
3493
ad48fd75 3494 ret = split_leaf(trans, root, &key, path, ins_len, 1);
f0486c68
YZ
3495 if (ret)
3496 goto err;
459931ec 3497
ad48fd75 3498 path->keep_locks = 0;
b9473439 3499 btrfs_unlock_up_safe(path, 1);
ad48fd75
YZ
3500 return 0;
3501err:
3502 path->keep_locks = 0;
3503 return ret;
3504}
3505
25263cd7 3506static noinline int split_item(struct btrfs_path *path,
310712b2 3507 const struct btrfs_key *new_key,
ad48fd75
YZ
3508 unsigned long split_offset)
3509{
3510 struct extent_buffer *leaf;
3511 struct btrfs_item *item;
3512 struct btrfs_item *new_item;
3513 int slot;
3514 char *buf;
3515 u32 nritems;
3516 u32 item_size;
3517 u32 orig_offset;
3518 struct btrfs_disk_key disk_key;
3519
b9473439 3520 leaf = path->nodes[0];
e902baac 3521 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
b9473439 3522
dd3cc16b 3523 item = btrfs_item_nr(path->slots[0]);
459931ec
CM
3524 orig_offset = btrfs_item_offset(leaf, item);
3525 item_size = btrfs_item_size(leaf, item);
3526
459931ec 3527 buf = kmalloc(item_size, GFP_NOFS);
ad48fd75
YZ
3528 if (!buf)
3529 return -ENOMEM;
3530
459931ec
CM
3531 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3532 path->slots[0]), item_size);
459931ec 3533
ad48fd75 3534 slot = path->slots[0] + 1;
459931ec 3535 nritems = btrfs_header_nritems(leaf);
459931ec
CM
3536 if (slot != nritems) {
3537 /* shift the items */
3538 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
ad48fd75
YZ
3539 btrfs_item_nr_offset(slot),
3540 (nritems - slot) * sizeof(struct btrfs_item));
459931ec
CM
3541 }
3542
3543 btrfs_cpu_key_to_disk(&disk_key, new_key);
3544 btrfs_set_item_key(leaf, &disk_key, slot);
3545
dd3cc16b 3546 new_item = btrfs_item_nr(slot);
459931ec
CM
3547
3548 btrfs_set_item_offset(leaf, new_item, orig_offset);
3549 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3550
3551 btrfs_set_item_offset(leaf, item,
3552 orig_offset + item_size - split_offset);
3553 btrfs_set_item_size(leaf, item, split_offset);
3554
3555 btrfs_set_header_nritems(leaf, nritems + 1);
3556
3557 /* write the data for the start of the original item */
3558 write_extent_buffer(leaf, buf,
3559 btrfs_item_ptr_offset(leaf, path->slots[0]),
3560 split_offset);
3561
3562 /* write the data for the new item */
3563 write_extent_buffer(leaf, buf + split_offset,
3564 btrfs_item_ptr_offset(leaf, slot),
3565 item_size - split_offset);
3566 btrfs_mark_buffer_dirty(leaf);
3567
e902baac 3568 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
459931ec 3569 kfree(buf);
ad48fd75
YZ
3570 return 0;
3571}
3572
3573/*
3574 * This function splits a single item into two items,
3575 * giving 'new_key' to the new item and splitting the
3576 * old one at split_offset (from the start of the item).
3577 *
3578 * The path may be released by this operation. After
3579 * the split, the path is pointing to the old item. The
3580 * new item is going to be in the same node as the old one.
3581 *
3582 * Note, the item being split must be smaller enough to live alone on
3583 * a tree block with room for one extra struct btrfs_item
3584 *
3585 * This allows us to split the item in place, keeping a lock on the
3586 * leaf the entire time.
3587 */
3588int btrfs_split_item(struct btrfs_trans_handle *trans,
3589 struct btrfs_root *root,
3590 struct btrfs_path *path,
310712b2 3591 const struct btrfs_key *new_key,
ad48fd75
YZ
3592 unsigned long split_offset)
3593{
3594 int ret;
3595 ret = setup_leaf_for_split(trans, root, path,
3596 sizeof(struct btrfs_item));
3597 if (ret)
3598 return ret;
3599
25263cd7 3600 ret = split_item(path, new_key, split_offset);
459931ec
CM
3601 return ret;
3602}
3603
ad48fd75
YZ
3604/*
3605 * This function duplicate a item, giving 'new_key' to the new item.
3606 * It guarantees both items live in the same tree leaf and the new item
3607 * is contiguous with the original item.
3608 *
3609 * This allows us to split file extent in place, keeping a lock on the
3610 * leaf the entire time.
3611 */
3612int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3613 struct btrfs_root *root,
3614 struct btrfs_path *path,
310712b2 3615 const struct btrfs_key *new_key)
ad48fd75
YZ
3616{
3617 struct extent_buffer *leaf;
3618 int ret;
3619 u32 item_size;
3620
3621 leaf = path->nodes[0];
3622 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3623 ret = setup_leaf_for_split(trans, root, path,
3624 item_size + sizeof(struct btrfs_item));
3625 if (ret)
3626 return ret;
3627
3628 path->slots[0]++;
fc0d82e1 3629 setup_items_for_insert(root, path, new_key, &item_size, 1);
ad48fd75
YZ
3630 leaf = path->nodes[0];
3631 memcpy_extent_buffer(leaf,
3632 btrfs_item_ptr_offset(leaf, path->slots[0]),
3633 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3634 item_size);
3635 return 0;
3636}
3637
d352ac68
CM
3638/*
3639 * make the item pointed to by the path smaller. new_size indicates
3640 * how small to make it, and from_end tells us if we just chop bytes
3641 * off the end of the item or if we shift the item to chop bytes off
3642 * the front.
3643 */
78ac4f9e 3644void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
b18c6685 3645{
b18c6685 3646 int slot;
5f39d397
CM
3647 struct extent_buffer *leaf;
3648 struct btrfs_item *item;
b18c6685
CM
3649 u32 nritems;
3650 unsigned int data_end;
3651 unsigned int old_data_start;
3652 unsigned int old_size;
3653 unsigned int size_diff;
3654 int i;
cfed81a0
CM
3655 struct btrfs_map_token token;
3656
5f39d397 3657 leaf = path->nodes[0];
179e29e4
CM
3658 slot = path->slots[0];
3659
3660 old_size = btrfs_item_size_nr(leaf, slot);
3661 if (old_size == new_size)
143bede5 3662 return;
b18c6685 3663
5f39d397 3664 nritems = btrfs_header_nritems(leaf);
8f881e8c 3665 data_end = leaf_data_end(leaf);
b18c6685 3666
5f39d397 3667 old_data_start = btrfs_item_offset_nr(leaf, slot);
179e29e4 3668
b18c6685
CM
3669 size_diff = old_size - new_size;
3670
3671 BUG_ON(slot < 0);
3672 BUG_ON(slot >= nritems);
3673
3674 /*
3675 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3676 */
3677 /* first correct the data pointers */
c82f823c 3678 btrfs_init_map_token(&token, leaf);
b18c6685 3679 for (i = slot; i < nritems; i++) {
5f39d397 3680 u32 ioff;
dd3cc16b 3681 item = btrfs_item_nr(i);
db94535d 3682
cc4c13d5
DS
3683 ioff = btrfs_token_item_offset(&token, item);
3684 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
b18c6685 3685 }
db94535d 3686
b18c6685 3687 /* shift the data */
179e29e4 3688 if (from_end) {
3d9ec8c4
NB
3689 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3690 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3691 data_end, old_data_start + new_size - data_end);
3692 } else {
3693 struct btrfs_disk_key disk_key;
3694 u64 offset;
3695
3696 btrfs_item_key(leaf, &disk_key, slot);
3697
3698 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3699 unsigned long ptr;
3700 struct btrfs_file_extent_item *fi;
3701
3702 fi = btrfs_item_ptr(leaf, slot,
3703 struct btrfs_file_extent_item);
3704 fi = (struct btrfs_file_extent_item *)(
3705 (unsigned long)fi - size_diff);
3706
3707 if (btrfs_file_extent_type(leaf, fi) ==
3708 BTRFS_FILE_EXTENT_INLINE) {
3709 ptr = btrfs_item_ptr_offset(leaf, slot);
3710 memmove_extent_buffer(leaf, ptr,
d397712b 3711 (unsigned long)fi,
7ec20afb 3712 BTRFS_FILE_EXTENT_INLINE_DATA_START);
179e29e4
CM
3713 }
3714 }
3715
3d9ec8c4
NB
3716 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3717 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3718 data_end, old_data_start - data_end);
3719
3720 offset = btrfs_disk_key_offset(&disk_key);
3721 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3722 btrfs_set_item_key(leaf, &disk_key, slot);
3723 if (slot == 0)
b167fa91 3724 fixup_low_keys(path, &disk_key, 1);
179e29e4 3725 }
5f39d397 3726
dd3cc16b 3727 item = btrfs_item_nr(slot);
5f39d397
CM
3728 btrfs_set_item_size(leaf, item, new_size);
3729 btrfs_mark_buffer_dirty(leaf);
b18c6685 3730
e902baac 3731 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3732 btrfs_print_leaf(leaf);
b18c6685 3733 BUG();
5f39d397 3734 }
b18c6685
CM
3735}
3736
d352ac68 3737/*
8f69dbd2 3738 * make the item pointed to by the path bigger, data_size is the added size.
d352ac68 3739 */
c71dd880 3740void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
6567e837 3741{
6567e837 3742 int slot;
5f39d397
CM
3743 struct extent_buffer *leaf;
3744 struct btrfs_item *item;
6567e837
CM
3745 u32 nritems;
3746 unsigned int data_end;
3747 unsigned int old_data;
3748 unsigned int old_size;
3749 int i;
cfed81a0
CM
3750 struct btrfs_map_token token;
3751
5f39d397 3752 leaf = path->nodes[0];
6567e837 3753
5f39d397 3754 nritems = btrfs_header_nritems(leaf);
8f881e8c 3755 data_end = leaf_data_end(leaf);
6567e837 3756
e902baac 3757 if (btrfs_leaf_free_space(leaf) < data_size) {
a4f78750 3758 btrfs_print_leaf(leaf);
6567e837 3759 BUG();
5f39d397 3760 }
6567e837 3761 slot = path->slots[0];
5f39d397 3762 old_data = btrfs_item_end_nr(leaf, slot);
6567e837
CM
3763
3764 BUG_ON(slot < 0);
3326d1b0 3765 if (slot >= nritems) {
a4f78750 3766 btrfs_print_leaf(leaf);
c71dd880 3767 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
0b246afa 3768 slot, nritems);
290342f6 3769 BUG();
3326d1b0 3770 }
6567e837
CM
3771
3772 /*
3773 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3774 */
3775 /* first correct the data pointers */
c82f823c 3776 btrfs_init_map_token(&token, leaf);
6567e837 3777 for (i = slot; i < nritems; i++) {
5f39d397 3778 u32 ioff;
dd3cc16b 3779 item = btrfs_item_nr(i);
db94535d 3780
cc4c13d5
DS
3781 ioff = btrfs_token_item_offset(&token, item);
3782 btrfs_set_token_item_offset(&token, item, ioff - data_size);
6567e837 3783 }
5f39d397 3784
6567e837 3785 /* shift the data */
3d9ec8c4
NB
3786 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3787 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
6567e837 3788 data_end, old_data - data_end);
5f39d397 3789
6567e837 3790 data_end = old_data;
5f39d397 3791 old_size = btrfs_item_size_nr(leaf, slot);
dd3cc16b 3792 item = btrfs_item_nr(slot);
5f39d397
CM
3793 btrfs_set_item_size(leaf, item, old_size + data_size);
3794 btrfs_mark_buffer_dirty(leaf);
6567e837 3795
e902baac 3796 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3797 btrfs_print_leaf(leaf);
6567e837 3798 BUG();
5f39d397 3799 }
6567e837
CM
3800}
3801
da9ffb24
NB
3802/**
3803 * setup_items_for_insert - Helper called before inserting one or more items
3804 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3805 * in a function that doesn't call btrfs_search_slot
3806 *
3807 * @root: root we are inserting items to
3808 * @path: points to the leaf/slot where we are going to insert new items
3809 * @cpu_key: array of keys for items to be inserted
3810 * @data_size: size of the body of each item we are going to insert
3811 * @nr: size of @cpu_key/@data_size arrays
74123bd7 3812 */
afe5fea7 3813void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
310712b2 3814 const struct btrfs_key *cpu_key, u32 *data_size,
fc0d82e1 3815 int nr)
be0e5c09 3816{
0b246afa 3817 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 3818 struct btrfs_item *item;
9c58309d 3819 int i;
7518a238 3820 u32 nritems;
be0e5c09 3821 unsigned int data_end;
e2fa7227 3822 struct btrfs_disk_key disk_key;
44871b1b
CM
3823 struct extent_buffer *leaf;
3824 int slot;
cfed81a0 3825 struct btrfs_map_token token;
fc0d82e1
NB
3826 u32 total_size;
3827 u32 total_data = 0;
3828
3829 for (i = 0; i < nr; i++)
3830 total_data += data_size[i];
3831 total_size = total_data + (nr * sizeof(struct btrfs_item));
cfed81a0 3832
24cdc847
FM
3833 if (path->slots[0] == 0) {
3834 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
b167fa91 3835 fixup_low_keys(path, &disk_key, 1);
24cdc847
FM
3836 }
3837 btrfs_unlock_up_safe(path, 1);
3838
5f39d397 3839 leaf = path->nodes[0];
44871b1b 3840 slot = path->slots[0];
74123bd7 3841
5f39d397 3842 nritems = btrfs_header_nritems(leaf);
8f881e8c 3843 data_end = leaf_data_end(leaf);
eb60ceac 3844
e902baac 3845 if (btrfs_leaf_free_space(leaf) < total_size) {
a4f78750 3846 btrfs_print_leaf(leaf);
0b246afa 3847 btrfs_crit(fs_info, "not enough freespace need %u have %d",
e902baac 3848 total_size, btrfs_leaf_free_space(leaf));
be0e5c09 3849 BUG();
d4dbff95 3850 }
5f39d397 3851
c82f823c 3852 btrfs_init_map_token(&token, leaf);
be0e5c09 3853 if (slot != nritems) {
5f39d397 3854 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
be0e5c09 3855
5f39d397 3856 if (old_data < data_end) {
a4f78750 3857 btrfs_print_leaf(leaf);
7269ddd2
NB
3858 btrfs_crit(fs_info,
3859 "item at slot %d with data offset %u beyond data end of leaf %u",
5d163e0e 3860 slot, old_data, data_end);
290342f6 3861 BUG();
5f39d397 3862 }
be0e5c09
CM
3863 /*
3864 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3865 */
3866 /* first correct the data pointers */
0783fcfc 3867 for (i = slot; i < nritems; i++) {
5f39d397 3868 u32 ioff;
db94535d 3869
62e85577 3870 item = btrfs_item_nr(i);
cc4c13d5
DS
3871 ioff = btrfs_token_item_offset(&token, item);
3872 btrfs_set_token_item_offset(&token, item,
3873 ioff - total_data);
0783fcfc 3874 }
be0e5c09 3875 /* shift the items */
9c58309d 3876 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
5f39d397 3877 btrfs_item_nr_offset(slot),
d6025579 3878 (nritems - slot) * sizeof(struct btrfs_item));
be0e5c09
CM
3879
3880 /* shift the data */
3d9ec8c4
NB
3881 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3882 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
d6025579 3883 data_end, old_data - data_end);
be0e5c09
CM
3884 data_end = old_data;
3885 }
5f39d397 3886
62e2749e 3887 /* setup the item for the new data */
9c58309d
CM
3888 for (i = 0; i < nr; i++) {
3889 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3890 btrfs_set_item_key(leaf, &disk_key, slot + i);
dd3cc16b 3891 item = btrfs_item_nr(slot + i);
9c58309d 3892 data_end -= data_size[i];
fc0716c2 3893 btrfs_set_token_item_offset(&token, item, data_end);
cc4c13d5 3894 btrfs_set_token_item_size(&token, item, data_size[i]);
9c58309d 3895 }
44871b1b 3896
9c58309d 3897 btrfs_set_header_nritems(leaf, nritems + nr);
b9473439 3898 btrfs_mark_buffer_dirty(leaf);
aa5d6bed 3899
e902baac 3900 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3901 btrfs_print_leaf(leaf);
be0e5c09 3902 BUG();
5f39d397 3903 }
44871b1b
CM
3904}
3905
3906/*
3907 * Given a key and some data, insert items into the tree.
3908 * This does all the path init required, making room in the tree if needed.
3909 */
3910int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3911 struct btrfs_root *root,
3912 struct btrfs_path *path,
310712b2 3913 const struct btrfs_key *cpu_key, u32 *data_size,
44871b1b
CM
3914 int nr)
3915{
44871b1b
CM
3916 int ret = 0;
3917 int slot;
3918 int i;
3919 u32 total_size = 0;
3920 u32 total_data = 0;
3921
3922 for (i = 0; i < nr; i++)
3923 total_data += data_size[i];
3924
3925 total_size = total_data + (nr * sizeof(struct btrfs_item));
3926 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3927 if (ret == 0)
3928 return -EEXIST;
3929 if (ret < 0)
143bede5 3930 return ret;
44871b1b 3931
44871b1b
CM
3932 slot = path->slots[0];
3933 BUG_ON(slot < 0);
3934
fc0d82e1 3935 setup_items_for_insert(root, path, cpu_key, data_size, nr);
143bede5 3936 return 0;
62e2749e
CM
3937}
3938
3939/*
3940 * Given a key and some data, insert an item into the tree.
3941 * This does all the path init required, making room in the tree if needed.
3942 */
310712b2
OS
3943int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3944 const struct btrfs_key *cpu_key, void *data,
3945 u32 data_size)
62e2749e
CM
3946{
3947 int ret = 0;
2c90e5d6 3948 struct btrfs_path *path;
5f39d397
CM
3949 struct extent_buffer *leaf;
3950 unsigned long ptr;
62e2749e 3951
2c90e5d6 3952 path = btrfs_alloc_path();
db5b493a
TI
3953 if (!path)
3954 return -ENOMEM;
2c90e5d6 3955 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
62e2749e 3956 if (!ret) {
5f39d397
CM
3957 leaf = path->nodes[0];
3958 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3959 write_extent_buffer(leaf, data, ptr, data_size);
3960 btrfs_mark_buffer_dirty(leaf);
62e2749e 3961 }
2c90e5d6 3962 btrfs_free_path(path);
aa5d6bed 3963 return ret;
be0e5c09
CM
3964}
3965
74123bd7 3966/*
5de08d7d 3967 * delete the pointer from a given node.
74123bd7 3968 *
d352ac68
CM
3969 * the tree should have been previously balanced so the deletion does not
3970 * empty a node.
74123bd7 3971 */
afe5fea7
TI
3972static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3973 int level, int slot)
be0e5c09 3974{
5f39d397 3975 struct extent_buffer *parent = path->nodes[level];
7518a238 3976 u32 nritems;
f3ea38da 3977 int ret;
be0e5c09 3978
5f39d397 3979 nritems = btrfs_header_nritems(parent);
d397712b 3980 if (slot != nritems - 1) {
bf1d3425 3981 if (level) {
f3a84ccd
FM
3982 ret = btrfs_tree_mod_log_insert_move(parent, slot,
3983 slot + 1, nritems - slot - 1);
bf1d3425
DS
3984 BUG_ON(ret < 0);
3985 }
5f39d397
CM
3986 memmove_extent_buffer(parent,
3987 btrfs_node_key_ptr_offset(slot),
3988 btrfs_node_key_ptr_offset(slot + 1),
d6025579
CM
3989 sizeof(struct btrfs_key_ptr) *
3990 (nritems - slot - 1));
57ba86c0 3991 } else if (level) {
f3a84ccd
FM
3992 ret = btrfs_tree_mod_log_insert_key(parent, slot,
3993 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
57ba86c0 3994 BUG_ON(ret < 0);
bb803951 3995 }
f3ea38da 3996
7518a238 3997 nritems--;
5f39d397 3998 btrfs_set_header_nritems(parent, nritems);
7518a238 3999 if (nritems == 0 && parent == root->node) {
5f39d397 4000 BUG_ON(btrfs_header_level(root->node) != 1);
bb803951 4001 /* just turn the root into a leaf and break */
5f39d397 4002 btrfs_set_header_level(root->node, 0);
bb803951 4003 } else if (slot == 0) {
5f39d397
CM
4004 struct btrfs_disk_key disk_key;
4005
4006 btrfs_node_key(parent, &disk_key, 0);
b167fa91 4007 fixup_low_keys(path, &disk_key, level + 1);
be0e5c09 4008 }
d6025579 4009 btrfs_mark_buffer_dirty(parent);
be0e5c09
CM
4010}
4011
323ac95b
CM
4012/*
4013 * a helper function to delete the leaf pointed to by path->slots[1] and
5d4f98a2 4014 * path->nodes[1].
323ac95b
CM
4015 *
4016 * This deletes the pointer in path->nodes[1] and frees the leaf
4017 * block extent. zero is returned if it all worked out, < 0 otherwise.
4018 *
4019 * The path must have already been setup for deleting the leaf, including
4020 * all the proper balancing. path->nodes[1] must be locked.
4021 */
143bede5
JM
4022static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4023 struct btrfs_root *root,
4024 struct btrfs_path *path,
4025 struct extent_buffer *leaf)
323ac95b 4026{
5d4f98a2 4027 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
afe5fea7 4028 del_ptr(root, path, 1, path->slots[1]);
323ac95b 4029
4d081c41
CM
4030 /*
4031 * btrfs_free_extent is expensive, we want to make sure we
4032 * aren't holding any locks when we call it
4033 */
4034 btrfs_unlock_up_safe(path, 0);
4035
f0486c68
YZ
4036 root_sub_used(root, leaf->len);
4037
67439dad 4038 atomic_inc(&leaf->refs);
5581a51a 4039 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3083ee2e 4040 free_extent_buffer_stale(leaf);
323ac95b 4041}
74123bd7
CM
4042/*
4043 * delete the item at the leaf level in path. If that empties
4044 * the leaf, remove it from the tree
4045 */
85e21bac
CM
4046int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4047 struct btrfs_path *path, int slot, int nr)
be0e5c09 4048{
0b246afa 4049 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
4050 struct extent_buffer *leaf;
4051 struct btrfs_item *item;
ce0eac2a
AM
4052 u32 last_off;
4053 u32 dsize = 0;
aa5d6bed
CM
4054 int ret = 0;
4055 int wret;
85e21bac 4056 int i;
7518a238 4057 u32 nritems;
be0e5c09 4058
5f39d397 4059 leaf = path->nodes[0];
85e21bac
CM
4060 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4061
4062 for (i = 0; i < nr; i++)
4063 dsize += btrfs_item_size_nr(leaf, slot + i);
4064
5f39d397 4065 nritems = btrfs_header_nritems(leaf);
be0e5c09 4066
85e21bac 4067 if (slot + nr != nritems) {
8f881e8c 4068 int data_end = leaf_data_end(leaf);
c82f823c 4069 struct btrfs_map_token token;
5f39d397 4070
3d9ec8c4 4071 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
d6025579 4072 data_end + dsize,
3d9ec8c4 4073 BTRFS_LEAF_DATA_OFFSET + data_end,
85e21bac 4074 last_off - data_end);
5f39d397 4075
c82f823c 4076 btrfs_init_map_token(&token, leaf);
85e21bac 4077 for (i = slot + nr; i < nritems; i++) {
5f39d397 4078 u32 ioff;
db94535d 4079
dd3cc16b 4080 item = btrfs_item_nr(i);
cc4c13d5
DS
4081 ioff = btrfs_token_item_offset(&token, item);
4082 btrfs_set_token_item_offset(&token, item, ioff + dsize);
0783fcfc 4083 }
db94535d 4084
5f39d397 4085 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
85e21bac 4086 btrfs_item_nr_offset(slot + nr),
d6025579 4087 sizeof(struct btrfs_item) *
85e21bac 4088 (nritems - slot - nr));
be0e5c09 4089 }
85e21bac
CM
4090 btrfs_set_header_nritems(leaf, nritems - nr);
4091 nritems -= nr;
5f39d397 4092
74123bd7 4093 /* delete the leaf if we've emptied it */
7518a238 4094 if (nritems == 0) {
5f39d397
CM
4095 if (leaf == root->node) {
4096 btrfs_set_header_level(leaf, 0);
9a8dd150 4097 } else {
6a884d7d 4098 btrfs_clean_tree_block(leaf);
143bede5 4099 btrfs_del_leaf(trans, root, path, leaf);
9a8dd150 4100 }
be0e5c09 4101 } else {
7518a238 4102 int used = leaf_space_used(leaf, 0, nritems);
aa5d6bed 4103 if (slot == 0) {
5f39d397
CM
4104 struct btrfs_disk_key disk_key;
4105
4106 btrfs_item_key(leaf, &disk_key, 0);
b167fa91 4107 fixup_low_keys(path, &disk_key, 1);
aa5d6bed 4108 }
aa5d6bed 4109
74123bd7 4110 /* delete the leaf if it is mostly empty */
0b246afa 4111 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
be0e5c09
CM
4112 /* push_leaf_left fixes the path.
4113 * make sure the path still points to our leaf
4114 * for possible call to del_ptr below
4115 */
4920c9ac 4116 slot = path->slots[1];
67439dad 4117 atomic_inc(&leaf->refs);
5f39d397 4118
99d8f83c
CM
4119 wret = push_leaf_left(trans, root, path, 1, 1,
4120 1, (u32)-1);
54aa1f4d 4121 if (wret < 0 && wret != -ENOSPC)
aa5d6bed 4122 ret = wret;
5f39d397
CM
4123
4124 if (path->nodes[0] == leaf &&
4125 btrfs_header_nritems(leaf)) {
99d8f83c
CM
4126 wret = push_leaf_right(trans, root, path, 1,
4127 1, 1, 0);
54aa1f4d 4128 if (wret < 0 && wret != -ENOSPC)
aa5d6bed
CM
4129 ret = wret;
4130 }
5f39d397
CM
4131
4132 if (btrfs_header_nritems(leaf) == 0) {
323ac95b 4133 path->slots[1] = slot;
143bede5 4134 btrfs_del_leaf(trans, root, path, leaf);
5f39d397 4135 free_extent_buffer(leaf);
143bede5 4136 ret = 0;
5de08d7d 4137 } else {
925baedd
CM
4138 /* if we're still in the path, make sure
4139 * we're dirty. Otherwise, one of the
4140 * push_leaf functions must have already
4141 * dirtied this buffer
4142 */
4143 if (path->nodes[0] == leaf)
4144 btrfs_mark_buffer_dirty(leaf);
5f39d397 4145 free_extent_buffer(leaf);
be0e5c09 4146 }
d5719762 4147 } else {
5f39d397 4148 btrfs_mark_buffer_dirty(leaf);
be0e5c09
CM
4149 }
4150 }
aa5d6bed 4151 return ret;
be0e5c09
CM
4152}
4153
7bb86316 4154/*
925baedd 4155 * search the tree again to find a leaf with lesser keys
7bb86316
CM
4156 * returns 0 if it found something or 1 if there are no lesser leaves.
4157 * returns < 0 on io errors.
d352ac68
CM
4158 *
4159 * This may release the path, and so you may lose any locks held at the
4160 * time you call it.
7bb86316 4161 */
16e7549f 4162int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
7bb86316 4163{
925baedd
CM
4164 struct btrfs_key key;
4165 struct btrfs_disk_key found_key;
4166 int ret;
7bb86316 4167
925baedd 4168 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
7bb86316 4169
e8b0d724 4170 if (key.offset > 0) {
925baedd 4171 key.offset--;
e8b0d724 4172 } else if (key.type > 0) {
925baedd 4173 key.type--;
e8b0d724
FDBM
4174 key.offset = (u64)-1;
4175 } else if (key.objectid > 0) {
925baedd 4176 key.objectid--;
e8b0d724
FDBM
4177 key.type = (u8)-1;
4178 key.offset = (u64)-1;
4179 } else {
925baedd 4180 return 1;
e8b0d724 4181 }
7bb86316 4182
b3b4aa74 4183 btrfs_release_path(path);
925baedd
CM
4184 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4185 if (ret < 0)
4186 return ret;
4187 btrfs_item_key(path->nodes[0], &found_key, 0);
4188 ret = comp_keys(&found_key, &key);
337c6f68
FM
4189 /*
4190 * We might have had an item with the previous key in the tree right
4191 * before we released our path. And after we released our path, that
4192 * item might have been pushed to the first slot (0) of the leaf we
4193 * were holding due to a tree balance. Alternatively, an item with the
4194 * previous key can exist as the only element of a leaf (big fat item).
4195 * Therefore account for these 2 cases, so that our callers (like
4196 * btrfs_previous_item) don't miss an existing item with a key matching
4197 * the previous key we computed above.
4198 */
4199 if (ret <= 0)
925baedd
CM
4200 return 0;
4201 return 1;
7bb86316
CM
4202}
4203
3f157a2f
CM
4204/*
4205 * A helper function to walk down the tree starting at min_key, and looking
de78b51a
ES
4206 * for nodes or leaves that are have a minimum transaction id.
4207 * This is used by the btree defrag code, and tree logging
3f157a2f
CM
4208 *
4209 * This does not cow, but it does stuff the starting key it finds back
4210 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4211 * key and get a writable path.
4212 *
3f157a2f
CM
4213 * This honors path->lowest_level to prevent descent past a given level
4214 * of the tree.
4215 *
d352ac68
CM
4216 * min_trans indicates the oldest transaction that you are interested
4217 * in walking through. Any nodes or leaves older than min_trans are
4218 * skipped over (without reading them).
4219 *
3f157a2f
CM
4220 * returns zero if something useful was found, < 0 on error and 1 if there
4221 * was nothing in the tree that matched the search criteria.
4222 */
4223int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
de78b51a 4224 struct btrfs_path *path,
3f157a2f
CM
4225 u64 min_trans)
4226{
4227 struct extent_buffer *cur;
4228 struct btrfs_key found_key;
4229 int slot;
9652480b 4230 int sret;
3f157a2f
CM
4231 u32 nritems;
4232 int level;
4233 int ret = 1;
f98de9b9 4234 int keep_locks = path->keep_locks;
3f157a2f 4235
f98de9b9 4236 path->keep_locks = 1;
3f157a2f 4237again:
bd681513 4238 cur = btrfs_read_lock_root_node(root);
3f157a2f 4239 level = btrfs_header_level(cur);
e02119d5 4240 WARN_ON(path->nodes[level]);
3f157a2f 4241 path->nodes[level] = cur;
bd681513 4242 path->locks[level] = BTRFS_READ_LOCK;
3f157a2f
CM
4243
4244 if (btrfs_header_generation(cur) < min_trans) {
4245 ret = 1;
4246 goto out;
4247 }
d397712b 4248 while (1) {
3f157a2f
CM
4249 nritems = btrfs_header_nritems(cur);
4250 level = btrfs_header_level(cur);
e3b83361 4251 sret = btrfs_bin_search(cur, min_key, &slot);
cbca7d59
FM
4252 if (sret < 0) {
4253 ret = sret;
4254 goto out;
4255 }
3f157a2f 4256
323ac95b
CM
4257 /* at the lowest level, we're done, setup the path and exit */
4258 if (level == path->lowest_level) {
e02119d5
CM
4259 if (slot >= nritems)
4260 goto find_next_key;
3f157a2f
CM
4261 ret = 0;
4262 path->slots[level] = slot;
4263 btrfs_item_key_to_cpu(cur, &found_key, slot);
4264 goto out;
4265 }
9652480b
Y
4266 if (sret && slot > 0)
4267 slot--;
3f157a2f 4268 /*
de78b51a 4269 * check this node pointer against the min_trans parameters.
260db43c 4270 * If it is too old, skip to the next one.
3f157a2f 4271 */
d397712b 4272 while (slot < nritems) {
3f157a2f 4273 u64 gen;
e02119d5 4274
3f157a2f
CM
4275 gen = btrfs_node_ptr_generation(cur, slot);
4276 if (gen < min_trans) {
4277 slot++;
4278 continue;
4279 }
de78b51a 4280 break;
3f157a2f 4281 }
e02119d5 4282find_next_key:
3f157a2f
CM
4283 /*
4284 * we didn't find a candidate key in this node, walk forward
4285 * and find another one
4286 */
4287 if (slot >= nritems) {
e02119d5
CM
4288 path->slots[level] = slot;
4289 sret = btrfs_find_next_key(root, path, min_key, level,
de78b51a 4290 min_trans);
e02119d5 4291 if (sret == 0) {
b3b4aa74 4292 btrfs_release_path(path);
3f157a2f
CM
4293 goto again;
4294 } else {
4295 goto out;
4296 }
4297 }
4298 /* save our key for returning back */
4299 btrfs_node_key_to_cpu(cur, &found_key, slot);
4300 path->slots[level] = slot;
4301 if (level == path->lowest_level) {
4302 ret = 0;
3f157a2f
CM
4303 goto out;
4304 }
4b231ae4 4305 cur = btrfs_read_node_slot(cur, slot);
fb770ae4
LB
4306 if (IS_ERR(cur)) {
4307 ret = PTR_ERR(cur);
4308 goto out;
4309 }
3f157a2f 4310
bd681513 4311 btrfs_tree_read_lock(cur);
b4ce94de 4312
bd681513 4313 path->locks[level - 1] = BTRFS_READ_LOCK;
3f157a2f 4314 path->nodes[level - 1] = cur;
f7c79f30 4315 unlock_up(path, level, 1, 0, NULL);
3f157a2f
CM
4316 }
4317out:
f98de9b9
FM
4318 path->keep_locks = keep_locks;
4319 if (ret == 0) {
4320 btrfs_unlock_up_safe(path, path->lowest_level + 1);
3f157a2f 4321 memcpy(min_key, &found_key, sizeof(found_key));
f98de9b9 4322 }
3f157a2f
CM
4323 return ret;
4324}
4325
4326/*
4327 * this is similar to btrfs_next_leaf, but does not try to preserve
4328 * and fixup the path. It looks for and returns the next key in the
de78b51a 4329 * tree based on the current path and the min_trans parameters.
3f157a2f
CM
4330 *
4331 * 0 is returned if another key is found, < 0 if there are any errors
4332 * and 1 is returned if there are no higher keys in the tree
4333 *
4334 * path->keep_locks should be set to 1 on the search made before
4335 * calling this function.
4336 */
e7a84565 4337int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
de78b51a 4338 struct btrfs_key *key, int level, u64 min_trans)
e7a84565 4339{
e7a84565
CM
4340 int slot;
4341 struct extent_buffer *c;
4342
6a9fb468 4343 WARN_ON(!path->keep_locks && !path->skip_locking);
d397712b 4344 while (level < BTRFS_MAX_LEVEL) {
e7a84565
CM
4345 if (!path->nodes[level])
4346 return 1;
4347
4348 slot = path->slots[level] + 1;
4349 c = path->nodes[level];
3f157a2f 4350next:
e7a84565 4351 if (slot >= btrfs_header_nritems(c)) {
33c66f43
YZ
4352 int ret;
4353 int orig_lowest;
4354 struct btrfs_key cur_key;
4355 if (level + 1 >= BTRFS_MAX_LEVEL ||
4356 !path->nodes[level + 1])
e7a84565 4357 return 1;
33c66f43 4358
6a9fb468 4359 if (path->locks[level + 1] || path->skip_locking) {
33c66f43
YZ
4360 level++;
4361 continue;
4362 }
4363
4364 slot = btrfs_header_nritems(c) - 1;
4365 if (level == 0)
4366 btrfs_item_key_to_cpu(c, &cur_key, slot);
4367 else
4368 btrfs_node_key_to_cpu(c, &cur_key, slot);
4369
4370 orig_lowest = path->lowest_level;
b3b4aa74 4371 btrfs_release_path(path);
33c66f43
YZ
4372 path->lowest_level = level;
4373 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4374 0, 0);
4375 path->lowest_level = orig_lowest;
4376 if (ret < 0)
4377 return ret;
4378
4379 c = path->nodes[level];
4380 slot = path->slots[level];
4381 if (ret == 0)
4382 slot++;
4383 goto next;
e7a84565 4384 }
33c66f43 4385
e7a84565
CM
4386 if (level == 0)
4387 btrfs_item_key_to_cpu(c, key, slot);
3f157a2f 4388 else {
3f157a2f
CM
4389 u64 gen = btrfs_node_ptr_generation(c, slot);
4390
3f157a2f
CM
4391 if (gen < min_trans) {
4392 slot++;
4393 goto next;
4394 }
e7a84565 4395 btrfs_node_key_to_cpu(c, key, slot);
3f157a2f 4396 }
e7a84565
CM
4397 return 0;
4398 }
4399 return 1;
4400}
4401
97571fd0 4402/*
925baedd 4403 * search the tree again to find a leaf with greater keys
0f70abe2
CM
4404 * returns 0 if it found something or 1 if there are no greater leaves.
4405 * returns < 0 on io errors.
97571fd0 4406 */
234b63a0 4407int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3d7806ec
JS
4408{
4409 return btrfs_next_old_leaf(root, path, 0);
4410}
4411
4412int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4413 u64 time_seq)
d97e63b6
CM
4414{
4415 int slot;
8e73f275 4416 int level;
5f39d397 4417 struct extent_buffer *c;
8e73f275 4418 struct extent_buffer *next;
925baedd
CM
4419 struct btrfs_key key;
4420 u32 nritems;
4421 int ret;
0e46318d 4422 int i;
925baedd
CM
4423
4424 nritems = btrfs_header_nritems(path->nodes[0]);
d397712b 4425 if (nritems == 0)
925baedd 4426 return 1;
925baedd 4427
8e73f275
CM
4428 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4429again:
4430 level = 1;
4431 next = NULL;
b3b4aa74 4432 btrfs_release_path(path);
8e73f275 4433
a2135011 4434 path->keep_locks = 1;
8e73f275 4435
3d7806ec
JS
4436 if (time_seq)
4437 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4438 else
4439 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
925baedd
CM
4440 path->keep_locks = 0;
4441
4442 if (ret < 0)
4443 return ret;
4444
a2135011 4445 nritems = btrfs_header_nritems(path->nodes[0]);
168fd7d2
CM
4446 /*
4447 * by releasing the path above we dropped all our locks. A balance
4448 * could have added more items next to the key that used to be
4449 * at the very end of the block. So, check again here and
4450 * advance the path if there are now more items available.
4451 */
a2135011 4452 if (nritems > 0 && path->slots[0] < nritems - 1) {
e457afec
YZ
4453 if (ret == 0)
4454 path->slots[0]++;
8e73f275 4455 ret = 0;
925baedd
CM
4456 goto done;
4457 }
0b43e04f
LB
4458 /*
4459 * So the above check misses one case:
4460 * - after releasing the path above, someone has removed the item that
4461 * used to be at the very end of the block, and balance between leafs
4462 * gets another one with bigger key.offset to replace it.
4463 *
4464 * This one should be returned as well, or we can get leaf corruption
4465 * later(esp. in __btrfs_drop_extents()).
4466 *
4467 * And a bit more explanation about this check,
4468 * with ret > 0, the key isn't found, the path points to the slot
4469 * where it should be inserted, so the path->slots[0] item must be the
4470 * bigger one.
4471 */
4472 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4473 ret = 0;
4474 goto done;
4475 }
d97e63b6 4476
d397712b 4477 while (level < BTRFS_MAX_LEVEL) {
8e73f275
CM
4478 if (!path->nodes[level]) {
4479 ret = 1;
4480 goto done;
4481 }
5f39d397 4482
d97e63b6
CM
4483 slot = path->slots[level] + 1;
4484 c = path->nodes[level];
5f39d397 4485 if (slot >= btrfs_header_nritems(c)) {
d97e63b6 4486 level++;
8e73f275
CM
4487 if (level == BTRFS_MAX_LEVEL) {
4488 ret = 1;
4489 goto done;
4490 }
d97e63b6
CM
4491 continue;
4492 }
5f39d397 4493
0e46318d
JB
4494
4495 /*
4496 * Our current level is where we're going to start from, and to
4497 * make sure lockdep doesn't complain we need to drop our locks
4498 * and nodes from 0 to our current level.
4499 */
4500 for (i = 0; i < level; i++) {
4501 if (path->locks[level]) {
4502 btrfs_tree_read_unlock(path->nodes[i]);
4503 path->locks[i] = 0;
4504 }
4505 free_extent_buffer(path->nodes[i]);
4506 path->nodes[i] = NULL;
925baedd 4507 }
5f39d397 4508
8e73f275 4509 next = c;
d07b8528 4510 ret = read_block_for_search(root, path, &next, level,
cda79c54 4511 slot, &key);
8e73f275
CM
4512 if (ret == -EAGAIN)
4513 goto again;
5f39d397 4514
76a05b35 4515 if (ret < 0) {
b3b4aa74 4516 btrfs_release_path(path);
76a05b35
CM
4517 goto done;
4518 }
4519
5cd57b2c 4520 if (!path->skip_locking) {
bd681513 4521 ret = btrfs_try_tree_read_lock(next);
d42244a0
JS
4522 if (!ret && time_seq) {
4523 /*
4524 * If we don't get the lock, we may be racing
4525 * with push_leaf_left, holding that lock while
4526 * itself waiting for the leaf we've currently
4527 * locked. To solve this situation, we give up
4528 * on our lock and cycle.
4529 */
cf538830 4530 free_extent_buffer(next);
d42244a0
JS
4531 btrfs_release_path(path);
4532 cond_resched();
4533 goto again;
4534 }
0e46318d
JB
4535 if (!ret)
4536 btrfs_tree_read_lock(next);
5cd57b2c 4537 }
d97e63b6
CM
4538 break;
4539 }
4540 path->slots[level] = slot;
d397712b 4541 while (1) {
d97e63b6 4542 level--;
d97e63b6
CM
4543 path->nodes[level] = next;
4544 path->slots[level] = 0;
a74a4b97 4545 if (!path->skip_locking)
ffeb03cf 4546 path->locks[level] = BTRFS_READ_LOCK;
d97e63b6
CM
4547 if (!level)
4548 break;
b4ce94de 4549
d07b8528 4550 ret = read_block_for_search(root, path, &next, level,
cda79c54 4551 0, &key);
8e73f275
CM
4552 if (ret == -EAGAIN)
4553 goto again;
4554
76a05b35 4555 if (ret < 0) {
b3b4aa74 4556 btrfs_release_path(path);
76a05b35
CM
4557 goto done;
4558 }
4559
ffeb03cf 4560 if (!path->skip_locking)
0e46318d 4561 btrfs_tree_read_lock(next);
d97e63b6 4562 }
8e73f275 4563 ret = 0;
925baedd 4564done:
f7c79f30 4565 unlock_up(path, 0, 1, 0, NULL);
8e73f275
CM
4566
4567 return ret;
d97e63b6 4568}
0b86a832 4569
3f157a2f
CM
4570/*
4571 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4572 * searching until it gets past min_objectid or finds an item of 'type'
4573 *
4574 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4575 */
0b86a832
CM
4576int btrfs_previous_item(struct btrfs_root *root,
4577 struct btrfs_path *path, u64 min_objectid,
4578 int type)
4579{
4580 struct btrfs_key found_key;
4581 struct extent_buffer *leaf;
e02119d5 4582 u32 nritems;
0b86a832
CM
4583 int ret;
4584
d397712b 4585 while (1) {
0b86a832
CM
4586 if (path->slots[0] == 0) {
4587 ret = btrfs_prev_leaf(root, path);
4588 if (ret != 0)
4589 return ret;
4590 } else {
4591 path->slots[0]--;
4592 }
4593 leaf = path->nodes[0];
e02119d5
CM
4594 nritems = btrfs_header_nritems(leaf);
4595 if (nritems == 0)
4596 return 1;
4597 if (path->slots[0] == nritems)
4598 path->slots[0]--;
4599
0b86a832 4600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
e02119d5
CM
4601 if (found_key.objectid < min_objectid)
4602 break;
0a4eefbb
YZ
4603 if (found_key.type == type)
4604 return 0;
e02119d5
CM
4605 if (found_key.objectid == min_objectid &&
4606 found_key.type < type)
4607 break;
0b86a832
CM
4608 }
4609 return 1;
4610}
ade2e0b3
WS
4611
4612/*
4613 * search in extent tree to find a previous Metadata/Data extent item with
4614 * min objecitd.
4615 *
4616 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4617 */
4618int btrfs_previous_extent_item(struct btrfs_root *root,
4619 struct btrfs_path *path, u64 min_objectid)
4620{
4621 struct btrfs_key found_key;
4622 struct extent_buffer *leaf;
4623 u32 nritems;
4624 int ret;
4625
4626 while (1) {
4627 if (path->slots[0] == 0) {
ade2e0b3
WS
4628 ret = btrfs_prev_leaf(root, path);
4629 if (ret != 0)
4630 return ret;
4631 } else {
4632 path->slots[0]--;
4633 }
4634 leaf = path->nodes[0];
4635 nritems = btrfs_header_nritems(leaf);
4636 if (nritems == 0)
4637 return 1;
4638 if (path->slots[0] == nritems)
4639 path->slots[0]--;
4640
4641 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4642 if (found_key.objectid < min_objectid)
4643 break;
4644 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4645 found_key.type == BTRFS_METADATA_ITEM_KEY)
4646 return 0;
4647 if (found_key.objectid == min_objectid &&
4648 found_key.type < BTRFS_EXTENT_ITEM_KEY)
4649 break;
4650 }
4651 return 1;
4652}