Merge tag 'i2c-for-6.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa...
[linux-block.git] / fs / btrfs / ctree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570 2/*
d352ac68 3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6cbd5570
CM
4 */
5
a6b6e75e 6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
bd989ba3 8#include <linux/rbtree.h>
adf02123 9#include <linux/mm.h>
e41d12f5 10#include <linux/error-injection.h>
9b569ea0 11#include "messages.h"
eb60ceac
CM
12#include "ctree.h"
13#include "disk-io.h"
7f5c1516 14#include "transaction.h"
5f39d397 15#include "print-tree.h"
925baedd 16#include "locking.h"
de37aa51 17#include "volumes.h"
f616f5cd 18#include "qgroup.h"
f3a84ccd 19#include "tree-mod-log.h"
88c602ab 20#include "tree-checker.h"
ec8eb376 21#include "fs.h"
ad1ac501 22#include "accessors.h"
a0231804 23#include "extent-tree.h"
67707479 24#include "relocation.h"
6bfd0ffa 25#include "file-item.h"
9a8dd150 26
226463d7
JB
27static struct kmem_cache *btrfs_path_cachep;
28
e089f05c
CM
29static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 *root, struct btrfs_path *path, int level);
310712b2
OS
31static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 const struct btrfs_key *ins_key, struct btrfs_path *path,
33 int data_size, int extend);
5f39d397 34static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 35 struct extent_buffer *dst,
971a1f66 36 struct extent_buffer *src, int empty);
5f39d397 37static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf);
afe5fea7
TI
40static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 int level, int slot);
d97e63b6 42
af024ed2
JT
43static const struct btrfs_csums {
44 u16 size;
59a0fcdb
DS
45 const char name[10];
46 const char driver[12];
af024ed2
JT
47} btrfs_csums[] = {
48 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
3951e7f0 49 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
3831bf00 50 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
352ae07b
DS
51 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
52 .driver = "blake2b-256" },
af024ed2
JT
53};
54
3a3178c7
JB
55/*
56 * The leaf data grows from end-to-front in the node. this returns the address
57 * of the start of the last item, which is the stop of the leaf data stack.
58 */
59static unsigned int leaf_data_end(const struct extent_buffer *leaf)
60{
61 u32 nr = btrfs_header_nritems(leaf);
62
63 if (nr == 0)
64 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
65 return btrfs_item_offset(leaf, nr - 1);
66}
67
637e3b48
JB
68/*
69 * Move data in a @leaf (using memmove, safe for overlapping ranges).
70 *
71 * @leaf: leaf that we're doing a memmove on
72 * @dst_offset: item data offset we're moving to
73 * @src_offset: item data offset were' moving from
74 * @len: length of the data we're moving
75 *
76 * Wrapper around memmove_extent_buffer() that takes into account the header on
77 * the leaf. The btrfs_item offset's start directly after the header, so we
78 * have to adjust any offsets to account for the header in the leaf. This
79 * handles that math to simplify the callers.
80 */
81static inline void memmove_leaf_data(const struct extent_buffer *leaf,
82 unsigned long dst_offset,
83 unsigned long src_offset,
84 unsigned long len)
85{
8009adf3
JB
86 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
87 btrfs_item_nr_offset(leaf, 0) + src_offset, len);
637e3b48
JB
88}
89
90/*
91 * Copy item data from @src into @dst at the given @offset.
92 *
93 * @dst: destination leaf that we're copying into
94 * @src: source leaf that we're copying from
95 * @dst_offset: item data offset we're copying to
96 * @src_offset: item data offset were' copying from
97 * @len: length of the data we're copying
98 *
99 * Wrapper around copy_extent_buffer() that takes into account the header on
100 * the leaf. The btrfs_item offset's start directly after the header, so we
101 * have to adjust any offsets to account for the header in the leaf. This
102 * handles that math to simplify the callers.
103 */
104static inline void copy_leaf_data(const struct extent_buffer *dst,
105 const struct extent_buffer *src,
106 unsigned long dst_offset,
107 unsigned long src_offset, unsigned long len)
108{
8009adf3
JB
109 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
110 btrfs_item_nr_offset(src, 0) + src_offset, len);
637e3b48
JB
111}
112
113/*
114 * Move items in a @leaf (using memmove).
115 *
116 * @dst: destination leaf for the items
117 * @dst_item: the item nr we're copying into
118 * @src_item: the item nr we're copying from
119 * @nr_items: the number of items to copy
120 *
121 * Wrapper around memmove_extent_buffer() that does the math to get the
122 * appropriate offsets into the leaf from the item numbers.
123 */
124static inline void memmove_leaf_items(const struct extent_buffer *leaf,
125 int dst_item, int src_item, int nr_items)
126{
127 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
128 btrfs_item_nr_offset(leaf, src_item),
129 nr_items * sizeof(struct btrfs_item));
130}
131
132/*
133 * Copy items from @src into @dst at the given @offset.
134 *
135 * @dst: destination leaf for the items
136 * @src: source leaf for the items
137 * @dst_item: the item nr we're copying into
138 * @src_item: the item nr we're copying from
139 * @nr_items: the number of items to copy
140 *
141 * Wrapper around copy_extent_buffer() that does the math to get the
142 * appropriate offsets into the leaf from the item numbers.
143 */
144static inline void copy_leaf_items(const struct extent_buffer *dst,
145 const struct extent_buffer *src,
146 int dst_item, int src_item, int nr_items)
147{
148 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
149 btrfs_item_nr_offset(src, src_item),
150 nr_items * sizeof(struct btrfs_item));
151}
152
af024ed2
JT
153int btrfs_super_csum_size(const struct btrfs_super_block *s)
154{
155 u16 t = btrfs_super_csum_type(s);
156 /*
157 * csum type is validated at mount time
158 */
159 return btrfs_csums[t].size;
160}
161
162const char *btrfs_super_csum_name(u16 csum_type)
163{
164 /* csum type is validated at mount time */
165 return btrfs_csums[csum_type].name;
166}
167
b4e967be
DS
168/*
169 * Return driver name if defined, otherwise the name that's also a valid driver
170 * name
171 */
172const char *btrfs_super_csum_driver(u16 csum_type)
173{
174 /* csum type is validated at mount time */
59a0fcdb
DS
175 return btrfs_csums[csum_type].driver[0] ?
176 btrfs_csums[csum_type].driver :
b4e967be
DS
177 btrfs_csums[csum_type].name;
178}
179
604997b4 180size_t __attribute_const__ btrfs_get_num_csums(void)
f7cea56c
DS
181{
182 return ARRAY_SIZE(btrfs_csums);
183}
184
df24a2b9 185struct btrfs_path *btrfs_alloc_path(void)
2c90e5d6 186{
a4c853af
C
187 might_sleep();
188
e2c89907 189 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
2c90e5d6
CM
190}
191
d352ac68 192/* this also releases the path */
df24a2b9 193void btrfs_free_path(struct btrfs_path *p)
be0e5c09 194{
ff175d57
JJ
195 if (!p)
196 return;
b3b4aa74 197 btrfs_release_path(p);
df24a2b9 198 kmem_cache_free(btrfs_path_cachep, p);
be0e5c09
CM
199}
200
d352ac68
CM
201/*
202 * path release drops references on the extent buffers in the path
203 * and it drops any locks held by this path
204 *
205 * It is safe to call this on paths that no locks or extent buffers held.
206 */
b3b4aa74 207noinline void btrfs_release_path(struct btrfs_path *p)
eb60ceac
CM
208{
209 int i;
a2135011 210
234b63a0 211 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3f157a2f 212 p->slots[i] = 0;
eb60ceac 213 if (!p->nodes[i])
925baedd
CM
214 continue;
215 if (p->locks[i]) {
bd681513 216 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
925baedd
CM
217 p->locks[i] = 0;
218 }
5f39d397 219 free_extent_buffer(p->nodes[i]);
3f157a2f 220 p->nodes[i] = NULL;
eb60ceac
CM
221 }
222}
223
8bb808c6
DS
224/*
225 * We want the transaction abort to print stack trace only for errors where the
226 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
227 * caused by external factors.
228 */
229bool __cold abort_should_print_stack(int errno)
230{
231 switch (errno) {
232 case -EIO:
233 case -EROFS:
234 case -ENOMEM:
235 return false;
236 }
237 return true;
238}
239
d352ac68
CM
240/*
241 * safely gets a reference on the root node of a tree. A lock
242 * is not taken, so a concurrent writer may put a different node
243 * at the root of the tree. See btrfs_lock_root_node for the
244 * looping required.
245 *
246 * The extent buffer returned by this has a reference taken, so
247 * it won't disappear. It may stop being the root of the tree
248 * at any time because there are no locks held.
249 */
925baedd
CM
250struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
251{
252 struct extent_buffer *eb;
240f62c8 253
3083ee2e
JB
254 while (1) {
255 rcu_read_lock();
256 eb = rcu_dereference(root->node);
257
258 /*
259 * RCU really hurts here, we could free up the root node because
01327610 260 * it was COWed but we may not get the new root node yet so do
3083ee2e
JB
261 * the inc_not_zero dance and if it doesn't work then
262 * synchronize_rcu and try again.
263 */
264 if (atomic_inc_not_zero(&eb->refs)) {
265 rcu_read_unlock();
266 break;
267 }
268 rcu_read_unlock();
269 synchronize_rcu();
270 }
925baedd
CM
271 return eb;
272}
273
92a7cc42
QW
274/*
275 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
276 * just get put onto a simple dirty list. Transaction walks this list to make
277 * sure they get properly updated on disk.
d352ac68 278 */
0b86a832
CM
279static void add_root_to_dirty_list(struct btrfs_root *root)
280{
0b246afa
JM
281 struct btrfs_fs_info *fs_info = root->fs_info;
282
e7070be1
JB
283 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
284 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
285 return;
286
0b246afa 287 spin_lock(&fs_info->trans_lock);
e7070be1
JB
288 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
289 /* Want the extent tree to be the last on the list */
4fd786e6 290 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
e7070be1 291 list_move_tail(&root->dirty_list,
0b246afa 292 &fs_info->dirty_cowonly_roots);
e7070be1
JB
293 else
294 list_move(&root->dirty_list,
0b246afa 295 &fs_info->dirty_cowonly_roots);
0b86a832 296 }
0b246afa 297 spin_unlock(&fs_info->trans_lock);
0b86a832
CM
298}
299
d352ac68
CM
300/*
301 * used by snapshot creation to make a copy of a root for a tree with
302 * a given objectid. The buffer with the new root node is returned in
303 * cow_ret, and this func returns zero on success or a negative error code.
304 */
be20aa9d
CM
305int btrfs_copy_root(struct btrfs_trans_handle *trans,
306 struct btrfs_root *root,
307 struct extent_buffer *buf,
308 struct extent_buffer **cow_ret, u64 new_root_objectid)
309{
0b246afa 310 struct btrfs_fs_info *fs_info = root->fs_info;
be20aa9d 311 struct extent_buffer *cow;
be20aa9d
CM
312 int ret = 0;
313 int level;
5d4f98a2 314 struct btrfs_disk_key disk_key;
be20aa9d 315
92a7cc42 316 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 317 trans->transid != fs_info->running_transaction->transid);
92a7cc42 318 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 319 trans->transid != root->last_trans);
be20aa9d
CM
320
321 level = btrfs_header_level(buf);
5d4f98a2
YZ
322 if (level == 0)
323 btrfs_item_key(buf, &disk_key, 0);
324 else
325 btrfs_node_key(buf, &disk_key, 0);
31840ae1 326
4d75f8a9 327 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
cf6f34aa
JB
328 &disk_key, level, buf->start, 0,
329 BTRFS_NESTING_NEW_ROOT);
5d4f98a2 330 if (IS_ERR(cow))
be20aa9d
CM
331 return PTR_ERR(cow);
332
58e8012c 333 copy_extent_buffer_full(cow, buf);
be20aa9d
CM
334 btrfs_set_header_bytenr(cow, cow->start);
335 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
336 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
337 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
338 BTRFS_HEADER_FLAG_RELOC);
339 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
340 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
341 else
342 btrfs_set_header_owner(cow, new_root_objectid);
be20aa9d 343
de37aa51 344 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 345
be20aa9d 346 WARN_ON(btrfs_header_generation(buf) > trans->transid);
5d4f98a2 347 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 348 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 349 else
e339a6b0 350 ret = btrfs_inc_ref(trans, root, cow, 0);
867ed321 351 if (ret) {
72c9925f
FM
352 btrfs_tree_unlock(cow);
353 free_extent_buffer(cow);
867ed321 354 btrfs_abort_transaction(trans, ret);
be20aa9d 355 return ret;
867ed321 356 }
be20aa9d
CM
357
358 btrfs_mark_buffer_dirty(cow);
359 *cow_ret = cow;
360 return 0;
361}
362
5d4f98a2
YZ
363/*
364 * check if the tree block can be shared by multiple trees
365 */
366int btrfs_block_can_be_shared(struct btrfs_root *root,
367 struct extent_buffer *buf)
368{
369 /*
92a7cc42
QW
370 * Tree blocks not in shareable trees and tree roots are never shared.
371 * If a block was allocated after the last snapshot and the block was
372 * not allocated by tree relocation, we know the block is not shared.
5d4f98a2 373 */
92a7cc42 374 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
5d4f98a2
YZ
375 buf != root->node && buf != root->commit_root &&
376 (btrfs_header_generation(buf) <=
377 btrfs_root_last_snapshot(&root->root_item) ||
378 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
379 return 1;
a79865c6 380
5d4f98a2
YZ
381 return 0;
382}
383
384static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
385 struct btrfs_root *root,
386 struct extent_buffer *buf,
f0486c68
YZ
387 struct extent_buffer *cow,
388 int *last_ref)
5d4f98a2 389{
0b246afa 390 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
391 u64 refs;
392 u64 owner;
393 u64 flags;
394 u64 new_flags = 0;
395 int ret;
396
397 /*
398 * Backrefs update rules:
399 *
400 * Always use full backrefs for extent pointers in tree block
401 * allocated by tree relocation.
402 *
403 * If a shared tree block is no longer referenced by its owner
404 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
405 * use full backrefs for extent pointers in tree block.
406 *
407 * If a tree block is been relocating
408 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
409 * use full backrefs for extent pointers in tree block.
410 * The reason for this is some operations (such as drop tree)
411 * are only allowed for blocks use full backrefs.
412 */
413
414 if (btrfs_block_can_be_shared(root, buf)) {
2ff7e61e 415 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
3173a18f
JB
416 btrfs_header_level(buf), 1,
417 &refs, &flags);
be1a5564
MF
418 if (ret)
419 return ret;
e5df9573
MF
420 if (refs == 0) {
421 ret = -EROFS;
0b246afa 422 btrfs_handle_fs_error(fs_info, ret, NULL);
e5df9573
MF
423 return ret;
424 }
5d4f98a2
YZ
425 } else {
426 refs = 1;
427 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
428 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
429 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
430 else
431 flags = 0;
432 }
433
434 owner = btrfs_header_owner(buf);
435 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
436 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
437
438 if (refs > 1) {
439 if ((owner == root->root_key.objectid ||
440 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
441 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
e339a6b0 442 ret = btrfs_inc_ref(trans, root, buf, 1);
692826b2
JM
443 if (ret)
444 return ret;
5d4f98a2
YZ
445
446 if (root->root_key.objectid ==
447 BTRFS_TREE_RELOC_OBJECTID) {
e339a6b0 448 ret = btrfs_dec_ref(trans, root, buf, 0);
692826b2
JM
449 if (ret)
450 return ret;
e339a6b0 451 ret = btrfs_inc_ref(trans, root, cow, 1);
692826b2
JM
452 if (ret)
453 return ret;
5d4f98a2
YZ
454 }
455 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
456 } else {
457
458 if (root->root_key.objectid ==
459 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 460 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 461 else
e339a6b0 462 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
463 if (ret)
464 return ret;
5d4f98a2
YZ
465 }
466 if (new_flags != 0) {
b1c79e09
JB
467 int level = btrfs_header_level(buf);
468
42c9d0b5 469 ret = btrfs_set_disk_extent_flags(trans, buf,
2fe6a5a1 470 new_flags, level);
be1a5564
MF
471 if (ret)
472 return ret;
5d4f98a2
YZ
473 }
474 } else {
475 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
476 if (root->root_key.objectid ==
477 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 478 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 479 else
e339a6b0 480 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
481 if (ret)
482 return ret;
e339a6b0 483 ret = btrfs_dec_ref(trans, root, buf, 1);
692826b2
JM
484 if (ret)
485 return ret;
5d4f98a2 486 }
190a8339 487 btrfs_clear_buffer_dirty(trans, buf);
f0486c68 488 *last_ref = 1;
5d4f98a2
YZ
489 }
490 return 0;
491}
492
d352ac68 493/*
d397712b
CM
494 * does the dirty work in cow of a single block. The parent block (if
495 * supplied) is updated to point to the new cow copy. The new buffer is marked
496 * dirty and returned locked. If you modify the block it needs to be marked
497 * dirty again.
d352ac68
CM
498 *
499 * search_start -- an allocation hint for the new block
500 *
d397712b
CM
501 * empty_size -- a hint that you plan on doing more cow. This is the size in
502 * bytes the allocator should try to find free next to the block it returns.
503 * This is just a hint and may be ignored by the allocator.
d352ac68 504 */
d397712b 505static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
506 struct btrfs_root *root,
507 struct extent_buffer *buf,
508 struct extent_buffer *parent, int parent_slot,
509 struct extent_buffer **cow_ret,
9631e4cc
JB
510 u64 search_start, u64 empty_size,
511 enum btrfs_lock_nesting nest)
02217ed2 512{
0b246afa 513 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 514 struct btrfs_disk_key disk_key;
5f39d397 515 struct extent_buffer *cow;
be1a5564 516 int level, ret;
f0486c68 517 int last_ref = 0;
925baedd 518 int unlock_orig = 0;
0f5053eb 519 u64 parent_start = 0;
7bb86316 520
925baedd
CM
521 if (*cow_ret == buf)
522 unlock_orig = 1;
523
49d0c642 524 btrfs_assert_tree_write_locked(buf);
925baedd 525
92a7cc42 526 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 527 trans->transid != fs_info->running_transaction->transid);
92a7cc42 528 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 529 trans->transid != root->last_trans);
5f39d397 530
7bb86316 531 level = btrfs_header_level(buf);
31840ae1 532
5d4f98a2
YZ
533 if (level == 0)
534 btrfs_item_key(buf, &disk_key, 0);
535 else
536 btrfs_node_key(buf, &disk_key, 0);
537
0f5053eb
GR
538 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
539 parent_start = parent->start;
5d4f98a2 540
79bd3712
FM
541 cow = btrfs_alloc_tree_block(trans, root, parent_start,
542 root->root_key.objectid, &disk_key, level,
543 search_start, empty_size, nest);
54aa1f4d
CM
544 if (IS_ERR(cow))
545 return PTR_ERR(cow);
6702ed49 546
b4ce94de
CM
547 /* cow is set to blocking by btrfs_init_new_buffer */
548
58e8012c 549 copy_extent_buffer_full(cow, buf);
db94535d 550 btrfs_set_header_bytenr(cow, cow->start);
5f39d397 551 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
552 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
553 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
554 BTRFS_HEADER_FLAG_RELOC);
555 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
556 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
557 else
558 btrfs_set_header_owner(cow, root->root_key.objectid);
6702ed49 559
de37aa51 560 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 561
be1a5564 562 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
b68dc2a9 563 if (ret) {
572c83ac
JB
564 btrfs_tree_unlock(cow);
565 free_extent_buffer(cow);
66642832 566 btrfs_abort_transaction(trans, ret);
b68dc2a9
MF
567 return ret;
568 }
1a40e23b 569
92a7cc42 570 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
83d4cfd4 571 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
93314e3b 572 if (ret) {
572c83ac
JB
573 btrfs_tree_unlock(cow);
574 free_extent_buffer(cow);
66642832 575 btrfs_abort_transaction(trans, ret);
83d4cfd4 576 return ret;
93314e3b 577 }
83d4cfd4 578 }
3fd0a558 579
02217ed2 580 if (buf == root->node) {
925baedd 581 WARN_ON(parent && parent != buf);
5d4f98a2
YZ
582 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
583 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
584 parent_start = buf->start;
925baedd 585
67439dad 586 atomic_inc(&cow->refs);
406808ab 587 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
d9d19a01 588 BUG_ON(ret < 0);
240f62c8 589 rcu_assign_pointer(root->node, cow);
925baedd 590
7a163608
FM
591 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
592 parent_start, last_ref);
5f39d397 593 free_extent_buffer(buf);
0b86a832 594 add_root_to_dirty_list(root);
02217ed2 595 } else {
5d4f98a2 596 WARN_ON(trans->transid != btrfs_header_generation(parent));
f3a84ccd 597 btrfs_tree_mod_log_insert_key(parent, parent_slot,
33cff222 598 BTRFS_MOD_LOG_KEY_REPLACE);
5f39d397 599 btrfs_set_node_blockptr(parent, parent_slot,
db94535d 600 cow->start);
74493f7a
CM
601 btrfs_set_node_ptr_generation(parent, parent_slot,
602 trans->transid);
d6025579 603 btrfs_mark_buffer_dirty(parent);
5de865ee 604 if (last_ref) {
f3a84ccd 605 ret = btrfs_tree_mod_log_free_eb(buf);
5de865ee 606 if (ret) {
572c83ac
JB
607 btrfs_tree_unlock(cow);
608 free_extent_buffer(cow);
66642832 609 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
610 return ret;
611 }
612 }
7a163608
FM
613 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
614 parent_start, last_ref);
02217ed2 615 }
925baedd
CM
616 if (unlock_orig)
617 btrfs_tree_unlock(buf);
3083ee2e 618 free_extent_buffer_stale(buf);
ccd467d6 619 btrfs_mark_buffer_dirty(cow);
2c90e5d6 620 *cow_ret = cow;
02217ed2
CM
621 return 0;
622}
623
5d4f98a2
YZ
624static inline int should_cow_block(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root,
626 struct extent_buffer *buf)
627{
f5ee5c9a 628 if (btrfs_is_testing(root->fs_info))
faa2dbf0 629 return 0;
fccb84c9 630
d1980131
DS
631 /* Ensure we can see the FORCE_COW bit */
632 smp_mb__before_atomic();
f1ebcc74
LB
633
634 /*
635 * We do not need to cow a block if
636 * 1) this block is not created or changed in this transaction;
637 * 2) this block does not belong to TREE_RELOC tree;
638 * 3) the root is not forced COW.
639 *
640 * What is forced COW:
01327610 641 * when we create snapshot during committing the transaction,
52042d8e 642 * after we've finished copying src root, we must COW the shared
f1ebcc74
LB
643 * block to ensure the metadata consistency.
644 */
5d4f98a2
YZ
645 if (btrfs_header_generation(buf) == trans->transid &&
646 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
647 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
f1ebcc74 648 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
27cdeb70 649 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
5d4f98a2
YZ
650 return 0;
651 return 1;
652}
653
d352ac68
CM
654/*
655 * cows a single block, see __btrfs_cow_block for the real work.
01327610 656 * This version of it has extra checks so that a block isn't COWed more than
d352ac68
CM
657 * once per transaction, as long as it hasn't been written yet
658 */
d397712b 659noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
660 struct btrfs_root *root, struct extent_buffer *buf,
661 struct extent_buffer *parent, int parent_slot,
9631e4cc
JB
662 struct extent_buffer **cow_ret,
663 enum btrfs_lock_nesting nest)
6702ed49 664{
0b246afa 665 struct btrfs_fs_info *fs_info = root->fs_info;
6702ed49 666 u64 search_start;
f510cfec 667 int ret;
dc17ff8f 668
83354f07
JB
669 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
670 btrfs_err(fs_info,
671 "COW'ing blocks on a fs root that's being dropped");
672
0b246afa 673 if (trans->transaction != fs_info->running_transaction)
31b1a2bd 674 WARN(1, KERN_CRIT "trans %llu running %llu\n",
c1c9ff7c 675 trans->transid,
0b246afa 676 fs_info->running_transaction->transid);
31b1a2bd 677
0b246afa 678 if (trans->transid != fs_info->generation)
31b1a2bd 679 WARN(1, KERN_CRIT "trans %llu running %llu\n",
0b246afa 680 trans->transid, fs_info->generation);
dc17ff8f 681
5d4f98a2 682 if (!should_cow_block(trans, root, buf)) {
6702ed49
CM
683 *cow_ret = buf;
684 return 0;
685 }
c487685d 686
ee22184b 687 search_start = buf->start & ~((u64)SZ_1G - 1);
b4ce94de 688
f616f5cd
QW
689 /*
690 * Before CoWing this block for later modification, check if it's
691 * the subtree root and do the delayed subtree trace if needed.
692 *
693 * Also We don't care about the error, as it's handled internally.
694 */
695 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
f510cfec 696 ret = __btrfs_cow_block(trans, root, buf, parent,
9631e4cc 697 parent_slot, cow_ret, search_start, 0, nest);
1abe9b8a 698
699 trace_btrfs_cow_block(root, buf, *cow_ret);
700
f510cfec 701 return ret;
6702ed49 702}
f75e2b79 703ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
6702ed49 704
d352ac68
CM
705/*
706 * helper function for defrag to decide if two blocks pointed to by a
707 * node are actually close by
708 */
6b80053d 709static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
6702ed49 710{
6b80053d 711 if (blocknr < other && other - (blocknr + blocksize) < 32768)
6702ed49 712 return 1;
6b80053d 713 if (blocknr > other && blocknr - (other + blocksize) < 32768)
6702ed49
CM
714 return 1;
715 return 0;
716}
717
ce6ef5ab
DS
718#ifdef __LITTLE_ENDIAN
719
720/*
721 * Compare two keys, on little-endian the disk order is same as CPU order and
722 * we can avoid the conversion.
723 */
724static int comp_keys(const struct btrfs_disk_key *disk_key,
725 const struct btrfs_key *k2)
726{
727 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
728
729 return btrfs_comp_cpu_keys(k1, k2);
730}
731
732#else
733
081e9573
CM
734/*
735 * compare two keys in a memcmp fashion
736 */
310712b2
OS
737static int comp_keys(const struct btrfs_disk_key *disk,
738 const struct btrfs_key *k2)
081e9573
CM
739{
740 struct btrfs_key k1;
741
742 btrfs_disk_key_to_cpu(&k1, disk);
743
20736aba 744 return btrfs_comp_cpu_keys(&k1, k2);
081e9573 745}
ce6ef5ab 746#endif
081e9573 747
f3465ca4
JB
748/*
749 * same as comp_keys only with two btrfs_key's
750 */
e1f60a65 751int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
f3465ca4
JB
752{
753 if (k1->objectid > k2->objectid)
754 return 1;
755 if (k1->objectid < k2->objectid)
756 return -1;
757 if (k1->type > k2->type)
758 return 1;
759 if (k1->type < k2->type)
760 return -1;
761 if (k1->offset > k2->offset)
762 return 1;
763 if (k1->offset < k2->offset)
764 return -1;
765 return 0;
766}
081e9573 767
d352ac68
CM
768/*
769 * this is used by the defrag code to go through all the
770 * leaves pointed to by a node and reallocate them so that
771 * disk order is close to key order
772 */
6702ed49 773int btrfs_realloc_node(struct btrfs_trans_handle *trans,
5f39d397 774 struct btrfs_root *root, struct extent_buffer *parent,
de78b51a 775 int start_slot, u64 *last_ret,
a6b6e75e 776 struct btrfs_key *progress)
6702ed49 777{
0b246afa 778 struct btrfs_fs_info *fs_info = root->fs_info;
6b80053d 779 struct extent_buffer *cur;
6702ed49 780 u64 blocknr;
e9d0b13b
CM
781 u64 search_start = *last_ret;
782 u64 last_block = 0;
6702ed49
CM
783 u64 other;
784 u32 parent_nritems;
6702ed49
CM
785 int end_slot;
786 int i;
787 int err = 0;
6b80053d 788 u32 blocksize;
081e9573
CM
789 int progress_passed = 0;
790 struct btrfs_disk_key disk_key;
6702ed49 791
0b246afa
JM
792 WARN_ON(trans->transaction != fs_info->running_transaction);
793 WARN_ON(trans->transid != fs_info->generation);
86479a04 794
6b80053d 795 parent_nritems = btrfs_header_nritems(parent);
0b246afa 796 blocksize = fs_info->nodesize;
5dfe2be7 797 end_slot = parent_nritems - 1;
6702ed49 798
5dfe2be7 799 if (parent_nritems <= 1)
6702ed49
CM
800 return 0;
801
5dfe2be7 802 for (i = start_slot; i <= end_slot; i++) {
6702ed49 803 int close = 1;
a6b6e75e 804
081e9573
CM
805 btrfs_node_key(parent, &disk_key, i);
806 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
807 continue;
808
809 progress_passed = 1;
6b80053d 810 blocknr = btrfs_node_blockptr(parent, i);
e9d0b13b
CM
811 if (last_block == 0)
812 last_block = blocknr;
5708b959 813
6702ed49 814 if (i > 0) {
6b80053d
CM
815 other = btrfs_node_blockptr(parent, i - 1);
816 close = close_blocks(blocknr, other, blocksize);
6702ed49 817 }
5dfe2be7 818 if (!close && i < end_slot) {
6b80053d
CM
819 other = btrfs_node_blockptr(parent, i + 1);
820 close = close_blocks(blocknr, other, blocksize);
6702ed49 821 }
e9d0b13b
CM
822 if (close) {
823 last_block = blocknr;
6702ed49 824 continue;
e9d0b13b 825 }
6702ed49 826
206983b7
JB
827 cur = btrfs_read_node_slot(parent, i);
828 if (IS_ERR(cur))
829 return PTR_ERR(cur);
e9d0b13b 830 if (search_start == 0)
6b80053d 831 search_start = last_block;
e9d0b13b 832
e7a84565 833 btrfs_tree_lock(cur);
6b80053d 834 err = __btrfs_cow_block(trans, root, cur, parent, i,
e7a84565 835 &cur, search_start,
6b80053d 836 min(16 * blocksize,
9631e4cc
JB
837 (end_slot - i) * blocksize),
838 BTRFS_NESTING_COW);
252c38f0 839 if (err) {
e7a84565 840 btrfs_tree_unlock(cur);
6b80053d 841 free_extent_buffer(cur);
6702ed49 842 break;
252c38f0 843 }
e7a84565
CM
844 search_start = cur->start;
845 last_block = cur->start;
f2183bde 846 *last_ret = search_start;
e7a84565
CM
847 btrfs_tree_unlock(cur);
848 free_extent_buffer(cur);
6702ed49
CM
849 }
850 return err;
851}
852
74123bd7 853/*
fb81212c 854 * Search for a key in the given extent_buffer.
5f39d397 855 *
a724f313 856 * The lower boundary for the search is specified by the slot number @first_slot.
fdf8d595
AJ
857 * Use a value of 0 to search over the whole extent buffer. Works for both
858 * leaves and nodes.
74123bd7 859 *
fb81212c
FM
860 * The slot in the extent buffer is returned via @slot. If the key exists in the
861 * extent buffer, then @slot will point to the slot where the key is, otherwise
862 * it points to the slot where you would insert the key.
863 *
864 * Slot may point to the total number of items (i.e. one position beyond the last
865 * key) if the key is bigger than the last key in the extent buffer.
74123bd7 866 */
fdf8d595
AJ
867int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
868 const struct btrfs_key *key, int *slot)
be0e5c09 869{
fb81212c
FM
870 unsigned long p;
871 int item_size;
a724f313
FM
872 /*
873 * Use unsigned types for the low and high slots, so that we get a more
874 * efficient division in the search loop below.
875 */
876 u32 low = first_slot;
877 u32 high = btrfs_header_nritems(eb);
be0e5c09 878 int ret;
5cd17f34 879 const int key_size = sizeof(struct btrfs_disk_key);
be0e5c09 880
a724f313 881 if (unlikely(low > high)) {
5e24e9af 882 btrfs_err(eb->fs_info,
a724f313 883 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
5e24e9af
LB
884 __func__, low, high, eb->start,
885 btrfs_header_owner(eb), btrfs_header_level(eb));
886 return -EINVAL;
887 }
888
fb81212c
FM
889 if (btrfs_header_level(eb) == 0) {
890 p = offsetof(struct btrfs_leaf, items);
891 item_size = sizeof(struct btrfs_item);
892 } else {
893 p = offsetof(struct btrfs_node, ptrs);
894 item_size = sizeof(struct btrfs_key_ptr);
895 }
896
d397712b 897 while (low < high) {
5cd17f34
DS
898 unsigned long oip;
899 unsigned long offset;
900 struct btrfs_disk_key *tmp;
901 struct btrfs_disk_key unaligned;
902 int mid;
903
be0e5c09 904 mid = (low + high) / 2;
5f39d397 905 offset = p + mid * item_size;
5cd17f34 906 oip = offset_in_page(offset);
5f39d397 907
5cd17f34 908 if (oip + key_size <= PAGE_SIZE) {
884b07d0 909 const unsigned long idx = get_eb_page_index(offset);
5cd17f34 910 char *kaddr = page_address(eb->pages[idx]);
5f39d397 911
884b07d0 912 oip = get_eb_offset_in_page(eb, offset);
5cd17f34 913 tmp = (struct btrfs_disk_key *)(kaddr + oip);
5f39d397 914 } else {
5cd17f34
DS
915 read_extent_buffer(eb, &unaligned, offset, key_size);
916 tmp = &unaligned;
5f39d397 917 }
5cd17f34 918
be0e5c09
CM
919 ret = comp_keys(tmp, key);
920
921 if (ret < 0)
922 low = mid + 1;
923 else if (ret > 0)
924 high = mid;
925 else {
926 *slot = mid;
927 return 0;
928 }
929 }
930 *slot = low;
931 return 1;
932}
933
f0486c68
YZ
934static void root_add_used(struct btrfs_root *root, u32 size)
935{
936 spin_lock(&root->accounting_lock);
937 btrfs_set_root_used(&root->root_item,
938 btrfs_root_used(&root->root_item) + size);
939 spin_unlock(&root->accounting_lock);
940}
941
942static void root_sub_used(struct btrfs_root *root, u32 size)
943{
944 spin_lock(&root->accounting_lock);
945 btrfs_set_root_used(&root->root_item,
946 btrfs_root_used(&root->root_item) - size);
947 spin_unlock(&root->accounting_lock);
948}
949
d352ac68
CM
950/* given a node and slot number, this reads the blocks it points to. The
951 * extent buffer is returned with a reference taken (but unlocked).
d352ac68 952 */
4b231ae4
DS
953struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
954 int slot)
bb803951 955{
ca7a79ad 956 int level = btrfs_header_level(parent);
789d6a3a 957 struct btrfs_tree_parent_check check = { 0 };
416bc658
JB
958 struct extent_buffer *eb;
959
fb770ae4
LB
960 if (slot < 0 || slot >= btrfs_header_nritems(parent))
961 return ERR_PTR(-ENOENT);
ca7a79ad 962
d4694728 963 ASSERT(level);
ca7a79ad 964
789d6a3a
QW
965 check.level = level - 1;
966 check.transid = btrfs_node_ptr_generation(parent, slot);
967 check.owner_root = btrfs_header_owner(parent);
968 check.has_first_key = true;
969 btrfs_node_key_to_cpu(parent, &check.first_key, slot);
970
d0d20b0f 971 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
789d6a3a 972 &check);
4eb150d6
QW
973 if (IS_ERR(eb))
974 return eb;
975 if (!extent_buffer_uptodate(eb)) {
fb770ae4 976 free_extent_buffer(eb);
4eb150d6 977 return ERR_PTR(-EIO);
416bc658
JB
978 }
979
980 return eb;
bb803951
CM
981}
982
d352ac68
CM
983/*
984 * node level balancing, used to make sure nodes are in proper order for
985 * item deletion. We balance from the top down, so we have to make sure
986 * that a deletion won't leave an node completely empty later on.
987 */
e02119d5 988static noinline int balance_level(struct btrfs_trans_handle *trans,
98ed5174
CM
989 struct btrfs_root *root,
990 struct btrfs_path *path, int level)
bb803951 991{
0b246afa 992 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
993 struct extent_buffer *right = NULL;
994 struct extent_buffer *mid;
995 struct extent_buffer *left = NULL;
996 struct extent_buffer *parent = NULL;
bb803951
CM
997 int ret = 0;
998 int wret;
999 int pslot;
bb803951 1000 int orig_slot = path->slots[level];
79f95c82 1001 u64 orig_ptr;
bb803951 1002
98e6b1eb 1003 ASSERT(level > 0);
bb803951 1004
5f39d397 1005 mid = path->nodes[level];
b4ce94de 1006
ac5887c8 1007 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
7bb86316
CM
1008 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1009
1d4f8a0c 1010 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
79f95c82 1011
a05a9bb1 1012 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1013 parent = path->nodes[level + 1];
a05a9bb1
LZ
1014 pslot = path->slots[level + 1];
1015 }
bb803951 1016
40689478
CM
1017 /*
1018 * deal with the case where there is only one pointer in the root
1019 * by promoting the node below to a root
1020 */
5f39d397
CM
1021 if (!parent) {
1022 struct extent_buffer *child;
bb803951 1023
5f39d397 1024 if (btrfs_header_nritems(mid) != 1)
bb803951
CM
1025 return 0;
1026
1027 /* promote the child to a root */
4b231ae4 1028 child = btrfs_read_node_slot(mid, 0);
fb770ae4
LB
1029 if (IS_ERR(child)) {
1030 ret = PTR_ERR(child);
0b246afa 1031 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
1032 goto enospc;
1033 }
1034
925baedd 1035 btrfs_tree_lock(child);
9631e4cc
JB
1036 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1037 BTRFS_NESTING_COW);
f0486c68
YZ
1038 if (ret) {
1039 btrfs_tree_unlock(child);
1040 free_extent_buffer(child);
1041 goto enospc;
1042 }
2f375ab9 1043
406808ab 1044 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
d9d19a01 1045 BUG_ON(ret < 0);
240f62c8 1046 rcu_assign_pointer(root->node, child);
925baedd 1047
0b86a832 1048 add_root_to_dirty_list(root);
925baedd 1049 btrfs_tree_unlock(child);
b4ce94de 1050
925baedd 1051 path->locks[level] = 0;
bb803951 1052 path->nodes[level] = NULL;
190a8339 1053 btrfs_clear_buffer_dirty(trans, mid);
925baedd 1054 btrfs_tree_unlock(mid);
bb803951 1055 /* once for the path */
5f39d397 1056 free_extent_buffer(mid);
f0486c68
YZ
1057
1058 root_sub_used(root, mid->len);
7a163608 1059 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
bb803951 1060 /* once for the root ptr */
3083ee2e 1061 free_extent_buffer_stale(mid);
f0486c68 1062 return 0;
bb803951 1063 }
5f39d397 1064 if (btrfs_header_nritems(mid) >
0b246afa 1065 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
bb803951
CM
1066 return 0;
1067
9cf14029
JB
1068 if (pslot) {
1069 left = btrfs_read_node_slot(parent, pslot - 1);
1070 if (IS_ERR(left)) {
1071 ret = PTR_ERR(left);
1072 left = NULL;
1073 goto enospc;
1074 }
fb770ae4 1075
bf77467a 1076 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
5f39d397 1077 wret = btrfs_cow_block(trans, root, left,
9631e4cc 1078 parent, pslot - 1, &left,
bf59a5a2 1079 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
1080 if (wret) {
1081 ret = wret;
1082 goto enospc;
1083 }
2cc58cf2 1084 }
fb770ae4 1085
9cf14029
JB
1086 if (pslot + 1 < btrfs_header_nritems(parent)) {
1087 right = btrfs_read_node_slot(parent, pslot + 1);
1088 if (IS_ERR(right)) {
1089 ret = PTR_ERR(right);
1090 right = NULL;
1091 goto enospc;
1092 }
fb770ae4 1093
bf77467a 1094 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
5f39d397 1095 wret = btrfs_cow_block(trans, root, right,
9631e4cc 1096 parent, pslot + 1, &right,
bf59a5a2 1097 BTRFS_NESTING_RIGHT_COW);
2cc58cf2
CM
1098 if (wret) {
1099 ret = wret;
1100 goto enospc;
1101 }
1102 }
1103
1104 /* first, try to make some room in the middle buffer */
5f39d397
CM
1105 if (left) {
1106 orig_slot += btrfs_header_nritems(left);
d30a668f 1107 wret = push_node_left(trans, left, mid, 1);
79f95c82
CM
1108 if (wret < 0)
1109 ret = wret;
bb803951 1110 }
79f95c82
CM
1111
1112 /*
1113 * then try to empty the right most buffer into the middle
1114 */
5f39d397 1115 if (right) {
d30a668f 1116 wret = push_node_left(trans, mid, right, 1);
54aa1f4d 1117 if (wret < 0 && wret != -ENOSPC)
79f95c82 1118 ret = wret;
5f39d397 1119 if (btrfs_header_nritems(right) == 0) {
190a8339 1120 btrfs_clear_buffer_dirty(trans, right);
925baedd 1121 btrfs_tree_unlock(right);
afe5fea7 1122 del_ptr(root, path, level + 1, pslot + 1);
f0486c68 1123 root_sub_used(root, right->len);
7a163608
FM
1124 btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1125 0, 1);
3083ee2e 1126 free_extent_buffer_stale(right);
f0486c68 1127 right = NULL;
bb803951 1128 } else {
5f39d397
CM
1129 struct btrfs_disk_key right_key;
1130 btrfs_node_key(right, &right_key, 0);
f3a84ccd 1131 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1132 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1133 BUG_ON(ret < 0);
5f39d397
CM
1134 btrfs_set_node_key(parent, &right_key, pslot + 1);
1135 btrfs_mark_buffer_dirty(parent);
bb803951
CM
1136 }
1137 }
5f39d397 1138 if (btrfs_header_nritems(mid) == 1) {
79f95c82
CM
1139 /*
1140 * we're not allowed to leave a node with one item in the
1141 * tree during a delete. A deletion from lower in the tree
1142 * could try to delete the only pointer in this node.
1143 * So, pull some keys from the left.
1144 * There has to be a left pointer at this point because
1145 * otherwise we would have pulled some pointers from the
1146 * right
1147 */
305a26af
MF
1148 if (!left) {
1149 ret = -EROFS;
0b246afa 1150 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
1151 goto enospc;
1152 }
55d32ed8 1153 wret = balance_node_right(trans, mid, left);
54aa1f4d 1154 if (wret < 0) {
79f95c82 1155 ret = wret;
54aa1f4d
CM
1156 goto enospc;
1157 }
bce4eae9 1158 if (wret == 1) {
d30a668f 1159 wret = push_node_left(trans, left, mid, 1);
bce4eae9
CM
1160 if (wret < 0)
1161 ret = wret;
1162 }
79f95c82
CM
1163 BUG_ON(wret == 1);
1164 }
5f39d397 1165 if (btrfs_header_nritems(mid) == 0) {
190a8339 1166 btrfs_clear_buffer_dirty(trans, mid);
925baedd 1167 btrfs_tree_unlock(mid);
afe5fea7 1168 del_ptr(root, path, level + 1, pslot);
f0486c68 1169 root_sub_used(root, mid->len);
7a163608 1170 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
3083ee2e 1171 free_extent_buffer_stale(mid);
f0486c68 1172 mid = NULL;
79f95c82
CM
1173 } else {
1174 /* update the parent key to reflect our changes */
5f39d397
CM
1175 struct btrfs_disk_key mid_key;
1176 btrfs_node_key(mid, &mid_key, 0);
f3a84ccd 1177 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1178 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1179 BUG_ON(ret < 0);
5f39d397
CM
1180 btrfs_set_node_key(parent, &mid_key, pslot);
1181 btrfs_mark_buffer_dirty(parent);
79f95c82 1182 }
bb803951 1183
79f95c82 1184 /* update the path */
5f39d397
CM
1185 if (left) {
1186 if (btrfs_header_nritems(left) > orig_slot) {
67439dad 1187 atomic_inc(&left->refs);
925baedd 1188 /* left was locked after cow */
5f39d397 1189 path->nodes[level] = left;
bb803951
CM
1190 path->slots[level + 1] -= 1;
1191 path->slots[level] = orig_slot;
925baedd
CM
1192 if (mid) {
1193 btrfs_tree_unlock(mid);
5f39d397 1194 free_extent_buffer(mid);
925baedd 1195 }
bb803951 1196 } else {
5f39d397 1197 orig_slot -= btrfs_header_nritems(left);
bb803951
CM
1198 path->slots[level] = orig_slot;
1199 }
1200 }
79f95c82 1201 /* double check we haven't messed things up */
e20d96d6 1202 if (orig_ptr !=
5f39d397 1203 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
79f95c82 1204 BUG();
54aa1f4d 1205enospc:
925baedd
CM
1206 if (right) {
1207 btrfs_tree_unlock(right);
5f39d397 1208 free_extent_buffer(right);
925baedd
CM
1209 }
1210 if (left) {
1211 if (path->nodes[level] != left)
1212 btrfs_tree_unlock(left);
5f39d397 1213 free_extent_buffer(left);
925baedd 1214 }
bb803951
CM
1215 return ret;
1216}
1217
d352ac68
CM
1218/* Node balancing for insertion. Here we only split or push nodes around
1219 * when they are completely full. This is also done top down, so we
1220 * have to be pessimistic.
1221 */
d397712b 1222static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
98ed5174
CM
1223 struct btrfs_root *root,
1224 struct btrfs_path *path, int level)
e66f709b 1225{
0b246afa 1226 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
1227 struct extent_buffer *right = NULL;
1228 struct extent_buffer *mid;
1229 struct extent_buffer *left = NULL;
1230 struct extent_buffer *parent = NULL;
e66f709b
CM
1231 int ret = 0;
1232 int wret;
1233 int pslot;
1234 int orig_slot = path->slots[level];
e66f709b
CM
1235
1236 if (level == 0)
1237 return 1;
1238
5f39d397 1239 mid = path->nodes[level];
7bb86316 1240 WARN_ON(btrfs_header_generation(mid) != trans->transid);
e66f709b 1241
a05a9bb1 1242 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1243 parent = path->nodes[level + 1];
a05a9bb1
LZ
1244 pslot = path->slots[level + 1];
1245 }
e66f709b 1246
5f39d397 1247 if (!parent)
e66f709b 1248 return 1;
e66f709b 1249
e66f709b 1250 /* first, try to make some room in the middle buffer */
9cf14029 1251 if (pslot) {
e66f709b 1252 u32 left_nr;
925baedd 1253
9cf14029
JB
1254 left = btrfs_read_node_slot(parent, pslot - 1);
1255 if (IS_ERR(left))
1256 return PTR_ERR(left);
1257
bf77467a 1258 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
b4ce94de 1259
5f39d397 1260 left_nr = btrfs_header_nritems(left);
0b246afa 1261 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1262 wret = 1;
1263 } else {
5f39d397 1264 ret = btrfs_cow_block(trans, root, left, parent,
9631e4cc 1265 pslot - 1, &left,
bf59a5a2 1266 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
1267 if (ret)
1268 wret = 1;
1269 else {
d30a668f 1270 wret = push_node_left(trans, left, mid, 0);
54aa1f4d 1271 }
33ade1f8 1272 }
e66f709b
CM
1273 if (wret < 0)
1274 ret = wret;
1275 if (wret == 0) {
5f39d397 1276 struct btrfs_disk_key disk_key;
e66f709b 1277 orig_slot += left_nr;
5f39d397 1278 btrfs_node_key(mid, &disk_key, 0);
f3a84ccd 1279 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1280 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1281 BUG_ON(ret < 0);
5f39d397
CM
1282 btrfs_set_node_key(parent, &disk_key, pslot);
1283 btrfs_mark_buffer_dirty(parent);
1284 if (btrfs_header_nritems(left) > orig_slot) {
1285 path->nodes[level] = left;
e66f709b
CM
1286 path->slots[level + 1] -= 1;
1287 path->slots[level] = orig_slot;
925baedd 1288 btrfs_tree_unlock(mid);
5f39d397 1289 free_extent_buffer(mid);
e66f709b
CM
1290 } else {
1291 orig_slot -=
5f39d397 1292 btrfs_header_nritems(left);
e66f709b 1293 path->slots[level] = orig_slot;
925baedd 1294 btrfs_tree_unlock(left);
5f39d397 1295 free_extent_buffer(left);
e66f709b 1296 }
e66f709b
CM
1297 return 0;
1298 }
925baedd 1299 btrfs_tree_unlock(left);
5f39d397 1300 free_extent_buffer(left);
e66f709b 1301 }
e66f709b
CM
1302
1303 /*
1304 * then try to empty the right most buffer into the middle
1305 */
9cf14029 1306 if (pslot + 1 < btrfs_header_nritems(parent)) {
33ade1f8 1307 u32 right_nr;
b4ce94de 1308
9cf14029
JB
1309 right = btrfs_read_node_slot(parent, pslot + 1);
1310 if (IS_ERR(right))
1311 return PTR_ERR(right);
1312
bf77467a 1313 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
b4ce94de 1314
5f39d397 1315 right_nr = btrfs_header_nritems(right);
0b246afa 1316 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1317 wret = 1;
1318 } else {
5f39d397
CM
1319 ret = btrfs_cow_block(trans, root, right,
1320 parent, pslot + 1,
bf59a5a2 1321 &right, BTRFS_NESTING_RIGHT_COW);
54aa1f4d
CM
1322 if (ret)
1323 wret = 1;
1324 else {
55d32ed8 1325 wret = balance_node_right(trans, right, mid);
54aa1f4d 1326 }
33ade1f8 1327 }
e66f709b
CM
1328 if (wret < 0)
1329 ret = wret;
1330 if (wret == 0) {
5f39d397
CM
1331 struct btrfs_disk_key disk_key;
1332
1333 btrfs_node_key(right, &disk_key, 0);
f3a84ccd 1334 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1335 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1336 BUG_ON(ret < 0);
5f39d397
CM
1337 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1338 btrfs_mark_buffer_dirty(parent);
1339
1340 if (btrfs_header_nritems(mid) <= orig_slot) {
1341 path->nodes[level] = right;
e66f709b
CM
1342 path->slots[level + 1] += 1;
1343 path->slots[level] = orig_slot -
5f39d397 1344 btrfs_header_nritems(mid);
925baedd 1345 btrfs_tree_unlock(mid);
5f39d397 1346 free_extent_buffer(mid);
e66f709b 1347 } else {
925baedd 1348 btrfs_tree_unlock(right);
5f39d397 1349 free_extent_buffer(right);
e66f709b 1350 }
e66f709b
CM
1351 return 0;
1352 }
925baedd 1353 btrfs_tree_unlock(right);
5f39d397 1354 free_extent_buffer(right);
e66f709b 1355 }
e66f709b
CM
1356 return 1;
1357}
1358
3c69faec 1359/*
d352ac68
CM
1360 * readahead one full node of leaves, finding things that are close
1361 * to the block in 'slot', and triggering ra on them.
3c69faec 1362 */
2ff7e61e 1363static void reada_for_search(struct btrfs_fs_info *fs_info,
c8c42864
CM
1364 struct btrfs_path *path,
1365 int level, int slot, u64 objectid)
3c69faec 1366{
5f39d397 1367 struct extent_buffer *node;
01f46658 1368 struct btrfs_disk_key disk_key;
3c69faec 1369 u32 nritems;
3c69faec 1370 u64 search;
a7175319 1371 u64 target;
6b80053d 1372 u64 nread = 0;
ace75066 1373 u64 nread_max;
6b80053d
CM
1374 u32 nr;
1375 u32 blocksize;
1376 u32 nscan = 0;
db94535d 1377
ace75066 1378 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
6702ed49
CM
1379 return;
1380
1381 if (!path->nodes[level])
3c69faec
CM
1382 return;
1383
5f39d397 1384 node = path->nodes[level];
925baedd 1385
ace75066
FM
1386 /*
1387 * Since the time between visiting leaves is much shorter than the time
1388 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1389 * much IO at once (possibly random).
1390 */
1391 if (path->reada == READA_FORWARD_ALWAYS) {
1392 if (level > 1)
1393 nread_max = node->fs_info->nodesize;
1394 else
1395 nread_max = SZ_128K;
1396 } else {
1397 nread_max = SZ_64K;
1398 }
1399
3c69faec 1400 search = btrfs_node_blockptr(node, slot);
0b246afa 1401 blocksize = fs_info->nodesize;
069a2e37
FM
1402 if (path->reada != READA_FORWARD_ALWAYS) {
1403 struct extent_buffer *eb;
1404
1405 eb = find_extent_buffer(fs_info, search);
1406 if (eb) {
1407 free_extent_buffer(eb);
1408 return;
1409 }
3c69faec
CM
1410 }
1411
a7175319 1412 target = search;
6b80053d 1413
5f39d397 1414 nritems = btrfs_header_nritems(node);
6b80053d 1415 nr = slot;
25b8b936 1416
d397712b 1417 while (1) {
e4058b54 1418 if (path->reada == READA_BACK) {
6b80053d
CM
1419 if (nr == 0)
1420 break;
1421 nr--;
ace75066
FM
1422 } else if (path->reada == READA_FORWARD ||
1423 path->reada == READA_FORWARD_ALWAYS) {
6b80053d
CM
1424 nr++;
1425 if (nr >= nritems)
1426 break;
3c69faec 1427 }
e4058b54 1428 if (path->reada == READA_BACK && objectid) {
01f46658
CM
1429 btrfs_node_key(node, &disk_key, nr);
1430 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1431 break;
1432 }
6b80053d 1433 search = btrfs_node_blockptr(node, nr);
ace75066
FM
1434 if (path->reada == READA_FORWARD_ALWAYS ||
1435 (search <= target && target - search <= 65536) ||
a7175319 1436 (search > target && search - target <= 65536)) {
bfb484d9 1437 btrfs_readahead_node_child(node, nr);
6b80053d
CM
1438 nread += blocksize;
1439 }
1440 nscan++;
ace75066 1441 if (nread > nread_max || nscan > 32)
6b80053d 1442 break;
3c69faec
CM
1443 }
1444}
925baedd 1445
bfb484d9 1446static noinline void reada_for_balance(struct btrfs_path *path, int level)
b4ce94de 1447{
bfb484d9 1448 struct extent_buffer *parent;
b4ce94de
CM
1449 int slot;
1450 int nritems;
b4ce94de 1451
8c594ea8 1452 parent = path->nodes[level + 1];
b4ce94de 1453 if (!parent)
0b08851f 1454 return;
b4ce94de
CM
1455
1456 nritems = btrfs_header_nritems(parent);
8c594ea8 1457 slot = path->slots[level + 1];
b4ce94de 1458
bfb484d9
JB
1459 if (slot > 0)
1460 btrfs_readahead_node_child(parent, slot - 1);
1461 if (slot + 1 < nritems)
1462 btrfs_readahead_node_child(parent, slot + 1);
b4ce94de
CM
1463}
1464
1465
d352ac68 1466/*
d397712b
CM
1467 * when we walk down the tree, it is usually safe to unlock the higher layers
1468 * in the tree. The exceptions are when our path goes through slot 0, because
1469 * operations on the tree might require changing key pointers higher up in the
1470 * tree.
d352ac68 1471 *
d397712b
CM
1472 * callers might also have set path->keep_locks, which tells this code to keep
1473 * the lock if the path points to the last slot in the block. This is part of
1474 * walking through the tree, and selecting the next slot in the higher block.
d352ac68 1475 *
d397712b
CM
1476 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1477 * if lowest_unlock is 1, level 0 won't be unlocked
d352ac68 1478 */
e02119d5 1479static noinline void unlock_up(struct btrfs_path *path, int level,
f7c79f30
CM
1480 int lowest_unlock, int min_write_lock_level,
1481 int *write_lock_level)
925baedd
CM
1482{
1483 int i;
1484 int skip_level = level;
c1227996 1485 bool check_skip = true;
925baedd
CM
1486
1487 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1488 if (!path->nodes[i])
1489 break;
1490 if (!path->locks[i])
1491 break;
c1227996
NB
1492
1493 if (check_skip) {
1494 if (path->slots[i] == 0) {
925baedd
CM
1495 skip_level = i + 1;
1496 continue;
1497 }
c1227996
NB
1498
1499 if (path->keep_locks) {
1500 u32 nritems;
1501
1502 nritems = btrfs_header_nritems(path->nodes[i]);
1503 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1504 skip_level = i + 1;
1505 continue;
1506 }
1507 }
925baedd 1508 }
051e1b9f 1509
d80bb3f9 1510 if (i >= lowest_unlock && i > skip_level) {
c1227996
NB
1511 check_skip = false;
1512 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
925baedd 1513 path->locks[i] = 0;
f7c79f30
CM
1514 if (write_lock_level &&
1515 i > min_write_lock_level &&
1516 i <= *write_lock_level) {
1517 *write_lock_level = i - 1;
1518 }
925baedd
CM
1519 }
1520 }
1521}
1522
c8c42864 1523/*
376a21d7
FM
1524 * Helper function for btrfs_search_slot() and other functions that do a search
1525 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1526 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1527 * its pages from disk.
c8c42864 1528 *
376a21d7
FM
1529 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1530 * whole btree search, starting again from the current root node.
c8c42864
CM
1531 */
1532static int
d07b8528
LB
1533read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1534 struct extent_buffer **eb_ret, int level, int slot,
cda79c54 1535 const struct btrfs_key *key)
c8c42864 1536{
0b246afa 1537 struct btrfs_fs_info *fs_info = root->fs_info;
789d6a3a 1538 struct btrfs_tree_parent_check check = { 0 };
c8c42864
CM
1539 u64 blocknr;
1540 u64 gen;
c8c42864 1541 struct extent_buffer *tmp;
76a05b35 1542 int ret;
581c1760 1543 int parent_level;
b246666e 1544 bool unlock_up;
c8c42864 1545
b246666e 1546 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
213ff4b7
NB
1547 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1548 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1549 parent_level = btrfs_header_level(*eb_ret);
789d6a3a
QW
1550 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1551 check.has_first_key = true;
1552 check.level = parent_level - 1;
1553 check.transid = gen;
1554 check.owner_root = root->root_key.objectid;
c8c42864 1555
b246666e
FM
1556 /*
1557 * If we need to read an extent buffer from disk and we are holding locks
1558 * on upper level nodes, we unlock all the upper nodes before reading the
1559 * extent buffer, and then return -EAGAIN to the caller as it needs to
1560 * restart the search. We don't release the lock on the current level
1561 * because we need to walk this node to figure out which blocks to read.
1562 */
0b246afa 1563 tmp = find_extent_buffer(fs_info, blocknr);
cb44921a 1564 if (tmp) {
ace75066
FM
1565 if (p->reada == READA_FORWARD_ALWAYS)
1566 reada_for_search(fs_info, p, level, slot, key->objectid);
1567
b9fab919 1568 /* first we do an atomic uptodate check */
bdf7c00e 1569 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
448de471
QW
1570 /*
1571 * Do extra check for first_key, eb can be stale due to
1572 * being cached, read from scrub, or have multiple
1573 * parents (shared tree blocks).
1574 */
e064d5e9 1575 if (btrfs_verify_level_key(tmp,
789d6a3a 1576 parent_level - 1, &check.first_key, gen)) {
448de471
QW
1577 free_extent_buffer(tmp);
1578 return -EUCLEAN;
1579 }
bdf7c00e
JB
1580 *eb_ret = tmp;
1581 return 0;
1582 }
1583
857bc13f
JB
1584 if (p->nowait) {
1585 free_extent_buffer(tmp);
1586 return -EAGAIN;
1587 }
1588
b246666e
FM
1589 if (unlock_up)
1590 btrfs_unlock_up_safe(p, level + 1);
1591
bdf7c00e 1592 /* now we're allowed to do a blocking uptodate check */
789d6a3a 1593 ret = btrfs_read_extent_buffer(tmp, &check);
9a4ffa1b
QW
1594 if (ret) {
1595 free_extent_buffer(tmp);
1596 btrfs_release_path(p);
1597 return -EIO;
cb44921a 1598 }
88c602ab
QW
1599 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1600 free_extent_buffer(tmp);
1601 btrfs_release_path(p);
1602 return -EUCLEAN;
1603 }
b246666e
FM
1604
1605 if (unlock_up)
1606 ret = -EAGAIN;
1607
1608 goto out;
857bc13f
JB
1609 } else if (p->nowait) {
1610 return -EAGAIN;
c8c42864
CM
1611 }
1612
b246666e 1613 if (unlock_up) {
4bb59055
FM
1614 btrfs_unlock_up_safe(p, level + 1);
1615 ret = -EAGAIN;
1616 } else {
1617 ret = 0;
1618 }
8c594ea8 1619
e4058b54 1620 if (p->reada != READA_NONE)
2ff7e61e 1621 reada_for_search(fs_info, p, level, slot, key->objectid);
c8c42864 1622
789d6a3a 1623 tmp = read_tree_block(fs_info, blocknr, &check);
4eb150d6
QW
1624 if (IS_ERR(tmp)) {
1625 btrfs_release_path(p);
1626 return PTR_ERR(tmp);
76a05b35 1627 }
4eb150d6
QW
1628 /*
1629 * If the read above didn't mark this buffer up to date,
1630 * it will never end up being up to date. Set ret to EIO now
1631 * and give up so that our caller doesn't loop forever
1632 * on our EAGAINs.
1633 */
1634 if (!extent_buffer_uptodate(tmp))
1635 ret = -EIO;
02a3307a 1636
b246666e 1637out:
4bb59055
FM
1638 if (ret == 0) {
1639 *eb_ret = tmp;
1640 } else {
1641 free_extent_buffer(tmp);
1642 btrfs_release_path(p);
1643 }
1644
76a05b35 1645 return ret;
c8c42864
CM
1646}
1647
1648/*
1649 * helper function for btrfs_search_slot. This does all of the checks
1650 * for node-level blocks and does any balancing required based on
1651 * the ins_len.
1652 *
1653 * If no extra work was required, zero is returned. If we had to
1654 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1655 * start over
1656 */
1657static int
1658setup_nodes_for_search(struct btrfs_trans_handle *trans,
1659 struct btrfs_root *root, struct btrfs_path *p,
bd681513
CM
1660 struct extent_buffer *b, int level, int ins_len,
1661 int *write_lock_level)
c8c42864 1662{
0b246afa 1663 struct btrfs_fs_info *fs_info = root->fs_info;
95b982de 1664 int ret = 0;
0b246afa 1665
c8c42864 1666 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
0b246afa 1667 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
c8c42864 1668
bd681513
CM
1669 if (*write_lock_level < level + 1) {
1670 *write_lock_level = level + 1;
1671 btrfs_release_path(p);
95b982de 1672 return -EAGAIN;
bd681513
CM
1673 }
1674
bfb484d9 1675 reada_for_balance(p, level);
95b982de 1676 ret = split_node(trans, root, p, level);
c8c42864 1677
c8c42864
CM
1678 b = p->nodes[level];
1679 } else if (ins_len < 0 && btrfs_header_nritems(b) <
0b246afa 1680 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
c8c42864 1681
bd681513
CM
1682 if (*write_lock_level < level + 1) {
1683 *write_lock_level = level + 1;
1684 btrfs_release_path(p);
95b982de 1685 return -EAGAIN;
bd681513
CM
1686 }
1687
bfb484d9 1688 reada_for_balance(p, level);
95b982de
NB
1689 ret = balance_level(trans, root, p, level);
1690 if (ret)
1691 return ret;
c8c42864 1692
c8c42864
CM
1693 b = p->nodes[level];
1694 if (!b) {
b3b4aa74 1695 btrfs_release_path(p);
95b982de 1696 return -EAGAIN;
c8c42864
CM
1697 }
1698 BUG_ON(btrfs_header_nritems(b) == 1);
1699 }
c8c42864
CM
1700 return ret;
1701}
1702
381cf658 1703int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
e33d5c3d
KN
1704 u64 iobjectid, u64 ioff, u8 key_type,
1705 struct btrfs_key *found_key)
1706{
1707 int ret;
1708 struct btrfs_key key;
1709 struct extent_buffer *eb;
381cf658
DS
1710
1711 ASSERT(path);
1d4c08e0 1712 ASSERT(found_key);
e33d5c3d
KN
1713
1714 key.type = key_type;
1715 key.objectid = iobjectid;
1716 key.offset = ioff;
1717
1718 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1d4c08e0 1719 if (ret < 0)
e33d5c3d
KN
1720 return ret;
1721
1722 eb = path->nodes[0];
1723 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1724 ret = btrfs_next_leaf(fs_root, path);
1725 if (ret)
1726 return ret;
1727 eb = path->nodes[0];
1728 }
1729
1730 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1731 if (found_key->type != key.type ||
1732 found_key->objectid != key.objectid)
1733 return 1;
1734
1735 return 0;
1736}
1737
1fc28d8e
LB
1738static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1739 struct btrfs_path *p,
1740 int write_lock_level)
1741{
1fc28d8e 1742 struct extent_buffer *b;
120de408 1743 int root_lock = 0;
1fc28d8e
LB
1744 int level = 0;
1745
1fc28d8e 1746 if (p->search_commit_root) {
d96b3424
FM
1747 b = root->commit_root;
1748 atomic_inc(&b->refs);
be6821f8 1749 level = btrfs_header_level(b);
f9ddfd05
LB
1750 /*
1751 * Ensure that all callers have set skip_locking when
1752 * p->search_commit_root = 1.
1753 */
1754 ASSERT(p->skip_locking == 1);
1fc28d8e
LB
1755
1756 goto out;
1757 }
1758
1759 if (p->skip_locking) {
1760 b = btrfs_root_node(root);
1761 level = btrfs_header_level(b);
1762 goto out;
1763 }
1764
120de408
JB
1765 /* We try very hard to do read locks on the root */
1766 root_lock = BTRFS_READ_LOCK;
1767
1fc28d8e 1768 /*
662c653b
LB
1769 * If the level is set to maximum, we can skip trying to get the read
1770 * lock.
1fc28d8e 1771 */
662c653b
LB
1772 if (write_lock_level < BTRFS_MAX_LEVEL) {
1773 /*
1774 * We don't know the level of the root node until we actually
1775 * have it read locked
1776 */
857bc13f
JB
1777 if (p->nowait) {
1778 b = btrfs_try_read_lock_root_node(root);
1779 if (IS_ERR(b))
1780 return b;
1781 } else {
1782 b = btrfs_read_lock_root_node(root);
1783 }
662c653b
LB
1784 level = btrfs_header_level(b);
1785 if (level > write_lock_level)
1786 goto out;
1787
1788 /* Whoops, must trade for write lock */
1789 btrfs_tree_read_unlock(b);
1790 free_extent_buffer(b);
1791 }
1fc28d8e 1792
1fc28d8e
LB
1793 b = btrfs_lock_root_node(root);
1794 root_lock = BTRFS_WRITE_LOCK;
1795
1796 /* The level might have changed, check again */
1797 level = btrfs_header_level(b);
1798
1799out:
120de408
JB
1800 /*
1801 * The root may have failed to write out at some point, and thus is no
1802 * longer valid, return an error in this case.
1803 */
1804 if (!extent_buffer_uptodate(b)) {
1805 if (root_lock)
1806 btrfs_tree_unlock_rw(b, root_lock);
1807 free_extent_buffer(b);
1808 return ERR_PTR(-EIO);
1809 }
1810
1fc28d8e
LB
1811 p->nodes[level] = b;
1812 if (!p->skip_locking)
1813 p->locks[level] = root_lock;
1814 /*
1815 * Callers are responsible for dropping b's references.
1816 */
1817 return b;
1818}
1819
d96b3424
FM
1820/*
1821 * Replace the extent buffer at the lowest level of the path with a cloned
1822 * version. The purpose is to be able to use it safely, after releasing the
1823 * commit root semaphore, even if relocation is happening in parallel, the
1824 * transaction used for relocation is committed and the extent buffer is
1825 * reallocated in the next transaction.
1826 *
1827 * This is used in a context where the caller does not prevent transaction
1828 * commits from happening, either by holding a transaction handle or holding
1829 * some lock, while it's doing searches through a commit root.
1830 * At the moment it's only used for send operations.
1831 */
1832static int finish_need_commit_sem_search(struct btrfs_path *path)
1833{
1834 const int i = path->lowest_level;
1835 const int slot = path->slots[i];
1836 struct extent_buffer *lowest = path->nodes[i];
1837 struct extent_buffer *clone;
1838
1839 ASSERT(path->need_commit_sem);
1840
1841 if (!lowest)
1842 return 0;
1843
1844 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1845
1846 clone = btrfs_clone_extent_buffer(lowest);
1847 if (!clone)
1848 return -ENOMEM;
1849
1850 btrfs_release_path(path);
1851 path->nodes[i] = clone;
1852 path->slots[i] = slot;
1853
1854 return 0;
1855}
1fc28d8e 1856
e2e58d0f
FM
1857static inline int search_for_key_slot(struct extent_buffer *eb,
1858 int search_low_slot,
1859 const struct btrfs_key *key,
1860 int prev_cmp,
1861 int *slot)
1862{
1863 /*
1864 * If a previous call to btrfs_bin_search() on a parent node returned an
1865 * exact match (prev_cmp == 0), we can safely assume the target key will
1866 * always be at slot 0 on lower levels, since each key pointer
1867 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1868 * subtree it points to. Thus we can skip searching lower levels.
1869 */
1870 if (prev_cmp == 0) {
1871 *slot = 0;
1872 return 0;
1873 }
1874
fdf8d595 1875 return btrfs_bin_search(eb, search_low_slot, key, slot);
e2e58d0f
FM
1876}
1877
109324cf
FM
1878static int search_leaf(struct btrfs_trans_handle *trans,
1879 struct btrfs_root *root,
1880 const struct btrfs_key *key,
1881 struct btrfs_path *path,
1882 int ins_len,
1883 int prev_cmp)
1884{
1885 struct extent_buffer *leaf = path->nodes[0];
1886 int leaf_free_space = -1;
1887 int search_low_slot = 0;
1888 int ret;
1889 bool do_bin_search = true;
1890
1891 /*
1892 * If we are doing an insertion, the leaf has enough free space and the
1893 * destination slot for the key is not slot 0, then we can unlock our
1894 * write lock on the parent, and any other upper nodes, before doing the
1895 * binary search on the leaf (with search_for_key_slot()), allowing other
1896 * tasks to lock the parent and any other upper nodes.
1897 */
1898 if (ins_len > 0) {
1899 /*
1900 * Cache the leaf free space, since we will need it later and it
1901 * will not change until then.
1902 */
1903 leaf_free_space = btrfs_leaf_free_space(leaf);
1904
1905 /*
1906 * !path->locks[1] means we have a single node tree, the leaf is
1907 * the root of the tree.
1908 */
1909 if (path->locks[1] && leaf_free_space >= ins_len) {
1910 struct btrfs_disk_key first_key;
1911
1912 ASSERT(btrfs_header_nritems(leaf) > 0);
1913 btrfs_item_key(leaf, &first_key, 0);
1914
1915 /*
1916 * Doing the extra comparison with the first key is cheap,
1917 * taking into account that the first key is very likely
1918 * already in a cache line because it immediately follows
1919 * the extent buffer's header and we have recently accessed
1920 * the header's level field.
1921 */
1922 ret = comp_keys(&first_key, key);
1923 if (ret < 0) {
1924 /*
1925 * The first key is smaller than the key we want
1926 * to insert, so we are safe to unlock all upper
1927 * nodes and we have to do the binary search.
1928 *
1929 * We do use btrfs_unlock_up_safe() and not
1930 * unlock_up() because the later does not unlock
1931 * nodes with a slot of 0 - we can safely unlock
1932 * any node even if its slot is 0 since in this
1933 * case the key does not end up at slot 0 of the
1934 * leaf and there's no need to split the leaf.
1935 */
1936 btrfs_unlock_up_safe(path, 1);
1937 search_low_slot = 1;
1938 } else {
1939 /*
1940 * The first key is >= then the key we want to
1941 * insert, so we can skip the binary search as
1942 * the target key will be at slot 0.
1943 *
1944 * We can not unlock upper nodes when the key is
1945 * less than the first key, because we will need
1946 * to update the key at slot 0 of the parent node
1947 * and possibly of other upper nodes too.
1948 * If the key matches the first key, then we can
1949 * unlock all the upper nodes, using
1950 * btrfs_unlock_up_safe() instead of unlock_up()
1951 * as stated above.
1952 */
1953 if (ret == 0)
1954 btrfs_unlock_up_safe(path, 1);
1955 /*
1956 * ret is already 0 or 1, matching the result of
1957 * a btrfs_bin_search() call, so there is no need
1958 * to adjust it.
1959 */
1960 do_bin_search = false;
1961 path->slots[0] = 0;
1962 }
1963 }
1964 }
1965
1966 if (do_bin_search) {
1967 ret = search_for_key_slot(leaf, search_low_slot, key,
1968 prev_cmp, &path->slots[0]);
1969 if (ret < 0)
1970 return ret;
1971 }
1972
1973 if (ins_len > 0) {
1974 /*
1975 * Item key already exists. In this case, if we are allowed to
1976 * insert the item (for example, in dir_item case, item key
1977 * collision is allowed), it will be merged with the original
1978 * item. Only the item size grows, no new btrfs item will be
1979 * added. If search_for_extension is not set, ins_len already
1980 * accounts the size btrfs_item, deduct it here so leaf space
1981 * check will be correct.
1982 */
1983 if (ret == 0 && !path->search_for_extension) {
1984 ASSERT(ins_len >= sizeof(struct btrfs_item));
1985 ins_len -= sizeof(struct btrfs_item);
1986 }
1987
1988 ASSERT(leaf_free_space >= 0);
1989
1990 if (leaf_free_space < ins_len) {
1991 int err;
1992
1993 err = split_leaf(trans, root, key, path, ins_len,
1994 (ret == 0));
bb8e9a60
FM
1995 ASSERT(err <= 0);
1996 if (WARN_ON(err > 0))
1997 err = -EUCLEAN;
109324cf
FM
1998 if (err)
1999 ret = err;
2000 }
2001 }
2002
2003 return ret;
2004}
2005
74123bd7 2006/*
4271ecea
NB
2007 * btrfs_search_slot - look for a key in a tree and perform necessary
2008 * modifications to preserve tree invariants.
74123bd7 2009 *
4271ecea
NB
2010 * @trans: Handle of transaction, used when modifying the tree
2011 * @p: Holds all btree nodes along the search path
2012 * @root: The root node of the tree
2013 * @key: The key we are looking for
9a664971 2014 * @ins_len: Indicates purpose of search:
2015 * >0 for inserts it's size of item inserted (*)
2016 * <0 for deletions
2017 * 0 for plain searches, not modifying the tree
2018 *
2019 * (*) If size of item inserted doesn't include
2020 * sizeof(struct btrfs_item), then p->search_for_extension must
2021 * be set.
4271ecea
NB
2022 * @cow: boolean should CoW operations be performed. Must always be 1
2023 * when modifying the tree.
97571fd0 2024 *
4271ecea
NB
2025 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2026 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2027 *
2028 * If @key is found, 0 is returned and you can find the item in the leaf level
2029 * of the path (level 0)
2030 *
2031 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2032 * points to the slot where it should be inserted
2033 *
2034 * If an error is encountered while searching the tree a negative error number
2035 * is returned
74123bd7 2036 */
310712b2
OS
2037int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2038 const struct btrfs_key *key, struct btrfs_path *p,
2039 int ins_len, int cow)
be0e5c09 2040{
d96b3424 2041 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 2042 struct extent_buffer *b;
be0e5c09
CM
2043 int slot;
2044 int ret;
33c66f43 2045 int err;
be0e5c09 2046 int level;
925baedd 2047 int lowest_unlock = 1;
bd681513
CM
2048 /* everything at write_lock_level or lower must be write locked */
2049 int write_lock_level = 0;
9f3a7427 2050 u8 lowest_level = 0;
f7c79f30 2051 int min_write_lock_level;
d7396f07 2052 int prev_cmp;
9f3a7427 2053
a4c853af
C
2054 might_sleep();
2055
6702ed49 2056 lowest_level = p->lowest_level;
323ac95b 2057 WARN_ON(lowest_level && ins_len > 0);
22b0ebda 2058 WARN_ON(p->nodes[0] != NULL);
eb653de1 2059 BUG_ON(!cow && ins_len);
25179201 2060
857bc13f
JB
2061 /*
2062 * For now only allow nowait for read only operations. There's no
2063 * strict reason why we can't, we just only need it for reads so it's
2064 * only implemented for reads.
2065 */
2066 ASSERT(!p->nowait || !cow);
2067
bd681513 2068 if (ins_len < 0) {
925baedd 2069 lowest_unlock = 2;
65b51a00 2070
bd681513
CM
2071 /* when we are removing items, we might have to go up to level
2072 * two as we update tree pointers Make sure we keep write
2073 * for those levels as well
2074 */
2075 write_lock_level = 2;
2076 } else if (ins_len > 0) {
2077 /*
2078 * for inserting items, make sure we have a write lock on
2079 * level 1 so we can update keys
2080 */
2081 write_lock_level = 1;
2082 }
2083
2084 if (!cow)
2085 write_lock_level = -1;
2086
09a2a8f9 2087 if (cow && (p->keep_locks || p->lowest_level))
bd681513
CM
2088 write_lock_level = BTRFS_MAX_LEVEL;
2089
f7c79f30
CM
2090 min_write_lock_level = write_lock_level;
2091
d96b3424
FM
2092 if (p->need_commit_sem) {
2093 ASSERT(p->search_commit_root);
857bc13f
JB
2094 if (p->nowait) {
2095 if (!down_read_trylock(&fs_info->commit_root_sem))
2096 return -EAGAIN;
2097 } else {
2098 down_read(&fs_info->commit_root_sem);
2099 }
d96b3424
FM
2100 }
2101
bb803951 2102again:
d7396f07 2103 prev_cmp = -1;
1fc28d8e 2104 b = btrfs_search_slot_get_root(root, p, write_lock_level);
be6821f8
FM
2105 if (IS_ERR(b)) {
2106 ret = PTR_ERR(b);
2107 goto done;
2108 }
925baedd 2109
eb60ceac 2110 while (b) {
f624d976
QW
2111 int dec = 0;
2112
5f39d397 2113 level = btrfs_header_level(b);
65b51a00 2114
02217ed2 2115 if (cow) {
9ea2c7c9
NB
2116 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2117
c8c42864
CM
2118 /*
2119 * if we don't really need to cow this block
2120 * then we don't want to set the path blocking,
2121 * so we test it here
2122 */
5963ffca 2123 if (!should_cow_block(trans, root, b))
65b51a00 2124 goto cow_done;
5d4f98a2 2125
bd681513
CM
2126 /*
2127 * must have write locks on this node and the
2128 * parent
2129 */
5124e00e
JB
2130 if (level > write_lock_level ||
2131 (level + 1 > write_lock_level &&
2132 level + 1 < BTRFS_MAX_LEVEL &&
2133 p->nodes[level + 1])) {
bd681513
CM
2134 write_lock_level = level + 1;
2135 btrfs_release_path(p);
2136 goto again;
2137 }
2138
9ea2c7c9
NB
2139 if (last_level)
2140 err = btrfs_cow_block(trans, root, b, NULL, 0,
9631e4cc
JB
2141 &b,
2142 BTRFS_NESTING_COW);
9ea2c7c9
NB
2143 else
2144 err = btrfs_cow_block(trans, root, b,
2145 p->nodes[level + 1],
9631e4cc
JB
2146 p->slots[level + 1], &b,
2147 BTRFS_NESTING_COW);
33c66f43 2148 if (err) {
33c66f43 2149 ret = err;
65b51a00 2150 goto done;
54aa1f4d 2151 }
02217ed2 2152 }
65b51a00 2153cow_done:
eb60ceac 2154 p->nodes[level] = b;
b4ce94de
CM
2155
2156 /*
2157 * we have a lock on b and as long as we aren't changing
2158 * the tree, there is no way to for the items in b to change.
2159 * It is safe to drop the lock on our parent before we
2160 * go through the expensive btree search on b.
2161 *
eb653de1
FDBM
2162 * If we're inserting or deleting (ins_len != 0), then we might
2163 * be changing slot zero, which may require changing the parent.
2164 * So, we can't drop the lock until after we know which slot
2165 * we're operating on.
b4ce94de 2166 */
eb653de1
FDBM
2167 if (!ins_len && !p->keep_locks) {
2168 int u = level + 1;
2169
2170 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2171 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2172 p->locks[u] = 0;
2173 }
2174 }
b4ce94de 2175
e2e58d0f 2176 if (level == 0) {
109324cf 2177 if (ins_len > 0)
e5e1c174 2178 ASSERT(write_lock_level >= 1);
bd681513 2179
109324cf 2180 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
459931ec 2181 if (!p->search_for_split)
f7c79f30 2182 unlock_up(p, level, lowest_unlock,
4b6f8e96 2183 min_write_lock_level, NULL);
65b51a00 2184 goto done;
be0e5c09 2185 }
e2e58d0f
FM
2186
2187 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2188 if (ret < 0)
2189 goto done;
2190 prev_cmp = ret;
2191
f624d976
QW
2192 if (ret && slot > 0) {
2193 dec = 1;
2194 slot--;
2195 }
2196 p->slots[level] = slot;
2197 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2198 &write_lock_level);
2199 if (err == -EAGAIN)
2200 goto again;
2201 if (err) {
2202 ret = err;
2203 goto done;
2204 }
2205 b = p->nodes[level];
2206 slot = p->slots[level];
2207
2208 /*
2209 * Slot 0 is special, if we change the key we have to update
2210 * the parent pointer which means we must have a write lock on
2211 * the parent
2212 */
2213 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2214 write_lock_level = level + 1;
2215 btrfs_release_path(p);
2216 goto again;
2217 }
2218
2219 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2220 &write_lock_level);
2221
2222 if (level == lowest_level) {
2223 if (dec)
2224 p->slots[level]++;
2225 goto done;
2226 }
2227
2228 err = read_block_for_search(root, p, &b, level, slot, key);
2229 if (err == -EAGAIN)
2230 goto again;
2231 if (err) {
2232 ret = err;
2233 goto done;
2234 }
2235
2236 if (!p->skip_locking) {
2237 level = btrfs_header_level(b);
b40130b2
JB
2238
2239 btrfs_maybe_reset_lockdep_class(root, b);
2240
f624d976 2241 if (level <= write_lock_level) {
ac5887c8 2242 btrfs_tree_lock(b);
f624d976
QW
2243 p->locks[level] = BTRFS_WRITE_LOCK;
2244 } else {
857bc13f
JB
2245 if (p->nowait) {
2246 if (!btrfs_try_tree_read_lock(b)) {
2247 free_extent_buffer(b);
2248 ret = -EAGAIN;
2249 goto done;
2250 }
2251 } else {
2252 btrfs_tree_read_lock(b);
2253 }
f624d976
QW
2254 p->locks[level] = BTRFS_READ_LOCK;
2255 }
2256 p->nodes[level] = b;
2257 }
be0e5c09 2258 }
65b51a00
CM
2259 ret = 1;
2260done:
5f5bc6b1 2261 if (ret < 0 && !p->skip_release_on_error)
b3b4aa74 2262 btrfs_release_path(p);
d96b3424
FM
2263
2264 if (p->need_commit_sem) {
2265 int ret2;
2266
2267 ret2 = finish_need_commit_sem_search(p);
2268 up_read(&fs_info->commit_root_sem);
2269 if (ret2)
2270 ret = ret2;
2271 }
2272
65b51a00 2273 return ret;
be0e5c09 2274}
f75e2b79 2275ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
be0e5c09 2276
5d9e75c4
JS
2277/*
2278 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2279 * current state of the tree together with the operations recorded in the tree
2280 * modification log to search for the key in a previous version of this tree, as
2281 * denoted by the time_seq parameter.
2282 *
2283 * Naturally, there is no support for insert, delete or cow operations.
2284 *
2285 * The resulting path and return value will be set up as if we called
2286 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2287 */
310712b2 2288int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
5d9e75c4
JS
2289 struct btrfs_path *p, u64 time_seq)
2290{
0b246afa 2291 struct btrfs_fs_info *fs_info = root->fs_info;
5d9e75c4
JS
2292 struct extent_buffer *b;
2293 int slot;
2294 int ret;
2295 int err;
2296 int level;
2297 int lowest_unlock = 1;
2298 u8 lowest_level = 0;
2299
2300 lowest_level = p->lowest_level;
2301 WARN_ON(p->nodes[0] != NULL);
c922b016 2302 ASSERT(!p->nowait);
5d9e75c4
JS
2303
2304 if (p->search_commit_root) {
2305 BUG_ON(time_seq);
2306 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2307 }
2308
2309again:
f3a84ccd 2310 b = btrfs_get_old_root(root, time_seq);
315bed43
NB
2311 if (!b) {
2312 ret = -EIO;
2313 goto done;
2314 }
5d9e75c4 2315 level = btrfs_header_level(b);
5d9e75c4
JS
2316 p->locks[level] = BTRFS_READ_LOCK;
2317
2318 while (b) {
abe9339d
QW
2319 int dec = 0;
2320
5d9e75c4
JS
2321 level = btrfs_header_level(b);
2322 p->nodes[level] = b;
5d9e75c4
JS
2323
2324 /*
2325 * we have a lock on b and as long as we aren't changing
2326 * the tree, there is no way to for the items in b to change.
2327 * It is safe to drop the lock on our parent before we
2328 * go through the expensive btree search on b.
2329 */
2330 btrfs_unlock_up_safe(p, level + 1);
2331
fdf8d595 2332 ret = btrfs_bin_search(b, 0, key, &slot);
cbca7d59
FM
2333 if (ret < 0)
2334 goto done;
5d9e75c4 2335
abe9339d 2336 if (level == 0) {
5d9e75c4
JS
2337 p->slots[level] = slot;
2338 unlock_up(p, level, lowest_unlock, 0, NULL);
abe9339d
QW
2339 goto done;
2340 }
5d9e75c4 2341
abe9339d
QW
2342 if (ret && slot > 0) {
2343 dec = 1;
2344 slot--;
2345 }
2346 p->slots[level] = slot;
2347 unlock_up(p, level, lowest_unlock, 0, NULL);
5d9e75c4 2348
abe9339d
QW
2349 if (level == lowest_level) {
2350 if (dec)
2351 p->slots[level]++;
2352 goto done;
2353 }
5d9e75c4 2354
abe9339d
QW
2355 err = read_block_for_search(root, p, &b, level, slot, key);
2356 if (err == -EAGAIN)
2357 goto again;
2358 if (err) {
2359 ret = err;
5d9e75c4
JS
2360 goto done;
2361 }
abe9339d
QW
2362
2363 level = btrfs_header_level(b);
ac5887c8 2364 btrfs_tree_read_lock(b);
f3a84ccd 2365 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
abe9339d
QW
2366 if (!b) {
2367 ret = -ENOMEM;
2368 goto done;
2369 }
2370 p->locks[level] = BTRFS_READ_LOCK;
2371 p->nodes[level] = b;
5d9e75c4
JS
2372 }
2373 ret = 1;
2374done:
5d9e75c4
JS
2375 if (ret < 0)
2376 btrfs_release_path(p);
2377
2378 return ret;
2379}
2380
2f38b3e1
AJ
2381/*
2382 * helper to use instead of search slot if no exact match is needed but
2383 * instead the next or previous item should be returned.
2384 * When find_higher is true, the next higher item is returned, the next lower
2385 * otherwise.
2386 * When return_any and find_higher are both true, and no higher item is found,
2387 * return the next lower instead.
2388 * When return_any is true and find_higher is false, and no lower item is found,
2389 * return the next higher instead.
2390 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2391 * < 0 on error
2392 */
2393int btrfs_search_slot_for_read(struct btrfs_root *root,
310712b2
OS
2394 const struct btrfs_key *key,
2395 struct btrfs_path *p, int find_higher,
2396 int return_any)
2f38b3e1
AJ
2397{
2398 int ret;
2399 struct extent_buffer *leaf;
2400
2401again:
2402 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2403 if (ret <= 0)
2404 return ret;
2405 /*
2406 * a return value of 1 means the path is at the position where the
2407 * item should be inserted. Normally this is the next bigger item,
2408 * but in case the previous item is the last in a leaf, path points
2409 * to the first free slot in the previous leaf, i.e. at an invalid
2410 * item.
2411 */
2412 leaf = p->nodes[0];
2413
2414 if (find_higher) {
2415 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2416 ret = btrfs_next_leaf(root, p);
2417 if (ret <= 0)
2418 return ret;
2419 if (!return_any)
2420 return 1;
2421 /*
2422 * no higher item found, return the next
2423 * lower instead
2424 */
2425 return_any = 0;
2426 find_higher = 0;
2427 btrfs_release_path(p);
2428 goto again;
2429 }
2430 } else {
e6793769
AJ
2431 if (p->slots[0] == 0) {
2432 ret = btrfs_prev_leaf(root, p);
2433 if (ret < 0)
2434 return ret;
2435 if (!ret) {
23c6bf6a
FDBM
2436 leaf = p->nodes[0];
2437 if (p->slots[0] == btrfs_header_nritems(leaf))
2438 p->slots[0]--;
e6793769 2439 return 0;
2f38b3e1 2440 }
e6793769
AJ
2441 if (!return_any)
2442 return 1;
2443 /*
2444 * no lower item found, return the next
2445 * higher instead
2446 */
2447 return_any = 0;
2448 find_higher = 1;
2449 btrfs_release_path(p);
2450 goto again;
2451 } else {
2f38b3e1
AJ
2452 --p->slots[0];
2453 }
2454 }
2455 return 0;
2456}
2457
0ff40a91
MPS
2458/*
2459 * Execute search and call btrfs_previous_item to traverse backwards if the item
2460 * was not found.
2461 *
2462 * Return 0 if found, 1 if not found and < 0 if error.
2463 */
2464int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2465 struct btrfs_path *path)
2466{
2467 int ret;
2468
2469 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2470 if (ret > 0)
2471 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2472
2473 if (ret == 0)
2474 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2475
2476 return ret;
2477}
2478
43dd529a 2479/*
62142be3
GN
2480 * Search for a valid slot for the given path.
2481 *
2482 * @root: The root node of the tree.
2483 * @key: Will contain a valid item if found.
2484 * @path: The starting point to validate the slot.
2485 *
2486 * Return: 0 if the item is valid
2487 * 1 if not found
2488 * <0 if error.
2489 */
2490int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2491 struct btrfs_path *path)
2492{
524f14bb 2493 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
62142be3 2494 int ret;
62142be3 2495
524f14bb
FM
2496 ret = btrfs_next_leaf(root, path);
2497 if (ret)
2498 return ret;
62142be3 2499 }
524f14bb
FM
2500
2501 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
62142be3
GN
2502 return 0;
2503}
2504
74123bd7
CM
2505/*
2506 * adjust the pointers going up the tree, starting at level
2507 * making sure the right key of each node is points to 'key'.
2508 * This is used after shifting pointers to the left, so it stops
2509 * fixing up pointers when a given leaf/node is not in slot 0 of the
2510 * higher levels
aa5d6bed 2511 *
74123bd7 2512 */
b167fa91 2513static void fixup_low_keys(struct btrfs_path *path,
143bede5 2514 struct btrfs_disk_key *key, int level)
be0e5c09
CM
2515{
2516 int i;
5f39d397 2517 struct extent_buffer *t;
0e82bcfe 2518 int ret;
5f39d397 2519
234b63a0 2520 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
be0e5c09 2521 int tslot = path->slots[i];
0e82bcfe 2522
eb60ceac 2523 if (!path->nodes[i])
be0e5c09 2524 break;
5f39d397 2525 t = path->nodes[i];
f3a84ccd 2526 ret = btrfs_tree_mod_log_insert_key(t, tslot,
33cff222 2527 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 2528 BUG_ON(ret < 0);
5f39d397 2529 btrfs_set_node_key(t, key, tslot);
d6025579 2530 btrfs_mark_buffer_dirty(path->nodes[i]);
be0e5c09
CM
2531 if (tslot != 0)
2532 break;
2533 }
2534}
2535
31840ae1
ZY
2536/*
2537 * update item key.
2538 *
2539 * This function isn't completely safe. It's the caller's responsibility
2540 * that the new key won't break the order
2541 */
b7a0365e
DD
2542void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2543 struct btrfs_path *path,
310712b2 2544 const struct btrfs_key *new_key)
31840ae1
ZY
2545{
2546 struct btrfs_disk_key disk_key;
2547 struct extent_buffer *eb;
2548 int slot;
2549
2550 eb = path->nodes[0];
2551 slot = path->slots[0];
2552 if (slot > 0) {
2553 btrfs_item_key(eb, &disk_key, slot - 1);
7c15d410
QW
2554 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2555 btrfs_crit(fs_info,
2556 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2557 slot, btrfs_disk_key_objectid(&disk_key),
2558 btrfs_disk_key_type(&disk_key),
2559 btrfs_disk_key_offset(&disk_key),
2560 new_key->objectid, new_key->type,
2561 new_key->offset);
2562 btrfs_print_leaf(eb);
2563 BUG();
2564 }
31840ae1
ZY
2565 }
2566 if (slot < btrfs_header_nritems(eb) - 1) {
2567 btrfs_item_key(eb, &disk_key, slot + 1);
7c15d410
QW
2568 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2569 btrfs_crit(fs_info,
2570 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2571 slot, btrfs_disk_key_objectid(&disk_key),
2572 btrfs_disk_key_type(&disk_key),
2573 btrfs_disk_key_offset(&disk_key),
2574 new_key->objectid, new_key->type,
2575 new_key->offset);
2576 btrfs_print_leaf(eb);
2577 BUG();
2578 }
31840ae1
ZY
2579 }
2580
2581 btrfs_cpu_key_to_disk(&disk_key, new_key);
2582 btrfs_set_item_key(eb, &disk_key, slot);
2583 btrfs_mark_buffer_dirty(eb);
2584 if (slot == 0)
b167fa91 2585 fixup_low_keys(path, &disk_key, 1);
31840ae1
ZY
2586}
2587
d16c702f
QW
2588/*
2589 * Check key order of two sibling extent buffers.
2590 *
2591 * Return true if something is wrong.
2592 * Return false if everything is fine.
2593 *
2594 * Tree-checker only works inside one tree block, thus the following
2595 * corruption can not be detected by tree-checker:
2596 *
2597 * Leaf @left | Leaf @right
2598 * --------------------------------------------------------------
2599 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2600 *
2601 * Key f6 in leaf @left itself is valid, but not valid when the next
2602 * key in leaf @right is 7.
2603 * This can only be checked at tree block merge time.
2604 * And since tree checker has ensured all key order in each tree block
2605 * is correct, we only need to bother the last key of @left and the first
2606 * key of @right.
2607 */
2608static bool check_sibling_keys(struct extent_buffer *left,
2609 struct extent_buffer *right)
2610{
2611 struct btrfs_key left_last;
2612 struct btrfs_key right_first;
2613 int level = btrfs_header_level(left);
2614 int nr_left = btrfs_header_nritems(left);
2615 int nr_right = btrfs_header_nritems(right);
2616
2617 /* No key to check in one of the tree blocks */
2618 if (!nr_left || !nr_right)
2619 return false;
2620
2621 if (level) {
2622 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2623 btrfs_node_key_to_cpu(right, &right_first, 0);
2624 } else {
2625 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2626 btrfs_item_key_to_cpu(right, &right_first, 0);
2627 }
2628
2629 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2630 btrfs_crit(left->fs_info,
2631"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2632 left_last.objectid, left_last.type,
2633 left_last.offset, right_first.objectid,
2634 right_first.type, right_first.offset);
2635 return true;
2636 }
2637 return false;
2638}
2639
74123bd7
CM
2640/*
2641 * try to push data from one node into the next node left in the
79f95c82 2642 * tree.
aa5d6bed
CM
2643 *
2644 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2645 * error, and > 0 if there was no room in the left hand block.
74123bd7 2646 */
98ed5174 2647static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 2648 struct extent_buffer *dst,
971a1f66 2649 struct extent_buffer *src, int empty)
be0e5c09 2650{
d30a668f 2651 struct btrfs_fs_info *fs_info = trans->fs_info;
be0e5c09 2652 int push_items = 0;
bb803951
CM
2653 int src_nritems;
2654 int dst_nritems;
aa5d6bed 2655 int ret = 0;
be0e5c09 2656
5f39d397
CM
2657 src_nritems = btrfs_header_nritems(src);
2658 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2659 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
7bb86316
CM
2660 WARN_ON(btrfs_header_generation(src) != trans->transid);
2661 WARN_ON(btrfs_header_generation(dst) != trans->transid);
54aa1f4d 2662
bce4eae9 2663 if (!empty && src_nritems <= 8)
971a1f66
CM
2664 return 1;
2665
d397712b 2666 if (push_items <= 0)
be0e5c09
CM
2667 return 1;
2668
bce4eae9 2669 if (empty) {
971a1f66 2670 push_items = min(src_nritems, push_items);
bce4eae9
CM
2671 if (push_items < src_nritems) {
2672 /* leave at least 8 pointers in the node if
2673 * we aren't going to empty it
2674 */
2675 if (src_nritems - push_items < 8) {
2676 if (push_items <= 8)
2677 return 1;
2678 push_items -= 8;
2679 }
2680 }
2681 } else
2682 push_items = min(src_nritems - 8, push_items);
79f95c82 2683
d16c702f
QW
2684 /* dst is the left eb, src is the middle eb */
2685 if (check_sibling_keys(dst, src)) {
2686 ret = -EUCLEAN;
2687 btrfs_abort_transaction(trans, ret);
2688 return ret;
2689 }
f3a84ccd 2690 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
5de865ee 2691 if (ret) {
66642832 2692 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2693 return ret;
2694 }
5f39d397 2695 copy_extent_buffer(dst, src,
e23efd8e
JB
2696 btrfs_node_key_ptr_offset(dst, dst_nritems),
2697 btrfs_node_key_ptr_offset(src, 0),
d397712b 2698 push_items * sizeof(struct btrfs_key_ptr));
5f39d397 2699
bb803951 2700 if (push_items < src_nritems) {
57911b8b 2701 /*
f3a84ccd
FM
2702 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2703 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
57911b8b 2704 */
e23efd8e
JB
2705 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2706 btrfs_node_key_ptr_offset(src, push_items),
5f39d397
CM
2707 (src_nritems - push_items) *
2708 sizeof(struct btrfs_key_ptr));
2709 }
2710 btrfs_set_header_nritems(src, src_nritems - push_items);
2711 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2712 btrfs_mark_buffer_dirty(src);
2713 btrfs_mark_buffer_dirty(dst);
31840ae1 2714
79f95c82
CM
2715 return ret;
2716}
2717
2718/*
2719 * try to push data from one node into the next node right in the
2720 * tree.
2721 *
2722 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2723 * error, and > 0 if there was no room in the right hand block.
2724 *
2725 * this will only push up to 1/2 the contents of the left node over
2726 */
5f39d397 2727static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
2728 struct extent_buffer *dst,
2729 struct extent_buffer *src)
79f95c82 2730{
55d32ed8 2731 struct btrfs_fs_info *fs_info = trans->fs_info;
79f95c82
CM
2732 int push_items = 0;
2733 int max_push;
2734 int src_nritems;
2735 int dst_nritems;
2736 int ret = 0;
79f95c82 2737
7bb86316
CM
2738 WARN_ON(btrfs_header_generation(src) != trans->transid);
2739 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2740
5f39d397
CM
2741 src_nritems = btrfs_header_nritems(src);
2742 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2743 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
d397712b 2744 if (push_items <= 0)
79f95c82 2745 return 1;
bce4eae9 2746
d397712b 2747 if (src_nritems < 4)
bce4eae9 2748 return 1;
79f95c82
CM
2749
2750 max_push = src_nritems / 2 + 1;
2751 /* don't try to empty the node */
d397712b 2752 if (max_push >= src_nritems)
79f95c82 2753 return 1;
252c38f0 2754
79f95c82
CM
2755 if (max_push < push_items)
2756 push_items = max_push;
2757
d16c702f
QW
2758 /* dst is the right eb, src is the middle eb */
2759 if (check_sibling_keys(src, dst)) {
2760 ret = -EUCLEAN;
2761 btrfs_abort_transaction(trans, ret);
2762 return ret;
2763 }
f3a84ccd 2764 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
bf1d3425 2765 BUG_ON(ret < 0);
e23efd8e
JB
2766 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2767 btrfs_node_key_ptr_offset(dst, 0),
5f39d397
CM
2768 (dst_nritems) *
2769 sizeof(struct btrfs_key_ptr));
d6025579 2770
f3a84ccd
FM
2771 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2772 push_items);
5de865ee 2773 if (ret) {
66642832 2774 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2775 return ret;
2776 }
5f39d397 2777 copy_extent_buffer(dst, src,
e23efd8e
JB
2778 btrfs_node_key_ptr_offset(dst, 0),
2779 btrfs_node_key_ptr_offset(src, src_nritems - push_items),
d397712b 2780 push_items * sizeof(struct btrfs_key_ptr));
79f95c82 2781
5f39d397
CM
2782 btrfs_set_header_nritems(src, src_nritems - push_items);
2783 btrfs_set_header_nritems(dst, dst_nritems + push_items);
79f95c82 2784
5f39d397
CM
2785 btrfs_mark_buffer_dirty(src);
2786 btrfs_mark_buffer_dirty(dst);
31840ae1 2787
aa5d6bed 2788 return ret;
be0e5c09
CM
2789}
2790
97571fd0
CM
2791/*
2792 * helper function to insert a new root level in the tree.
2793 * A new node is allocated, and a single item is inserted to
2794 * point to the existing root
aa5d6bed
CM
2795 *
2796 * returns zero on success or < 0 on failure.
97571fd0 2797 */
d397712b 2798static noinline int insert_new_root(struct btrfs_trans_handle *trans,
5f39d397 2799 struct btrfs_root *root,
fdd99c72 2800 struct btrfs_path *path, int level)
5c680ed6 2801{
0b246afa 2802 struct btrfs_fs_info *fs_info = root->fs_info;
7bb86316 2803 u64 lower_gen;
5f39d397
CM
2804 struct extent_buffer *lower;
2805 struct extent_buffer *c;
925baedd 2806 struct extent_buffer *old;
5f39d397 2807 struct btrfs_disk_key lower_key;
d9d19a01 2808 int ret;
5c680ed6
CM
2809
2810 BUG_ON(path->nodes[level]);
2811 BUG_ON(path->nodes[level-1] != root->node);
2812
7bb86316
CM
2813 lower = path->nodes[level-1];
2814 if (level == 1)
2815 btrfs_item_key(lower, &lower_key, 0);
2816 else
2817 btrfs_node_key(lower, &lower_key, 0);
2818
79bd3712
FM
2819 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2820 &lower_key, level, root->node->start, 0,
2821 BTRFS_NESTING_NEW_ROOT);
5f39d397
CM
2822 if (IS_ERR(c))
2823 return PTR_ERR(c);
925baedd 2824
0b246afa 2825 root_add_used(root, fs_info->nodesize);
f0486c68 2826
5f39d397 2827 btrfs_set_header_nritems(c, 1);
5f39d397 2828 btrfs_set_node_key(c, &lower_key, 0);
db94535d 2829 btrfs_set_node_blockptr(c, 0, lower->start);
7bb86316 2830 lower_gen = btrfs_header_generation(lower);
31840ae1 2831 WARN_ON(lower_gen != trans->transid);
7bb86316
CM
2832
2833 btrfs_set_node_ptr_generation(c, 0, lower_gen);
d5719762 2834
5f39d397 2835 btrfs_mark_buffer_dirty(c);
d5719762 2836
925baedd 2837 old = root->node;
406808ab 2838 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
d9d19a01 2839 BUG_ON(ret < 0);
240f62c8 2840 rcu_assign_pointer(root->node, c);
925baedd
CM
2841
2842 /* the super has an extra ref to root->node */
2843 free_extent_buffer(old);
2844
0b86a832 2845 add_root_to_dirty_list(root);
67439dad 2846 atomic_inc(&c->refs);
5f39d397 2847 path->nodes[level] = c;
ac5887c8 2848 path->locks[level] = BTRFS_WRITE_LOCK;
5c680ed6
CM
2849 path->slots[level] = 0;
2850 return 0;
2851}
2852
74123bd7
CM
2853/*
2854 * worker function to insert a single pointer in a node.
2855 * the node should have enough room for the pointer already
97571fd0 2856 *
74123bd7
CM
2857 * slot and level indicate where you want the key to go, and
2858 * blocknr is the block the key points to.
2859 */
143bede5 2860static void insert_ptr(struct btrfs_trans_handle *trans,
6ad3cf6d 2861 struct btrfs_path *path,
143bede5 2862 struct btrfs_disk_key *key, u64 bytenr,
c3e06965 2863 int slot, int level)
74123bd7 2864{
5f39d397 2865 struct extent_buffer *lower;
74123bd7 2866 int nritems;
f3ea38da 2867 int ret;
5c680ed6
CM
2868
2869 BUG_ON(!path->nodes[level]);
49d0c642 2870 btrfs_assert_tree_write_locked(path->nodes[level]);
5f39d397
CM
2871 lower = path->nodes[level];
2872 nritems = btrfs_header_nritems(lower);
c293498b 2873 BUG_ON(slot > nritems);
6ad3cf6d 2874 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
74123bd7 2875 if (slot != nritems) {
bf1d3425 2876 if (level) {
f3a84ccd
FM
2877 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2878 slot, nritems - slot);
bf1d3425
DS
2879 BUG_ON(ret < 0);
2880 }
5f39d397 2881 memmove_extent_buffer(lower,
e23efd8e
JB
2882 btrfs_node_key_ptr_offset(lower, slot + 1),
2883 btrfs_node_key_ptr_offset(lower, slot),
d6025579 2884 (nritems - slot) * sizeof(struct btrfs_key_ptr));
74123bd7 2885 }
c3e06965 2886 if (level) {
f3a84ccd 2887 ret = btrfs_tree_mod_log_insert_key(lower, slot,
33cff222 2888 BTRFS_MOD_LOG_KEY_ADD);
f3ea38da
JS
2889 BUG_ON(ret < 0);
2890 }
5f39d397 2891 btrfs_set_node_key(lower, key, slot);
db94535d 2892 btrfs_set_node_blockptr(lower, slot, bytenr);
74493f7a
CM
2893 WARN_ON(trans->transid == 0);
2894 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
5f39d397
CM
2895 btrfs_set_header_nritems(lower, nritems + 1);
2896 btrfs_mark_buffer_dirty(lower);
74123bd7
CM
2897}
2898
97571fd0
CM
2899/*
2900 * split the node at the specified level in path in two.
2901 * The path is corrected to point to the appropriate node after the split
2902 *
2903 * Before splitting this tries to make some room in the node by pushing
2904 * left and right, if either one works, it returns right away.
aa5d6bed
CM
2905 *
2906 * returns 0 on success and < 0 on failure
97571fd0 2907 */
e02119d5
CM
2908static noinline int split_node(struct btrfs_trans_handle *trans,
2909 struct btrfs_root *root,
2910 struct btrfs_path *path, int level)
be0e5c09 2911{
0b246afa 2912 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
2913 struct extent_buffer *c;
2914 struct extent_buffer *split;
2915 struct btrfs_disk_key disk_key;
be0e5c09 2916 int mid;
5c680ed6 2917 int ret;
7518a238 2918 u32 c_nritems;
eb60ceac 2919
5f39d397 2920 c = path->nodes[level];
7bb86316 2921 WARN_ON(btrfs_header_generation(c) != trans->transid);
5f39d397 2922 if (c == root->node) {
d9abbf1c 2923 /*
90f8d62e
JS
2924 * trying to split the root, lets make a new one
2925 *
fdd99c72 2926 * tree mod log: We don't log_removal old root in
90f8d62e
JS
2927 * insert_new_root, because that root buffer will be kept as a
2928 * normal node. We are going to log removal of half of the
f3a84ccd
FM
2929 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2930 * holding a tree lock on the buffer, which is why we cannot
2931 * race with other tree_mod_log users.
d9abbf1c 2932 */
fdd99c72 2933 ret = insert_new_root(trans, root, path, level + 1);
5c680ed6
CM
2934 if (ret)
2935 return ret;
b3612421 2936 } else {
e66f709b 2937 ret = push_nodes_for_insert(trans, root, path, level);
5f39d397
CM
2938 c = path->nodes[level];
2939 if (!ret && btrfs_header_nritems(c) <
0b246afa 2940 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
e66f709b 2941 return 0;
54aa1f4d
CM
2942 if (ret < 0)
2943 return ret;
be0e5c09 2944 }
e66f709b 2945
5f39d397 2946 c_nritems = btrfs_header_nritems(c);
5d4f98a2
YZ
2947 mid = (c_nritems + 1) / 2;
2948 btrfs_node_key(c, &disk_key, mid);
7bb86316 2949
79bd3712
FM
2950 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2951 &disk_key, level, c->start, 0,
2952 BTRFS_NESTING_SPLIT);
5f39d397
CM
2953 if (IS_ERR(split))
2954 return PTR_ERR(split);
2955
0b246afa 2956 root_add_used(root, fs_info->nodesize);
bc877d28 2957 ASSERT(btrfs_header_level(c) == level);
54aa1f4d 2958
f3a84ccd 2959 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
5de865ee 2960 if (ret) {
66642832 2961 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2962 return ret;
2963 }
5f39d397 2964 copy_extent_buffer(split, c,
e23efd8e
JB
2965 btrfs_node_key_ptr_offset(split, 0),
2966 btrfs_node_key_ptr_offset(c, mid),
5f39d397
CM
2967 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2968 btrfs_set_header_nritems(split, c_nritems - mid);
2969 btrfs_set_header_nritems(c, mid);
aa5d6bed 2970
5f39d397
CM
2971 btrfs_mark_buffer_dirty(c);
2972 btrfs_mark_buffer_dirty(split);
2973
6ad3cf6d 2974 insert_ptr(trans, path, &disk_key, split->start,
c3e06965 2975 path->slots[level + 1] + 1, level + 1);
aa5d6bed 2976
5de08d7d 2977 if (path->slots[level] >= mid) {
5c680ed6 2978 path->slots[level] -= mid;
925baedd 2979 btrfs_tree_unlock(c);
5f39d397
CM
2980 free_extent_buffer(c);
2981 path->nodes[level] = split;
5c680ed6
CM
2982 path->slots[level + 1] += 1;
2983 } else {
925baedd 2984 btrfs_tree_unlock(split);
5f39d397 2985 free_extent_buffer(split);
be0e5c09 2986 }
d5286a92 2987 return 0;
be0e5c09
CM
2988}
2989
74123bd7
CM
2990/*
2991 * how many bytes are required to store the items in a leaf. start
2992 * and nr indicate which items in the leaf to check. This totals up the
2993 * space used both by the item structs and the item data
2994 */
5f39d397 2995static int leaf_space_used(struct extent_buffer *l, int start, int nr)
be0e5c09
CM
2996{
2997 int data_len;
5f39d397 2998 int nritems = btrfs_header_nritems(l);
d4dbff95 2999 int end = min(nritems, start + nr) - 1;
be0e5c09
CM
3000
3001 if (!nr)
3002 return 0;
3212fa14
JB
3003 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3004 data_len = data_len - btrfs_item_offset(l, end);
0783fcfc 3005 data_len += sizeof(struct btrfs_item) * nr;
d4dbff95 3006 WARN_ON(data_len < 0);
be0e5c09
CM
3007 return data_len;
3008}
3009
d4dbff95
CM
3010/*
3011 * The space between the end of the leaf items and
3012 * the start of the leaf data. IOW, how much room
3013 * the leaf has left for both items and data
3014 */
e902baac 3015noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
d4dbff95 3016{
e902baac 3017 struct btrfs_fs_info *fs_info = leaf->fs_info;
5f39d397
CM
3018 int nritems = btrfs_header_nritems(leaf);
3019 int ret;
0b246afa
JM
3020
3021 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
5f39d397 3022 if (ret < 0) {
0b246afa
JM
3023 btrfs_crit(fs_info,
3024 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3025 ret,
3026 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3027 leaf_space_used(leaf, 0, nritems), nritems);
5f39d397
CM
3028 }
3029 return ret;
d4dbff95
CM
3030}
3031
99d8f83c
CM
3032/*
3033 * min slot controls the lowest index we're willing to push to the
3034 * right. We'll push up to and including min_slot, but no lower
3035 */
ed25dab3
JB
3036static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3037 struct btrfs_path *path,
44871b1b
CM
3038 int data_size, int empty,
3039 struct extent_buffer *right,
99d8f83c
CM
3040 int free_space, u32 left_nritems,
3041 u32 min_slot)
00ec4c51 3042{
f72f0010 3043 struct btrfs_fs_info *fs_info = right->fs_info;
5f39d397 3044 struct extent_buffer *left = path->nodes[0];
44871b1b 3045 struct extent_buffer *upper = path->nodes[1];
cfed81a0 3046 struct btrfs_map_token token;
5f39d397 3047 struct btrfs_disk_key disk_key;
00ec4c51 3048 int slot;
34a38218 3049 u32 i;
00ec4c51
CM
3050 int push_space = 0;
3051 int push_items = 0;
34a38218 3052 u32 nr;
7518a238 3053 u32 right_nritems;
5f39d397 3054 u32 data_end;
db94535d 3055 u32 this_item_size;
00ec4c51 3056
34a38218
CM
3057 if (empty)
3058 nr = 0;
3059 else
99d8f83c 3060 nr = max_t(u32, 1, min_slot);
34a38218 3061
31840ae1 3062 if (path->slots[0] >= left_nritems)
87b29b20 3063 push_space += data_size;
31840ae1 3064
44871b1b 3065 slot = path->slots[1];
34a38218
CM
3066 i = left_nritems - 1;
3067 while (i >= nr) {
31840ae1
ZY
3068 if (!empty && push_items > 0) {
3069 if (path->slots[0] > i)
3070 break;
3071 if (path->slots[0] == i) {
e902baac
DS
3072 int space = btrfs_leaf_free_space(left);
3073
31840ae1
ZY
3074 if (space + push_space * 2 > free_space)
3075 break;
3076 }
3077 }
3078
00ec4c51 3079 if (path->slots[0] == i)
87b29b20 3080 push_space += data_size;
db94535d 3081
3212fa14 3082 this_item_size = btrfs_item_size(left, i);
74794207
JB
3083 if (this_item_size + sizeof(struct btrfs_item) +
3084 push_space > free_space)
00ec4c51 3085 break;
31840ae1 3086
00ec4c51 3087 push_items++;
74794207 3088 push_space += this_item_size + sizeof(struct btrfs_item);
34a38218
CM
3089 if (i == 0)
3090 break;
3091 i--;
db94535d 3092 }
5f39d397 3093
925baedd
CM
3094 if (push_items == 0)
3095 goto out_unlock;
5f39d397 3096
6c1500f2 3097 WARN_ON(!empty && push_items == left_nritems);
5f39d397 3098
00ec4c51 3099 /* push left to right */
5f39d397 3100 right_nritems = btrfs_header_nritems(right);
34a38218 3101
dc2e724e 3102 push_space = btrfs_item_data_end(left, left_nritems - push_items);
8f881e8c 3103 push_space -= leaf_data_end(left);
5f39d397 3104
00ec4c51 3105 /* make room in the right data area */
8f881e8c 3106 data_end = leaf_data_end(right);
637e3b48
JB
3107 memmove_leaf_data(right, data_end - push_space, data_end,
3108 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
5f39d397 3109
00ec4c51 3110 /* copy from the left data area */
637e3b48
JB
3111 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3112 leaf_data_end(left), push_space);
5f39d397 3113
637e3b48 3114 memmove_leaf_items(right, push_items, 0, right_nritems);
5f39d397 3115
00ec4c51 3116 /* copy the items from left to right */
637e3b48 3117 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
00ec4c51
CM
3118
3119 /* update the item pointers */
c82f823c 3120 btrfs_init_map_token(&token, right);
7518a238 3121 right_nritems += push_items;
5f39d397 3122 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3123 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
7518a238 3124 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3125 push_space -= btrfs_token_item_size(&token, i);
3126 btrfs_set_token_item_offset(&token, i, push_space);
db94535d
CM
3127 }
3128
7518a238 3129 left_nritems -= push_items;
5f39d397 3130 btrfs_set_header_nritems(left, left_nritems);
00ec4c51 3131
34a38218
CM
3132 if (left_nritems)
3133 btrfs_mark_buffer_dirty(left);
f0486c68 3134 else
190a8339 3135 btrfs_clear_buffer_dirty(trans, left);
f0486c68 3136
5f39d397 3137 btrfs_mark_buffer_dirty(right);
a429e513 3138
5f39d397
CM
3139 btrfs_item_key(right, &disk_key, 0);
3140 btrfs_set_node_key(upper, &disk_key, slot + 1);
d6025579 3141 btrfs_mark_buffer_dirty(upper);
02217ed2 3142
00ec4c51 3143 /* then fixup the leaf pointer in the path */
7518a238
CM
3144 if (path->slots[0] >= left_nritems) {
3145 path->slots[0] -= left_nritems;
925baedd 3146 if (btrfs_header_nritems(path->nodes[0]) == 0)
190a8339 3147 btrfs_clear_buffer_dirty(trans, path->nodes[0]);
925baedd 3148 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3149 free_extent_buffer(path->nodes[0]);
3150 path->nodes[0] = right;
00ec4c51
CM
3151 path->slots[1] += 1;
3152 } else {
925baedd 3153 btrfs_tree_unlock(right);
5f39d397 3154 free_extent_buffer(right);
00ec4c51
CM
3155 }
3156 return 0;
925baedd
CM
3157
3158out_unlock:
3159 btrfs_tree_unlock(right);
3160 free_extent_buffer(right);
3161 return 1;
00ec4c51 3162}
925baedd 3163
44871b1b
CM
3164/*
3165 * push some data in the path leaf to the right, trying to free up at
3166 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3167 *
3168 * returns 1 if the push failed because the other node didn't have enough
3169 * room, 0 if everything worked out and < 0 if there were major errors.
99d8f83c
CM
3170 *
3171 * this will push starting from min_slot to the end of the leaf. It won't
3172 * push any slot lower than min_slot
44871b1b
CM
3173 */
3174static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3175 *root, struct btrfs_path *path,
3176 int min_data_size, int data_size,
3177 int empty, u32 min_slot)
44871b1b
CM
3178{
3179 struct extent_buffer *left = path->nodes[0];
3180 struct extent_buffer *right;
3181 struct extent_buffer *upper;
3182 int slot;
3183 int free_space;
3184 u32 left_nritems;
3185 int ret;
3186
3187 if (!path->nodes[1])
3188 return 1;
3189
3190 slot = path->slots[1];
3191 upper = path->nodes[1];
3192 if (slot >= btrfs_header_nritems(upper) - 1)
3193 return 1;
3194
49d0c642 3195 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3196
4b231ae4 3197 right = btrfs_read_node_slot(upper, slot + 1);
fb770ae4 3198 if (IS_ERR(right))
9cf14029 3199 return PTR_ERR(right);
91ca338d 3200
bf77467a 3201 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
44871b1b 3202
e902baac 3203 free_space = btrfs_leaf_free_space(right);
44871b1b
CM
3204 if (free_space < data_size)
3205 goto out_unlock;
3206
44871b1b 3207 ret = btrfs_cow_block(trans, root, right, upper,
bf59a5a2 3208 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
44871b1b
CM
3209 if (ret)
3210 goto out_unlock;
3211
44871b1b
CM
3212 left_nritems = btrfs_header_nritems(left);
3213 if (left_nritems == 0)
3214 goto out_unlock;
3215
d16c702f
QW
3216 if (check_sibling_keys(left, right)) {
3217 ret = -EUCLEAN;
3218 btrfs_tree_unlock(right);
3219 free_extent_buffer(right);
3220 return ret;
3221 }
2ef1fed2
FDBM
3222 if (path->slots[0] == left_nritems && !empty) {
3223 /* Key greater than all keys in the leaf, right neighbor has
3224 * enough room for it and we're not emptying our leaf to delete
3225 * it, therefore use right neighbor to insert the new item and
52042d8e 3226 * no need to touch/dirty our left leaf. */
2ef1fed2
FDBM
3227 btrfs_tree_unlock(left);
3228 free_extent_buffer(left);
3229 path->nodes[0] = right;
3230 path->slots[0] = 0;
3231 path->slots[1]++;
3232 return 0;
3233 }
3234
ed25dab3
JB
3235 return __push_leaf_right(trans, path, min_data_size, empty, right,
3236 free_space, left_nritems, min_slot);
44871b1b
CM
3237out_unlock:
3238 btrfs_tree_unlock(right);
3239 free_extent_buffer(right);
3240 return 1;
3241}
3242
74123bd7
CM
3243/*
3244 * push some data in the path leaf to the left, trying to free up at
3245 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3246 *
3247 * max_slot can put a limit on how far into the leaf we'll push items. The
3248 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3249 * items
74123bd7 3250 */
ed25dab3
JB
3251static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3252 struct btrfs_path *path, int data_size,
44871b1b 3253 int empty, struct extent_buffer *left,
99d8f83c
CM
3254 int free_space, u32 right_nritems,
3255 u32 max_slot)
be0e5c09 3256{
8087c193 3257 struct btrfs_fs_info *fs_info = left->fs_info;
5f39d397
CM
3258 struct btrfs_disk_key disk_key;
3259 struct extent_buffer *right = path->nodes[0];
be0e5c09 3260 int i;
be0e5c09
CM
3261 int push_space = 0;
3262 int push_items = 0;
7518a238 3263 u32 old_left_nritems;
34a38218 3264 u32 nr;
aa5d6bed 3265 int ret = 0;
db94535d
CM
3266 u32 this_item_size;
3267 u32 old_left_item_size;
cfed81a0
CM
3268 struct btrfs_map_token token;
3269
34a38218 3270 if (empty)
99d8f83c 3271 nr = min(right_nritems, max_slot);
34a38218 3272 else
99d8f83c 3273 nr = min(right_nritems - 1, max_slot);
34a38218
CM
3274
3275 for (i = 0; i < nr; i++) {
31840ae1
ZY
3276 if (!empty && push_items > 0) {
3277 if (path->slots[0] < i)
3278 break;
3279 if (path->slots[0] == i) {
e902baac
DS
3280 int space = btrfs_leaf_free_space(right);
3281
31840ae1
ZY
3282 if (space + push_space * 2 > free_space)
3283 break;
3284 }
3285 }
3286
be0e5c09 3287 if (path->slots[0] == i)
87b29b20 3288 push_space += data_size;
db94535d 3289
3212fa14 3290 this_item_size = btrfs_item_size(right, i);
74794207
JB
3291 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3292 free_space)
be0e5c09 3293 break;
db94535d 3294
be0e5c09 3295 push_items++;
74794207 3296 push_space += this_item_size + sizeof(struct btrfs_item);
db94535d
CM
3297 }
3298
be0e5c09 3299 if (push_items == 0) {
925baedd
CM
3300 ret = 1;
3301 goto out;
be0e5c09 3302 }
fae7f21c 3303 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
5f39d397 3304
be0e5c09 3305 /* push data from right to left */
637e3b48 3306 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
5f39d397 3307
0b246afa 3308 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3212fa14 3309 btrfs_item_offset(right, push_items - 1);
5f39d397 3310
637e3b48
JB
3311 copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3312 btrfs_item_offset(right, push_items - 1), push_space);
5f39d397 3313 old_left_nritems = btrfs_header_nritems(left);
87b29b20 3314 BUG_ON(old_left_nritems <= 0);
eb60ceac 3315
c82f823c 3316 btrfs_init_map_token(&token, left);
3212fa14 3317 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
0783fcfc 3318 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
5f39d397 3319 u32 ioff;
db94535d 3320
3212fa14
JB
3321 ioff = btrfs_token_item_offset(&token, i);
3322 btrfs_set_token_item_offset(&token, i,
cc4c13d5 3323 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
be0e5c09 3324 }
5f39d397 3325 btrfs_set_header_nritems(left, old_left_nritems + push_items);
be0e5c09
CM
3326
3327 /* fixup right node */
31b1a2bd
JL
3328 if (push_items > right_nritems)
3329 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
d397712b 3330 right_nritems);
34a38218
CM
3331
3332 if (push_items < right_nritems) {
3212fa14 3333 push_space = btrfs_item_offset(right, push_items - 1) -
8f881e8c 3334 leaf_data_end(right);
637e3b48
JB
3335 memmove_leaf_data(right,
3336 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3337 leaf_data_end(right), push_space);
3338
3339 memmove_leaf_items(right, 0, push_items,
3340 btrfs_header_nritems(right) - push_items);
34a38218 3341 }
c82f823c
DS
3342
3343 btrfs_init_map_token(&token, right);
eef1c494
Y
3344 right_nritems -= push_items;
3345 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3346 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
5f39d397 3347 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3348 push_space = push_space - btrfs_token_item_size(&token, i);
3349 btrfs_set_token_item_offset(&token, i, push_space);
db94535d 3350 }
eb60ceac 3351
5f39d397 3352 btrfs_mark_buffer_dirty(left);
34a38218
CM
3353 if (right_nritems)
3354 btrfs_mark_buffer_dirty(right);
f0486c68 3355 else
190a8339 3356 btrfs_clear_buffer_dirty(trans, right);
098f59c2 3357
5f39d397 3358 btrfs_item_key(right, &disk_key, 0);
b167fa91 3359 fixup_low_keys(path, &disk_key, 1);
be0e5c09
CM
3360
3361 /* then fixup the leaf pointer in the path */
3362 if (path->slots[0] < push_items) {
3363 path->slots[0] += old_left_nritems;
925baedd 3364 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3365 free_extent_buffer(path->nodes[0]);
3366 path->nodes[0] = left;
be0e5c09
CM
3367 path->slots[1] -= 1;
3368 } else {
925baedd 3369 btrfs_tree_unlock(left);
5f39d397 3370 free_extent_buffer(left);
be0e5c09
CM
3371 path->slots[0] -= push_items;
3372 }
eb60ceac 3373 BUG_ON(path->slots[0] < 0);
aa5d6bed 3374 return ret;
925baedd
CM
3375out:
3376 btrfs_tree_unlock(left);
3377 free_extent_buffer(left);
3378 return ret;
be0e5c09
CM
3379}
3380
44871b1b
CM
3381/*
3382 * push some data in the path leaf to the left, trying to free up at
3383 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3384 *
3385 * max_slot can put a limit on how far into the leaf we'll push items. The
3386 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3387 * items
44871b1b
CM
3388 */
3389static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3390 *root, struct btrfs_path *path, int min_data_size,
3391 int data_size, int empty, u32 max_slot)
44871b1b
CM
3392{
3393 struct extent_buffer *right = path->nodes[0];
3394 struct extent_buffer *left;
3395 int slot;
3396 int free_space;
3397 u32 right_nritems;
3398 int ret = 0;
3399
3400 slot = path->slots[1];
3401 if (slot == 0)
3402 return 1;
3403 if (!path->nodes[1])
3404 return 1;
3405
3406 right_nritems = btrfs_header_nritems(right);
3407 if (right_nritems == 0)
3408 return 1;
3409
49d0c642 3410 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3411
4b231ae4 3412 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
fb770ae4 3413 if (IS_ERR(left))
9cf14029 3414 return PTR_ERR(left);
91ca338d 3415
bf77467a 3416 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
44871b1b 3417
e902baac 3418 free_space = btrfs_leaf_free_space(left);
44871b1b
CM
3419 if (free_space < data_size) {
3420 ret = 1;
3421 goto out;
3422 }
3423
44871b1b 3424 ret = btrfs_cow_block(trans, root, left,
9631e4cc 3425 path->nodes[1], slot - 1, &left,
bf59a5a2 3426 BTRFS_NESTING_LEFT_COW);
44871b1b
CM
3427 if (ret) {
3428 /* we hit -ENOSPC, but it isn't fatal here */
79787eaa
JM
3429 if (ret == -ENOSPC)
3430 ret = 1;
44871b1b
CM
3431 goto out;
3432 }
3433
d16c702f
QW
3434 if (check_sibling_keys(left, right)) {
3435 ret = -EUCLEAN;
3436 goto out;
3437 }
ed25dab3
JB
3438 return __push_leaf_left(trans, path, min_data_size, empty, left,
3439 free_space, right_nritems, max_slot);
44871b1b
CM
3440out:
3441 btrfs_tree_unlock(left);
3442 free_extent_buffer(left);
3443 return ret;
3444}
3445
3446/*
3447 * split the path's leaf in two, making sure there is at least data_size
3448 * available for the resulting leaf level of the path.
44871b1b 3449 */
143bede5 3450static noinline void copy_for_split(struct btrfs_trans_handle *trans,
143bede5
JM
3451 struct btrfs_path *path,
3452 struct extent_buffer *l,
3453 struct extent_buffer *right,
3454 int slot, int mid, int nritems)
44871b1b 3455{
94f94ad9 3456 struct btrfs_fs_info *fs_info = trans->fs_info;
44871b1b
CM
3457 int data_copy_size;
3458 int rt_data_off;
3459 int i;
44871b1b 3460 struct btrfs_disk_key disk_key;
cfed81a0
CM
3461 struct btrfs_map_token token;
3462
44871b1b
CM
3463 nritems = nritems - mid;
3464 btrfs_set_header_nritems(right, nritems);
dc2e724e 3465 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
44871b1b 3466
637e3b48 3467 copy_leaf_items(right, l, 0, mid, nritems);
44871b1b 3468
637e3b48
JB
3469 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3470 leaf_data_end(l), data_copy_size);
44871b1b 3471
dc2e724e 3472 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
44871b1b 3473
c82f823c 3474 btrfs_init_map_token(&token, right);
44871b1b 3475 for (i = 0; i < nritems; i++) {
44871b1b
CM
3476 u32 ioff;
3477
3212fa14
JB
3478 ioff = btrfs_token_item_offset(&token, i);
3479 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
44871b1b
CM
3480 }
3481
44871b1b 3482 btrfs_set_header_nritems(l, mid);
44871b1b 3483 btrfs_item_key(right, &disk_key, 0);
6ad3cf6d 3484 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
44871b1b
CM
3485
3486 btrfs_mark_buffer_dirty(right);
3487 btrfs_mark_buffer_dirty(l);
3488 BUG_ON(path->slots[0] != slot);
3489
44871b1b
CM
3490 if (mid <= slot) {
3491 btrfs_tree_unlock(path->nodes[0]);
3492 free_extent_buffer(path->nodes[0]);
3493 path->nodes[0] = right;
3494 path->slots[0] -= mid;
3495 path->slots[1] += 1;
3496 } else {
3497 btrfs_tree_unlock(right);
3498 free_extent_buffer(right);
3499 }
3500
3501 BUG_ON(path->slots[0] < 0);
44871b1b
CM
3502}
3503
99d8f83c
CM
3504/*
3505 * double splits happen when we need to insert a big item in the middle
3506 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3507 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3508 * A B C
3509 *
3510 * We avoid this by trying to push the items on either side of our target
3511 * into the adjacent leaves. If all goes well we can avoid the double split
3512 * completely.
3513 */
3514static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3515 struct btrfs_root *root,
3516 struct btrfs_path *path,
3517 int data_size)
3518{
3519 int ret;
3520 int progress = 0;
3521 int slot;
3522 u32 nritems;
5a4267ca 3523 int space_needed = data_size;
99d8f83c
CM
3524
3525 slot = path->slots[0];
5a4267ca 3526 if (slot < btrfs_header_nritems(path->nodes[0]))
e902baac 3527 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
99d8f83c
CM
3528
3529 /*
3530 * try to push all the items after our slot into the
3531 * right leaf
3532 */
5a4267ca 3533 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3534 if (ret < 0)
3535 return ret;
3536
3537 if (ret == 0)
3538 progress++;
3539
3540 nritems = btrfs_header_nritems(path->nodes[0]);
3541 /*
3542 * our goal is to get our slot at the start or end of a leaf. If
3543 * we've done so we're done
3544 */
3545 if (path->slots[0] == 0 || path->slots[0] == nritems)
3546 return 0;
3547
e902baac 3548 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3549 return 0;
3550
3551 /* try to push all the items before our slot into the next leaf */
3552 slot = path->slots[0];
263d3995
FM
3553 space_needed = data_size;
3554 if (slot > 0)
e902baac 3555 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
5a4267ca 3556 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3557 if (ret < 0)
3558 return ret;
3559
3560 if (ret == 0)
3561 progress++;
3562
3563 if (progress)
3564 return 0;
3565 return 1;
3566}
3567
74123bd7
CM
3568/*
3569 * split the path's leaf in two, making sure there is at least data_size
3570 * available for the resulting leaf level of the path.
aa5d6bed
CM
3571 *
3572 * returns 0 if all went well and < 0 on failure.
74123bd7 3573 */
e02119d5
CM
3574static noinline int split_leaf(struct btrfs_trans_handle *trans,
3575 struct btrfs_root *root,
310712b2 3576 const struct btrfs_key *ins_key,
e02119d5
CM
3577 struct btrfs_path *path, int data_size,
3578 int extend)
be0e5c09 3579{
5d4f98a2 3580 struct btrfs_disk_key disk_key;
5f39d397 3581 struct extent_buffer *l;
7518a238 3582 u32 nritems;
eb60ceac
CM
3583 int mid;
3584 int slot;
5f39d397 3585 struct extent_buffer *right;
b7a0365e 3586 struct btrfs_fs_info *fs_info = root->fs_info;
d4dbff95 3587 int ret = 0;
aa5d6bed 3588 int wret;
5d4f98a2 3589 int split;
cc0c5538 3590 int num_doubles = 0;
99d8f83c 3591 int tried_avoid_double = 0;
aa5d6bed 3592
a5719521
YZ
3593 l = path->nodes[0];
3594 slot = path->slots[0];
3212fa14 3595 if (extend && data_size + btrfs_item_size(l, slot) +
0b246afa 3596 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
a5719521
YZ
3597 return -EOVERFLOW;
3598
40689478 3599 /* first try to make some room by pushing left and right */
33157e05 3600 if (data_size && path->nodes[1]) {
5a4267ca
FDBM
3601 int space_needed = data_size;
3602
3603 if (slot < btrfs_header_nritems(l))
e902baac 3604 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3605
3606 wret = push_leaf_right(trans, root, path, space_needed,
3607 space_needed, 0, 0);
d397712b 3608 if (wret < 0)
eaee50e8 3609 return wret;
3685f791 3610 if (wret) {
263d3995
FM
3611 space_needed = data_size;
3612 if (slot > 0)
e902baac 3613 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3614 wret = push_leaf_left(trans, root, path, space_needed,
3615 space_needed, 0, (u32)-1);
3685f791
CM
3616 if (wret < 0)
3617 return wret;
3618 }
3619 l = path->nodes[0];
aa5d6bed 3620
3685f791 3621 /* did the pushes work? */
e902baac 3622 if (btrfs_leaf_free_space(l) >= data_size)
3685f791 3623 return 0;
3326d1b0 3624 }
aa5d6bed 3625
5c680ed6 3626 if (!path->nodes[1]) {
fdd99c72 3627 ret = insert_new_root(trans, root, path, 1);
5c680ed6
CM
3628 if (ret)
3629 return ret;
3630 }
cc0c5538 3631again:
5d4f98a2 3632 split = 1;
cc0c5538 3633 l = path->nodes[0];
eb60ceac 3634 slot = path->slots[0];
5f39d397 3635 nritems = btrfs_header_nritems(l);
d397712b 3636 mid = (nritems + 1) / 2;
54aa1f4d 3637
5d4f98a2
YZ
3638 if (mid <= slot) {
3639 if (nritems == 1 ||
3640 leaf_space_used(l, mid, nritems - mid) + data_size >
0b246afa 3641 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3642 if (slot >= nritems) {
3643 split = 0;
3644 } else {
3645 mid = slot;
3646 if (mid != nritems &&
3647 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3648 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3649 if (data_size && !tried_avoid_double)
3650 goto push_for_double;
5d4f98a2
YZ
3651 split = 2;
3652 }
3653 }
3654 }
3655 } else {
3656 if (leaf_space_used(l, 0, mid) + data_size >
0b246afa 3657 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3658 if (!extend && data_size && slot == 0) {
3659 split = 0;
3660 } else if ((extend || !data_size) && slot == 0) {
3661 mid = 1;
3662 } else {
3663 mid = slot;
3664 if (mid != nritems &&
3665 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3666 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3667 if (data_size && !tried_avoid_double)
3668 goto push_for_double;
67871254 3669 split = 2;
5d4f98a2
YZ
3670 }
3671 }
3672 }
3673 }
3674
3675 if (split == 0)
3676 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3677 else
3678 btrfs_item_key(l, &disk_key, mid);
3679
ca9d473a
JB
3680 /*
3681 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3682 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3683 * subclasses, which is 8 at the time of this patch, and we've maxed it
3684 * out. In the future we could add a
3685 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3686 * use BTRFS_NESTING_NEW_ROOT.
3687 */
79bd3712
FM
3688 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3689 &disk_key, 0, l->start, 0,
3690 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3691 BTRFS_NESTING_SPLIT);
f0486c68 3692 if (IS_ERR(right))
5f39d397 3693 return PTR_ERR(right);
f0486c68 3694
0b246afa 3695 root_add_used(root, fs_info->nodesize);
5f39d397 3696
5d4f98a2
YZ
3697 if (split == 0) {
3698 if (mid <= slot) {
3699 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3700 insert_ptr(trans, path, &disk_key,
2ff7e61e 3701 right->start, path->slots[1] + 1, 1);
5d4f98a2
YZ
3702 btrfs_tree_unlock(path->nodes[0]);
3703 free_extent_buffer(path->nodes[0]);
3704 path->nodes[0] = right;
3705 path->slots[0] = 0;
3706 path->slots[1] += 1;
3707 } else {
3708 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3709 insert_ptr(trans, path, &disk_key,
2ff7e61e 3710 right->start, path->slots[1], 1);
5d4f98a2
YZ
3711 btrfs_tree_unlock(path->nodes[0]);
3712 free_extent_buffer(path->nodes[0]);
3713 path->nodes[0] = right;
3714 path->slots[0] = 0;
143bede5 3715 if (path->slots[1] == 0)
b167fa91 3716 fixup_low_keys(path, &disk_key, 1);
d4dbff95 3717 }
196e0249
LB
3718 /*
3719 * We create a new leaf 'right' for the required ins_len and
3720 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3721 * the content of ins_len to 'right'.
3722 */
5d4f98a2 3723 return ret;
d4dbff95 3724 }
74123bd7 3725
94f94ad9 3726 copy_for_split(trans, path, l, right, slot, mid, nritems);
31840ae1 3727
5d4f98a2 3728 if (split == 2) {
cc0c5538
CM
3729 BUG_ON(num_doubles != 0);
3730 num_doubles++;
3731 goto again;
a429e513 3732 }
44871b1b 3733
143bede5 3734 return 0;
99d8f83c
CM
3735
3736push_for_double:
3737 push_for_double_split(trans, root, path, data_size);
3738 tried_avoid_double = 1;
e902baac 3739 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3740 return 0;
3741 goto again;
be0e5c09
CM
3742}
3743
ad48fd75
YZ
3744static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root,
3746 struct btrfs_path *path, int ins_len)
459931ec 3747{
ad48fd75 3748 struct btrfs_key key;
459931ec 3749 struct extent_buffer *leaf;
ad48fd75
YZ
3750 struct btrfs_file_extent_item *fi;
3751 u64 extent_len = 0;
3752 u32 item_size;
3753 int ret;
459931ec
CM
3754
3755 leaf = path->nodes[0];
ad48fd75
YZ
3756 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3757
3758 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3759 key.type != BTRFS_EXTENT_CSUM_KEY);
3760
e902baac 3761 if (btrfs_leaf_free_space(leaf) >= ins_len)
ad48fd75 3762 return 0;
459931ec 3763
3212fa14 3764 item_size = btrfs_item_size(leaf, path->slots[0]);
ad48fd75
YZ
3765 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3766 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item);
3768 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3769 }
b3b4aa74 3770 btrfs_release_path(path);
459931ec 3771
459931ec 3772 path->keep_locks = 1;
ad48fd75
YZ
3773 path->search_for_split = 1;
3774 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
459931ec 3775 path->search_for_split = 0;
a8df6fe6
FM
3776 if (ret > 0)
3777 ret = -EAGAIN;
ad48fd75
YZ
3778 if (ret < 0)
3779 goto err;
459931ec 3780
ad48fd75
YZ
3781 ret = -EAGAIN;
3782 leaf = path->nodes[0];
a8df6fe6 3783 /* if our item isn't there, return now */
3212fa14 3784 if (item_size != btrfs_item_size(leaf, path->slots[0]))
ad48fd75
YZ
3785 goto err;
3786
109f6aef 3787 /* the leaf has changed, it now has room. return now */
e902baac 3788 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
109f6aef
CM
3789 goto err;
3790
ad48fd75
YZ
3791 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3792 fi = btrfs_item_ptr(leaf, path->slots[0],
3793 struct btrfs_file_extent_item);
3794 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3795 goto err;
459931ec
CM
3796 }
3797
ad48fd75 3798 ret = split_leaf(trans, root, &key, path, ins_len, 1);
f0486c68
YZ
3799 if (ret)
3800 goto err;
459931ec 3801
ad48fd75 3802 path->keep_locks = 0;
b9473439 3803 btrfs_unlock_up_safe(path, 1);
ad48fd75
YZ
3804 return 0;
3805err:
3806 path->keep_locks = 0;
3807 return ret;
3808}
3809
25263cd7 3810static noinline int split_item(struct btrfs_path *path,
310712b2 3811 const struct btrfs_key *new_key,
ad48fd75
YZ
3812 unsigned long split_offset)
3813{
3814 struct extent_buffer *leaf;
c91666b1 3815 int orig_slot, slot;
ad48fd75
YZ
3816 char *buf;
3817 u32 nritems;
3818 u32 item_size;
3819 u32 orig_offset;
3820 struct btrfs_disk_key disk_key;
3821
b9473439 3822 leaf = path->nodes[0];
e902baac 3823 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
b9473439 3824
c91666b1 3825 orig_slot = path->slots[0];
3212fa14
JB
3826 orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3827 item_size = btrfs_item_size(leaf, path->slots[0]);
459931ec 3828
459931ec 3829 buf = kmalloc(item_size, GFP_NOFS);
ad48fd75
YZ
3830 if (!buf)
3831 return -ENOMEM;
3832
459931ec
CM
3833 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3834 path->slots[0]), item_size);
459931ec 3835
ad48fd75 3836 slot = path->slots[0] + 1;
459931ec 3837 nritems = btrfs_header_nritems(leaf);
459931ec
CM
3838 if (slot != nritems) {
3839 /* shift the items */
637e3b48 3840 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
459931ec
CM
3841 }
3842
3843 btrfs_cpu_key_to_disk(&disk_key, new_key);
3844 btrfs_set_item_key(leaf, &disk_key, slot);
3845
3212fa14
JB
3846 btrfs_set_item_offset(leaf, slot, orig_offset);
3847 btrfs_set_item_size(leaf, slot, item_size - split_offset);
459931ec 3848
3212fa14 3849 btrfs_set_item_offset(leaf, orig_slot,
c91666b1 3850 orig_offset + item_size - split_offset);
3212fa14 3851 btrfs_set_item_size(leaf, orig_slot, split_offset);
459931ec
CM
3852
3853 btrfs_set_header_nritems(leaf, nritems + 1);
3854
3855 /* write the data for the start of the original item */
3856 write_extent_buffer(leaf, buf,
3857 btrfs_item_ptr_offset(leaf, path->slots[0]),
3858 split_offset);
3859
3860 /* write the data for the new item */
3861 write_extent_buffer(leaf, buf + split_offset,
3862 btrfs_item_ptr_offset(leaf, slot),
3863 item_size - split_offset);
3864 btrfs_mark_buffer_dirty(leaf);
3865
e902baac 3866 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
459931ec 3867 kfree(buf);
ad48fd75
YZ
3868 return 0;
3869}
3870
3871/*
3872 * This function splits a single item into two items,
3873 * giving 'new_key' to the new item and splitting the
3874 * old one at split_offset (from the start of the item).
3875 *
3876 * The path may be released by this operation. After
3877 * the split, the path is pointing to the old item. The
3878 * new item is going to be in the same node as the old one.
3879 *
3880 * Note, the item being split must be smaller enough to live alone on
3881 * a tree block with room for one extra struct btrfs_item
3882 *
3883 * This allows us to split the item in place, keeping a lock on the
3884 * leaf the entire time.
3885 */
3886int btrfs_split_item(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_path *path,
310712b2 3889 const struct btrfs_key *new_key,
ad48fd75
YZ
3890 unsigned long split_offset)
3891{
3892 int ret;
3893 ret = setup_leaf_for_split(trans, root, path,
3894 sizeof(struct btrfs_item));
3895 if (ret)
3896 return ret;
3897
25263cd7 3898 ret = split_item(path, new_key, split_offset);
459931ec
CM
3899 return ret;
3900}
3901
d352ac68
CM
3902/*
3903 * make the item pointed to by the path smaller. new_size indicates
3904 * how small to make it, and from_end tells us if we just chop bytes
3905 * off the end of the item or if we shift the item to chop bytes off
3906 * the front.
3907 */
78ac4f9e 3908void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
b18c6685 3909{
b18c6685 3910 int slot;
5f39d397 3911 struct extent_buffer *leaf;
b18c6685
CM
3912 u32 nritems;
3913 unsigned int data_end;
3914 unsigned int old_data_start;
3915 unsigned int old_size;
3916 unsigned int size_diff;
3917 int i;
cfed81a0
CM
3918 struct btrfs_map_token token;
3919
5f39d397 3920 leaf = path->nodes[0];
179e29e4
CM
3921 slot = path->slots[0];
3922
3212fa14 3923 old_size = btrfs_item_size(leaf, slot);
179e29e4 3924 if (old_size == new_size)
143bede5 3925 return;
b18c6685 3926
5f39d397 3927 nritems = btrfs_header_nritems(leaf);
8f881e8c 3928 data_end = leaf_data_end(leaf);
b18c6685 3929
3212fa14 3930 old_data_start = btrfs_item_offset(leaf, slot);
179e29e4 3931
b18c6685
CM
3932 size_diff = old_size - new_size;
3933
3934 BUG_ON(slot < 0);
3935 BUG_ON(slot >= nritems);
3936
3937 /*
3938 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3939 */
3940 /* first correct the data pointers */
c82f823c 3941 btrfs_init_map_token(&token, leaf);
b18c6685 3942 for (i = slot; i < nritems; i++) {
5f39d397 3943 u32 ioff;
db94535d 3944
3212fa14
JB
3945 ioff = btrfs_token_item_offset(&token, i);
3946 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
b18c6685 3947 }
db94535d 3948
b18c6685 3949 /* shift the data */
179e29e4 3950 if (from_end) {
637e3b48
JB
3951 memmove_leaf_data(leaf, data_end + size_diff, data_end,
3952 old_data_start + new_size - data_end);
179e29e4
CM
3953 } else {
3954 struct btrfs_disk_key disk_key;
3955 u64 offset;
3956
3957 btrfs_item_key(leaf, &disk_key, slot);
3958
3959 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3960 unsigned long ptr;
3961 struct btrfs_file_extent_item *fi;
3962
3963 fi = btrfs_item_ptr(leaf, slot,
3964 struct btrfs_file_extent_item);
3965 fi = (struct btrfs_file_extent_item *)(
3966 (unsigned long)fi - size_diff);
3967
3968 if (btrfs_file_extent_type(leaf, fi) ==
3969 BTRFS_FILE_EXTENT_INLINE) {
3970 ptr = btrfs_item_ptr_offset(leaf, slot);
3971 memmove_extent_buffer(leaf, ptr,
d397712b 3972 (unsigned long)fi,
7ec20afb 3973 BTRFS_FILE_EXTENT_INLINE_DATA_START);
179e29e4
CM
3974 }
3975 }
3976
637e3b48
JB
3977 memmove_leaf_data(leaf, data_end + size_diff, data_end,
3978 old_data_start - data_end);
179e29e4
CM
3979
3980 offset = btrfs_disk_key_offset(&disk_key);
3981 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3982 btrfs_set_item_key(leaf, &disk_key, slot);
3983 if (slot == 0)
b167fa91 3984 fixup_low_keys(path, &disk_key, 1);
179e29e4 3985 }
5f39d397 3986
3212fa14 3987 btrfs_set_item_size(leaf, slot, new_size);
5f39d397 3988 btrfs_mark_buffer_dirty(leaf);
b18c6685 3989
e902baac 3990 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3991 btrfs_print_leaf(leaf);
b18c6685 3992 BUG();
5f39d397 3993 }
b18c6685
CM
3994}
3995
d352ac68 3996/*
8f69dbd2 3997 * make the item pointed to by the path bigger, data_size is the added size.
d352ac68 3998 */
c71dd880 3999void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
6567e837 4000{
6567e837 4001 int slot;
5f39d397 4002 struct extent_buffer *leaf;
6567e837
CM
4003 u32 nritems;
4004 unsigned int data_end;
4005 unsigned int old_data;
4006 unsigned int old_size;
4007 int i;
cfed81a0
CM
4008 struct btrfs_map_token token;
4009
5f39d397 4010 leaf = path->nodes[0];
6567e837 4011
5f39d397 4012 nritems = btrfs_header_nritems(leaf);
8f881e8c 4013 data_end = leaf_data_end(leaf);
6567e837 4014
e902baac 4015 if (btrfs_leaf_free_space(leaf) < data_size) {
a4f78750 4016 btrfs_print_leaf(leaf);
6567e837 4017 BUG();
5f39d397 4018 }
6567e837 4019 slot = path->slots[0];
dc2e724e 4020 old_data = btrfs_item_data_end(leaf, slot);
6567e837
CM
4021
4022 BUG_ON(slot < 0);
3326d1b0 4023 if (slot >= nritems) {
a4f78750 4024 btrfs_print_leaf(leaf);
c71dd880 4025 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
0b246afa 4026 slot, nritems);
290342f6 4027 BUG();
3326d1b0 4028 }
6567e837
CM
4029
4030 /*
4031 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4032 */
4033 /* first correct the data pointers */
c82f823c 4034 btrfs_init_map_token(&token, leaf);
6567e837 4035 for (i = slot; i < nritems; i++) {
5f39d397 4036 u32 ioff;
db94535d 4037
3212fa14
JB
4038 ioff = btrfs_token_item_offset(&token, i);
4039 btrfs_set_token_item_offset(&token, i, ioff - data_size);
6567e837 4040 }
5f39d397 4041
6567e837 4042 /* shift the data */
637e3b48
JB
4043 memmove_leaf_data(leaf, data_end - data_size, data_end,
4044 old_data - data_end);
5f39d397 4045
6567e837 4046 data_end = old_data;
3212fa14
JB
4047 old_size = btrfs_item_size(leaf, slot);
4048 btrfs_set_item_size(leaf, slot, old_size + data_size);
5f39d397 4049 btrfs_mark_buffer_dirty(leaf);
6567e837 4050
e902baac 4051 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 4052 btrfs_print_leaf(leaf);
6567e837 4053 BUG();
5f39d397 4054 }
6567e837
CM
4055}
4056
43dd529a
DS
4057/*
4058 * Make space in the node before inserting one or more items.
da9ffb24
NB
4059 *
4060 * @root: root we are inserting items to
4061 * @path: points to the leaf/slot where we are going to insert new items
b7ef5f3a 4062 * @batch: information about the batch of items to insert
43dd529a
DS
4063 *
4064 * Main purpose is to save stack depth by doing the bulk of the work in a
4065 * function that doesn't call btrfs_search_slot
74123bd7 4066 */
f0641656
FM
4067static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4068 const struct btrfs_item_batch *batch)
be0e5c09 4069{
0b246afa 4070 struct btrfs_fs_info *fs_info = root->fs_info;
9c58309d 4071 int i;
7518a238 4072 u32 nritems;
be0e5c09 4073 unsigned int data_end;
e2fa7227 4074 struct btrfs_disk_key disk_key;
44871b1b
CM
4075 struct extent_buffer *leaf;
4076 int slot;
cfed81a0 4077 struct btrfs_map_token token;
fc0d82e1 4078 u32 total_size;
cfed81a0 4079
b7ef5f3a
FM
4080 /*
4081 * Before anything else, update keys in the parent and other ancestors
4082 * if needed, then release the write locks on them, so that other tasks
4083 * can use them while we modify the leaf.
4084 */
24cdc847 4085 if (path->slots[0] == 0) {
b7ef5f3a 4086 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
b167fa91 4087 fixup_low_keys(path, &disk_key, 1);
24cdc847
FM
4088 }
4089 btrfs_unlock_up_safe(path, 1);
4090
5f39d397 4091 leaf = path->nodes[0];
44871b1b 4092 slot = path->slots[0];
74123bd7 4093
5f39d397 4094 nritems = btrfs_header_nritems(leaf);
8f881e8c 4095 data_end = leaf_data_end(leaf);
b7ef5f3a 4096 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
eb60ceac 4097
e902baac 4098 if (btrfs_leaf_free_space(leaf) < total_size) {
a4f78750 4099 btrfs_print_leaf(leaf);
0b246afa 4100 btrfs_crit(fs_info, "not enough freespace need %u have %d",
e902baac 4101 total_size, btrfs_leaf_free_space(leaf));
be0e5c09 4102 BUG();
d4dbff95 4103 }
5f39d397 4104
c82f823c 4105 btrfs_init_map_token(&token, leaf);
be0e5c09 4106 if (slot != nritems) {
dc2e724e 4107 unsigned int old_data = btrfs_item_data_end(leaf, slot);
be0e5c09 4108
5f39d397 4109 if (old_data < data_end) {
a4f78750 4110 btrfs_print_leaf(leaf);
7269ddd2
NB
4111 btrfs_crit(fs_info,
4112 "item at slot %d with data offset %u beyond data end of leaf %u",
5d163e0e 4113 slot, old_data, data_end);
290342f6 4114 BUG();
5f39d397 4115 }
be0e5c09
CM
4116 /*
4117 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4118 */
4119 /* first correct the data pointers */
0783fcfc 4120 for (i = slot; i < nritems; i++) {
5f39d397 4121 u32 ioff;
db94535d 4122
3212fa14
JB
4123 ioff = btrfs_token_item_offset(&token, i);
4124 btrfs_set_token_item_offset(&token, i,
74794207 4125 ioff - batch->total_data_size);
0783fcfc 4126 }
be0e5c09 4127 /* shift the items */
637e3b48 4128 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
be0e5c09
CM
4129
4130 /* shift the data */
637e3b48
JB
4131 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4132 data_end, old_data - data_end);
be0e5c09
CM
4133 data_end = old_data;
4134 }
5f39d397 4135
62e2749e 4136 /* setup the item for the new data */
b7ef5f3a
FM
4137 for (i = 0; i < batch->nr; i++) {
4138 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
9c58309d 4139 btrfs_set_item_key(leaf, &disk_key, slot + i);
b7ef5f3a 4140 data_end -= batch->data_sizes[i];
3212fa14
JB
4141 btrfs_set_token_item_offset(&token, slot + i, data_end);
4142 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
9c58309d 4143 }
44871b1b 4144
b7ef5f3a 4145 btrfs_set_header_nritems(leaf, nritems + batch->nr);
b9473439 4146 btrfs_mark_buffer_dirty(leaf);
aa5d6bed 4147
e902baac 4148 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 4149 btrfs_print_leaf(leaf);
be0e5c09 4150 BUG();
5f39d397 4151 }
44871b1b
CM
4152}
4153
f0641656
FM
4154/*
4155 * Insert a new item into a leaf.
4156 *
4157 * @root: The root of the btree.
4158 * @path: A path pointing to the target leaf and slot.
4159 * @key: The key of the new item.
4160 * @data_size: The size of the data associated with the new key.
4161 */
4162void btrfs_setup_item_for_insert(struct btrfs_root *root,
4163 struct btrfs_path *path,
4164 const struct btrfs_key *key,
4165 u32 data_size)
4166{
4167 struct btrfs_item_batch batch;
4168
4169 batch.keys = key;
4170 batch.data_sizes = &data_size;
4171 batch.total_data_size = data_size;
4172 batch.nr = 1;
4173
4174 setup_items_for_insert(root, path, &batch);
4175}
4176
44871b1b
CM
4177/*
4178 * Given a key and some data, insert items into the tree.
4179 * This does all the path init required, making room in the tree if needed.
4180 */
4181int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4182 struct btrfs_root *root,
4183 struct btrfs_path *path,
b7ef5f3a 4184 const struct btrfs_item_batch *batch)
44871b1b 4185{
44871b1b
CM
4186 int ret = 0;
4187 int slot;
b7ef5f3a 4188 u32 total_size;
44871b1b 4189
b7ef5f3a
FM
4190 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4191 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
44871b1b
CM
4192 if (ret == 0)
4193 return -EEXIST;
4194 if (ret < 0)
143bede5 4195 return ret;
44871b1b 4196
44871b1b
CM
4197 slot = path->slots[0];
4198 BUG_ON(slot < 0);
4199
b7ef5f3a 4200 setup_items_for_insert(root, path, batch);
143bede5 4201 return 0;
62e2749e
CM
4202}
4203
4204/*
4205 * Given a key and some data, insert an item into the tree.
4206 * This does all the path init required, making room in the tree if needed.
4207 */
310712b2
OS
4208int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4209 const struct btrfs_key *cpu_key, void *data,
4210 u32 data_size)
62e2749e
CM
4211{
4212 int ret = 0;
2c90e5d6 4213 struct btrfs_path *path;
5f39d397
CM
4214 struct extent_buffer *leaf;
4215 unsigned long ptr;
62e2749e 4216
2c90e5d6 4217 path = btrfs_alloc_path();
db5b493a
TI
4218 if (!path)
4219 return -ENOMEM;
2c90e5d6 4220 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
62e2749e 4221 if (!ret) {
5f39d397
CM
4222 leaf = path->nodes[0];
4223 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4224 write_extent_buffer(leaf, data, ptr, data_size);
4225 btrfs_mark_buffer_dirty(leaf);
62e2749e 4226 }
2c90e5d6 4227 btrfs_free_path(path);
aa5d6bed 4228 return ret;
be0e5c09
CM
4229}
4230
f0641656
FM
4231/*
4232 * This function duplicates an item, giving 'new_key' to the new item.
4233 * It guarantees both items live in the same tree leaf and the new item is
4234 * contiguous with the original item.
4235 *
4236 * This allows us to split a file extent in place, keeping a lock on the leaf
4237 * the entire time.
4238 */
4239int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4240 struct btrfs_root *root,
4241 struct btrfs_path *path,
4242 const struct btrfs_key *new_key)
4243{
4244 struct extent_buffer *leaf;
4245 int ret;
4246 u32 item_size;
4247
4248 leaf = path->nodes[0];
3212fa14 4249 item_size = btrfs_item_size(leaf, path->slots[0]);
f0641656
FM
4250 ret = setup_leaf_for_split(trans, root, path,
4251 item_size + sizeof(struct btrfs_item));
4252 if (ret)
4253 return ret;
4254
4255 path->slots[0]++;
4256 btrfs_setup_item_for_insert(root, path, new_key, item_size);
4257 leaf = path->nodes[0];
4258 memcpy_extent_buffer(leaf,
4259 btrfs_item_ptr_offset(leaf, path->slots[0]),
4260 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4261 item_size);
4262 return 0;
4263}
4264
74123bd7 4265/*
5de08d7d 4266 * delete the pointer from a given node.
74123bd7 4267 *
d352ac68
CM
4268 * the tree should have been previously balanced so the deletion does not
4269 * empty a node.
74123bd7 4270 */
afe5fea7
TI
4271static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4272 int level, int slot)
be0e5c09 4273{
5f39d397 4274 struct extent_buffer *parent = path->nodes[level];
7518a238 4275 u32 nritems;
f3ea38da 4276 int ret;
be0e5c09 4277
5f39d397 4278 nritems = btrfs_header_nritems(parent);
d397712b 4279 if (slot != nritems - 1) {
bf1d3425 4280 if (level) {
f3a84ccd
FM
4281 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4282 slot + 1, nritems - slot - 1);
bf1d3425
DS
4283 BUG_ON(ret < 0);
4284 }
5f39d397 4285 memmove_extent_buffer(parent,
e23efd8e
JB
4286 btrfs_node_key_ptr_offset(parent, slot),
4287 btrfs_node_key_ptr_offset(parent, slot + 1),
d6025579
CM
4288 sizeof(struct btrfs_key_ptr) *
4289 (nritems - slot - 1));
57ba86c0 4290 } else if (level) {
f3a84ccd 4291 ret = btrfs_tree_mod_log_insert_key(parent, slot,
33cff222 4292 BTRFS_MOD_LOG_KEY_REMOVE);
57ba86c0 4293 BUG_ON(ret < 0);
bb803951 4294 }
f3ea38da 4295
7518a238 4296 nritems--;
5f39d397 4297 btrfs_set_header_nritems(parent, nritems);
7518a238 4298 if (nritems == 0 && parent == root->node) {
5f39d397 4299 BUG_ON(btrfs_header_level(root->node) != 1);
bb803951 4300 /* just turn the root into a leaf and break */
5f39d397 4301 btrfs_set_header_level(root->node, 0);
bb803951 4302 } else if (slot == 0) {
5f39d397
CM
4303 struct btrfs_disk_key disk_key;
4304
4305 btrfs_node_key(parent, &disk_key, 0);
b167fa91 4306 fixup_low_keys(path, &disk_key, level + 1);
be0e5c09 4307 }
d6025579 4308 btrfs_mark_buffer_dirty(parent);
be0e5c09
CM
4309}
4310
323ac95b
CM
4311/*
4312 * a helper function to delete the leaf pointed to by path->slots[1] and
5d4f98a2 4313 * path->nodes[1].
323ac95b
CM
4314 *
4315 * This deletes the pointer in path->nodes[1] and frees the leaf
4316 * block extent. zero is returned if it all worked out, < 0 otherwise.
4317 *
4318 * The path must have already been setup for deleting the leaf, including
4319 * all the proper balancing. path->nodes[1] must be locked.
4320 */
143bede5
JM
4321static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4322 struct btrfs_root *root,
4323 struct btrfs_path *path,
4324 struct extent_buffer *leaf)
323ac95b 4325{
5d4f98a2 4326 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
afe5fea7 4327 del_ptr(root, path, 1, path->slots[1]);
323ac95b 4328
4d081c41
CM
4329 /*
4330 * btrfs_free_extent is expensive, we want to make sure we
4331 * aren't holding any locks when we call it
4332 */
4333 btrfs_unlock_up_safe(path, 0);
4334
f0486c68
YZ
4335 root_sub_used(root, leaf->len);
4336
67439dad 4337 atomic_inc(&leaf->refs);
7a163608 4338 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
3083ee2e 4339 free_extent_buffer_stale(leaf);
323ac95b 4340}
74123bd7
CM
4341/*
4342 * delete the item at the leaf level in path. If that empties
4343 * the leaf, remove it from the tree
4344 */
85e21bac
CM
4345int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4346 struct btrfs_path *path, int slot, int nr)
be0e5c09 4347{
0b246afa 4348 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 4349 struct extent_buffer *leaf;
aa5d6bed
CM
4350 int ret = 0;
4351 int wret;
7518a238 4352 u32 nritems;
be0e5c09 4353
5f39d397 4354 leaf = path->nodes[0];
5f39d397 4355 nritems = btrfs_header_nritems(leaf);
be0e5c09 4356
85e21bac 4357 if (slot + nr != nritems) {
0cae23b6
FM
4358 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4359 const int data_end = leaf_data_end(leaf);
c82f823c 4360 struct btrfs_map_token token;
0cae23b6
FM
4361 u32 dsize = 0;
4362 int i;
4363
4364 for (i = 0; i < nr; i++)
4365 dsize += btrfs_item_size(leaf, slot + i);
5f39d397 4366
637e3b48
JB
4367 memmove_leaf_data(leaf, data_end + dsize, data_end,
4368 last_off - data_end);
5f39d397 4369
c82f823c 4370 btrfs_init_map_token(&token, leaf);
85e21bac 4371 for (i = slot + nr; i < nritems; i++) {
5f39d397 4372 u32 ioff;
db94535d 4373
3212fa14
JB
4374 ioff = btrfs_token_item_offset(&token, i);
4375 btrfs_set_token_item_offset(&token, i, ioff + dsize);
0783fcfc 4376 }
db94535d 4377
637e3b48 4378 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
be0e5c09 4379 }
85e21bac
CM
4380 btrfs_set_header_nritems(leaf, nritems - nr);
4381 nritems -= nr;
5f39d397 4382
74123bd7 4383 /* delete the leaf if we've emptied it */
7518a238 4384 if (nritems == 0) {
5f39d397
CM
4385 if (leaf == root->node) {
4386 btrfs_set_header_level(leaf, 0);
9a8dd150 4387 } else {
190a8339 4388 btrfs_clear_buffer_dirty(trans, leaf);
143bede5 4389 btrfs_del_leaf(trans, root, path, leaf);
9a8dd150 4390 }
be0e5c09 4391 } else {
7518a238 4392 int used = leaf_space_used(leaf, 0, nritems);
aa5d6bed 4393 if (slot == 0) {
5f39d397
CM
4394 struct btrfs_disk_key disk_key;
4395
4396 btrfs_item_key(leaf, &disk_key, 0);
b167fa91 4397 fixup_low_keys(path, &disk_key, 1);
aa5d6bed 4398 }
aa5d6bed 4399
7c4063d1
FM
4400 /*
4401 * Try to delete the leaf if it is mostly empty. We do this by
4402 * trying to move all its items into its left and right neighbours.
4403 * If we can't move all the items, then we don't delete it - it's
4404 * not ideal, but future insertions might fill the leaf with more
4405 * items, or items from other leaves might be moved later into our
4406 * leaf due to deletions on those leaves.
4407 */
0b246afa 4408 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
7c4063d1
FM
4409 u32 min_push_space;
4410
be0e5c09
CM
4411 /* push_leaf_left fixes the path.
4412 * make sure the path still points to our leaf
4413 * for possible call to del_ptr below
4414 */
4920c9ac 4415 slot = path->slots[1];
67439dad 4416 atomic_inc(&leaf->refs);
7c4063d1
FM
4417 /*
4418 * We want to be able to at least push one item to the
4419 * left neighbour leaf, and that's the first item.
4420 */
4421 min_push_space = sizeof(struct btrfs_item) +
4422 btrfs_item_size(leaf, 0);
4423 wret = push_leaf_left(trans, root, path, 0,
4424 min_push_space, 1, (u32)-1);
54aa1f4d 4425 if (wret < 0 && wret != -ENOSPC)
aa5d6bed 4426 ret = wret;
5f39d397
CM
4427
4428 if (path->nodes[0] == leaf &&
4429 btrfs_header_nritems(leaf)) {
7c4063d1
FM
4430 /*
4431 * If we were not able to push all items from our
4432 * leaf to its left neighbour, then attempt to
4433 * either push all the remaining items to the
4434 * right neighbour or none. There's no advantage
4435 * in pushing only some items, instead of all, as
4436 * it's pointless to end up with a leaf having
4437 * too few items while the neighbours can be full
4438 * or nearly full.
4439 */
4440 nritems = btrfs_header_nritems(leaf);
4441 min_push_space = leaf_space_used(leaf, 0, nritems);
4442 wret = push_leaf_right(trans, root, path, 0,
4443 min_push_space, 1, 0);
54aa1f4d 4444 if (wret < 0 && wret != -ENOSPC)
aa5d6bed
CM
4445 ret = wret;
4446 }
5f39d397
CM
4447
4448 if (btrfs_header_nritems(leaf) == 0) {
323ac95b 4449 path->slots[1] = slot;
143bede5 4450 btrfs_del_leaf(trans, root, path, leaf);
5f39d397 4451 free_extent_buffer(leaf);
143bede5 4452 ret = 0;
5de08d7d 4453 } else {
925baedd
CM
4454 /* if we're still in the path, make sure
4455 * we're dirty. Otherwise, one of the
4456 * push_leaf functions must have already
4457 * dirtied this buffer
4458 */
4459 if (path->nodes[0] == leaf)
4460 btrfs_mark_buffer_dirty(leaf);
5f39d397 4461 free_extent_buffer(leaf);
be0e5c09 4462 }
d5719762 4463 } else {
5f39d397 4464 btrfs_mark_buffer_dirty(leaf);
be0e5c09
CM
4465 }
4466 }
aa5d6bed 4467 return ret;
be0e5c09
CM
4468}
4469
7bb86316 4470/*
925baedd 4471 * search the tree again to find a leaf with lesser keys
7bb86316
CM
4472 * returns 0 if it found something or 1 if there are no lesser leaves.
4473 * returns < 0 on io errors.
d352ac68
CM
4474 *
4475 * This may release the path, and so you may lose any locks held at the
4476 * time you call it.
7bb86316 4477 */
16e7549f 4478int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
7bb86316 4479{
925baedd
CM
4480 struct btrfs_key key;
4481 struct btrfs_disk_key found_key;
4482 int ret;
7bb86316 4483
925baedd 4484 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
7bb86316 4485
e8b0d724 4486 if (key.offset > 0) {
925baedd 4487 key.offset--;
e8b0d724 4488 } else if (key.type > 0) {
925baedd 4489 key.type--;
e8b0d724
FDBM
4490 key.offset = (u64)-1;
4491 } else if (key.objectid > 0) {
925baedd 4492 key.objectid--;
e8b0d724
FDBM
4493 key.type = (u8)-1;
4494 key.offset = (u64)-1;
4495 } else {
925baedd 4496 return 1;
e8b0d724 4497 }
7bb86316 4498
b3b4aa74 4499 btrfs_release_path(path);
925baedd
CM
4500 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4501 if (ret < 0)
4502 return ret;
4503 btrfs_item_key(path->nodes[0], &found_key, 0);
4504 ret = comp_keys(&found_key, &key);
337c6f68
FM
4505 /*
4506 * We might have had an item with the previous key in the tree right
4507 * before we released our path. And after we released our path, that
4508 * item might have been pushed to the first slot (0) of the leaf we
4509 * were holding due to a tree balance. Alternatively, an item with the
4510 * previous key can exist as the only element of a leaf (big fat item).
4511 * Therefore account for these 2 cases, so that our callers (like
4512 * btrfs_previous_item) don't miss an existing item with a key matching
4513 * the previous key we computed above.
4514 */
4515 if (ret <= 0)
925baedd
CM
4516 return 0;
4517 return 1;
7bb86316
CM
4518}
4519
3f157a2f
CM
4520/*
4521 * A helper function to walk down the tree starting at min_key, and looking
de78b51a
ES
4522 * for nodes or leaves that are have a minimum transaction id.
4523 * This is used by the btree defrag code, and tree logging
3f157a2f
CM
4524 *
4525 * This does not cow, but it does stuff the starting key it finds back
4526 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4527 * key and get a writable path.
4528 *
3f157a2f
CM
4529 * This honors path->lowest_level to prevent descent past a given level
4530 * of the tree.
4531 *
d352ac68
CM
4532 * min_trans indicates the oldest transaction that you are interested
4533 * in walking through. Any nodes or leaves older than min_trans are
4534 * skipped over (without reading them).
4535 *
3f157a2f
CM
4536 * returns zero if something useful was found, < 0 on error and 1 if there
4537 * was nothing in the tree that matched the search criteria.
4538 */
4539int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
de78b51a 4540 struct btrfs_path *path,
3f157a2f
CM
4541 u64 min_trans)
4542{
4543 struct extent_buffer *cur;
4544 struct btrfs_key found_key;
4545 int slot;
9652480b 4546 int sret;
3f157a2f
CM
4547 u32 nritems;
4548 int level;
4549 int ret = 1;
f98de9b9 4550 int keep_locks = path->keep_locks;
3f157a2f 4551
c922b016 4552 ASSERT(!path->nowait);
f98de9b9 4553 path->keep_locks = 1;
3f157a2f 4554again:
bd681513 4555 cur = btrfs_read_lock_root_node(root);
3f157a2f 4556 level = btrfs_header_level(cur);
e02119d5 4557 WARN_ON(path->nodes[level]);
3f157a2f 4558 path->nodes[level] = cur;
bd681513 4559 path->locks[level] = BTRFS_READ_LOCK;
3f157a2f
CM
4560
4561 if (btrfs_header_generation(cur) < min_trans) {
4562 ret = 1;
4563 goto out;
4564 }
d397712b 4565 while (1) {
3f157a2f
CM
4566 nritems = btrfs_header_nritems(cur);
4567 level = btrfs_header_level(cur);
fdf8d595 4568 sret = btrfs_bin_search(cur, 0, min_key, &slot);
cbca7d59
FM
4569 if (sret < 0) {
4570 ret = sret;
4571 goto out;
4572 }
3f157a2f 4573
323ac95b
CM
4574 /* at the lowest level, we're done, setup the path and exit */
4575 if (level == path->lowest_level) {
e02119d5
CM
4576 if (slot >= nritems)
4577 goto find_next_key;
3f157a2f
CM
4578 ret = 0;
4579 path->slots[level] = slot;
4580 btrfs_item_key_to_cpu(cur, &found_key, slot);
4581 goto out;
4582 }
9652480b
Y
4583 if (sret && slot > 0)
4584 slot--;
3f157a2f 4585 /*
de78b51a 4586 * check this node pointer against the min_trans parameters.
260db43c 4587 * If it is too old, skip to the next one.
3f157a2f 4588 */
d397712b 4589 while (slot < nritems) {
3f157a2f 4590 u64 gen;
e02119d5 4591
3f157a2f
CM
4592 gen = btrfs_node_ptr_generation(cur, slot);
4593 if (gen < min_trans) {
4594 slot++;
4595 continue;
4596 }
de78b51a 4597 break;
3f157a2f 4598 }
e02119d5 4599find_next_key:
3f157a2f
CM
4600 /*
4601 * we didn't find a candidate key in this node, walk forward
4602 * and find another one
4603 */
4604 if (slot >= nritems) {
e02119d5
CM
4605 path->slots[level] = slot;
4606 sret = btrfs_find_next_key(root, path, min_key, level,
de78b51a 4607 min_trans);
e02119d5 4608 if (sret == 0) {
b3b4aa74 4609 btrfs_release_path(path);
3f157a2f
CM
4610 goto again;
4611 } else {
4612 goto out;
4613 }
4614 }
4615 /* save our key for returning back */
4616 btrfs_node_key_to_cpu(cur, &found_key, slot);
4617 path->slots[level] = slot;
4618 if (level == path->lowest_level) {
4619 ret = 0;
3f157a2f
CM
4620 goto out;
4621 }
4b231ae4 4622 cur = btrfs_read_node_slot(cur, slot);
fb770ae4
LB
4623 if (IS_ERR(cur)) {
4624 ret = PTR_ERR(cur);
4625 goto out;
4626 }
3f157a2f 4627
bd681513 4628 btrfs_tree_read_lock(cur);
b4ce94de 4629
bd681513 4630 path->locks[level - 1] = BTRFS_READ_LOCK;
3f157a2f 4631 path->nodes[level - 1] = cur;
f7c79f30 4632 unlock_up(path, level, 1, 0, NULL);
3f157a2f
CM
4633 }
4634out:
f98de9b9
FM
4635 path->keep_locks = keep_locks;
4636 if (ret == 0) {
4637 btrfs_unlock_up_safe(path, path->lowest_level + 1);
3f157a2f 4638 memcpy(min_key, &found_key, sizeof(found_key));
f98de9b9 4639 }
3f157a2f
CM
4640 return ret;
4641}
4642
4643/*
4644 * this is similar to btrfs_next_leaf, but does not try to preserve
4645 * and fixup the path. It looks for and returns the next key in the
de78b51a 4646 * tree based on the current path and the min_trans parameters.
3f157a2f
CM
4647 *
4648 * 0 is returned if another key is found, < 0 if there are any errors
4649 * and 1 is returned if there are no higher keys in the tree
4650 *
4651 * path->keep_locks should be set to 1 on the search made before
4652 * calling this function.
4653 */
e7a84565 4654int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
de78b51a 4655 struct btrfs_key *key, int level, u64 min_trans)
e7a84565 4656{
e7a84565
CM
4657 int slot;
4658 struct extent_buffer *c;
4659
6a9fb468 4660 WARN_ON(!path->keep_locks && !path->skip_locking);
d397712b 4661 while (level < BTRFS_MAX_LEVEL) {
e7a84565
CM
4662 if (!path->nodes[level])
4663 return 1;
4664
4665 slot = path->slots[level] + 1;
4666 c = path->nodes[level];
3f157a2f 4667next:
e7a84565 4668 if (slot >= btrfs_header_nritems(c)) {
33c66f43
YZ
4669 int ret;
4670 int orig_lowest;
4671 struct btrfs_key cur_key;
4672 if (level + 1 >= BTRFS_MAX_LEVEL ||
4673 !path->nodes[level + 1])
e7a84565 4674 return 1;
33c66f43 4675
6a9fb468 4676 if (path->locks[level + 1] || path->skip_locking) {
33c66f43
YZ
4677 level++;
4678 continue;
4679 }
4680
4681 slot = btrfs_header_nritems(c) - 1;
4682 if (level == 0)
4683 btrfs_item_key_to_cpu(c, &cur_key, slot);
4684 else
4685 btrfs_node_key_to_cpu(c, &cur_key, slot);
4686
4687 orig_lowest = path->lowest_level;
b3b4aa74 4688 btrfs_release_path(path);
33c66f43
YZ
4689 path->lowest_level = level;
4690 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4691 0, 0);
4692 path->lowest_level = orig_lowest;
4693 if (ret < 0)
4694 return ret;
4695
4696 c = path->nodes[level];
4697 slot = path->slots[level];
4698 if (ret == 0)
4699 slot++;
4700 goto next;
e7a84565 4701 }
33c66f43 4702
e7a84565
CM
4703 if (level == 0)
4704 btrfs_item_key_to_cpu(c, key, slot);
3f157a2f 4705 else {
3f157a2f
CM
4706 u64 gen = btrfs_node_ptr_generation(c, slot);
4707
3f157a2f
CM
4708 if (gen < min_trans) {
4709 slot++;
4710 goto next;
4711 }
e7a84565 4712 btrfs_node_key_to_cpu(c, key, slot);
3f157a2f 4713 }
e7a84565
CM
4714 return 0;
4715 }
4716 return 1;
4717}
4718
3d7806ec
JS
4719int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4720 u64 time_seq)
d97e63b6
CM
4721{
4722 int slot;
8e73f275 4723 int level;
5f39d397 4724 struct extent_buffer *c;
8e73f275 4725 struct extent_buffer *next;
d96b3424 4726 struct btrfs_fs_info *fs_info = root->fs_info;
925baedd 4727 struct btrfs_key key;
d96b3424 4728 bool need_commit_sem = false;
925baedd
CM
4729 u32 nritems;
4730 int ret;
0e46318d 4731 int i;
925baedd 4732
bdcdd86c
FM
4733 /*
4734 * The nowait semantics are used only for write paths, where we don't
4735 * use the tree mod log and sequence numbers.
4736 */
4737 if (time_seq)
4738 ASSERT(!path->nowait);
c922b016 4739
925baedd 4740 nritems = btrfs_header_nritems(path->nodes[0]);
d397712b 4741 if (nritems == 0)
925baedd 4742 return 1;
925baedd 4743
8e73f275
CM
4744 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4745again:
4746 level = 1;
4747 next = NULL;
b3b4aa74 4748 btrfs_release_path(path);
8e73f275 4749
a2135011 4750 path->keep_locks = 1;
8e73f275 4751
d96b3424 4752 if (time_seq) {
3d7806ec 4753 ret = btrfs_search_old_slot(root, &key, path, time_seq);
d96b3424
FM
4754 } else {
4755 if (path->need_commit_sem) {
4756 path->need_commit_sem = 0;
4757 need_commit_sem = true;
bdcdd86c
FM
4758 if (path->nowait) {
4759 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4760 ret = -EAGAIN;
4761 goto done;
4762 }
4763 } else {
4764 down_read(&fs_info->commit_root_sem);
4765 }
d96b3424 4766 }
3d7806ec 4767 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
d96b3424 4768 }
925baedd
CM
4769 path->keep_locks = 0;
4770
4771 if (ret < 0)
d96b3424 4772 goto done;
925baedd 4773
a2135011 4774 nritems = btrfs_header_nritems(path->nodes[0]);
168fd7d2
CM
4775 /*
4776 * by releasing the path above we dropped all our locks. A balance
4777 * could have added more items next to the key that used to be
4778 * at the very end of the block. So, check again here and
4779 * advance the path if there are now more items available.
4780 */
a2135011 4781 if (nritems > 0 && path->slots[0] < nritems - 1) {
e457afec
YZ
4782 if (ret == 0)
4783 path->slots[0]++;
8e73f275 4784 ret = 0;
925baedd
CM
4785 goto done;
4786 }
0b43e04f
LB
4787 /*
4788 * So the above check misses one case:
4789 * - after releasing the path above, someone has removed the item that
4790 * used to be at the very end of the block, and balance between leafs
4791 * gets another one with bigger key.offset to replace it.
4792 *
4793 * This one should be returned as well, or we can get leaf corruption
4794 * later(esp. in __btrfs_drop_extents()).
4795 *
4796 * And a bit more explanation about this check,
4797 * with ret > 0, the key isn't found, the path points to the slot
4798 * where it should be inserted, so the path->slots[0] item must be the
4799 * bigger one.
4800 */
4801 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4802 ret = 0;
4803 goto done;
4804 }
d97e63b6 4805
d397712b 4806 while (level < BTRFS_MAX_LEVEL) {
8e73f275
CM
4807 if (!path->nodes[level]) {
4808 ret = 1;
4809 goto done;
4810 }
5f39d397 4811
d97e63b6
CM
4812 slot = path->slots[level] + 1;
4813 c = path->nodes[level];
5f39d397 4814 if (slot >= btrfs_header_nritems(c)) {
d97e63b6 4815 level++;
8e73f275
CM
4816 if (level == BTRFS_MAX_LEVEL) {
4817 ret = 1;
4818 goto done;
4819 }
d97e63b6
CM
4820 continue;
4821 }
5f39d397 4822
0e46318d
JB
4823
4824 /*
4825 * Our current level is where we're going to start from, and to
4826 * make sure lockdep doesn't complain we need to drop our locks
4827 * and nodes from 0 to our current level.
4828 */
4829 for (i = 0; i < level; i++) {
4830 if (path->locks[level]) {
4831 btrfs_tree_read_unlock(path->nodes[i]);
4832 path->locks[i] = 0;
4833 }
4834 free_extent_buffer(path->nodes[i]);
4835 path->nodes[i] = NULL;
925baedd 4836 }
5f39d397 4837
8e73f275 4838 next = c;
d07b8528 4839 ret = read_block_for_search(root, path, &next, level,
cda79c54 4840 slot, &key);
bdcdd86c 4841 if (ret == -EAGAIN && !path->nowait)
8e73f275 4842 goto again;
5f39d397 4843
76a05b35 4844 if (ret < 0) {
b3b4aa74 4845 btrfs_release_path(path);
76a05b35
CM
4846 goto done;
4847 }
4848
5cd57b2c 4849 if (!path->skip_locking) {
bd681513 4850 ret = btrfs_try_tree_read_lock(next);
bdcdd86c
FM
4851 if (!ret && path->nowait) {
4852 ret = -EAGAIN;
4853 goto done;
4854 }
d42244a0
JS
4855 if (!ret && time_seq) {
4856 /*
4857 * If we don't get the lock, we may be racing
4858 * with push_leaf_left, holding that lock while
4859 * itself waiting for the leaf we've currently
4860 * locked. To solve this situation, we give up
4861 * on our lock and cycle.
4862 */
cf538830 4863 free_extent_buffer(next);
d42244a0
JS
4864 btrfs_release_path(path);
4865 cond_resched();
4866 goto again;
4867 }
0e46318d
JB
4868 if (!ret)
4869 btrfs_tree_read_lock(next);
5cd57b2c 4870 }
d97e63b6
CM
4871 break;
4872 }
4873 path->slots[level] = slot;
d397712b 4874 while (1) {
d97e63b6 4875 level--;
d97e63b6
CM
4876 path->nodes[level] = next;
4877 path->slots[level] = 0;
a74a4b97 4878 if (!path->skip_locking)
ffeb03cf 4879 path->locks[level] = BTRFS_READ_LOCK;
d97e63b6
CM
4880 if (!level)
4881 break;
b4ce94de 4882
d07b8528 4883 ret = read_block_for_search(root, path, &next, level,
cda79c54 4884 0, &key);
bdcdd86c 4885 if (ret == -EAGAIN && !path->nowait)
8e73f275
CM
4886 goto again;
4887
76a05b35 4888 if (ret < 0) {
b3b4aa74 4889 btrfs_release_path(path);
76a05b35
CM
4890 goto done;
4891 }
4892
bdcdd86c
FM
4893 if (!path->skip_locking) {
4894 if (path->nowait) {
4895 if (!btrfs_try_tree_read_lock(next)) {
4896 ret = -EAGAIN;
4897 goto done;
4898 }
4899 } else {
4900 btrfs_tree_read_lock(next);
4901 }
4902 }
d97e63b6 4903 }
8e73f275 4904 ret = 0;
925baedd 4905done:
f7c79f30 4906 unlock_up(path, 0, 1, 0, NULL);
d96b3424
FM
4907 if (need_commit_sem) {
4908 int ret2;
4909
4910 path->need_commit_sem = 1;
4911 ret2 = finish_need_commit_sem_search(path);
4912 up_read(&fs_info->commit_root_sem);
4913 if (ret2)
4914 ret = ret2;
4915 }
8e73f275
CM
4916
4917 return ret;
d97e63b6 4918}
0b86a832 4919
890d2b1a
JB
4920int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4921{
4922 path->slots[0]++;
4923 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4924 return btrfs_next_old_leaf(root, path, time_seq);
4925 return 0;
4926}
4927
3f157a2f
CM
4928/*
4929 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4930 * searching until it gets past min_objectid or finds an item of 'type'
4931 *
4932 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4933 */
0b86a832
CM
4934int btrfs_previous_item(struct btrfs_root *root,
4935 struct btrfs_path *path, u64 min_objectid,
4936 int type)
4937{
4938 struct btrfs_key found_key;
4939 struct extent_buffer *leaf;
e02119d5 4940 u32 nritems;
0b86a832
CM
4941 int ret;
4942
d397712b 4943 while (1) {
0b86a832
CM
4944 if (path->slots[0] == 0) {
4945 ret = btrfs_prev_leaf(root, path);
4946 if (ret != 0)
4947 return ret;
4948 } else {
4949 path->slots[0]--;
4950 }
4951 leaf = path->nodes[0];
e02119d5
CM
4952 nritems = btrfs_header_nritems(leaf);
4953 if (nritems == 0)
4954 return 1;
4955 if (path->slots[0] == nritems)
4956 path->slots[0]--;
4957
0b86a832 4958 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
e02119d5
CM
4959 if (found_key.objectid < min_objectid)
4960 break;
0a4eefbb
YZ
4961 if (found_key.type == type)
4962 return 0;
e02119d5
CM
4963 if (found_key.objectid == min_objectid &&
4964 found_key.type < type)
4965 break;
0b86a832
CM
4966 }
4967 return 1;
4968}
ade2e0b3
WS
4969
4970/*
4971 * search in extent tree to find a previous Metadata/Data extent item with
4972 * min objecitd.
4973 *
4974 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4975 */
4976int btrfs_previous_extent_item(struct btrfs_root *root,
4977 struct btrfs_path *path, u64 min_objectid)
4978{
4979 struct btrfs_key found_key;
4980 struct extent_buffer *leaf;
4981 u32 nritems;
4982 int ret;
4983
4984 while (1) {
4985 if (path->slots[0] == 0) {
ade2e0b3
WS
4986 ret = btrfs_prev_leaf(root, path);
4987 if (ret != 0)
4988 return ret;
4989 } else {
4990 path->slots[0]--;
4991 }
4992 leaf = path->nodes[0];
4993 nritems = btrfs_header_nritems(leaf);
4994 if (nritems == 0)
4995 return 1;
4996 if (path->slots[0] == nritems)
4997 path->slots[0]--;
4998
4999 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5000 if (found_key.objectid < min_objectid)
5001 break;
5002 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5003 found_key.type == BTRFS_METADATA_ITEM_KEY)
5004 return 0;
5005 if (found_key.objectid == min_objectid &&
5006 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5007 break;
5008 }
5009 return 1;
5010}
226463d7
JB
5011
5012int __init btrfs_ctree_init(void)
5013{
5014 btrfs_path_cachep = kmem_cache_create("btrfs_path",
5015 sizeof(struct btrfs_path), 0,
5016 SLAB_MEM_SPREAD, NULL);
5017 if (!btrfs_path_cachep)
5018 return -ENOMEM;
5019 return 0;
5020}
5021
5022void __cold btrfs_ctree_exit(void)
5023{
5024 kmem_cache_destroy(btrfs_path_cachep);
5025}