btrfs: move assert helpers out of ctree.h
[linux-block.git] / fs / btrfs / ctree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570 2/*
d352ac68 3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6cbd5570
CM
4 */
5
a6b6e75e 6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
bd989ba3 8#include <linux/rbtree.h>
adf02123 9#include <linux/mm.h>
e41d12f5 10#include <linux/error-injection.h>
eb60ceac
CM
11#include "ctree.h"
12#include "disk-io.h"
7f5c1516 13#include "transaction.h"
5f39d397 14#include "print-tree.h"
925baedd 15#include "locking.h"
de37aa51 16#include "volumes.h"
f616f5cd 17#include "qgroup.h"
f3a84ccd 18#include "tree-mod-log.h"
88c602ab 19#include "tree-checker.h"
9a8dd150 20
226463d7
JB
21static struct kmem_cache *btrfs_path_cachep;
22
e089f05c
CM
23static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
24 *root, struct btrfs_path *path, int level);
310712b2
OS
25static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
26 const struct btrfs_key *ins_key, struct btrfs_path *path,
27 int data_size, int extend);
5f39d397 28static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 29 struct extent_buffer *dst,
971a1f66 30 struct extent_buffer *src, int empty);
5f39d397 31static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
32 struct extent_buffer *dst_buf,
33 struct extent_buffer *src_buf);
afe5fea7
TI
34static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
35 int level, int slot);
d97e63b6 36
af024ed2
JT
37static const struct btrfs_csums {
38 u16 size;
59a0fcdb
DS
39 const char name[10];
40 const char driver[12];
af024ed2
JT
41} btrfs_csums[] = {
42 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
3951e7f0 43 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
3831bf00 44 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
352ae07b
DS
45 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
46 .driver = "blake2b-256" },
af024ed2
JT
47};
48
49int btrfs_super_csum_size(const struct btrfs_super_block *s)
50{
51 u16 t = btrfs_super_csum_type(s);
52 /*
53 * csum type is validated at mount time
54 */
55 return btrfs_csums[t].size;
56}
57
58const char *btrfs_super_csum_name(u16 csum_type)
59{
60 /* csum type is validated at mount time */
61 return btrfs_csums[csum_type].name;
62}
63
b4e967be
DS
64/*
65 * Return driver name if defined, otherwise the name that's also a valid driver
66 * name
67 */
68const char *btrfs_super_csum_driver(u16 csum_type)
69{
70 /* csum type is validated at mount time */
59a0fcdb
DS
71 return btrfs_csums[csum_type].driver[0] ?
72 btrfs_csums[csum_type].driver :
b4e967be
DS
73 btrfs_csums[csum_type].name;
74}
75
604997b4 76size_t __attribute_const__ btrfs_get_num_csums(void)
f7cea56c
DS
77{
78 return ARRAY_SIZE(btrfs_csums);
79}
80
df24a2b9 81struct btrfs_path *btrfs_alloc_path(void)
2c90e5d6 82{
e2c89907 83 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
2c90e5d6
CM
84}
85
d352ac68 86/* this also releases the path */
df24a2b9 87void btrfs_free_path(struct btrfs_path *p)
be0e5c09 88{
ff175d57
JJ
89 if (!p)
90 return;
b3b4aa74 91 btrfs_release_path(p);
df24a2b9 92 kmem_cache_free(btrfs_path_cachep, p);
be0e5c09
CM
93}
94
d352ac68
CM
95/*
96 * path release drops references on the extent buffers in the path
97 * and it drops any locks held by this path
98 *
99 * It is safe to call this on paths that no locks or extent buffers held.
100 */
b3b4aa74 101noinline void btrfs_release_path(struct btrfs_path *p)
eb60ceac
CM
102{
103 int i;
a2135011 104
234b63a0 105 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3f157a2f 106 p->slots[i] = 0;
eb60ceac 107 if (!p->nodes[i])
925baedd
CM
108 continue;
109 if (p->locks[i]) {
bd681513 110 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
925baedd
CM
111 p->locks[i] = 0;
112 }
5f39d397 113 free_extent_buffer(p->nodes[i]);
3f157a2f 114 p->nodes[i] = NULL;
eb60ceac
CM
115 }
116}
117
8bb808c6
DS
118/*
119 * We want the transaction abort to print stack trace only for errors where the
120 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
121 * caused by external factors.
122 */
123bool __cold abort_should_print_stack(int errno)
124{
125 switch (errno) {
126 case -EIO:
127 case -EROFS:
128 case -ENOMEM:
129 return false;
130 }
131 return true;
132}
133
d352ac68
CM
134/*
135 * safely gets a reference on the root node of a tree. A lock
136 * is not taken, so a concurrent writer may put a different node
137 * at the root of the tree. See btrfs_lock_root_node for the
138 * looping required.
139 *
140 * The extent buffer returned by this has a reference taken, so
141 * it won't disappear. It may stop being the root of the tree
142 * at any time because there are no locks held.
143 */
925baedd
CM
144struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
145{
146 struct extent_buffer *eb;
240f62c8 147
3083ee2e
JB
148 while (1) {
149 rcu_read_lock();
150 eb = rcu_dereference(root->node);
151
152 /*
153 * RCU really hurts here, we could free up the root node because
01327610 154 * it was COWed but we may not get the new root node yet so do
3083ee2e
JB
155 * the inc_not_zero dance and if it doesn't work then
156 * synchronize_rcu and try again.
157 */
158 if (atomic_inc_not_zero(&eb->refs)) {
159 rcu_read_unlock();
160 break;
161 }
162 rcu_read_unlock();
163 synchronize_rcu();
164 }
925baedd
CM
165 return eb;
166}
167
92a7cc42
QW
168/*
169 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
170 * just get put onto a simple dirty list. Transaction walks this list to make
171 * sure they get properly updated on disk.
d352ac68 172 */
0b86a832
CM
173static void add_root_to_dirty_list(struct btrfs_root *root)
174{
0b246afa
JM
175 struct btrfs_fs_info *fs_info = root->fs_info;
176
e7070be1
JB
177 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
178 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
179 return;
180
0b246afa 181 spin_lock(&fs_info->trans_lock);
e7070be1
JB
182 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
183 /* Want the extent tree to be the last on the list */
4fd786e6 184 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
e7070be1 185 list_move_tail(&root->dirty_list,
0b246afa 186 &fs_info->dirty_cowonly_roots);
e7070be1
JB
187 else
188 list_move(&root->dirty_list,
0b246afa 189 &fs_info->dirty_cowonly_roots);
0b86a832 190 }
0b246afa 191 spin_unlock(&fs_info->trans_lock);
0b86a832
CM
192}
193
d352ac68
CM
194/*
195 * used by snapshot creation to make a copy of a root for a tree with
196 * a given objectid. The buffer with the new root node is returned in
197 * cow_ret, and this func returns zero on success or a negative error code.
198 */
be20aa9d
CM
199int btrfs_copy_root(struct btrfs_trans_handle *trans,
200 struct btrfs_root *root,
201 struct extent_buffer *buf,
202 struct extent_buffer **cow_ret, u64 new_root_objectid)
203{
0b246afa 204 struct btrfs_fs_info *fs_info = root->fs_info;
be20aa9d 205 struct extent_buffer *cow;
be20aa9d
CM
206 int ret = 0;
207 int level;
5d4f98a2 208 struct btrfs_disk_key disk_key;
be20aa9d 209
92a7cc42 210 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 211 trans->transid != fs_info->running_transaction->transid);
92a7cc42 212 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 213 trans->transid != root->last_trans);
be20aa9d
CM
214
215 level = btrfs_header_level(buf);
5d4f98a2
YZ
216 if (level == 0)
217 btrfs_item_key(buf, &disk_key, 0);
218 else
219 btrfs_node_key(buf, &disk_key, 0);
31840ae1 220
4d75f8a9 221 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
cf6f34aa
JB
222 &disk_key, level, buf->start, 0,
223 BTRFS_NESTING_NEW_ROOT);
5d4f98a2 224 if (IS_ERR(cow))
be20aa9d
CM
225 return PTR_ERR(cow);
226
58e8012c 227 copy_extent_buffer_full(cow, buf);
be20aa9d
CM
228 btrfs_set_header_bytenr(cow, cow->start);
229 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
230 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
231 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
232 BTRFS_HEADER_FLAG_RELOC);
233 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
234 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
235 else
236 btrfs_set_header_owner(cow, new_root_objectid);
be20aa9d 237
de37aa51 238 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 239
be20aa9d 240 WARN_ON(btrfs_header_generation(buf) > trans->transid);
5d4f98a2 241 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 242 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 243 else
e339a6b0 244 ret = btrfs_inc_ref(trans, root, cow, 0);
867ed321 245 if (ret) {
72c9925f
FM
246 btrfs_tree_unlock(cow);
247 free_extent_buffer(cow);
867ed321 248 btrfs_abort_transaction(trans, ret);
be20aa9d 249 return ret;
867ed321 250 }
be20aa9d
CM
251
252 btrfs_mark_buffer_dirty(cow);
253 *cow_ret = cow;
254 return 0;
255}
256
5d4f98a2
YZ
257/*
258 * check if the tree block can be shared by multiple trees
259 */
260int btrfs_block_can_be_shared(struct btrfs_root *root,
261 struct extent_buffer *buf)
262{
263 /*
92a7cc42
QW
264 * Tree blocks not in shareable trees and tree roots are never shared.
265 * If a block was allocated after the last snapshot and the block was
266 * not allocated by tree relocation, we know the block is not shared.
5d4f98a2 267 */
92a7cc42 268 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
5d4f98a2
YZ
269 buf != root->node && buf != root->commit_root &&
270 (btrfs_header_generation(buf) <=
271 btrfs_root_last_snapshot(&root->root_item) ||
272 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
273 return 1;
a79865c6 274
5d4f98a2
YZ
275 return 0;
276}
277
278static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root,
280 struct extent_buffer *buf,
f0486c68
YZ
281 struct extent_buffer *cow,
282 int *last_ref)
5d4f98a2 283{
0b246afa 284 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
285 u64 refs;
286 u64 owner;
287 u64 flags;
288 u64 new_flags = 0;
289 int ret;
290
291 /*
292 * Backrefs update rules:
293 *
294 * Always use full backrefs for extent pointers in tree block
295 * allocated by tree relocation.
296 *
297 * If a shared tree block is no longer referenced by its owner
298 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
299 * use full backrefs for extent pointers in tree block.
300 *
301 * If a tree block is been relocating
302 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
303 * use full backrefs for extent pointers in tree block.
304 * The reason for this is some operations (such as drop tree)
305 * are only allowed for blocks use full backrefs.
306 */
307
308 if (btrfs_block_can_be_shared(root, buf)) {
2ff7e61e 309 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
3173a18f
JB
310 btrfs_header_level(buf), 1,
311 &refs, &flags);
be1a5564
MF
312 if (ret)
313 return ret;
e5df9573
MF
314 if (refs == 0) {
315 ret = -EROFS;
0b246afa 316 btrfs_handle_fs_error(fs_info, ret, NULL);
e5df9573
MF
317 return ret;
318 }
5d4f98a2
YZ
319 } else {
320 refs = 1;
321 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
322 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
323 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
324 else
325 flags = 0;
326 }
327
328 owner = btrfs_header_owner(buf);
329 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
330 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
331
332 if (refs > 1) {
333 if ((owner == root->root_key.objectid ||
334 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
335 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
e339a6b0 336 ret = btrfs_inc_ref(trans, root, buf, 1);
692826b2
JM
337 if (ret)
338 return ret;
5d4f98a2
YZ
339
340 if (root->root_key.objectid ==
341 BTRFS_TREE_RELOC_OBJECTID) {
e339a6b0 342 ret = btrfs_dec_ref(trans, root, buf, 0);
692826b2
JM
343 if (ret)
344 return ret;
e339a6b0 345 ret = btrfs_inc_ref(trans, root, cow, 1);
692826b2
JM
346 if (ret)
347 return ret;
5d4f98a2
YZ
348 }
349 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
350 } else {
351
352 if (root->root_key.objectid ==
353 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 354 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 355 else
e339a6b0 356 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
357 if (ret)
358 return ret;
5d4f98a2
YZ
359 }
360 if (new_flags != 0) {
b1c79e09
JB
361 int level = btrfs_header_level(buf);
362
42c9d0b5 363 ret = btrfs_set_disk_extent_flags(trans, buf,
2fe6a5a1 364 new_flags, level);
be1a5564
MF
365 if (ret)
366 return ret;
5d4f98a2
YZ
367 }
368 } else {
369 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
370 if (root->root_key.objectid ==
371 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 372 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 373 else
e339a6b0 374 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
375 if (ret)
376 return ret;
e339a6b0 377 ret = btrfs_dec_ref(trans, root, buf, 1);
692826b2
JM
378 if (ret)
379 return ret;
5d4f98a2 380 }
6a884d7d 381 btrfs_clean_tree_block(buf);
f0486c68 382 *last_ref = 1;
5d4f98a2
YZ
383 }
384 return 0;
385}
386
d352ac68 387/*
d397712b
CM
388 * does the dirty work in cow of a single block. The parent block (if
389 * supplied) is updated to point to the new cow copy. The new buffer is marked
390 * dirty and returned locked. If you modify the block it needs to be marked
391 * dirty again.
d352ac68
CM
392 *
393 * search_start -- an allocation hint for the new block
394 *
d397712b
CM
395 * empty_size -- a hint that you plan on doing more cow. This is the size in
396 * bytes the allocator should try to find free next to the block it returns.
397 * This is just a hint and may be ignored by the allocator.
d352ac68 398 */
d397712b 399static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
400 struct btrfs_root *root,
401 struct extent_buffer *buf,
402 struct extent_buffer *parent, int parent_slot,
403 struct extent_buffer **cow_ret,
9631e4cc
JB
404 u64 search_start, u64 empty_size,
405 enum btrfs_lock_nesting nest)
02217ed2 406{
0b246afa 407 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 408 struct btrfs_disk_key disk_key;
5f39d397 409 struct extent_buffer *cow;
be1a5564 410 int level, ret;
f0486c68 411 int last_ref = 0;
925baedd 412 int unlock_orig = 0;
0f5053eb 413 u64 parent_start = 0;
7bb86316 414
925baedd
CM
415 if (*cow_ret == buf)
416 unlock_orig = 1;
417
49d0c642 418 btrfs_assert_tree_write_locked(buf);
925baedd 419
92a7cc42 420 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 421 trans->transid != fs_info->running_transaction->transid);
92a7cc42 422 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 423 trans->transid != root->last_trans);
5f39d397 424
7bb86316 425 level = btrfs_header_level(buf);
31840ae1 426
5d4f98a2
YZ
427 if (level == 0)
428 btrfs_item_key(buf, &disk_key, 0);
429 else
430 btrfs_node_key(buf, &disk_key, 0);
431
0f5053eb
GR
432 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
433 parent_start = parent->start;
5d4f98a2 434
79bd3712
FM
435 cow = btrfs_alloc_tree_block(trans, root, parent_start,
436 root->root_key.objectid, &disk_key, level,
437 search_start, empty_size, nest);
54aa1f4d
CM
438 if (IS_ERR(cow))
439 return PTR_ERR(cow);
6702ed49 440
b4ce94de
CM
441 /* cow is set to blocking by btrfs_init_new_buffer */
442
58e8012c 443 copy_extent_buffer_full(cow, buf);
db94535d 444 btrfs_set_header_bytenr(cow, cow->start);
5f39d397 445 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
446 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
447 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
448 BTRFS_HEADER_FLAG_RELOC);
449 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
450 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
451 else
452 btrfs_set_header_owner(cow, root->root_key.objectid);
6702ed49 453
de37aa51 454 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 455
be1a5564 456 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
b68dc2a9 457 if (ret) {
572c83ac
JB
458 btrfs_tree_unlock(cow);
459 free_extent_buffer(cow);
66642832 460 btrfs_abort_transaction(trans, ret);
b68dc2a9
MF
461 return ret;
462 }
1a40e23b 463
92a7cc42 464 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
83d4cfd4 465 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
93314e3b 466 if (ret) {
572c83ac
JB
467 btrfs_tree_unlock(cow);
468 free_extent_buffer(cow);
66642832 469 btrfs_abort_transaction(trans, ret);
83d4cfd4 470 return ret;
93314e3b 471 }
83d4cfd4 472 }
3fd0a558 473
02217ed2 474 if (buf == root->node) {
925baedd 475 WARN_ON(parent && parent != buf);
5d4f98a2
YZ
476 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
477 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
478 parent_start = buf->start;
925baedd 479
67439dad 480 atomic_inc(&cow->refs);
406808ab 481 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
d9d19a01 482 BUG_ON(ret < 0);
240f62c8 483 rcu_assign_pointer(root->node, cow);
925baedd 484
7a163608
FM
485 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
486 parent_start, last_ref);
5f39d397 487 free_extent_buffer(buf);
0b86a832 488 add_root_to_dirty_list(root);
02217ed2 489 } else {
5d4f98a2 490 WARN_ON(trans->transid != btrfs_header_generation(parent));
f3a84ccd 491 btrfs_tree_mod_log_insert_key(parent, parent_slot,
33cff222 492 BTRFS_MOD_LOG_KEY_REPLACE);
5f39d397 493 btrfs_set_node_blockptr(parent, parent_slot,
db94535d 494 cow->start);
74493f7a
CM
495 btrfs_set_node_ptr_generation(parent, parent_slot,
496 trans->transid);
d6025579 497 btrfs_mark_buffer_dirty(parent);
5de865ee 498 if (last_ref) {
f3a84ccd 499 ret = btrfs_tree_mod_log_free_eb(buf);
5de865ee 500 if (ret) {
572c83ac
JB
501 btrfs_tree_unlock(cow);
502 free_extent_buffer(cow);
66642832 503 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
504 return ret;
505 }
506 }
7a163608
FM
507 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
508 parent_start, last_ref);
02217ed2 509 }
925baedd
CM
510 if (unlock_orig)
511 btrfs_tree_unlock(buf);
3083ee2e 512 free_extent_buffer_stale(buf);
ccd467d6 513 btrfs_mark_buffer_dirty(cow);
2c90e5d6 514 *cow_ret = cow;
02217ed2
CM
515 return 0;
516}
517
5d4f98a2
YZ
518static inline int should_cow_block(struct btrfs_trans_handle *trans,
519 struct btrfs_root *root,
520 struct extent_buffer *buf)
521{
f5ee5c9a 522 if (btrfs_is_testing(root->fs_info))
faa2dbf0 523 return 0;
fccb84c9 524
d1980131
DS
525 /* Ensure we can see the FORCE_COW bit */
526 smp_mb__before_atomic();
f1ebcc74
LB
527
528 /*
529 * We do not need to cow a block if
530 * 1) this block is not created or changed in this transaction;
531 * 2) this block does not belong to TREE_RELOC tree;
532 * 3) the root is not forced COW.
533 *
534 * What is forced COW:
01327610 535 * when we create snapshot during committing the transaction,
52042d8e 536 * after we've finished copying src root, we must COW the shared
f1ebcc74
LB
537 * block to ensure the metadata consistency.
538 */
5d4f98a2
YZ
539 if (btrfs_header_generation(buf) == trans->transid &&
540 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
541 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
f1ebcc74 542 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
27cdeb70 543 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
5d4f98a2
YZ
544 return 0;
545 return 1;
546}
547
d352ac68
CM
548/*
549 * cows a single block, see __btrfs_cow_block for the real work.
01327610 550 * This version of it has extra checks so that a block isn't COWed more than
d352ac68
CM
551 * once per transaction, as long as it hasn't been written yet
552 */
d397712b 553noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
554 struct btrfs_root *root, struct extent_buffer *buf,
555 struct extent_buffer *parent, int parent_slot,
9631e4cc
JB
556 struct extent_buffer **cow_ret,
557 enum btrfs_lock_nesting nest)
6702ed49 558{
0b246afa 559 struct btrfs_fs_info *fs_info = root->fs_info;
6702ed49 560 u64 search_start;
f510cfec 561 int ret;
dc17ff8f 562
83354f07
JB
563 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
564 btrfs_err(fs_info,
565 "COW'ing blocks on a fs root that's being dropped");
566
0b246afa 567 if (trans->transaction != fs_info->running_transaction)
31b1a2bd 568 WARN(1, KERN_CRIT "trans %llu running %llu\n",
c1c9ff7c 569 trans->transid,
0b246afa 570 fs_info->running_transaction->transid);
31b1a2bd 571
0b246afa 572 if (trans->transid != fs_info->generation)
31b1a2bd 573 WARN(1, KERN_CRIT "trans %llu running %llu\n",
0b246afa 574 trans->transid, fs_info->generation);
dc17ff8f 575
5d4f98a2 576 if (!should_cow_block(trans, root, buf)) {
6702ed49
CM
577 *cow_ret = buf;
578 return 0;
579 }
c487685d 580
ee22184b 581 search_start = buf->start & ~((u64)SZ_1G - 1);
b4ce94de 582
f616f5cd
QW
583 /*
584 * Before CoWing this block for later modification, check if it's
585 * the subtree root and do the delayed subtree trace if needed.
586 *
587 * Also We don't care about the error, as it's handled internally.
588 */
589 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
f510cfec 590 ret = __btrfs_cow_block(trans, root, buf, parent,
9631e4cc 591 parent_slot, cow_ret, search_start, 0, nest);
1abe9b8a 592
593 trace_btrfs_cow_block(root, buf, *cow_ret);
594
f510cfec 595 return ret;
6702ed49 596}
f75e2b79 597ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
6702ed49 598
d352ac68
CM
599/*
600 * helper function for defrag to decide if two blocks pointed to by a
601 * node are actually close by
602 */
6b80053d 603static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
6702ed49 604{
6b80053d 605 if (blocknr < other && other - (blocknr + blocksize) < 32768)
6702ed49 606 return 1;
6b80053d 607 if (blocknr > other && blocknr - (other + blocksize) < 32768)
6702ed49
CM
608 return 1;
609 return 0;
610}
611
ce6ef5ab
DS
612#ifdef __LITTLE_ENDIAN
613
614/*
615 * Compare two keys, on little-endian the disk order is same as CPU order and
616 * we can avoid the conversion.
617 */
618static int comp_keys(const struct btrfs_disk_key *disk_key,
619 const struct btrfs_key *k2)
620{
621 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
622
623 return btrfs_comp_cpu_keys(k1, k2);
624}
625
626#else
627
081e9573
CM
628/*
629 * compare two keys in a memcmp fashion
630 */
310712b2
OS
631static int comp_keys(const struct btrfs_disk_key *disk,
632 const struct btrfs_key *k2)
081e9573
CM
633{
634 struct btrfs_key k1;
635
636 btrfs_disk_key_to_cpu(&k1, disk);
637
20736aba 638 return btrfs_comp_cpu_keys(&k1, k2);
081e9573 639}
ce6ef5ab 640#endif
081e9573 641
f3465ca4
JB
642/*
643 * same as comp_keys only with two btrfs_key's
644 */
e1f60a65 645int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
f3465ca4
JB
646{
647 if (k1->objectid > k2->objectid)
648 return 1;
649 if (k1->objectid < k2->objectid)
650 return -1;
651 if (k1->type > k2->type)
652 return 1;
653 if (k1->type < k2->type)
654 return -1;
655 if (k1->offset > k2->offset)
656 return 1;
657 if (k1->offset < k2->offset)
658 return -1;
659 return 0;
660}
081e9573 661
d352ac68
CM
662/*
663 * this is used by the defrag code to go through all the
664 * leaves pointed to by a node and reallocate them so that
665 * disk order is close to key order
666 */
6702ed49 667int btrfs_realloc_node(struct btrfs_trans_handle *trans,
5f39d397 668 struct btrfs_root *root, struct extent_buffer *parent,
de78b51a 669 int start_slot, u64 *last_ret,
a6b6e75e 670 struct btrfs_key *progress)
6702ed49 671{
0b246afa 672 struct btrfs_fs_info *fs_info = root->fs_info;
6b80053d 673 struct extent_buffer *cur;
6702ed49 674 u64 blocknr;
e9d0b13b
CM
675 u64 search_start = *last_ret;
676 u64 last_block = 0;
6702ed49
CM
677 u64 other;
678 u32 parent_nritems;
6702ed49
CM
679 int end_slot;
680 int i;
681 int err = 0;
6b80053d 682 u32 blocksize;
081e9573
CM
683 int progress_passed = 0;
684 struct btrfs_disk_key disk_key;
6702ed49 685
0b246afa
JM
686 WARN_ON(trans->transaction != fs_info->running_transaction);
687 WARN_ON(trans->transid != fs_info->generation);
86479a04 688
6b80053d 689 parent_nritems = btrfs_header_nritems(parent);
0b246afa 690 blocksize = fs_info->nodesize;
5dfe2be7 691 end_slot = parent_nritems - 1;
6702ed49 692
5dfe2be7 693 if (parent_nritems <= 1)
6702ed49
CM
694 return 0;
695
5dfe2be7 696 for (i = start_slot; i <= end_slot; i++) {
6702ed49 697 int close = 1;
a6b6e75e 698
081e9573
CM
699 btrfs_node_key(parent, &disk_key, i);
700 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
701 continue;
702
703 progress_passed = 1;
6b80053d 704 blocknr = btrfs_node_blockptr(parent, i);
e9d0b13b
CM
705 if (last_block == 0)
706 last_block = blocknr;
5708b959 707
6702ed49 708 if (i > 0) {
6b80053d
CM
709 other = btrfs_node_blockptr(parent, i - 1);
710 close = close_blocks(blocknr, other, blocksize);
6702ed49 711 }
5dfe2be7 712 if (!close && i < end_slot) {
6b80053d
CM
713 other = btrfs_node_blockptr(parent, i + 1);
714 close = close_blocks(blocknr, other, blocksize);
6702ed49 715 }
e9d0b13b
CM
716 if (close) {
717 last_block = blocknr;
6702ed49 718 continue;
e9d0b13b 719 }
6702ed49 720
206983b7
JB
721 cur = btrfs_read_node_slot(parent, i);
722 if (IS_ERR(cur))
723 return PTR_ERR(cur);
e9d0b13b 724 if (search_start == 0)
6b80053d 725 search_start = last_block;
e9d0b13b 726
e7a84565 727 btrfs_tree_lock(cur);
6b80053d 728 err = __btrfs_cow_block(trans, root, cur, parent, i,
e7a84565 729 &cur, search_start,
6b80053d 730 min(16 * blocksize,
9631e4cc
JB
731 (end_slot - i) * blocksize),
732 BTRFS_NESTING_COW);
252c38f0 733 if (err) {
e7a84565 734 btrfs_tree_unlock(cur);
6b80053d 735 free_extent_buffer(cur);
6702ed49 736 break;
252c38f0 737 }
e7a84565
CM
738 search_start = cur->start;
739 last_block = cur->start;
f2183bde 740 *last_ret = search_start;
e7a84565
CM
741 btrfs_tree_unlock(cur);
742 free_extent_buffer(cur);
6702ed49
CM
743 }
744 return err;
745}
746
74123bd7 747/*
fb81212c 748 * Search for a key in the given extent_buffer.
5f39d397 749 *
fb81212c
FM
750 * The lower boundary for the search is specified by the slot number @low. Use a
751 * value of 0 to search over the whole extent buffer.
74123bd7 752 *
fb81212c
FM
753 * The slot in the extent buffer is returned via @slot. If the key exists in the
754 * extent buffer, then @slot will point to the slot where the key is, otherwise
755 * it points to the slot where you would insert the key.
756 *
757 * Slot may point to the total number of items (i.e. one position beyond the last
758 * key) if the key is bigger than the last key in the extent buffer.
74123bd7 759 */
fb81212c 760static noinline int generic_bin_search(struct extent_buffer *eb, int low,
67d5e289 761 const struct btrfs_key *key, int *slot)
be0e5c09 762{
fb81212c
FM
763 unsigned long p;
764 int item_size;
67d5e289 765 int high = btrfs_header_nritems(eb);
be0e5c09 766 int ret;
5cd17f34 767 const int key_size = sizeof(struct btrfs_disk_key);
be0e5c09 768
5e24e9af
LB
769 if (low > high) {
770 btrfs_err(eb->fs_info,
771 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
772 __func__, low, high, eb->start,
773 btrfs_header_owner(eb), btrfs_header_level(eb));
774 return -EINVAL;
775 }
776
fb81212c
FM
777 if (btrfs_header_level(eb) == 0) {
778 p = offsetof(struct btrfs_leaf, items);
779 item_size = sizeof(struct btrfs_item);
780 } else {
781 p = offsetof(struct btrfs_node, ptrs);
782 item_size = sizeof(struct btrfs_key_ptr);
783 }
784
d397712b 785 while (low < high) {
5cd17f34
DS
786 unsigned long oip;
787 unsigned long offset;
788 struct btrfs_disk_key *tmp;
789 struct btrfs_disk_key unaligned;
790 int mid;
791
be0e5c09 792 mid = (low + high) / 2;
5f39d397 793 offset = p + mid * item_size;
5cd17f34 794 oip = offset_in_page(offset);
5f39d397 795
5cd17f34 796 if (oip + key_size <= PAGE_SIZE) {
884b07d0 797 const unsigned long idx = get_eb_page_index(offset);
5cd17f34 798 char *kaddr = page_address(eb->pages[idx]);
5f39d397 799
884b07d0 800 oip = get_eb_offset_in_page(eb, offset);
5cd17f34 801 tmp = (struct btrfs_disk_key *)(kaddr + oip);
5f39d397 802 } else {
5cd17f34
DS
803 read_extent_buffer(eb, &unaligned, offset, key_size);
804 tmp = &unaligned;
5f39d397 805 }
5cd17f34 806
be0e5c09
CM
807 ret = comp_keys(tmp, key);
808
809 if (ret < 0)
810 low = mid + 1;
811 else if (ret > 0)
812 high = mid;
813 else {
814 *slot = mid;
815 return 0;
816 }
817 }
818 *slot = low;
819 return 1;
820}
821
97571fd0 822/*
fb81212c
FM
823 * Simple binary search on an extent buffer. Works for both leaves and nodes, and
824 * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
97571fd0 825 */
a74b35ec 826int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
e3b83361 827 int *slot)
be0e5c09 828{
fb81212c 829 return generic_bin_search(eb, 0, key, slot);
be0e5c09
CM
830}
831
f0486c68
YZ
832static void root_add_used(struct btrfs_root *root, u32 size)
833{
834 spin_lock(&root->accounting_lock);
835 btrfs_set_root_used(&root->root_item,
836 btrfs_root_used(&root->root_item) + size);
837 spin_unlock(&root->accounting_lock);
838}
839
840static void root_sub_used(struct btrfs_root *root, u32 size)
841{
842 spin_lock(&root->accounting_lock);
843 btrfs_set_root_used(&root->root_item,
844 btrfs_root_used(&root->root_item) - size);
845 spin_unlock(&root->accounting_lock);
846}
847
d352ac68
CM
848/* given a node and slot number, this reads the blocks it points to. The
849 * extent buffer is returned with a reference taken (but unlocked).
d352ac68 850 */
4b231ae4
DS
851struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
852 int slot)
bb803951 853{
ca7a79ad 854 int level = btrfs_header_level(parent);
416bc658 855 struct extent_buffer *eb;
581c1760 856 struct btrfs_key first_key;
416bc658 857
fb770ae4
LB
858 if (slot < 0 || slot >= btrfs_header_nritems(parent))
859 return ERR_PTR(-ENOENT);
ca7a79ad
CM
860
861 BUG_ON(level == 0);
862
581c1760 863 btrfs_node_key_to_cpu(parent, &first_key, slot);
d0d20b0f 864 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1b7ec85e 865 btrfs_header_owner(parent),
581c1760
QW
866 btrfs_node_ptr_generation(parent, slot),
867 level - 1, &first_key);
4eb150d6
QW
868 if (IS_ERR(eb))
869 return eb;
870 if (!extent_buffer_uptodate(eb)) {
fb770ae4 871 free_extent_buffer(eb);
4eb150d6 872 return ERR_PTR(-EIO);
416bc658
JB
873 }
874
875 return eb;
bb803951
CM
876}
877
d352ac68
CM
878/*
879 * node level balancing, used to make sure nodes are in proper order for
880 * item deletion. We balance from the top down, so we have to make sure
881 * that a deletion won't leave an node completely empty later on.
882 */
e02119d5 883static noinline int balance_level(struct btrfs_trans_handle *trans,
98ed5174
CM
884 struct btrfs_root *root,
885 struct btrfs_path *path, int level)
bb803951 886{
0b246afa 887 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
888 struct extent_buffer *right = NULL;
889 struct extent_buffer *mid;
890 struct extent_buffer *left = NULL;
891 struct extent_buffer *parent = NULL;
bb803951
CM
892 int ret = 0;
893 int wret;
894 int pslot;
bb803951 895 int orig_slot = path->slots[level];
79f95c82 896 u64 orig_ptr;
bb803951 897
98e6b1eb 898 ASSERT(level > 0);
bb803951 899
5f39d397 900 mid = path->nodes[level];
b4ce94de 901
ac5887c8 902 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
7bb86316
CM
903 WARN_ON(btrfs_header_generation(mid) != trans->transid);
904
1d4f8a0c 905 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
79f95c82 906
a05a9bb1 907 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 908 parent = path->nodes[level + 1];
a05a9bb1
LZ
909 pslot = path->slots[level + 1];
910 }
bb803951 911
40689478
CM
912 /*
913 * deal with the case where there is only one pointer in the root
914 * by promoting the node below to a root
915 */
5f39d397
CM
916 if (!parent) {
917 struct extent_buffer *child;
bb803951 918
5f39d397 919 if (btrfs_header_nritems(mid) != 1)
bb803951
CM
920 return 0;
921
922 /* promote the child to a root */
4b231ae4 923 child = btrfs_read_node_slot(mid, 0);
fb770ae4
LB
924 if (IS_ERR(child)) {
925 ret = PTR_ERR(child);
0b246afa 926 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
927 goto enospc;
928 }
929
925baedd 930 btrfs_tree_lock(child);
9631e4cc
JB
931 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
932 BTRFS_NESTING_COW);
f0486c68
YZ
933 if (ret) {
934 btrfs_tree_unlock(child);
935 free_extent_buffer(child);
936 goto enospc;
937 }
2f375ab9 938
406808ab 939 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
d9d19a01 940 BUG_ON(ret < 0);
240f62c8 941 rcu_assign_pointer(root->node, child);
925baedd 942
0b86a832 943 add_root_to_dirty_list(root);
925baedd 944 btrfs_tree_unlock(child);
b4ce94de 945
925baedd 946 path->locks[level] = 0;
bb803951 947 path->nodes[level] = NULL;
6a884d7d 948 btrfs_clean_tree_block(mid);
925baedd 949 btrfs_tree_unlock(mid);
bb803951 950 /* once for the path */
5f39d397 951 free_extent_buffer(mid);
f0486c68
YZ
952
953 root_sub_used(root, mid->len);
7a163608 954 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
bb803951 955 /* once for the root ptr */
3083ee2e 956 free_extent_buffer_stale(mid);
f0486c68 957 return 0;
bb803951 958 }
5f39d397 959 if (btrfs_header_nritems(mid) >
0b246afa 960 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
bb803951
CM
961 return 0;
962
4b231ae4 963 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
964 if (IS_ERR(left))
965 left = NULL;
966
5f39d397 967 if (left) {
bf77467a 968 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
5f39d397 969 wret = btrfs_cow_block(trans, root, left,
9631e4cc 970 parent, pslot - 1, &left,
bf59a5a2 971 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
972 if (wret) {
973 ret = wret;
974 goto enospc;
975 }
2cc58cf2 976 }
fb770ae4 977
4b231ae4 978 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
979 if (IS_ERR(right))
980 right = NULL;
981
5f39d397 982 if (right) {
bf77467a 983 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
5f39d397 984 wret = btrfs_cow_block(trans, root, right,
9631e4cc 985 parent, pslot + 1, &right,
bf59a5a2 986 BTRFS_NESTING_RIGHT_COW);
2cc58cf2
CM
987 if (wret) {
988 ret = wret;
989 goto enospc;
990 }
991 }
992
993 /* first, try to make some room in the middle buffer */
5f39d397
CM
994 if (left) {
995 orig_slot += btrfs_header_nritems(left);
d30a668f 996 wret = push_node_left(trans, left, mid, 1);
79f95c82
CM
997 if (wret < 0)
998 ret = wret;
bb803951 999 }
79f95c82
CM
1000
1001 /*
1002 * then try to empty the right most buffer into the middle
1003 */
5f39d397 1004 if (right) {
d30a668f 1005 wret = push_node_left(trans, mid, right, 1);
54aa1f4d 1006 if (wret < 0 && wret != -ENOSPC)
79f95c82 1007 ret = wret;
5f39d397 1008 if (btrfs_header_nritems(right) == 0) {
6a884d7d 1009 btrfs_clean_tree_block(right);
925baedd 1010 btrfs_tree_unlock(right);
afe5fea7 1011 del_ptr(root, path, level + 1, pslot + 1);
f0486c68 1012 root_sub_used(root, right->len);
7a163608
FM
1013 btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1014 0, 1);
3083ee2e 1015 free_extent_buffer_stale(right);
f0486c68 1016 right = NULL;
bb803951 1017 } else {
5f39d397
CM
1018 struct btrfs_disk_key right_key;
1019 btrfs_node_key(right, &right_key, 0);
f3a84ccd 1020 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1021 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1022 BUG_ON(ret < 0);
5f39d397
CM
1023 btrfs_set_node_key(parent, &right_key, pslot + 1);
1024 btrfs_mark_buffer_dirty(parent);
bb803951
CM
1025 }
1026 }
5f39d397 1027 if (btrfs_header_nritems(mid) == 1) {
79f95c82
CM
1028 /*
1029 * we're not allowed to leave a node with one item in the
1030 * tree during a delete. A deletion from lower in the tree
1031 * could try to delete the only pointer in this node.
1032 * So, pull some keys from the left.
1033 * There has to be a left pointer at this point because
1034 * otherwise we would have pulled some pointers from the
1035 * right
1036 */
305a26af
MF
1037 if (!left) {
1038 ret = -EROFS;
0b246afa 1039 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
1040 goto enospc;
1041 }
55d32ed8 1042 wret = balance_node_right(trans, mid, left);
54aa1f4d 1043 if (wret < 0) {
79f95c82 1044 ret = wret;
54aa1f4d
CM
1045 goto enospc;
1046 }
bce4eae9 1047 if (wret == 1) {
d30a668f 1048 wret = push_node_left(trans, left, mid, 1);
bce4eae9
CM
1049 if (wret < 0)
1050 ret = wret;
1051 }
79f95c82
CM
1052 BUG_ON(wret == 1);
1053 }
5f39d397 1054 if (btrfs_header_nritems(mid) == 0) {
6a884d7d 1055 btrfs_clean_tree_block(mid);
925baedd 1056 btrfs_tree_unlock(mid);
afe5fea7 1057 del_ptr(root, path, level + 1, pslot);
f0486c68 1058 root_sub_used(root, mid->len);
7a163608 1059 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
3083ee2e 1060 free_extent_buffer_stale(mid);
f0486c68 1061 mid = NULL;
79f95c82
CM
1062 } else {
1063 /* update the parent key to reflect our changes */
5f39d397
CM
1064 struct btrfs_disk_key mid_key;
1065 btrfs_node_key(mid, &mid_key, 0);
f3a84ccd 1066 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1067 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1068 BUG_ON(ret < 0);
5f39d397
CM
1069 btrfs_set_node_key(parent, &mid_key, pslot);
1070 btrfs_mark_buffer_dirty(parent);
79f95c82 1071 }
bb803951 1072
79f95c82 1073 /* update the path */
5f39d397
CM
1074 if (left) {
1075 if (btrfs_header_nritems(left) > orig_slot) {
67439dad 1076 atomic_inc(&left->refs);
925baedd 1077 /* left was locked after cow */
5f39d397 1078 path->nodes[level] = left;
bb803951
CM
1079 path->slots[level + 1] -= 1;
1080 path->slots[level] = orig_slot;
925baedd
CM
1081 if (mid) {
1082 btrfs_tree_unlock(mid);
5f39d397 1083 free_extent_buffer(mid);
925baedd 1084 }
bb803951 1085 } else {
5f39d397 1086 orig_slot -= btrfs_header_nritems(left);
bb803951
CM
1087 path->slots[level] = orig_slot;
1088 }
1089 }
79f95c82 1090 /* double check we haven't messed things up */
e20d96d6 1091 if (orig_ptr !=
5f39d397 1092 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
79f95c82 1093 BUG();
54aa1f4d 1094enospc:
925baedd
CM
1095 if (right) {
1096 btrfs_tree_unlock(right);
5f39d397 1097 free_extent_buffer(right);
925baedd
CM
1098 }
1099 if (left) {
1100 if (path->nodes[level] != left)
1101 btrfs_tree_unlock(left);
5f39d397 1102 free_extent_buffer(left);
925baedd 1103 }
bb803951
CM
1104 return ret;
1105}
1106
d352ac68
CM
1107/* Node balancing for insertion. Here we only split or push nodes around
1108 * when they are completely full. This is also done top down, so we
1109 * have to be pessimistic.
1110 */
d397712b 1111static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
98ed5174
CM
1112 struct btrfs_root *root,
1113 struct btrfs_path *path, int level)
e66f709b 1114{
0b246afa 1115 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
1116 struct extent_buffer *right = NULL;
1117 struct extent_buffer *mid;
1118 struct extent_buffer *left = NULL;
1119 struct extent_buffer *parent = NULL;
e66f709b
CM
1120 int ret = 0;
1121 int wret;
1122 int pslot;
1123 int orig_slot = path->slots[level];
e66f709b
CM
1124
1125 if (level == 0)
1126 return 1;
1127
5f39d397 1128 mid = path->nodes[level];
7bb86316 1129 WARN_ON(btrfs_header_generation(mid) != trans->transid);
e66f709b 1130
a05a9bb1 1131 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1132 parent = path->nodes[level + 1];
a05a9bb1
LZ
1133 pslot = path->slots[level + 1];
1134 }
e66f709b 1135
5f39d397 1136 if (!parent)
e66f709b 1137 return 1;
e66f709b 1138
4b231ae4 1139 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
1140 if (IS_ERR(left))
1141 left = NULL;
e66f709b
CM
1142
1143 /* first, try to make some room in the middle buffer */
5f39d397 1144 if (left) {
e66f709b 1145 u32 left_nr;
925baedd 1146
bf77467a 1147 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
b4ce94de 1148
5f39d397 1149 left_nr = btrfs_header_nritems(left);
0b246afa 1150 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1151 wret = 1;
1152 } else {
5f39d397 1153 ret = btrfs_cow_block(trans, root, left, parent,
9631e4cc 1154 pslot - 1, &left,
bf59a5a2 1155 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
1156 if (ret)
1157 wret = 1;
1158 else {
d30a668f 1159 wret = push_node_left(trans, left, mid, 0);
54aa1f4d 1160 }
33ade1f8 1161 }
e66f709b
CM
1162 if (wret < 0)
1163 ret = wret;
1164 if (wret == 0) {
5f39d397 1165 struct btrfs_disk_key disk_key;
e66f709b 1166 orig_slot += left_nr;
5f39d397 1167 btrfs_node_key(mid, &disk_key, 0);
f3a84ccd 1168 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1169 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1170 BUG_ON(ret < 0);
5f39d397
CM
1171 btrfs_set_node_key(parent, &disk_key, pslot);
1172 btrfs_mark_buffer_dirty(parent);
1173 if (btrfs_header_nritems(left) > orig_slot) {
1174 path->nodes[level] = left;
e66f709b
CM
1175 path->slots[level + 1] -= 1;
1176 path->slots[level] = orig_slot;
925baedd 1177 btrfs_tree_unlock(mid);
5f39d397 1178 free_extent_buffer(mid);
e66f709b
CM
1179 } else {
1180 orig_slot -=
5f39d397 1181 btrfs_header_nritems(left);
e66f709b 1182 path->slots[level] = orig_slot;
925baedd 1183 btrfs_tree_unlock(left);
5f39d397 1184 free_extent_buffer(left);
e66f709b 1185 }
e66f709b
CM
1186 return 0;
1187 }
925baedd 1188 btrfs_tree_unlock(left);
5f39d397 1189 free_extent_buffer(left);
e66f709b 1190 }
4b231ae4 1191 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
1192 if (IS_ERR(right))
1193 right = NULL;
e66f709b
CM
1194
1195 /*
1196 * then try to empty the right most buffer into the middle
1197 */
5f39d397 1198 if (right) {
33ade1f8 1199 u32 right_nr;
b4ce94de 1200
bf77467a 1201 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
b4ce94de 1202
5f39d397 1203 right_nr = btrfs_header_nritems(right);
0b246afa 1204 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1205 wret = 1;
1206 } else {
5f39d397
CM
1207 ret = btrfs_cow_block(trans, root, right,
1208 parent, pslot + 1,
bf59a5a2 1209 &right, BTRFS_NESTING_RIGHT_COW);
54aa1f4d
CM
1210 if (ret)
1211 wret = 1;
1212 else {
55d32ed8 1213 wret = balance_node_right(trans, right, mid);
54aa1f4d 1214 }
33ade1f8 1215 }
e66f709b
CM
1216 if (wret < 0)
1217 ret = wret;
1218 if (wret == 0) {
5f39d397
CM
1219 struct btrfs_disk_key disk_key;
1220
1221 btrfs_node_key(right, &disk_key, 0);
f3a84ccd 1222 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1223 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1224 BUG_ON(ret < 0);
5f39d397
CM
1225 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1226 btrfs_mark_buffer_dirty(parent);
1227
1228 if (btrfs_header_nritems(mid) <= orig_slot) {
1229 path->nodes[level] = right;
e66f709b
CM
1230 path->slots[level + 1] += 1;
1231 path->slots[level] = orig_slot -
5f39d397 1232 btrfs_header_nritems(mid);
925baedd 1233 btrfs_tree_unlock(mid);
5f39d397 1234 free_extent_buffer(mid);
e66f709b 1235 } else {
925baedd 1236 btrfs_tree_unlock(right);
5f39d397 1237 free_extent_buffer(right);
e66f709b 1238 }
e66f709b
CM
1239 return 0;
1240 }
925baedd 1241 btrfs_tree_unlock(right);
5f39d397 1242 free_extent_buffer(right);
e66f709b 1243 }
e66f709b
CM
1244 return 1;
1245}
1246
3c69faec 1247/*
d352ac68
CM
1248 * readahead one full node of leaves, finding things that are close
1249 * to the block in 'slot', and triggering ra on them.
3c69faec 1250 */
2ff7e61e 1251static void reada_for_search(struct btrfs_fs_info *fs_info,
c8c42864
CM
1252 struct btrfs_path *path,
1253 int level, int slot, u64 objectid)
3c69faec 1254{
5f39d397 1255 struct extent_buffer *node;
01f46658 1256 struct btrfs_disk_key disk_key;
3c69faec 1257 u32 nritems;
3c69faec 1258 u64 search;
a7175319 1259 u64 target;
6b80053d 1260 u64 nread = 0;
ace75066 1261 u64 nread_max;
6b80053d
CM
1262 u32 nr;
1263 u32 blocksize;
1264 u32 nscan = 0;
db94535d 1265
ace75066 1266 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
6702ed49
CM
1267 return;
1268
1269 if (!path->nodes[level])
3c69faec
CM
1270 return;
1271
5f39d397 1272 node = path->nodes[level];
925baedd 1273
ace75066
FM
1274 /*
1275 * Since the time between visiting leaves is much shorter than the time
1276 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1277 * much IO at once (possibly random).
1278 */
1279 if (path->reada == READA_FORWARD_ALWAYS) {
1280 if (level > 1)
1281 nread_max = node->fs_info->nodesize;
1282 else
1283 nread_max = SZ_128K;
1284 } else {
1285 nread_max = SZ_64K;
1286 }
1287
3c69faec 1288 search = btrfs_node_blockptr(node, slot);
0b246afa 1289 blocksize = fs_info->nodesize;
069a2e37
FM
1290 if (path->reada != READA_FORWARD_ALWAYS) {
1291 struct extent_buffer *eb;
1292
1293 eb = find_extent_buffer(fs_info, search);
1294 if (eb) {
1295 free_extent_buffer(eb);
1296 return;
1297 }
3c69faec
CM
1298 }
1299
a7175319 1300 target = search;
6b80053d 1301
5f39d397 1302 nritems = btrfs_header_nritems(node);
6b80053d 1303 nr = slot;
25b8b936 1304
d397712b 1305 while (1) {
e4058b54 1306 if (path->reada == READA_BACK) {
6b80053d
CM
1307 if (nr == 0)
1308 break;
1309 nr--;
ace75066
FM
1310 } else if (path->reada == READA_FORWARD ||
1311 path->reada == READA_FORWARD_ALWAYS) {
6b80053d
CM
1312 nr++;
1313 if (nr >= nritems)
1314 break;
3c69faec 1315 }
e4058b54 1316 if (path->reada == READA_BACK && objectid) {
01f46658
CM
1317 btrfs_node_key(node, &disk_key, nr);
1318 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1319 break;
1320 }
6b80053d 1321 search = btrfs_node_blockptr(node, nr);
ace75066
FM
1322 if (path->reada == READA_FORWARD_ALWAYS ||
1323 (search <= target && target - search <= 65536) ||
a7175319 1324 (search > target && search - target <= 65536)) {
bfb484d9 1325 btrfs_readahead_node_child(node, nr);
6b80053d
CM
1326 nread += blocksize;
1327 }
1328 nscan++;
ace75066 1329 if (nread > nread_max || nscan > 32)
6b80053d 1330 break;
3c69faec
CM
1331 }
1332}
925baedd 1333
bfb484d9 1334static noinline void reada_for_balance(struct btrfs_path *path, int level)
b4ce94de 1335{
bfb484d9 1336 struct extent_buffer *parent;
b4ce94de
CM
1337 int slot;
1338 int nritems;
b4ce94de 1339
8c594ea8 1340 parent = path->nodes[level + 1];
b4ce94de 1341 if (!parent)
0b08851f 1342 return;
b4ce94de
CM
1343
1344 nritems = btrfs_header_nritems(parent);
8c594ea8 1345 slot = path->slots[level + 1];
b4ce94de 1346
bfb484d9
JB
1347 if (slot > 0)
1348 btrfs_readahead_node_child(parent, slot - 1);
1349 if (slot + 1 < nritems)
1350 btrfs_readahead_node_child(parent, slot + 1);
b4ce94de
CM
1351}
1352
1353
d352ac68 1354/*
d397712b
CM
1355 * when we walk down the tree, it is usually safe to unlock the higher layers
1356 * in the tree. The exceptions are when our path goes through slot 0, because
1357 * operations on the tree might require changing key pointers higher up in the
1358 * tree.
d352ac68 1359 *
d397712b
CM
1360 * callers might also have set path->keep_locks, which tells this code to keep
1361 * the lock if the path points to the last slot in the block. This is part of
1362 * walking through the tree, and selecting the next slot in the higher block.
d352ac68 1363 *
d397712b
CM
1364 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1365 * if lowest_unlock is 1, level 0 won't be unlocked
d352ac68 1366 */
e02119d5 1367static noinline void unlock_up(struct btrfs_path *path, int level,
f7c79f30
CM
1368 int lowest_unlock, int min_write_lock_level,
1369 int *write_lock_level)
925baedd
CM
1370{
1371 int i;
1372 int skip_level = level;
c1227996 1373 bool check_skip = true;
925baedd
CM
1374
1375 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1376 if (!path->nodes[i])
1377 break;
1378 if (!path->locks[i])
1379 break;
c1227996
NB
1380
1381 if (check_skip) {
1382 if (path->slots[i] == 0) {
925baedd
CM
1383 skip_level = i + 1;
1384 continue;
1385 }
c1227996
NB
1386
1387 if (path->keep_locks) {
1388 u32 nritems;
1389
1390 nritems = btrfs_header_nritems(path->nodes[i]);
1391 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1392 skip_level = i + 1;
1393 continue;
1394 }
1395 }
925baedd 1396 }
051e1b9f 1397
d80bb3f9 1398 if (i >= lowest_unlock && i > skip_level) {
c1227996
NB
1399 check_skip = false;
1400 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
925baedd 1401 path->locks[i] = 0;
f7c79f30
CM
1402 if (write_lock_level &&
1403 i > min_write_lock_level &&
1404 i <= *write_lock_level) {
1405 *write_lock_level = i - 1;
1406 }
925baedd
CM
1407 }
1408 }
1409}
1410
c8c42864 1411/*
376a21d7
FM
1412 * Helper function for btrfs_search_slot() and other functions that do a search
1413 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1414 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1415 * its pages from disk.
c8c42864 1416 *
376a21d7
FM
1417 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1418 * whole btree search, starting again from the current root node.
c8c42864
CM
1419 */
1420static int
d07b8528
LB
1421read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1422 struct extent_buffer **eb_ret, int level, int slot,
cda79c54 1423 const struct btrfs_key *key)
c8c42864 1424{
0b246afa 1425 struct btrfs_fs_info *fs_info = root->fs_info;
c8c42864
CM
1426 u64 blocknr;
1427 u64 gen;
c8c42864 1428 struct extent_buffer *tmp;
581c1760 1429 struct btrfs_key first_key;
76a05b35 1430 int ret;
581c1760 1431 int parent_level;
b246666e 1432 bool unlock_up;
c8c42864 1433
b246666e 1434 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
213ff4b7
NB
1435 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1436 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1437 parent_level = btrfs_header_level(*eb_ret);
1438 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
c8c42864 1439
b246666e
FM
1440 /*
1441 * If we need to read an extent buffer from disk and we are holding locks
1442 * on upper level nodes, we unlock all the upper nodes before reading the
1443 * extent buffer, and then return -EAGAIN to the caller as it needs to
1444 * restart the search. We don't release the lock on the current level
1445 * because we need to walk this node to figure out which blocks to read.
1446 */
0b246afa 1447 tmp = find_extent_buffer(fs_info, blocknr);
cb44921a 1448 if (tmp) {
ace75066
FM
1449 if (p->reada == READA_FORWARD_ALWAYS)
1450 reada_for_search(fs_info, p, level, slot, key->objectid);
1451
b9fab919 1452 /* first we do an atomic uptodate check */
bdf7c00e 1453 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
448de471
QW
1454 /*
1455 * Do extra check for first_key, eb can be stale due to
1456 * being cached, read from scrub, or have multiple
1457 * parents (shared tree blocks).
1458 */
e064d5e9 1459 if (btrfs_verify_level_key(tmp,
448de471
QW
1460 parent_level - 1, &first_key, gen)) {
1461 free_extent_buffer(tmp);
1462 return -EUCLEAN;
1463 }
bdf7c00e
JB
1464 *eb_ret = tmp;
1465 return 0;
1466 }
1467
857bc13f
JB
1468 if (p->nowait) {
1469 free_extent_buffer(tmp);
1470 return -EAGAIN;
1471 }
1472
b246666e
FM
1473 if (unlock_up)
1474 btrfs_unlock_up_safe(p, level + 1);
1475
bdf7c00e 1476 /* now we're allowed to do a blocking uptodate check */
6a2e9dc4 1477 ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
9a4ffa1b
QW
1478 if (ret) {
1479 free_extent_buffer(tmp);
1480 btrfs_release_path(p);
1481 return -EIO;
cb44921a 1482 }
88c602ab
QW
1483 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1484 free_extent_buffer(tmp);
1485 btrfs_release_path(p);
1486 return -EUCLEAN;
1487 }
b246666e
FM
1488
1489 if (unlock_up)
1490 ret = -EAGAIN;
1491
1492 goto out;
857bc13f
JB
1493 } else if (p->nowait) {
1494 return -EAGAIN;
c8c42864
CM
1495 }
1496
b246666e 1497 if (unlock_up) {
4bb59055
FM
1498 btrfs_unlock_up_safe(p, level + 1);
1499 ret = -EAGAIN;
1500 } else {
1501 ret = 0;
1502 }
8c594ea8 1503
e4058b54 1504 if (p->reada != READA_NONE)
2ff7e61e 1505 reada_for_search(fs_info, p, level, slot, key->objectid);
c8c42864 1506
1b7ec85e
JB
1507 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1508 gen, parent_level - 1, &first_key);
4eb150d6
QW
1509 if (IS_ERR(tmp)) {
1510 btrfs_release_path(p);
1511 return PTR_ERR(tmp);
76a05b35 1512 }
4eb150d6
QW
1513 /*
1514 * If the read above didn't mark this buffer up to date,
1515 * it will never end up being up to date. Set ret to EIO now
1516 * and give up so that our caller doesn't loop forever
1517 * on our EAGAINs.
1518 */
1519 if (!extent_buffer_uptodate(tmp))
1520 ret = -EIO;
02a3307a 1521
b246666e 1522out:
4bb59055
FM
1523 if (ret == 0) {
1524 *eb_ret = tmp;
1525 } else {
1526 free_extent_buffer(tmp);
1527 btrfs_release_path(p);
1528 }
1529
76a05b35 1530 return ret;
c8c42864
CM
1531}
1532
1533/*
1534 * helper function for btrfs_search_slot. This does all of the checks
1535 * for node-level blocks and does any balancing required based on
1536 * the ins_len.
1537 *
1538 * If no extra work was required, zero is returned. If we had to
1539 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1540 * start over
1541 */
1542static int
1543setup_nodes_for_search(struct btrfs_trans_handle *trans,
1544 struct btrfs_root *root, struct btrfs_path *p,
bd681513
CM
1545 struct extent_buffer *b, int level, int ins_len,
1546 int *write_lock_level)
c8c42864 1547{
0b246afa 1548 struct btrfs_fs_info *fs_info = root->fs_info;
95b982de 1549 int ret = 0;
0b246afa 1550
c8c42864 1551 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
0b246afa 1552 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
c8c42864 1553
bd681513
CM
1554 if (*write_lock_level < level + 1) {
1555 *write_lock_level = level + 1;
1556 btrfs_release_path(p);
95b982de 1557 return -EAGAIN;
bd681513
CM
1558 }
1559
bfb484d9 1560 reada_for_balance(p, level);
95b982de 1561 ret = split_node(trans, root, p, level);
c8c42864 1562
c8c42864
CM
1563 b = p->nodes[level];
1564 } else if (ins_len < 0 && btrfs_header_nritems(b) <
0b246afa 1565 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
c8c42864 1566
bd681513
CM
1567 if (*write_lock_level < level + 1) {
1568 *write_lock_level = level + 1;
1569 btrfs_release_path(p);
95b982de 1570 return -EAGAIN;
bd681513
CM
1571 }
1572
bfb484d9 1573 reada_for_balance(p, level);
95b982de
NB
1574 ret = balance_level(trans, root, p, level);
1575 if (ret)
1576 return ret;
c8c42864 1577
c8c42864
CM
1578 b = p->nodes[level];
1579 if (!b) {
b3b4aa74 1580 btrfs_release_path(p);
95b982de 1581 return -EAGAIN;
c8c42864
CM
1582 }
1583 BUG_ON(btrfs_header_nritems(b) == 1);
1584 }
c8c42864
CM
1585 return ret;
1586}
1587
381cf658 1588int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
e33d5c3d
KN
1589 u64 iobjectid, u64 ioff, u8 key_type,
1590 struct btrfs_key *found_key)
1591{
1592 int ret;
1593 struct btrfs_key key;
1594 struct extent_buffer *eb;
381cf658
DS
1595
1596 ASSERT(path);
1d4c08e0 1597 ASSERT(found_key);
e33d5c3d
KN
1598
1599 key.type = key_type;
1600 key.objectid = iobjectid;
1601 key.offset = ioff;
1602
1603 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1d4c08e0 1604 if (ret < 0)
e33d5c3d
KN
1605 return ret;
1606
1607 eb = path->nodes[0];
1608 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1609 ret = btrfs_next_leaf(fs_root, path);
1610 if (ret)
1611 return ret;
1612 eb = path->nodes[0];
1613 }
1614
1615 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1616 if (found_key->type != key.type ||
1617 found_key->objectid != key.objectid)
1618 return 1;
1619
1620 return 0;
1621}
1622
1fc28d8e
LB
1623static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1624 struct btrfs_path *p,
1625 int write_lock_level)
1626{
1fc28d8e 1627 struct extent_buffer *b;
120de408 1628 int root_lock = 0;
1fc28d8e
LB
1629 int level = 0;
1630
1fc28d8e 1631 if (p->search_commit_root) {
d96b3424
FM
1632 b = root->commit_root;
1633 atomic_inc(&b->refs);
be6821f8 1634 level = btrfs_header_level(b);
f9ddfd05
LB
1635 /*
1636 * Ensure that all callers have set skip_locking when
1637 * p->search_commit_root = 1.
1638 */
1639 ASSERT(p->skip_locking == 1);
1fc28d8e
LB
1640
1641 goto out;
1642 }
1643
1644 if (p->skip_locking) {
1645 b = btrfs_root_node(root);
1646 level = btrfs_header_level(b);
1647 goto out;
1648 }
1649
120de408
JB
1650 /* We try very hard to do read locks on the root */
1651 root_lock = BTRFS_READ_LOCK;
1652
1fc28d8e 1653 /*
662c653b
LB
1654 * If the level is set to maximum, we can skip trying to get the read
1655 * lock.
1fc28d8e 1656 */
662c653b
LB
1657 if (write_lock_level < BTRFS_MAX_LEVEL) {
1658 /*
1659 * We don't know the level of the root node until we actually
1660 * have it read locked
1661 */
857bc13f
JB
1662 if (p->nowait) {
1663 b = btrfs_try_read_lock_root_node(root);
1664 if (IS_ERR(b))
1665 return b;
1666 } else {
1667 b = btrfs_read_lock_root_node(root);
1668 }
662c653b
LB
1669 level = btrfs_header_level(b);
1670 if (level > write_lock_level)
1671 goto out;
1672
1673 /* Whoops, must trade for write lock */
1674 btrfs_tree_read_unlock(b);
1675 free_extent_buffer(b);
1676 }
1fc28d8e 1677
1fc28d8e
LB
1678 b = btrfs_lock_root_node(root);
1679 root_lock = BTRFS_WRITE_LOCK;
1680
1681 /* The level might have changed, check again */
1682 level = btrfs_header_level(b);
1683
1684out:
120de408
JB
1685 /*
1686 * The root may have failed to write out at some point, and thus is no
1687 * longer valid, return an error in this case.
1688 */
1689 if (!extent_buffer_uptodate(b)) {
1690 if (root_lock)
1691 btrfs_tree_unlock_rw(b, root_lock);
1692 free_extent_buffer(b);
1693 return ERR_PTR(-EIO);
1694 }
1695
1fc28d8e
LB
1696 p->nodes[level] = b;
1697 if (!p->skip_locking)
1698 p->locks[level] = root_lock;
1699 /*
1700 * Callers are responsible for dropping b's references.
1701 */
1702 return b;
1703}
1704
d96b3424
FM
1705/*
1706 * Replace the extent buffer at the lowest level of the path with a cloned
1707 * version. The purpose is to be able to use it safely, after releasing the
1708 * commit root semaphore, even if relocation is happening in parallel, the
1709 * transaction used for relocation is committed and the extent buffer is
1710 * reallocated in the next transaction.
1711 *
1712 * This is used in a context where the caller does not prevent transaction
1713 * commits from happening, either by holding a transaction handle or holding
1714 * some lock, while it's doing searches through a commit root.
1715 * At the moment it's only used for send operations.
1716 */
1717static int finish_need_commit_sem_search(struct btrfs_path *path)
1718{
1719 const int i = path->lowest_level;
1720 const int slot = path->slots[i];
1721 struct extent_buffer *lowest = path->nodes[i];
1722 struct extent_buffer *clone;
1723
1724 ASSERT(path->need_commit_sem);
1725
1726 if (!lowest)
1727 return 0;
1728
1729 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1730
1731 clone = btrfs_clone_extent_buffer(lowest);
1732 if (!clone)
1733 return -ENOMEM;
1734
1735 btrfs_release_path(path);
1736 path->nodes[i] = clone;
1737 path->slots[i] = slot;
1738
1739 return 0;
1740}
1fc28d8e 1741
e2e58d0f
FM
1742static inline int search_for_key_slot(struct extent_buffer *eb,
1743 int search_low_slot,
1744 const struct btrfs_key *key,
1745 int prev_cmp,
1746 int *slot)
1747{
1748 /*
1749 * If a previous call to btrfs_bin_search() on a parent node returned an
1750 * exact match (prev_cmp == 0), we can safely assume the target key will
1751 * always be at slot 0 on lower levels, since each key pointer
1752 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1753 * subtree it points to. Thus we can skip searching lower levels.
1754 */
1755 if (prev_cmp == 0) {
1756 *slot = 0;
1757 return 0;
1758 }
1759
1760 return generic_bin_search(eb, search_low_slot, key, slot);
1761}
1762
109324cf
FM
1763static int search_leaf(struct btrfs_trans_handle *trans,
1764 struct btrfs_root *root,
1765 const struct btrfs_key *key,
1766 struct btrfs_path *path,
1767 int ins_len,
1768 int prev_cmp)
1769{
1770 struct extent_buffer *leaf = path->nodes[0];
1771 int leaf_free_space = -1;
1772 int search_low_slot = 0;
1773 int ret;
1774 bool do_bin_search = true;
1775
1776 /*
1777 * If we are doing an insertion, the leaf has enough free space and the
1778 * destination slot for the key is not slot 0, then we can unlock our
1779 * write lock on the parent, and any other upper nodes, before doing the
1780 * binary search on the leaf (with search_for_key_slot()), allowing other
1781 * tasks to lock the parent and any other upper nodes.
1782 */
1783 if (ins_len > 0) {
1784 /*
1785 * Cache the leaf free space, since we will need it later and it
1786 * will not change until then.
1787 */
1788 leaf_free_space = btrfs_leaf_free_space(leaf);
1789
1790 /*
1791 * !path->locks[1] means we have a single node tree, the leaf is
1792 * the root of the tree.
1793 */
1794 if (path->locks[1] && leaf_free_space >= ins_len) {
1795 struct btrfs_disk_key first_key;
1796
1797 ASSERT(btrfs_header_nritems(leaf) > 0);
1798 btrfs_item_key(leaf, &first_key, 0);
1799
1800 /*
1801 * Doing the extra comparison with the first key is cheap,
1802 * taking into account that the first key is very likely
1803 * already in a cache line because it immediately follows
1804 * the extent buffer's header and we have recently accessed
1805 * the header's level field.
1806 */
1807 ret = comp_keys(&first_key, key);
1808 if (ret < 0) {
1809 /*
1810 * The first key is smaller than the key we want
1811 * to insert, so we are safe to unlock all upper
1812 * nodes and we have to do the binary search.
1813 *
1814 * We do use btrfs_unlock_up_safe() and not
1815 * unlock_up() because the later does not unlock
1816 * nodes with a slot of 0 - we can safely unlock
1817 * any node even if its slot is 0 since in this
1818 * case the key does not end up at slot 0 of the
1819 * leaf and there's no need to split the leaf.
1820 */
1821 btrfs_unlock_up_safe(path, 1);
1822 search_low_slot = 1;
1823 } else {
1824 /*
1825 * The first key is >= then the key we want to
1826 * insert, so we can skip the binary search as
1827 * the target key will be at slot 0.
1828 *
1829 * We can not unlock upper nodes when the key is
1830 * less than the first key, because we will need
1831 * to update the key at slot 0 of the parent node
1832 * and possibly of other upper nodes too.
1833 * If the key matches the first key, then we can
1834 * unlock all the upper nodes, using
1835 * btrfs_unlock_up_safe() instead of unlock_up()
1836 * as stated above.
1837 */
1838 if (ret == 0)
1839 btrfs_unlock_up_safe(path, 1);
1840 /*
1841 * ret is already 0 or 1, matching the result of
1842 * a btrfs_bin_search() call, so there is no need
1843 * to adjust it.
1844 */
1845 do_bin_search = false;
1846 path->slots[0] = 0;
1847 }
1848 }
1849 }
1850
1851 if (do_bin_search) {
1852 ret = search_for_key_slot(leaf, search_low_slot, key,
1853 prev_cmp, &path->slots[0]);
1854 if (ret < 0)
1855 return ret;
1856 }
1857
1858 if (ins_len > 0) {
1859 /*
1860 * Item key already exists. In this case, if we are allowed to
1861 * insert the item (for example, in dir_item case, item key
1862 * collision is allowed), it will be merged with the original
1863 * item. Only the item size grows, no new btrfs item will be
1864 * added. If search_for_extension is not set, ins_len already
1865 * accounts the size btrfs_item, deduct it here so leaf space
1866 * check will be correct.
1867 */
1868 if (ret == 0 && !path->search_for_extension) {
1869 ASSERT(ins_len >= sizeof(struct btrfs_item));
1870 ins_len -= sizeof(struct btrfs_item);
1871 }
1872
1873 ASSERT(leaf_free_space >= 0);
1874
1875 if (leaf_free_space < ins_len) {
1876 int err;
1877
1878 err = split_leaf(trans, root, key, path, ins_len,
1879 (ret == 0));
bb8e9a60
FM
1880 ASSERT(err <= 0);
1881 if (WARN_ON(err > 0))
1882 err = -EUCLEAN;
109324cf
FM
1883 if (err)
1884 ret = err;
1885 }
1886 }
1887
1888 return ret;
1889}
1890
74123bd7 1891/*
4271ecea
NB
1892 * btrfs_search_slot - look for a key in a tree and perform necessary
1893 * modifications to preserve tree invariants.
74123bd7 1894 *
4271ecea
NB
1895 * @trans: Handle of transaction, used when modifying the tree
1896 * @p: Holds all btree nodes along the search path
1897 * @root: The root node of the tree
1898 * @key: The key we are looking for
9a664971 1899 * @ins_len: Indicates purpose of search:
1900 * >0 for inserts it's size of item inserted (*)
1901 * <0 for deletions
1902 * 0 for plain searches, not modifying the tree
1903 *
1904 * (*) If size of item inserted doesn't include
1905 * sizeof(struct btrfs_item), then p->search_for_extension must
1906 * be set.
4271ecea
NB
1907 * @cow: boolean should CoW operations be performed. Must always be 1
1908 * when modifying the tree.
97571fd0 1909 *
4271ecea
NB
1910 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1911 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1912 *
1913 * If @key is found, 0 is returned and you can find the item in the leaf level
1914 * of the path (level 0)
1915 *
1916 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1917 * points to the slot where it should be inserted
1918 *
1919 * If an error is encountered while searching the tree a negative error number
1920 * is returned
74123bd7 1921 */
310712b2
OS
1922int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1923 const struct btrfs_key *key, struct btrfs_path *p,
1924 int ins_len, int cow)
be0e5c09 1925{
d96b3424 1926 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 1927 struct extent_buffer *b;
be0e5c09
CM
1928 int slot;
1929 int ret;
33c66f43 1930 int err;
be0e5c09 1931 int level;
925baedd 1932 int lowest_unlock = 1;
bd681513
CM
1933 /* everything at write_lock_level or lower must be write locked */
1934 int write_lock_level = 0;
9f3a7427 1935 u8 lowest_level = 0;
f7c79f30 1936 int min_write_lock_level;
d7396f07 1937 int prev_cmp;
9f3a7427 1938
6702ed49 1939 lowest_level = p->lowest_level;
323ac95b 1940 WARN_ON(lowest_level && ins_len > 0);
22b0ebda 1941 WARN_ON(p->nodes[0] != NULL);
eb653de1 1942 BUG_ON(!cow && ins_len);
25179201 1943
857bc13f
JB
1944 /*
1945 * For now only allow nowait for read only operations. There's no
1946 * strict reason why we can't, we just only need it for reads so it's
1947 * only implemented for reads.
1948 */
1949 ASSERT(!p->nowait || !cow);
1950
bd681513 1951 if (ins_len < 0) {
925baedd 1952 lowest_unlock = 2;
65b51a00 1953
bd681513
CM
1954 /* when we are removing items, we might have to go up to level
1955 * two as we update tree pointers Make sure we keep write
1956 * for those levels as well
1957 */
1958 write_lock_level = 2;
1959 } else if (ins_len > 0) {
1960 /*
1961 * for inserting items, make sure we have a write lock on
1962 * level 1 so we can update keys
1963 */
1964 write_lock_level = 1;
1965 }
1966
1967 if (!cow)
1968 write_lock_level = -1;
1969
09a2a8f9 1970 if (cow && (p->keep_locks || p->lowest_level))
bd681513
CM
1971 write_lock_level = BTRFS_MAX_LEVEL;
1972
f7c79f30
CM
1973 min_write_lock_level = write_lock_level;
1974
d96b3424
FM
1975 if (p->need_commit_sem) {
1976 ASSERT(p->search_commit_root);
857bc13f
JB
1977 if (p->nowait) {
1978 if (!down_read_trylock(&fs_info->commit_root_sem))
1979 return -EAGAIN;
1980 } else {
1981 down_read(&fs_info->commit_root_sem);
1982 }
d96b3424
FM
1983 }
1984
bb803951 1985again:
d7396f07 1986 prev_cmp = -1;
1fc28d8e 1987 b = btrfs_search_slot_get_root(root, p, write_lock_level);
be6821f8
FM
1988 if (IS_ERR(b)) {
1989 ret = PTR_ERR(b);
1990 goto done;
1991 }
925baedd 1992
eb60ceac 1993 while (b) {
f624d976
QW
1994 int dec = 0;
1995
5f39d397 1996 level = btrfs_header_level(b);
65b51a00 1997
02217ed2 1998 if (cow) {
9ea2c7c9
NB
1999 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2000
c8c42864
CM
2001 /*
2002 * if we don't really need to cow this block
2003 * then we don't want to set the path blocking,
2004 * so we test it here
2005 */
5963ffca 2006 if (!should_cow_block(trans, root, b))
65b51a00 2007 goto cow_done;
5d4f98a2 2008
bd681513
CM
2009 /*
2010 * must have write locks on this node and the
2011 * parent
2012 */
5124e00e
JB
2013 if (level > write_lock_level ||
2014 (level + 1 > write_lock_level &&
2015 level + 1 < BTRFS_MAX_LEVEL &&
2016 p->nodes[level + 1])) {
bd681513
CM
2017 write_lock_level = level + 1;
2018 btrfs_release_path(p);
2019 goto again;
2020 }
2021
9ea2c7c9
NB
2022 if (last_level)
2023 err = btrfs_cow_block(trans, root, b, NULL, 0,
9631e4cc
JB
2024 &b,
2025 BTRFS_NESTING_COW);
9ea2c7c9
NB
2026 else
2027 err = btrfs_cow_block(trans, root, b,
2028 p->nodes[level + 1],
9631e4cc
JB
2029 p->slots[level + 1], &b,
2030 BTRFS_NESTING_COW);
33c66f43 2031 if (err) {
33c66f43 2032 ret = err;
65b51a00 2033 goto done;
54aa1f4d 2034 }
02217ed2 2035 }
65b51a00 2036cow_done:
eb60ceac 2037 p->nodes[level] = b;
b4ce94de
CM
2038
2039 /*
2040 * we have a lock on b and as long as we aren't changing
2041 * the tree, there is no way to for the items in b to change.
2042 * It is safe to drop the lock on our parent before we
2043 * go through the expensive btree search on b.
2044 *
eb653de1
FDBM
2045 * If we're inserting or deleting (ins_len != 0), then we might
2046 * be changing slot zero, which may require changing the parent.
2047 * So, we can't drop the lock until after we know which slot
2048 * we're operating on.
b4ce94de 2049 */
eb653de1
FDBM
2050 if (!ins_len && !p->keep_locks) {
2051 int u = level + 1;
2052
2053 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2054 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2055 p->locks[u] = 0;
2056 }
2057 }
b4ce94de 2058
e2e58d0f 2059 if (level == 0) {
109324cf 2060 if (ins_len > 0)
e5e1c174 2061 ASSERT(write_lock_level >= 1);
bd681513 2062
109324cf 2063 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
459931ec 2064 if (!p->search_for_split)
f7c79f30 2065 unlock_up(p, level, lowest_unlock,
4b6f8e96 2066 min_write_lock_level, NULL);
65b51a00 2067 goto done;
be0e5c09 2068 }
e2e58d0f
FM
2069
2070 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2071 if (ret < 0)
2072 goto done;
2073 prev_cmp = ret;
2074
f624d976
QW
2075 if (ret && slot > 0) {
2076 dec = 1;
2077 slot--;
2078 }
2079 p->slots[level] = slot;
2080 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2081 &write_lock_level);
2082 if (err == -EAGAIN)
2083 goto again;
2084 if (err) {
2085 ret = err;
2086 goto done;
2087 }
2088 b = p->nodes[level];
2089 slot = p->slots[level];
2090
2091 /*
2092 * Slot 0 is special, if we change the key we have to update
2093 * the parent pointer which means we must have a write lock on
2094 * the parent
2095 */
2096 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2097 write_lock_level = level + 1;
2098 btrfs_release_path(p);
2099 goto again;
2100 }
2101
2102 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2103 &write_lock_level);
2104
2105 if (level == lowest_level) {
2106 if (dec)
2107 p->slots[level]++;
2108 goto done;
2109 }
2110
2111 err = read_block_for_search(root, p, &b, level, slot, key);
2112 if (err == -EAGAIN)
2113 goto again;
2114 if (err) {
2115 ret = err;
2116 goto done;
2117 }
2118
2119 if (!p->skip_locking) {
2120 level = btrfs_header_level(b);
b40130b2
JB
2121
2122 btrfs_maybe_reset_lockdep_class(root, b);
2123
f624d976 2124 if (level <= write_lock_level) {
ac5887c8 2125 btrfs_tree_lock(b);
f624d976
QW
2126 p->locks[level] = BTRFS_WRITE_LOCK;
2127 } else {
857bc13f
JB
2128 if (p->nowait) {
2129 if (!btrfs_try_tree_read_lock(b)) {
2130 free_extent_buffer(b);
2131 ret = -EAGAIN;
2132 goto done;
2133 }
2134 } else {
2135 btrfs_tree_read_lock(b);
2136 }
f624d976
QW
2137 p->locks[level] = BTRFS_READ_LOCK;
2138 }
2139 p->nodes[level] = b;
2140 }
be0e5c09 2141 }
65b51a00
CM
2142 ret = 1;
2143done:
5f5bc6b1 2144 if (ret < 0 && !p->skip_release_on_error)
b3b4aa74 2145 btrfs_release_path(p);
d96b3424
FM
2146
2147 if (p->need_commit_sem) {
2148 int ret2;
2149
2150 ret2 = finish_need_commit_sem_search(p);
2151 up_read(&fs_info->commit_root_sem);
2152 if (ret2)
2153 ret = ret2;
2154 }
2155
65b51a00 2156 return ret;
be0e5c09 2157}
f75e2b79 2158ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
be0e5c09 2159
5d9e75c4
JS
2160/*
2161 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2162 * current state of the tree together with the operations recorded in the tree
2163 * modification log to search for the key in a previous version of this tree, as
2164 * denoted by the time_seq parameter.
2165 *
2166 * Naturally, there is no support for insert, delete or cow operations.
2167 *
2168 * The resulting path and return value will be set up as if we called
2169 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2170 */
310712b2 2171int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
5d9e75c4
JS
2172 struct btrfs_path *p, u64 time_seq)
2173{
0b246afa 2174 struct btrfs_fs_info *fs_info = root->fs_info;
5d9e75c4
JS
2175 struct extent_buffer *b;
2176 int slot;
2177 int ret;
2178 int err;
2179 int level;
2180 int lowest_unlock = 1;
2181 u8 lowest_level = 0;
2182
2183 lowest_level = p->lowest_level;
2184 WARN_ON(p->nodes[0] != NULL);
c922b016 2185 ASSERT(!p->nowait);
5d9e75c4
JS
2186
2187 if (p->search_commit_root) {
2188 BUG_ON(time_seq);
2189 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2190 }
2191
2192again:
f3a84ccd 2193 b = btrfs_get_old_root(root, time_seq);
315bed43
NB
2194 if (!b) {
2195 ret = -EIO;
2196 goto done;
2197 }
5d9e75c4 2198 level = btrfs_header_level(b);
5d9e75c4
JS
2199 p->locks[level] = BTRFS_READ_LOCK;
2200
2201 while (b) {
abe9339d
QW
2202 int dec = 0;
2203
5d9e75c4
JS
2204 level = btrfs_header_level(b);
2205 p->nodes[level] = b;
5d9e75c4
JS
2206
2207 /*
2208 * we have a lock on b and as long as we aren't changing
2209 * the tree, there is no way to for the items in b to change.
2210 * It is safe to drop the lock on our parent before we
2211 * go through the expensive btree search on b.
2212 */
2213 btrfs_unlock_up_safe(p, level + 1);
2214
995e9a16 2215 ret = btrfs_bin_search(b, key, &slot);
cbca7d59
FM
2216 if (ret < 0)
2217 goto done;
5d9e75c4 2218
abe9339d 2219 if (level == 0) {
5d9e75c4
JS
2220 p->slots[level] = slot;
2221 unlock_up(p, level, lowest_unlock, 0, NULL);
abe9339d
QW
2222 goto done;
2223 }
5d9e75c4 2224
abe9339d
QW
2225 if (ret && slot > 0) {
2226 dec = 1;
2227 slot--;
2228 }
2229 p->slots[level] = slot;
2230 unlock_up(p, level, lowest_unlock, 0, NULL);
5d9e75c4 2231
abe9339d
QW
2232 if (level == lowest_level) {
2233 if (dec)
2234 p->slots[level]++;
2235 goto done;
2236 }
5d9e75c4 2237
abe9339d
QW
2238 err = read_block_for_search(root, p, &b, level, slot, key);
2239 if (err == -EAGAIN)
2240 goto again;
2241 if (err) {
2242 ret = err;
5d9e75c4
JS
2243 goto done;
2244 }
abe9339d
QW
2245
2246 level = btrfs_header_level(b);
ac5887c8 2247 btrfs_tree_read_lock(b);
f3a84ccd 2248 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
abe9339d
QW
2249 if (!b) {
2250 ret = -ENOMEM;
2251 goto done;
2252 }
2253 p->locks[level] = BTRFS_READ_LOCK;
2254 p->nodes[level] = b;
5d9e75c4
JS
2255 }
2256 ret = 1;
2257done:
5d9e75c4
JS
2258 if (ret < 0)
2259 btrfs_release_path(p);
2260
2261 return ret;
2262}
2263
2f38b3e1
AJ
2264/*
2265 * helper to use instead of search slot if no exact match is needed but
2266 * instead the next or previous item should be returned.
2267 * When find_higher is true, the next higher item is returned, the next lower
2268 * otherwise.
2269 * When return_any and find_higher are both true, and no higher item is found,
2270 * return the next lower instead.
2271 * When return_any is true and find_higher is false, and no lower item is found,
2272 * return the next higher instead.
2273 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2274 * < 0 on error
2275 */
2276int btrfs_search_slot_for_read(struct btrfs_root *root,
310712b2
OS
2277 const struct btrfs_key *key,
2278 struct btrfs_path *p, int find_higher,
2279 int return_any)
2f38b3e1
AJ
2280{
2281 int ret;
2282 struct extent_buffer *leaf;
2283
2284again:
2285 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2286 if (ret <= 0)
2287 return ret;
2288 /*
2289 * a return value of 1 means the path is at the position where the
2290 * item should be inserted. Normally this is the next bigger item,
2291 * but in case the previous item is the last in a leaf, path points
2292 * to the first free slot in the previous leaf, i.e. at an invalid
2293 * item.
2294 */
2295 leaf = p->nodes[0];
2296
2297 if (find_higher) {
2298 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2299 ret = btrfs_next_leaf(root, p);
2300 if (ret <= 0)
2301 return ret;
2302 if (!return_any)
2303 return 1;
2304 /*
2305 * no higher item found, return the next
2306 * lower instead
2307 */
2308 return_any = 0;
2309 find_higher = 0;
2310 btrfs_release_path(p);
2311 goto again;
2312 }
2313 } else {
e6793769
AJ
2314 if (p->slots[0] == 0) {
2315 ret = btrfs_prev_leaf(root, p);
2316 if (ret < 0)
2317 return ret;
2318 if (!ret) {
23c6bf6a
FDBM
2319 leaf = p->nodes[0];
2320 if (p->slots[0] == btrfs_header_nritems(leaf))
2321 p->slots[0]--;
e6793769 2322 return 0;
2f38b3e1 2323 }
e6793769
AJ
2324 if (!return_any)
2325 return 1;
2326 /*
2327 * no lower item found, return the next
2328 * higher instead
2329 */
2330 return_any = 0;
2331 find_higher = 1;
2332 btrfs_release_path(p);
2333 goto again;
2334 } else {
2f38b3e1
AJ
2335 --p->slots[0];
2336 }
2337 }
2338 return 0;
2339}
2340
0ff40a91
MPS
2341/*
2342 * Execute search and call btrfs_previous_item to traverse backwards if the item
2343 * was not found.
2344 *
2345 * Return 0 if found, 1 if not found and < 0 if error.
2346 */
2347int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2348 struct btrfs_path *path)
2349{
2350 int ret;
2351
2352 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2353 if (ret > 0)
2354 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2355
2356 if (ret == 0)
2357 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2358
2359 return ret;
2360}
2361
62142be3
GN
2362/**
2363 * Search for a valid slot for the given path.
2364 *
2365 * @root: The root node of the tree.
2366 * @key: Will contain a valid item if found.
2367 * @path: The starting point to validate the slot.
2368 *
2369 * Return: 0 if the item is valid
2370 * 1 if not found
2371 * <0 if error.
2372 */
2373int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2374 struct btrfs_path *path)
2375{
2376 while (1) {
2377 int ret;
2378 const int slot = path->slots[0];
2379 const struct extent_buffer *leaf = path->nodes[0];
2380
2381 /* This is where we start walking the path. */
2382 if (slot >= btrfs_header_nritems(leaf)) {
2383 /*
2384 * If we've reached the last slot in this leaf we need
2385 * to go to the next leaf and reset the path.
2386 */
2387 ret = btrfs_next_leaf(root, path);
2388 if (ret)
2389 return ret;
2390 continue;
2391 }
2392 /* Store the found, valid item in @key. */
2393 btrfs_item_key_to_cpu(leaf, key, slot);
2394 break;
2395 }
2396 return 0;
2397}
2398
74123bd7
CM
2399/*
2400 * adjust the pointers going up the tree, starting at level
2401 * making sure the right key of each node is points to 'key'.
2402 * This is used after shifting pointers to the left, so it stops
2403 * fixing up pointers when a given leaf/node is not in slot 0 of the
2404 * higher levels
aa5d6bed 2405 *
74123bd7 2406 */
b167fa91 2407static void fixup_low_keys(struct btrfs_path *path,
143bede5 2408 struct btrfs_disk_key *key, int level)
be0e5c09
CM
2409{
2410 int i;
5f39d397 2411 struct extent_buffer *t;
0e82bcfe 2412 int ret;
5f39d397 2413
234b63a0 2414 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
be0e5c09 2415 int tslot = path->slots[i];
0e82bcfe 2416
eb60ceac 2417 if (!path->nodes[i])
be0e5c09 2418 break;
5f39d397 2419 t = path->nodes[i];
f3a84ccd 2420 ret = btrfs_tree_mod_log_insert_key(t, tslot,
33cff222 2421 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 2422 BUG_ON(ret < 0);
5f39d397 2423 btrfs_set_node_key(t, key, tslot);
d6025579 2424 btrfs_mark_buffer_dirty(path->nodes[i]);
be0e5c09
CM
2425 if (tslot != 0)
2426 break;
2427 }
2428}
2429
31840ae1
ZY
2430/*
2431 * update item key.
2432 *
2433 * This function isn't completely safe. It's the caller's responsibility
2434 * that the new key won't break the order
2435 */
b7a0365e
DD
2436void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2437 struct btrfs_path *path,
310712b2 2438 const struct btrfs_key *new_key)
31840ae1
ZY
2439{
2440 struct btrfs_disk_key disk_key;
2441 struct extent_buffer *eb;
2442 int slot;
2443
2444 eb = path->nodes[0];
2445 slot = path->slots[0];
2446 if (slot > 0) {
2447 btrfs_item_key(eb, &disk_key, slot - 1);
7c15d410
QW
2448 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2449 btrfs_crit(fs_info,
2450 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2451 slot, btrfs_disk_key_objectid(&disk_key),
2452 btrfs_disk_key_type(&disk_key),
2453 btrfs_disk_key_offset(&disk_key),
2454 new_key->objectid, new_key->type,
2455 new_key->offset);
2456 btrfs_print_leaf(eb);
2457 BUG();
2458 }
31840ae1
ZY
2459 }
2460 if (slot < btrfs_header_nritems(eb) - 1) {
2461 btrfs_item_key(eb, &disk_key, slot + 1);
7c15d410
QW
2462 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2463 btrfs_crit(fs_info,
2464 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2465 slot, btrfs_disk_key_objectid(&disk_key),
2466 btrfs_disk_key_type(&disk_key),
2467 btrfs_disk_key_offset(&disk_key),
2468 new_key->objectid, new_key->type,
2469 new_key->offset);
2470 btrfs_print_leaf(eb);
2471 BUG();
2472 }
31840ae1
ZY
2473 }
2474
2475 btrfs_cpu_key_to_disk(&disk_key, new_key);
2476 btrfs_set_item_key(eb, &disk_key, slot);
2477 btrfs_mark_buffer_dirty(eb);
2478 if (slot == 0)
b167fa91 2479 fixup_low_keys(path, &disk_key, 1);
31840ae1
ZY
2480}
2481
d16c702f
QW
2482/*
2483 * Check key order of two sibling extent buffers.
2484 *
2485 * Return true if something is wrong.
2486 * Return false if everything is fine.
2487 *
2488 * Tree-checker only works inside one tree block, thus the following
2489 * corruption can not be detected by tree-checker:
2490 *
2491 * Leaf @left | Leaf @right
2492 * --------------------------------------------------------------
2493 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2494 *
2495 * Key f6 in leaf @left itself is valid, but not valid when the next
2496 * key in leaf @right is 7.
2497 * This can only be checked at tree block merge time.
2498 * And since tree checker has ensured all key order in each tree block
2499 * is correct, we only need to bother the last key of @left and the first
2500 * key of @right.
2501 */
2502static bool check_sibling_keys(struct extent_buffer *left,
2503 struct extent_buffer *right)
2504{
2505 struct btrfs_key left_last;
2506 struct btrfs_key right_first;
2507 int level = btrfs_header_level(left);
2508 int nr_left = btrfs_header_nritems(left);
2509 int nr_right = btrfs_header_nritems(right);
2510
2511 /* No key to check in one of the tree blocks */
2512 if (!nr_left || !nr_right)
2513 return false;
2514
2515 if (level) {
2516 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2517 btrfs_node_key_to_cpu(right, &right_first, 0);
2518 } else {
2519 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2520 btrfs_item_key_to_cpu(right, &right_first, 0);
2521 }
2522
2523 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2524 btrfs_crit(left->fs_info,
2525"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2526 left_last.objectid, left_last.type,
2527 left_last.offset, right_first.objectid,
2528 right_first.type, right_first.offset);
2529 return true;
2530 }
2531 return false;
2532}
2533
74123bd7
CM
2534/*
2535 * try to push data from one node into the next node left in the
79f95c82 2536 * tree.
aa5d6bed
CM
2537 *
2538 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2539 * error, and > 0 if there was no room in the left hand block.
74123bd7 2540 */
98ed5174 2541static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 2542 struct extent_buffer *dst,
971a1f66 2543 struct extent_buffer *src, int empty)
be0e5c09 2544{
d30a668f 2545 struct btrfs_fs_info *fs_info = trans->fs_info;
be0e5c09 2546 int push_items = 0;
bb803951
CM
2547 int src_nritems;
2548 int dst_nritems;
aa5d6bed 2549 int ret = 0;
be0e5c09 2550
5f39d397
CM
2551 src_nritems = btrfs_header_nritems(src);
2552 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2553 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
7bb86316
CM
2554 WARN_ON(btrfs_header_generation(src) != trans->transid);
2555 WARN_ON(btrfs_header_generation(dst) != trans->transid);
54aa1f4d 2556
bce4eae9 2557 if (!empty && src_nritems <= 8)
971a1f66
CM
2558 return 1;
2559
d397712b 2560 if (push_items <= 0)
be0e5c09
CM
2561 return 1;
2562
bce4eae9 2563 if (empty) {
971a1f66 2564 push_items = min(src_nritems, push_items);
bce4eae9
CM
2565 if (push_items < src_nritems) {
2566 /* leave at least 8 pointers in the node if
2567 * we aren't going to empty it
2568 */
2569 if (src_nritems - push_items < 8) {
2570 if (push_items <= 8)
2571 return 1;
2572 push_items -= 8;
2573 }
2574 }
2575 } else
2576 push_items = min(src_nritems - 8, push_items);
79f95c82 2577
d16c702f
QW
2578 /* dst is the left eb, src is the middle eb */
2579 if (check_sibling_keys(dst, src)) {
2580 ret = -EUCLEAN;
2581 btrfs_abort_transaction(trans, ret);
2582 return ret;
2583 }
f3a84ccd 2584 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
5de865ee 2585 if (ret) {
66642832 2586 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2587 return ret;
2588 }
5f39d397
CM
2589 copy_extent_buffer(dst, src,
2590 btrfs_node_key_ptr_offset(dst_nritems),
2591 btrfs_node_key_ptr_offset(0),
d397712b 2592 push_items * sizeof(struct btrfs_key_ptr));
5f39d397 2593
bb803951 2594 if (push_items < src_nritems) {
57911b8b 2595 /*
f3a84ccd
FM
2596 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2597 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
57911b8b 2598 */
5f39d397
CM
2599 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2600 btrfs_node_key_ptr_offset(push_items),
2601 (src_nritems - push_items) *
2602 sizeof(struct btrfs_key_ptr));
2603 }
2604 btrfs_set_header_nritems(src, src_nritems - push_items);
2605 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2606 btrfs_mark_buffer_dirty(src);
2607 btrfs_mark_buffer_dirty(dst);
31840ae1 2608
79f95c82
CM
2609 return ret;
2610}
2611
2612/*
2613 * try to push data from one node into the next node right in the
2614 * tree.
2615 *
2616 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2617 * error, and > 0 if there was no room in the right hand block.
2618 *
2619 * this will only push up to 1/2 the contents of the left node over
2620 */
5f39d397 2621static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
2622 struct extent_buffer *dst,
2623 struct extent_buffer *src)
79f95c82 2624{
55d32ed8 2625 struct btrfs_fs_info *fs_info = trans->fs_info;
79f95c82
CM
2626 int push_items = 0;
2627 int max_push;
2628 int src_nritems;
2629 int dst_nritems;
2630 int ret = 0;
79f95c82 2631
7bb86316
CM
2632 WARN_ON(btrfs_header_generation(src) != trans->transid);
2633 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2634
5f39d397
CM
2635 src_nritems = btrfs_header_nritems(src);
2636 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2637 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
d397712b 2638 if (push_items <= 0)
79f95c82 2639 return 1;
bce4eae9 2640
d397712b 2641 if (src_nritems < 4)
bce4eae9 2642 return 1;
79f95c82
CM
2643
2644 max_push = src_nritems / 2 + 1;
2645 /* don't try to empty the node */
d397712b 2646 if (max_push >= src_nritems)
79f95c82 2647 return 1;
252c38f0 2648
79f95c82
CM
2649 if (max_push < push_items)
2650 push_items = max_push;
2651
d16c702f
QW
2652 /* dst is the right eb, src is the middle eb */
2653 if (check_sibling_keys(src, dst)) {
2654 ret = -EUCLEAN;
2655 btrfs_abort_transaction(trans, ret);
2656 return ret;
2657 }
f3a84ccd 2658 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
bf1d3425 2659 BUG_ON(ret < 0);
5f39d397
CM
2660 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2661 btrfs_node_key_ptr_offset(0),
2662 (dst_nritems) *
2663 sizeof(struct btrfs_key_ptr));
d6025579 2664
f3a84ccd
FM
2665 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2666 push_items);
5de865ee 2667 if (ret) {
66642832 2668 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2669 return ret;
2670 }
5f39d397
CM
2671 copy_extent_buffer(dst, src,
2672 btrfs_node_key_ptr_offset(0),
2673 btrfs_node_key_ptr_offset(src_nritems - push_items),
d397712b 2674 push_items * sizeof(struct btrfs_key_ptr));
79f95c82 2675
5f39d397
CM
2676 btrfs_set_header_nritems(src, src_nritems - push_items);
2677 btrfs_set_header_nritems(dst, dst_nritems + push_items);
79f95c82 2678
5f39d397
CM
2679 btrfs_mark_buffer_dirty(src);
2680 btrfs_mark_buffer_dirty(dst);
31840ae1 2681
aa5d6bed 2682 return ret;
be0e5c09
CM
2683}
2684
97571fd0
CM
2685/*
2686 * helper function to insert a new root level in the tree.
2687 * A new node is allocated, and a single item is inserted to
2688 * point to the existing root
aa5d6bed
CM
2689 *
2690 * returns zero on success or < 0 on failure.
97571fd0 2691 */
d397712b 2692static noinline int insert_new_root(struct btrfs_trans_handle *trans,
5f39d397 2693 struct btrfs_root *root,
fdd99c72 2694 struct btrfs_path *path, int level)
5c680ed6 2695{
0b246afa 2696 struct btrfs_fs_info *fs_info = root->fs_info;
7bb86316 2697 u64 lower_gen;
5f39d397
CM
2698 struct extent_buffer *lower;
2699 struct extent_buffer *c;
925baedd 2700 struct extent_buffer *old;
5f39d397 2701 struct btrfs_disk_key lower_key;
d9d19a01 2702 int ret;
5c680ed6
CM
2703
2704 BUG_ON(path->nodes[level]);
2705 BUG_ON(path->nodes[level-1] != root->node);
2706
7bb86316
CM
2707 lower = path->nodes[level-1];
2708 if (level == 1)
2709 btrfs_item_key(lower, &lower_key, 0);
2710 else
2711 btrfs_node_key(lower, &lower_key, 0);
2712
79bd3712
FM
2713 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2714 &lower_key, level, root->node->start, 0,
2715 BTRFS_NESTING_NEW_ROOT);
5f39d397
CM
2716 if (IS_ERR(c))
2717 return PTR_ERR(c);
925baedd 2718
0b246afa 2719 root_add_used(root, fs_info->nodesize);
f0486c68 2720
5f39d397 2721 btrfs_set_header_nritems(c, 1);
5f39d397 2722 btrfs_set_node_key(c, &lower_key, 0);
db94535d 2723 btrfs_set_node_blockptr(c, 0, lower->start);
7bb86316 2724 lower_gen = btrfs_header_generation(lower);
31840ae1 2725 WARN_ON(lower_gen != trans->transid);
7bb86316
CM
2726
2727 btrfs_set_node_ptr_generation(c, 0, lower_gen);
d5719762 2728
5f39d397 2729 btrfs_mark_buffer_dirty(c);
d5719762 2730
925baedd 2731 old = root->node;
406808ab 2732 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
d9d19a01 2733 BUG_ON(ret < 0);
240f62c8 2734 rcu_assign_pointer(root->node, c);
925baedd
CM
2735
2736 /* the super has an extra ref to root->node */
2737 free_extent_buffer(old);
2738
0b86a832 2739 add_root_to_dirty_list(root);
67439dad 2740 atomic_inc(&c->refs);
5f39d397 2741 path->nodes[level] = c;
ac5887c8 2742 path->locks[level] = BTRFS_WRITE_LOCK;
5c680ed6
CM
2743 path->slots[level] = 0;
2744 return 0;
2745}
2746
74123bd7
CM
2747/*
2748 * worker function to insert a single pointer in a node.
2749 * the node should have enough room for the pointer already
97571fd0 2750 *
74123bd7
CM
2751 * slot and level indicate where you want the key to go, and
2752 * blocknr is the block the key points to.
2753 */
143bede5 2754static void insert_ptr(struct btrfs_trans_handle *trans,
6ad3cf6d 2755 struct btrfs_path *path,
143bede5 2756 struct btrfs_disk_key *key, u64 bytenr,
c3e06965 2757 int slot, int level)
74123bd7 2758{
5f39d397 2759 struct extent_buffer *lower;
74123bd7 2760 int nritems;
f3ea38da 2761 int ret;
5c680ed6
CM
2762
2763 BUG_ON(!path->nodes[level]);
49d0c642 2764 btrfs_assert_tree_write_locked(path->nodes[level]);
5f39d397
CM
2765 lower = path->nodes[level];
2766 nritems = btrfs_header_nritems(lower);
c293498b 2767 BUG_ON(slot > nritems);
6ad3cf6d 2768 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
74123bd7 2769 if (slot != nritems) {
bf1d3425 2770 if (level) {
f3a84ccd
FM
2771 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2772 slot, nritems - slot);
bf1d3425
DS
2773 BUG_ON(ret < 0);
2774 }
5f39d397
CM
2775 memmove_extent_buffer(lower,
2776 btrfs_node_key_ptr_offset(slot + 1),
2777 btrfs_node_key_ptr_offset(slot),
d6025579 2778 (nritems - slot) * sizeof(struct btrfs_key_ptr));
74123bd7 2779 }
c3e06965 2780 if (level) {
f3a84ccd 2781 ret = btrfs_tree_mod_log_insert_key(lower, slot,
33cff222 2782 BTRFS_MOD_LOG_KEY_ADD);
f3ea38da
JS
2783 BUG_ON(ret < 0);
2784 }
5f39d397 2785 btrfs_set_node_key(lower, key, slot);
db94535d 2786 btrfs_set_node_blockptr(lower, slot, bytenr);
74493f7a
CM
2787 WARN_ON(trans->transid == 0);
2788 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
5f39d397
CM
2789 btrfs_set_header_nritems(lower, nritems + 1);
2790 btrfs_mark_buffer_dirty(lower);
74123bd7
CM
2791}
2792
97571fd0
CM
2793/*
2794 * split the node at the specified level in path in two.
2795 * The path is corrected to point to the appropriate node after the split
2796 *
2797 * Before splitting this tries to make some room in the node by pushing
2798 * left and right, if either one works, it returns right away.
aa5d6bed
CM
2799 *
2800 * returns 0 on success and < 0 on failure
97571fd0 2801 */
e02119d5
CM
2802static noinline int split_node(struct btrfs_trans_handle *trans,
2803 struct btrfs_root *root,
2804 struct btrfs_path *path, int level)
be0e5c09 2805{
0b246afa 2806 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
2807 struct extent_buffer *c;
2808 struct extent_buffer *split;
2809 struct btrfs_disk_key disk_key;
be0e5c09 2810 int mid;
5c680ed6 2811 int ret;
7518a238 2812 u32 c_nritems;
eb60ceac 2813
5f39d397 2814 c = path->nodes[level];
7bb86316 2815 WARN_ON(btrfs_header_generation(c) != trans->transid);
5f39d397 2816 if (c == root->node) {
d9abbf1c 2817 /*
90f8d62e
JS
2818 * trying to split the root, lets make a new one
2819 *
fdd99c72 2820 * tree mod log: We don't log_removal old root in
90f8d62e
JS
2821 * insert_new_root, because that root buffer will be kept as a
2822 * normal node. We are going to log removal of half of the
f3a84ccd
FM
2823 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2824 * holding a tree lock on the buffer, which is why we cannot
2825 * race with other tree_mod_log users.
d9abbf1c 2826 */
fdd99c72 2827 ret = insert_new_root(trans, root, path, level + 1);
5c680ed6
CM
2828 if (ret)
2829 return ret;
b3612421 2830 } else {
e66f709b 2831 ret = push_nodes_for_insert(trans, root, path, level);
5f39d397
CM
2832 c = path->nodes[level];
2833 if (!ret && btrfs_header_nritems(c) <
0b246afa 2834 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
e66f709b 2835 return 0;
54aa1f4d
CM
2836 if (ret < 0)
2837 return ret;
be0e5c09 2838 }
e66f709b 2839
5f39d397 2840 c_nritems = btrfs_header_nritems(c);
5d4f98a2
YZ
2841 mid = (c_nritems + 1) / 2;
2842 btrfs_node_key(c, &disk_key, mid);
7bb86316 2843
79bd3712
FM
2844 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2845 &disk_key, level, c->start, 0,
2846 BTRFS_NESTING_SPLIT);
5f39d397
CM
2847 if (IS_ERR(split))
2848 return PTR_ERR(split);
2849
0b246afa 2850 root_add_used(root, fs_info->nodesize);
bc877d28 2851 ASSERT(btrfs_header_level(c) == level);
54aa1f4d 2852
f3a84ccd 2853 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
5de865ee 2854 if (ret) {
66642832 2855 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2856 return ret;
2857 }
5f39d397
CM
2858 copy_extent_buffer(split, c,
2859 btrfs_node_key_ptr_offset(0),
2860 btrfs_node_key_ptr_offset(mid),
2861 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2862 btrfs_set_header_nritems(split, c_nritems - mid);
2863 btrfs_set_header_nritems(c, mid);
aa5d6bed 2864
5f39d397
CM
2865 btrfs_mark_buffer_dirty(c);
2866 btrfs_mark_buffer_dirty(split);
2867
6ad3cf6d 2868 insert_ptr(trans, path, &disk_key, split->start,
c3e06965 2869 path->slots[level + 1] + 1, level + 1);
aa5d6bed 2870
5de08d7d 2871 if (path->slots[level] >= mid) {
5c680ed6 2872 path->slots[level] -= mid;
925baedd 2873 btrfs_tree_unlock(c);
5f39d397
CM
2874 free_extent_buffer(c);
2875 path->nodes[level] = split;
5c680ed6
CM
2876 path->slots[level + 1] += 1;
2877 } else {
925baedd 2878 btrfs_tree_unlock(split);
5f39d397 2879 free_extent_buffer(split);
be0e5c09 2880 }
d5286a92 2881 return 0;
be0e5c09
CM
2882}
2883
74123bd7
CM
2884/*
2885 * how many bytes are required to store the items in a leaf. start
2886 * and nr indicate which items in the leaf to check. This totals up the
2887 * space used both by the item structs and the item data
2888 */
5f39d397 2889static int leaf_space_used(struct extent_buffer *l, int start, int nr)
be0e5c09
CM
2890{
2891 int data_len;
5f39d397 2892 int nritems = btrfs_header_nritems(l);
d4dbff95 2893 int end = min(nritems, start + nr) - 1;
be0e5c09
CM
2894
2895 if (!nr)
2896 return 0;
3212fa14
JB
2897 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
2898 data_len = data_len - btrfs_item_offset(l, end);
0783fcfc 2899 data_len += sizeof(struct btrfs_item) * nr;
d4dbff95 2900 WARN_ON(data_len < 0);
be0e5c09
CM
2901 return data_len;
2902}
2903
d4dbff95
CM
2904/*
2905 * The space between the end of the leaf items and
2906 * the start of the leaf data. IOW, how much room
2907 * the leaf has left for both items and data
2908 */
e902baac 2909noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
d4dbff95 2910{
e902baac 2911 struct btrfs_fs_info *fs_info = leaf->fs_info;
5f39d397
CM
2912 int nritems = btrfs_header_nritems(leaf);
2913 int ret;
0b246afa
JM
2914
2915 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
5f39d397 2916 if (ret < 0) {
0b246afa
JM
2917 btrfs_crit(fs_info,
2918 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2919 ret,
2920 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2921 leaf_space_used(leaf, 0, nritems), nritems);
5f39d397
CM
2922 }
2923 return ret;
d4dbff95
CM
2924}
2925
99d8f83c
CM
2926/*
2927 * min slot controls the lowest index we're willing to push to the
2928 * right. We'll push up to and including min_slot, but no lower
2929 */
f72f0010 2930static noinline int __push_leaf_right(struct btrfs_path *path,
44871b1b
CM
2931 int data_size, int empty,
2932 struct extent_buffer *right,
99d8f83c
CM
2933 int free_space, u32 left_nritems,
2934 u32 min_slot)
00ec4c51 2935{
f72f0010 2936 struct btrfs_fs_info *fs_info = right->fs_info;
5f39d397 2937 struct extent_buffer *left = path->nodes[0];
44871b1b 2938 struct extent_buffer *upper = path->nodes[1];
cfed81a0 2939 struct btrfs_map_token token;
5f39d397 2940 struct btrfs_disk_key disk_key;
00ec4c51 2941 int slot;
34a38218 2942 u32 i;
00ec4c51
CM
2943 int push_space = 0;
2944 int push_items = 0;
34a38218 2945 u32 nr;
7518a238 2946 u32 right_nritems;
5f39d397 2947 u32 data_end;
db94535d 2948 u32 this_item_size;
00ec4c51 2949
34a38218
CM
2950 if (empty)
2951 nr = 0;
2952 else
99d8f83c 2953 nr = max_t(u32, 1, min_slot);
34a38218 2954
31840ae1 2955 if (path->slots[0] >= left_nritems)
87b29b20 2956 push_space += data_size;
31840ae1 2957
44871b1b 2958 slot = path->slots[1];
34a38218
CM
2959 i = left_nritems - 1;
2960 while (i >= nr) {
31840ae1
ZY
2961 if (!empty && push_items > 0) {
2962 if (path->slots[0] > i)
2963 break;
2964 if (path->slots[0] == i) {
e902baac
DS
2965 int space = btrfs_leaf_free_space(left);
2966
31840ae1
ZY
2967 if (space + push_space * 2 > free_space)
2968 break;
2969 }
2970 }
2971
00ec4c51 2972 if (path->slots[0] == i)
87b29b20 2973 push_space += data_size;
db94535d 2974
3212fa14 2975 this_item_size = btrfs_item_size(left, i);
74794207
JB
2976 if (this_item_size + sizeof(struct btrfs_item) +
2977 push_space > free_space)
00ec4c51 2978 break;
31840ae1 2979
00ec4c51 2980 push_items++;
74794207 2981 push_space += this_item_size + sizeof(struct btrfs_item);
34a38218
CM
2982 if (i == 0)
2983 break;
2984 i--;
db94535d 2985 }
5f39d397 2986
925baedd
CM
2987 if (push_items == 0)
2988 goto out_unlock;
5f39d397 2989
6c1500f2 2990 WARN_ON(!empty && push_items == left_nritems);
5f39d397 2991
00ec4c51 2992 /* push left to right */
5f39d397 2993 right_nritems = btrfs_header_nritems(right);
34a38218 2994
dc2e724e 2995 push_space = btrfs_item_data_end(left, left_nritems - push_items);
8f881e8c 2996 push_space -= leaf_data_end(left);
5f39d397 2997
00ec4c51 2998 /* make room in the right data area */
8f881e8c 2999 data_end = leaf_data_end(right);
5f39d397 3000 memmove_extent_buffer(right,
3d9ec8c4
NB
3001 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3002 BTRFS_LEAF_DATA_OFFSET + data_end,
0b246afa 3003 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
5f39d397 3004
00ec4c51 3005 /* copy from the left data area */
3d9ec8c4 3006 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
0b246afa 3007 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
8f881e8c 3008 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
d6025579 3009 push_space);
5f39d397
CM
3010
3011 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3012 btrfs_item_nr_offset(0),
3013 right_nritems * sizeof(struct btrfs_item));
3014
00ec4c51 3015 /* copy the items from left to right */
5f39d397
CM
3016 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3017 btrfs_item_nr_offset(left_nritems - push_items),
3018 push_items * sizeof(struct btrfs_item));
00ec4c51
CM
3019
3020 /* update the item pointers */
c82f823c 3021 btrfs_init_map_token(&token, right);
7518a238 3022 right_nritems += push_items;
5f39d397 3023 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3024 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
7518a238 3025 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3026 push_space -= btrfs_token_item_size(&token, i);
3027 btrfs_set_token_item_offset(&token, i, push_space);
db94535d
CM
3028 }
3029
7518a238 3030 left_nritems -= push_items;
5f39d397 3031 btrfs_set_header_nritems(left, left_nritems);
00ec4c51 3032
34a38218
CM
3033 if (left_nritems)
3034 btrfs_mark_buffer_dirty(left);
f0486c68 3035 else
6a884d7d 3036 btrfs_clean_tree_block(left);
f0486c68 3037
5f39d397 3038 btrfs_mark_buffer_dirty(right);
a429e513 3039
5f39d397
CM
3040 btrfs_item_key(right, &disk_key, 0);
3041 btrfs_set_node_key(upper, &disk_key, slot + 1);
d6025579 3042 btrfs_mark_buffer_dirty(upper);
02217ed2 3043
00ec4c51 3044 /* then fixup the leaf pointer in the path */
7518a238
CM
3045 if (path->slots[0] >= left_nritems) {
3046 path->slots[0] -= left_nritems;
925baedd 3047 if (btrfs_header_nritems(path->nodes[0]) == 0)
6a884d7d 3048 btrfs_clean_tree_block(path->nodes[0]);
925baedd 3049 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3050 free_extent_buffer(path->nodes[0]);
3051 path->nodes[0] = right;
00ec4c51
CM
3052 path->slots[1] += 1;
3053 } else {
925baedd 3054 btrfs_tree_unlock(right);
5f39d397 3055 free_extent_buffer(right);
00ec4c51
CM
3056 }
3057 return 0;
925baedd
CM
3058
3059out_unlock:
3060 btrfs_tree_unlock(right);
3061 free_extent_buffer(right);
3062 return 1;
00ec4c51 3063}
925baedd 3064
44871b1b
CM
3065/*
3066 * push some data in the path leaf to the right, trying to free up at
3067 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3068 *
3069 * returns 1 if the push failed because the other node didn't have enough
3070 * room, 0 if everything worked out and < 0 if there were major errors.
99d8f83c
CM
3071 *
3072 * this will push starting from min_slot to the end of the leaf. It won't
3073 * push any slot lower than min_slot
44871b1b
CM
3074 */
3075static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3076 *root, struct btrfs_path *path,
3077 int min_data_size, int data_size,
3078 int empty, u32 min_slot)
44871b1b
CM
3079{
3080 struct extent_buffer *left = path->nodes[0];
3081 struct extent_buffer *right;
3082 struct extent_buffer *upper;
3083 int slot;
3084 int free_space;
3085 u32 left_nritems;
3086 int ret;
3087
3088 if (!path->nodes[1])
3089 return 1;
3090
3091 slot = path->slots[1];
3092 upper = path->nodes[1];
3093 if (slot >= btrfs_header_nritems(upper) - 1)
3094 return 1;
3095
49d0c642 3096 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3097
4b231ae4 3098 right = btrfs_read_node_slot(upper, slot + 1);
fb770ae4
LB
3099 /*
3100 * slot + 1 is not valid or we fail to read the right node,
3101 * no big deal, just return.
3102 */
3103 if (IS_ERR(right))
91ca338d
TI
3104 return 1;
3105
bf77467a 3106 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
44871b1b 3107
e902baac 3108 free_space = btrfs_leaf_free_space(right);
44871b1b
CM
3109 if (free_space < data_size)
3110 goto out_unlock;
3111
44871b1b 3112 ret = btrfs_cow_block(trans, root, right, upper,
bf59a5a2 3113 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
44871b1b
CM
3114 if (ret)
3115 goto out_unlock;
3116
44871b1b
CM
3117 left_nritems = btrfs_header_nritems(left);
3118 if (left_nritems == 0)
3119 goto out_unlock;
3120
d16c702f
QW
3121 if (check_sibling_keys(left, right)) {
3122 ret = -EUCLEAN;
3123 btrfs_tree_unlock(right);
3124 free_extent_buffer(right);
3125 return ret;
3126 }
2ef1fed2
FDBM
3127 if (path->slots[0] == left_nritems && !empty) {
3128 /* Key greater than all keys in the leaf, right neighbor has
3129 * enough room for it and we're not emptying our leaf to delete
3130 * it, therefore use right neighbor to insert the new item and
52042d8e 3131 * no need to touch/dirty our left leaf. */
2ef1fed2
FDBM
3132 btrfs_tree_unlock(left);
3133 free_extent_buffer(left);
3134 path->nodes[0] = right;
3135 path->slots[0] = 0;
3136 path->slots[1]++;
3137 return 0;
3138 }
3139
f72f0010 3140 return __push_leaf_right(path, min_data_size, empty,
99d8f83c 3141 right, free_space, left_nritems, min_slot);
44871b1b
CM
3142out_unlock:
3143 btrfs_tree_unlock(right);
3144 free_extent_buffer(right);
3145 return 1;
3146}
3147
74123bd7
CM
3148/*
3149 * push some data in the path leaf to the left, trying to free up at
3150 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3151 *
3152 * max_slot can put a limit on how far into the leaf we'll push items. The
3153 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3154 * items
74123bd7 3155 */
8087c193 3156static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
44871b1b 3157 int empty, struct extent_buffer *left,
99d8f83c
CM
3158 int free_space, u32 right_nritems,
3159 u32 max_slot)
be0e5c09 3160{
8087c193 3161 struct btrfs_fs_info *fs_info = left->fs_info;
5f39d397
CM
3162 struct btrfs_disk_key disk_key;
3163 struct extent_buffer *right = path->nodes[0];
be0e5c09 3164 int i;
be0e5c09
CM
3165 int push_space = 0;
3166 int push_items = 0;
7518a238 3167 u32 old_left_nritems;
34a38218 3168 u32 nr;
aa5d6bed 3169 int ret = 0;
db94535d
CM
3170 u32 this_item_size;
3171 u32 old_left_item_size;
cfed81a0
CM
3172 struct btrfs_map_token token;
3173
34a38218 3174 if (empty)
99d8f83c 3175 nr = min(right_nritems, max_slot);
34a38218 3176 else
99d8f83c 3177 nr = min(right_nritems - 1, max_slot);
34a38218
CM
3178
3179 for (i = 0; i < nr; i++) {
31840ae1
ZY
3180 if (!empty && push_items > 0) {
3181 if (path->slots[0] < i)
3182 break;
3183 if (path->slots[0] == i) {
e902baac
DS
3184 int space = btrfs_leaf_free_space(right);
3185
31840ae1
ZY
3186 if (space + push_space * 2 > free_space)
3187 break;
3188 }
3189 }
3190
be0e5c09 3191 if (path->slots[0] == i)
87b29b20 3192 push_space += data_size;
db94535d 3193
3212fa14 3194 this_item_size = btrfs_item_size(right, i);
74794207
JB
3195 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3196 free_space)
be0e5c09 3197 break;
db94535d 3198
be0e5c09 3199 push_items++;
74794207 3200 push_space += this_item_size + sizeof(struct btrfs_item);
db94535d
CM
3201 }
3202
be0e5c09 3203 if (push_items == 0) {
925baedd
CM
3204 ret = 1;
3205 goto out;
be0e5c09 3206 }
fae7f21c 3207 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
5f39d397 3208
be0e5c09 3209 /* push data from right to left */
5f39d397
CM
3210 copy_extent_buffer(left, right,
3211 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3212 btrfs_item_nr_offset(0),
3213 push_items * sizeof(struct btrfs_item));
3214
0b246afa 3215 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3212fa14 3216 btrfs_item_offset(right, push_items - 1);
5f39d397 3217
3d9ec8c4 3218 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3219 leaf_data_end(left) - push_space,
3d9ec8c4 3220 BTRFS_LEAF_DATA_OFFSET +
3212fa14 3221 btrfs_item_offset(right, push_items - 1),
d6025579 3222 push_space);
5f39d397 3223 old_left_nritems = btrfs_header_nritems(left);
87b29b20 3224 BUG_ON(old_left_nritems <= 0);
eb60ceac 3225
c82f823c 3226 btrfs_init_map_token(&token, left);
3212fa14 3227 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
0783fcfc 3228 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
5f39d397 3229 u32 ioff;
db94535d 3230
3212fa14
JB
3231 ioff = btrfs_token_item_offset(&token, i);
3232 btrfs_set_token_item_offset(&token, i,
cc4c13d5 3233 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
be0e5c09 3234 }
5f39d397 3235 btrfs_set_header_nritems(left, old_left_nritems + push_items);
be0e5c09
CM
3236
3237 /* fixup right node */
31b1a2bd
JL
3238 if (push_items > right_nritems)
3239 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
d397712b 3240 right_nritems);
34a38218
CM
3241
3242 if (push_items < right_nritems) {
3212fa14 3243 push_space = btrfs_item_offset(right, push_items - 1) -
8f881e8c 3244 leaf_data_end(right);
3d9ec8c4 3245 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
0b246afa 3246 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3d9ec8c4 3247 BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3248 leaf_data_end(right), push_space);
34a38218
CM
3249
3250 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
5f39d397
CM
3251 btrfs_item_nr_offset(push_items),
3252 (btrfs_header_nritems(right) - push_items) *
3253 sizeof(struct btrfs_item));
34a38218 3254 }
c82f823c
DS
3255
3256 btrfs_init_map_token(&token, right);
eef1c494
Y
3257 right_nritems -= push_items;
3258 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3259 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
5f39d397 3260 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3261 push_space = push_space - btrfs_token_item_size(&token, i);
3262 btrfs_set_token_item_offset(&token, i, push_space);
db94535d 3263 }
eb60ceac 3264
5f39d397 3265 btrfs_mark_buffer_dirty(left);
34a38218
CM
3266 if (right_nritems)
3267 btrfs_mark_buffer_dirty(right);
f0486c68 3268 else
6a884d7d 3269 btrfs_clean_tree_block(right);
098f59c2 3270
5f39d397 3271 btrfs_item_key(right, &disk_key, 0);
b167fa91 3272 fixup_low_keys(path, &disk_key, 1);
be0e5c09
CM
3273
3274 /* then fixup the leaf pointer in the path */
3275 if (path->slots[0] < push_items) {
3276 path->slots[0] += old_left_nritems;
925baedd 3277 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3278 free_extent_buffer(path->nodes[0]);
3279 path->nodes[0] = left;
be0e5c09
CM
3280 path->slots[1] -= 1;
3281 } else {
925baedd 3282 btrfs_tree_unlock(left);
5f39d397 3283 free_extent_buffer(left);
be0e5c09
CM
3284 path->slots[0] -= push_items;
3285 }
eb60ceac 3286 BUG_ON(path->slots[0] < 0);
aa5d6bed 3287 return ret;
925baedd
CM
3288out:
3289 btrfs_tree_unlock(left);
3290 free_extent_buffer(left);
3291 return ret;
be0e5c09
CM
3292}
3293
44871b1b
CM
3294/*
3295 * push some data in the path leaf to the left, trying to free up at
3296 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3297 *
3298 * max_slot can put a limit on how far into the leaf we'll push items. The
3299 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3300 * items
44871b1b
CM
3301 */
3302static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3303 *root, struct btrfs_path *path, int min_data_size,
3304 int data_size, int empty, u32 max_slot)
44871b1b
CM
3305{
3306 struct extent_buffer *right = path->nodes[0];
3307 struct extent_buffer *left;
3308 int slot;
3309 int free_space;
3310 u32 right_nritems;
3311 int ret = 0;
3312
3313 slot = path->slots[1];
3314 if (slot == 0)
3315 return 1;
3316 if (!path->nodes[1])
3317 return 1;
3318
3319 right_nritems = btrfs_header_nritems(right);
3320 if (right_nritems == 0)
3321 return 1;
3322
49d0c642 3323 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3324
4b231ae4 3325 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
fb770ae4
LB
3326 /*
3327 * slot - 1 is not valid or we fail to read the left node,
3328 * no big deal, just return.
3329 */
3330 if (IS_ERR(left))
91ca338d
TI
3331 return 1;
3332
bf77467a 3333 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
44871b1b 3334
e902baac 3335 free_space = btrfs_leaf_free_space(left);
44871b1b
CM
3336 if (free_space < data_size) {
3337 ret = 1;
3338 goto out;
3339 }
3340
44871b1b 3341 ret = btrfs_cow_block(trans, root, left,
9631e4cc 3342 path->nodes[1], slot - 1, &left,
bf59a5a2 3343 BTRFS_NESTING_LEFT_COW);
44871b1b
CM
3344 if (ret) {
3345 /* we hit -ENOSPC, but it isn't fatal here */
79787eaa
JM
3346 if (ret == -ENOSPC)
3347 ret = 1;
44871b1b
CM
3348 goto out;
3349 }
3350
d16c702f
QW
3351 if (check_sibling_keys(left, right)) {
3352 ret = -EUCLEAN;
3353 goto out;
3354 }
8087c193 3355 return __push_leaf_left(path, min_data_size,
99d8f83c
CM
3356 empty, left, free_space, right_nritems,
3357 max_slot);
44871b1b
CM
3358out:
3359 btrfs_tree_unlock(left);
3360 free_extent_buffer(left);
3361 return ret;
3362}
3363
3364/*
3365 * split the path's leaf in two, making sure there is at least data_size
3366 * available for the resulting leaf level of the path.
44871b1b 3367 */
143bede5 3368static noinline void copy_for_split(struct btrfs_trans_handle *trans,
143bede5
JM
3369 struct btrfs_path *path,
3370 struct extent_buffer *l,
3371 struct extent_buffer *right,
3372 int slot, int mid, int nritems)
44871b1b 3373{
94f94ad9 3374 struct btrfs_fs_info *fs_info = trans->fs_info;
44871b1b
CM
3375 int data_copy_size;
3376 int rt_data_off;
3377 int i;
44871b1b 3378 struct btrfs_disk_key disk_key;
cfed81a0
CM
3379 struct btrfs_map_token token;
3380
44871b1b
CM
3381 nritems = nritems - mid;
3382 btrfs_set_header_nritems(right, nritems);
dc2e724e 3383 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
44871b1b
CM
3384
3385 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3386 btrfs_item_nr_offset(mid),
3387 nritems * sizeof(struct btrfs_item));
3388
3389 copy_extent_buffer(right, l,
3d9ec8c4
NB
3390 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3391 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3392 leaf_data_end(l), data_copy_size);
44871b1b 3393
dc2e724e 3394 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
44871b1b 3395
c82f823c 3396 btrfs_init_map_token(&token, right);
44871b1b 3397 for (i = 0; i < nritems; i++) {
44871b1b
CM
3398 u32 ioff;
3399
3212fa14
JB
3400 ioff = btrfs_token_item_offset(&token, i);
3401 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
44871b1b
CM
3402 }
3403
44871b1b 3404 btrfs_set_header_nritems(l, mid);
44871b1b 3405 btrfs_item_key(right, &disk_key, 0);
6ad3cf6d 3406 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
44871b1b
CM
3407
3408 btrfs_mark_buffer_dirty(right);
3409 btrfs_mark_buffer_dirty(l);
3410 BUG_ON(path->slots[0] != slot);
3411
44871b1b
CM
3412 if (mid <= slot) {
3413 btrfs_tree_unlock(path->nodes[0]);
3414 free_extent_buffer(path->nodes[0]);
3415 path->nodes[0] = right;
3416 path->slots[0] -= mid;
3417 path->slots[1] += 1;
3418 } else {
3419 btrfs_tree_unlock(right);
3420 free_extent_buffer(right);
3421 }
3422
3423 BUG_ON(path->slots[0] < 0);
44871b1b
CM
3424}
3425
99d8f83c
CM
3426/*
3427 * double splits happen when we need to insert a big item in the middle
3428 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3429 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3430 * A B C
3431 *
3432 * We avoid this by trying to push the items on either side of our target
3433 * into the adjacent leaves. If all goes well we can avoid the double split
3434 * completely.
3435 */
3436static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3437 struct btrfs_root *root,
3438 struct btrfs_path *path,
3439 int data_size)
3440{
3441 int ret;
3442 int progress = 0;
3443 int slot;
3444 u32 nritems;
5a4267ca 3445 int space_needed = data_size;
99d8f83c
CM
3446
3447 slot = path->slots[0];
5a4267ca 3448 if (slot < btrfs_header_nritems(path->nodes[0]))
e902baac 3449 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
99d8f83c
CM
3450
3451 /*
3452 * try to push all the items after our slot into the
3453 * right leaf
3454 */
5a4267ca 3455 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3456 if (ret < 0)
3457 return ret;
3458
3459 if (ret == 0)
3460 progress++;
3461
3462 nritems = btrfs_header_nritems(path->nodes[0]);
3463 /*
3464 * our goal is to get our slot at the start or end of a leaf. If
3465 * we've done so we're done
3466 */
3467 if (path->slots[0] == 0 || path->slots[0] == nritems)
3468 return 0;
3469
e902baac 3470 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3471 return 0;
3472
3473 /* try to push all the items before our slot into the next leaf */
3474 slot = path->slots[0];
263d3995
FM
3475 space_needed = data_size;
3476 if (slot > 0)
e902baac 3477 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
5a4267ca 3478 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3479 if (ret < 0)
3480 return ret;
3481
3482 if (ret == 0)
3483 progress++;
3484
3485 if (progress)
3486 return 0;
3487 return 1;
3488}
3489
74123bd7
CM
3490/*
3491 * split the path's leaf in two, making sure there is at least data_size
3492 * available for the resulting leaf level of the path.
aa5d6bed
CM
3493 *
3494 * returns 0 if all went well and < 0 on failure.
74123bd7 3495 */
e02119d5
CM
3496static noinline int split_leaf(struct btrfs_trans_handle *trans,
3497 struct btrfs_root *root,
310712b2 3498 const struct btrfs_key *ins_key,
e02119d5
CM
3499 struct btrfs_path *path, int data_size,
3500 int extend)
be0e5c09 3501{
5d4f98a2 3502 struct btrfs_disk_key disk_key;
5f39d397 3503 struct extent_buffer *l;
7518a238 3504 u32 nritems;
eb60ceac
CM
3505 int mid;
3506 int slot;
5f39d397 3507 struct extent_buffer *right;
b7a0365e 3508 struct btrfs_fs_info *fs_info = root->fs_info;
d4dbff95 3509 int ret = 0;
aa5d6bed 3510 int wret;
5d4f98a2 3511 int split;
cc0c5538 3512 int num_doubles = 0;
99d8f83c 3513 int tried_avoid_double = 0;
aa5d6bed 3514
a5719521
YZ
3515 l = path->nodes[0];
3516 slot = path->slots[0];
3212fa14 3517 if (extend && data_size + btrfs_item_size(l, slot) +
0b246afa 3518 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
a5719521
YZ
3519 return -EOVERFLOW;
3520
40689478 3521 /* first try to make some room by pushing left and right */
33157e05 3522 if (data_size && path->nodes[1]) {
5a4267ca
FDBM
3523 int space_needed = data_size;
3524
3525 if (slot < btrfs_header_nritems(l))
e902baac 3526 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3527
3528 wret = push_leaf_right(trans, root, path, space_needed,
3529 space_needed, 0, 0);
d397712b 3530 if (wret < 0)
eaee50e8 3531 return wret;
3685f791 3532 if (wret) {
263d3995
FM
3533 space_needed = data_size;
3534 if (slot > 0)
e902baac 3535 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3536 wret = push_leaf_left(trans, root, path, space_needed,
3537 space_needed, 0, (u32)-1);
3685f791
CM
3538 if (wret < 0)
3539 return wret;
3540 }
3541 l = path->nodes[0];
aa5d6bed 3542
3685f791 3543 /* did the pushes work? */
e902baac 3544 if (btrfs_leaf_free_space(l) >= data_size)
3685f791 3545 return 0;
3326d1b0 3546 }
aa5d6bed 3547
5c680ed6 3548 if (!path->nodes[1]) {
fdd99c72 3549 ret = insert_new_root(trans, root, path, 1);
5c680ed6
CM
3550 if (ret)
3551 return ret;
3552 }
cc0c5538 3553again:
5d4f98a2 3554 split = 1;
cc0c5538 3555 l = path->nodes[0];
eb60ceac 3556 slot = path->slots[0];
5f39d397 3557 nritems = btrfs_header_nritems(l);
d397712b 3558 mid = (nritems + 1) / 2;
54aa1f4d 3559
5d4f98a2
YZ
3560 if (mid <= slot) {
3561 if (nritems == 1 ||
3562 leaf_space_used(l, mid, nritems - mid) + data_size >
0b246afa 3563 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3564 if (slot >= nritems) {
3565 split = 0;
3566 } else {
3567 mid = slot;
3568 if (mid != nritems &&
3569 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3570 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3571 if (data_size && !tried_avoid_double)
3572 goto push_for_double;
5d4f98a2
YZ
3573 split = 2;
3574 }
3575 }
3576 }
3577 } else {
3578 if (leaf_space_used(l, 0, mid) + data_size >
0b246afa 3579 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3580 if (!extend && data_size && slot == 0) {
3581 split = 0;
3582 } else if ((extend || !data_size) && slot == 0) {
3583 mid = 1;
3584 } else {
3585 mid = slot;
3586 if (mid != nritems &&
3587 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3588 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3589 if (data_size && !tried_avoid_double)
3590 goto push_for_double;
67871254 3591 split = 2;
5d4f98a2
YZ
3592 }
3593 }
3594 }
3595 }
3596
3597 if (split == 0)
3598 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3599 else
3600 btrfs_item_key(l, &disk_key, mid);
3601
ca9d473a
JB
3602 /*
3603 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3604 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3605 * subclasses, which is 8 at the time of this patch, and we've maxed it
3606 * out. In the future we could add a
3607 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3608 * use BTRFS_NESTING_NEW_ROOT.
3609 */
79bd3712
FM
3610 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3611 &disk_key, 0, l->start, 0,
3612 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3613 BTRFS_NESTING_SPLIT);
f0486c68 3614 if (IS_ERR(right))
5f39d397 3615 return PTR_ERR(right);
f0486c68 3616
0b246afa 3617 root_add_used(root, fs_info->nodesize);
5f39d397 3618
5d4f98a2
YZ
3619 if (split == 0) {
3620 if (mid <= slot) {
3621 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3622 insert_ptr(trans, path, &disk_key,
2ff7e61e 3623 right->start, path->slots[1] + 1, 1);
5d4f98a2
YZ
3624 btrfs_tree_unlock(path->nodes[0]);
3625 free_extent_buffer(path->nodes[0]);
3626 path->nodes[0] = right;
3627 path->slots[0] = 0;
3628 path->slots[1] += 1;
3629 } else {
3630 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3631 insert_ptr(trans, path, &disk_key,
2ff7e61e 3632 right->start, path->slots[1], 1);
5d4f98a2
YZ
3633 btrfs_tree_unlock(path->nodes[0]);
3634 free_extent_buffer(path->nodes[0]);
3635 path->nodes[0] = right;
3636 path->slots[0] = 0;
143bede5 3637 if (path->slots[1] == 0)
b167fa91 3638 fixup_low_keys(path, &disk_key, 1);
d4dbff95 3639 }
196e0249
LB
3640 /*
3641 * We create a new leaf 'right' for the required ins_len and
3642 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3643 * the content of ins_len to 'right'.
3644 */
5d4f98a2 3645 return ret;
d4dbff95 3646 }
74123bd7 3647
94f94ad9 3648 copy_for_split(trans, path, l, right, slot, mid, nritems);
31840ae1 3649
5d4f98a2 3650 if (split == 2) {
cc0c5538
CM
3651 BUG_ON(num_doubles != 0);
3652 num_doubles++;
3653 goto again;
a429e513 3654 }
44871b1b 3655
143bede5 3656 return 0;
99d8f83c
CM
3657
3658push_for_double:
3659 push_for_double_split(trans, root, path, data_size);
3660 tried_avoid_double = 1;
e902baac 3661 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3662 return 0;
3663 goto again;
be0e5c09
CM
3664}
3665
ad48fd75
YZ
3666static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3667 struct btrfs_root *root,
3668 struct btrfs_path *path, int ins_len)
459931ec 3669{
ad48fd75 3670 struct btrfs_key key;
459931ec 3671 struct extent_buffer *leaf;
ad48fd75
YZ
3672 struct btrfs_file_extent_item *fi;
3673 u64 extent_len = 0;
3674 u32 item_size;
3675 int ret;
459931ec
CM
3676
3677 leaf = path->nodes[0];
ad48fd75
YZ
3678 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3679
3680 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3681 key.type != BTRFS_EXTENT_CSUM_KEY);
3682
e902baac 3683 if (btrfs_leaf_free_space(leaf) >= ins_len)
ad48fd75 3684 return 0;
459931ec 3685
3212fa14 3686 item_size = btrfs_item_size(leaf, path->slots[0]);
ad48fd75
YZ
3687 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3688 fi = btrfs_item_ptr(leaf, path->slots[0],
3689 struct btrfs_file_extent_item);
3690 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3691 }
b3b4aa74 3692 btrfs_release_path(path);
459931ec 3693
459931ec 3694 path->keep_locks = 1;
ad48fd75
YZ
3695 path->search_for_split = 1;
3696 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
459931ec 3697 path->search_for_split = 0;
a8df6fe6
FM
3698 if (ret > 0)
3699 ret = -EAGAIN;
ad48fd75
YZ
3700 if (ret < 0)
3701 goto err;
459931ec 3702
ad48fd75
YZ
3703 ret = -EAGAIN;
3704 leaf = path->nodes[0];
a8df6fe6 3705 /* if our item isn't there, return now */
3212fa14 3706 if (item_size != btrfs_item_size(leaf, path->slots[0]))
ad48fd75
YZ
3707 goto err;
3708
109f6aef 3709 /* the leaf has changed, it now has room. return now */
e902baac 3710 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
109f6aef
CM
3711 goto err;
3712
ad48fd75
YZ
3713 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3714 fi = btrfs_item_ptr(leaf, path->slots[0],
3715 struct btrfs_file_extent_item);
3716 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3717 goto err;
459931ec
CM
3718 }
3719
ad48fd75 3720 ret = split_leaf(trans, root, &key, path, ins_len, 1);
f0486c68
YZ
3721 if (ret)
3722 goto err;
459931ec 3723
ad48fd75 3724 path->keep_locks = 0;
b9473439 3725 btrfs_unlock_up_safe(path, 1);
ad48fd75
YZ
3726 return 0;
3727err:
3728 path->keep_locks = 0;
3729 return ret;
3730}
3731
25263cd7 3732static noinline int split_item(struct btrfs_path *path,
310712b2 3733 const struct btrfs_key *new_key,
ad48fd75
YZ
3734 unsigned long split_offset)
3735{
3736 struct extent_buffer *leaf;
c91666b1 3737 int orig_slot, slot;
ad48fd75
YZ
3738 char *buf;
3739 u32 nritems;
3740 u32 item_size;
3741 u32 orig_offset;
3742 struct btrfs_disk_key disk_key;
3743
b9473439 3744 leaf = path->nodes[0];
e902baac 3745 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
b9473439 3746
c91666b1 3747 orig_slot = path->slots[0];
3212fa14
JB
3748 orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3749 item_size = btrfs_item_size(leaf, path->slots[0]);
459931ec 3750
459931ec 3751 buf = kmalloc(item_size, GFP_NOFS);
ad48fd75
YZ
3752 if (!buf)
3753 return -ENOMEM;
3754
459931ec
CM
3755 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3756 path->slots[0]), item_size);
459931ec 3757
ad48fd75 3758 slot = path->slots[0] + 1;
459931ec 3759 nritems = btrfs_header_nritems(leaf);
459931ec
CM
3760 if (slot != nritems) {
3761 /* shift the items */
3762 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
ad48fd75
YZ
3763 btrfs_item_nr_offset(slot),
3764 (nritems - slot) * sizeof(struct btrfs_item));
459931ec
CM
3765 }
3766
3767 btrfs_cpu_key_to_disk(&disk_key, new_key);
3768 btrfs_set_item_key(leaf, &disk_key, slot);
3769
3212fa14
JB
3770 btrfs_set_item_offset(leaf, slot, orig_offset);
3771 btrfs_set_item_size(leaf, slot, item_size - split_offset);
459931ec 3772
3212fa14 3773 btrfs_set_item_offset(leaf, orig_slot,
c91666b1 3774 orig_offset + item_size - split_offset);
3212fa14 3775 btrfs_set_item_size(leaf, orig_slot, split_offset);
459931ec
CM
3776
3777 btrfs_set_header_nritems(leaf, nritems + 1);
3778
3779 /* write the data for the start of the original item */
3780 write_extent_buffer(leaf, buf,
3781 btrfs_item_ptr_offset(leaf, path->slots[0]),
3782 split_offset);
3783
3784 /* write the data for the new item */
3785 write_extent_buffer(leaf, buf + split_offset,
3786 btrfs_item_ptr_offset(leaf, slot),
3787 item_size - split_offset);
3788 btrfs_mark_buffer_dirty(leaf);
3789
e902baac 3790 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
459931ec 3791 kfree(buf);
ad48fd75
YZ
3792 return 0;
3793}
3794
3795/*
3796 * This function splits a single item into two items,
3797 * giving 'new_key' to the new item and splitting the
3798 * old one at split_offset (from the start of the item).
3799 *
3800 * The path may be released by this operation. After
3801 * the split, the path is pointing to the old item. The
3802 * new item is going to be in the same node as the old one.
3803 *
3804 * Note, the item being split must be smaller enough to live alone on
3805 * a tree block with room for one extra struct btrfs_item
3806 *
3807 * This allows us to split the item in place, keeping a lock on the
3808 * leaf the entire time.
3809 */
3810int btrfs_split_item(struct btrfs_trans_handle *trans,
3811 struct btrfs_root *root,
3812 struct btrfs_path *path,
310712b2 3813 const struct btrfs_key *new_key,
ad48fd75
YZ
3814 unsigned long split_offset)
3815{
3816 int ret;
3817 ret = setup_leaf_for_split(trans, root, path,
3818 sizeof(struct btrfs_item));
3819 if (ret)
3820 return ret;
3821
25263cd7 3822 ret = split_item(path, new_key, split_offset);
459931ec
CM
3823 return ret;
3824}
3825
d352ac68
CM
3826/*
3827 * make the item pointed to by the path smaller. new_size indicates
3828 * how small to make it, and from_end tells us if we just chop bytes
3829 * off the end of the item or if we shift the item to chop bytes off
3830 * the front.
3831 */
78ac4f9e 3832void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
b18c6685 3833{
b18c6685 3834 int slot;
5f39d397 3835 struct extent_buffer *leaf;
b18c6685
CM
3836 u32 nritems;
3837 unsigned int data_end;
3838 unsigned int old_data_start;
3839 unsigned int old_size;
3840 unsigned int size_diff;
3841 int i;
cfed81a0
CM
3842 struct btrfs_map_token token;
3843
5f39d397 3844 leaf = path->nodes[0];
179e29e4
CM
3845 slot = path->slots[0];
3846
3212fa14 3847 old_size = btrfs_item_size(leaf, slot);
179e29e4 3848 if (old_size == new_size)
143bede5 3849 return;
b18c6685 3850
5f39d397 3851 nritems = btrfs_header_nritems(leaf);
8f881e8c 3852 data_end = leaf_data_end(leaf);
b18c6685 3853
3212fa14 3854 old_data_start = btrfs_item_offset(leaf, slot);
179e29e4 3855
b18c6685
CM
3856 size_diff = old_size - new_size;
3857
3858 BUG_ON(slot < 0);
3859 BUG_ON(slot >= nritems);
3860
3861 /*
3862 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3863 */
3864 /* first correct the data pointers */
c82f823c 3865 btrfs_init_map_token(&token, leaf);
b18c6685 3866 for (i = slot; i < nritems; i++) {
5f39d397 3867 u32 ioff;
db94535d 3868
3212fa14
JB
3869 ioff = btrfs_token_item_offset(&token, i);
3870 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
b18c6685 3871 }
db94535d 3872
b18c6685 3873 /* shift the data */
179e29e4 3874 if (from_end) {
3d9ec8c4
NB
3875 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3876 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3877 data_end, old_data_start + new_size - data_end);
3878 } else {
3879 struct btrfs_disk_key disk_key;
3880 u64 offset;
3881
3882 btrfs_item_key(leaf, &disk_key, slot);
3883
3884 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3885 unsigned long ptr;
3886 struct btrfs_file_extent_item *fi;
3887
3888 fi = btrfs_item_ptr(leaf, slot,
3889 struct btrfs_file_extent_item);
3890 fi = (struct btrfs_file_extent_item *)(
3891 (unsigned long)fi - size_diff);
3892
3893 if (btrfs_file_extent_type(leaf, fi) ==
3894 BTRFS_FILE_EXTENT_INLINE) {
3895 ptr = btrfs_item_ptr_offset(leaf, slot);
3896 memmove_extent_buffer(leaf, ptr,
d397712b 3897 (unsigned long)fi,
7ec20afb 3898 BTRFS_FILE_EXTENT_INLINE_DATA_START);
179e29e4
CM
3899 }
3900 }
3901
3d9ec8c4
NB
3902 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3903 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3904 data_end, old_data_start - data_end);
3905
3906 offset = btrfs_disk_key_offset(&disk_key);
3907 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3908 btrfs_set_item_key(leaf, &disk_key, slot);
3909 if (slot == 0)
b167fa91 3910 fixup_low_keys(path, &disk_key, 1);
179e29e4 3911 }
5f39d397 3912
3212fa14 3913 btrfs_set_item_size(leaf, slot, new_size);
5f39d397 3914 btrfs_mark_buffer_dirty(leaf);
b18c6685 3915
e902baac 3916 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3917 btrfs_print_leaf(leaf);
b18c6685 3918 BUG();
5f39d397 3919 }
b18c6685
CM
3920}
3921
d352ac68 3922/*
8f69dbd2 3923 * make the item pointed to by the path bigger, data_size is the added size.
d352ac68 3924 */
c71dd880 3925void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
6567e837 3926{
6567e837 3927 int slot;
5f39d397 3928 struct extent_buffer *leaf;
6567e837
CM
3929 u32 nritems;
3930 unsigned int data_end;
3931 unsigned int old_data;
3932 unsigned int old_size;
3933 int i;
cfed81a0
CM
3934 struct btrfs_map_token token;
3935
5f39d397 3936 leaf = path->nodes[0];
6567e837 3937
5f39d397 3938 nritems = btrfs_header_nritems(leaf);
8f881e8c 3939 data_end = leaf_data_end(leaf);
6567e837 3940
e902baac 3941 if (btrfs_leaf_free_space(leaf) < data_size) {
a4f78750 3942 btrfs_print_leaf(leaf);
6567e837 3943 BUG();
5f39d397 3944 }
6567e837 3945 slot = path->slots[0];
dc2e724e 3946 old_data = btrfs_item_data_end(leaf, slot);
6567e837
CM
3947
3948 BUG_ON(slot < 0);
3326d1b0 3949 if (slot >= nritems) {
a4f78750 3950 btrfs_print_leaf(leaf);
c71dd880 3951 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
0b246afa 3952 slot, nritems);
290342f6 3953 BUG();
3326d1b0 3954 }
6567e837
CM
3955
3956 /*
3957 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3958 */
3959 /* first correct the data pointers */
c82f823c 3960 btrfs_init_map_token(&token, leaf);
6567e837 3961 for (i = slot; i < nritems; i++) {
5f39d397 3962 u32 ioff;
db94535d 3963
3212fa14
JB
3964 ioff = btrfs_token_item_offset(&token, i);
3965 btrfs_set_token_item_offset(&token, i, ioff - data_size);
6567e837 3966 }
5f39d397 3967
6567e837 3968 /* shift the data */
3d9ec8c4
NB
3969 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3970 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
6567e837 3971 data_end, old_data - data_end);
5f39d397 3972
6567e837 3973 data_end = old_data;
3212fa14
JB
3974 old_size = btrfs_item_size(leaf, slot);
3975 btrfs_set_item_size(leaf, slot, old_size + data_size);
5f39d397 3976 btrfs_mark_buffer_dirty(leaf);
6567e837 3977
e902baac 3978 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3979 btrfs_print_leaf(leaf);
6567e837 3980 BUG();
5f39d397 3981 }
6567e837
CM
3982}
3983
da9ffb24
NB
3984/**
3985 * setup_items_for_insert - Helper called before inserting one or more items
3986 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3987 * in a function that doesn't call btrfs_search_slot
3988 *
3989 * @root: root we are inserting items to
3990 * @path: points to the leaf/slot where we are going to insert new items
b7ef5f3a 3991 * @batch: information about the batch of items to insert
74123bd7 3992 */
f0641656
FM
3993static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3994 const struct btrfs_item_batch *batch)
be0e5c09 3995{
0b246afa 3996 struct btrfs_fs_info *fs_info = root->fs_info;
9c58309d 3997 int i;
7518a238 3998 u32 nritems;
be0e5c09 3999 unsigned int data_end;
e2fa7227 4000 struct btrfs_disk_key disk_key;
44871b1b
CM
4001 struct extent_buffer *leaf;
4002 int slot;
cfed81a0 4003 struct btrfs_map_token token;
fc0d82e1 4004 u32 total_size;
cfed81a0 4005
b7ef5f3a
FM
4006 /*
4007 * Before anything else, update keys in the parent and other ancestors
4008 * if needed, then release the write locks on them, so that other tasks
4009 * can use them while we modify the leaf.
4010 */
24cdc847 4011 if (path->slots[0] == 0) {
b7ef5f3a 4012 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
b167fa91 4013 fixup_low_keys(path, &disk_key, 1);
24cdc847
FM
4014 }
4015 btrfs_unlock_up_safe(path, 1);
4016
5f39d397 4017 leaf = path->nodes[0];
44871b1b 4018 slot = path->slots[0];
74123bd7 4019
5f39d397 4020 nritems = btrfs_header_nritems(leaf);
8f881e8c 4021 data_end = leaf_data_end(leaf);
b7ef5f3a 4022 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
eb60ceac 4023
e902baac 4024 if (btrfs_leaf_free_space(leaf) < total_size) {
a4f78750 4025 btrfs_print_leaf(leaf);
0b246afa 4026 btrfs_crit(fs_info, "not enough freespace need %u have %d",
e902baac 4027 total_size, btrfs_leaf_free_space(leaf));
be0e5c09 4028 BUG();
d4dbff95 4029 }
5f39d397 4030
c82f823c 4031 btrfs_init_map_token(&token, leaf);
be0e5c09 4032 if (slot != nritems) {
dc2e724e 4033 unsigned int old_data = btrfs_item_data_end(leaf, slot);
be0e5c09 4034
5f39d397 4035 if (old_data < data_end) {
a4f78750 4036 btrfs_print_leaf(leaf);
7269ddd2
NB
4037 btrfs_crit(fs_info,
4038 "item at slot %d with data offset %u beyond data end of leaf %u",
5d163e0e 4039 slot, old_data, data_end);
290342f6 4040 BUG();
5f39d397 4041 }
be0e5c09
CM
4042 /*
4043 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4044 */
4045 /* first correct the data pointers */
0783fcfc 4046 for (i = slot; i < nritems; i++) {
5f39d397 4047 u32 ioff;
db94535d 4048
3212fa14
JB
4049 ioff = btrfs_token_item_offset(&token, i);
4050 btrfs_set_token_item_offset(&token, i,
74794207 4051 ioff - batch->total_data_size);
0783fcfc 4052 }
be0e5c09 4053 /* shift the items */
b7ef5f3a 4054 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
5f39d397 4055 btrfs_item_nr_offset(slot),
d6025579 4056 (nritems - slot) * sizeof(struct btrfs_item));
be0e5c09
CM
4057
4058 /* shift the data */
3d9ec8c4 4059 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
b7ef5f3a
FM
4060 data_end - batch->total_data_size,
4061 BTRFS_LEAF_DATA_OFFSET + data_end,
4062 old_data - data_end);
be0e5c09
CM
4063 data_end = old_data;
4064 }
5f39d397 4065
62e2749e 4066 /* setup the item for the new data */
b7ef5f3a
FM
4067 for (i = 0; i < batch->nr; i++) {
4068 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
9c58309d 4069 btrfs_set_item_key(leaf, &disk_key, slot + i);
b7ef5f3a 4070 data_end -= batch->data_sizes[i];
3212fa14
JB
4071 btrfs_set_token_item_offset(&token, slot + i, data_end);
4072 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
9c58309d 4073 }
44871b1b 4074
b7ef5f3a 4075 btrfs_set_header_nritems(leaf, nritems + batch->nr);
b9473439 4076 btrfs_mark_buffer_dirty(leaf);
aa5d6bed 4077
e902baac 4078 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 4079 btrfs_print_leaf(leaf);
be0e5c09 4080 BUG();
5f39d397 4081 }
44871b1b
CM
4082}
4083
f0641656
FM
4084/*
4085 * Insert a new item into a leaf.
4086 *
4087 * @root: The root of the btree.
4088 * @path: A path pointing to the target leaf and slot.
4089 * @key: The key of the new item.
4090 * @data_size: The size of the data associated with the new key.
4091 */
4092void btrfs_setup_item_for_insert(struct btrfs_root *root,
4093 struct btrfs_path *path,
4094 const struct btrfs_key *key,
4095 u32 data_size)
4096{
4097 struct btrfs_item_batch batch;
4098
4099 batch.keys = key;
4100 batch.data_sizes = &data_size;
4101 batch.total_data_size = data_size;
4102 batch.nr = 1;
4103
4104 setup_items_for_insert(root, path, &batch);
4105}
4106
44871b1b
CM
4107/*
4108 * Given a key and some data, insert items into the tree.
4109 * This does all the path init required, making room in the tree if needed.
4110 */
4111int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4112 struct btrfs_root *root,
4113 struct btrfs_path *path,
b7ef5f3a 4114 const struct btrfs_item_batch *batch)
44871b1b 4115{
44871b1b
CM
4116 int ret = 0;
4117 int slot;
b7ef5f3a 4118 u32 total_size;
44871b1b 4119
b7ef5f3a
FM
4120 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4121 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
44871b1b
CM
4122 if (ret == 0)
4123 return -EEXIST;
4124 if (ret < 0)
143bede5 4125 return ret;
44871b1b 4126
44871b1b
CM
4127 slot = path->slots[0];
4128 BUG_ON(slot < 0);
4129
b7ef5f3a 4130 setup_items_for_insert(root, path, batch);
143bede5 4131 return 0;
62e2749e
CM
4132}
4133
4134/*
4135 * Given a key and some data, insert an item into the tree.
4136 * This does all the path init required, making room in the tree if needed.
4137 */
310712b2
OS
4138int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4139 const struct btrfs_key *cpu_key, void *data,
4140 u32 data_size)
62e2749e
CM
4141{
4142 int ret = 0;
2c90e5d6 4143 struct btrfs_path *path;
5f39d397
CM
4144 struct extent_buffer *leaf;
4145 unsigned long ptr;
62e2749e 4146
2c90e5d6 4147 path = btrfs_alloc_path();
db5b493a
TI
4148 if (!path)
4149 return -ENOMEM;
2c90e5d6 4150 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
62e2749e 4151 if (!ret) {
5f39d397
CM
4152 leaf = path->nodes[0];
4153 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4154 write_extent_buffer(leaf, data, ptr, data_size);
4155 btrfs_mark_buffer_dirty(leaf);
62e2749e 4156 }
2c90e5d6 4157 btrfs_free_path(path);
aa5d6bed 4158 return ret;
be0e5c09
CM
4159}
4160
f0641656
FM
4161/*
4162 * This function duplicates an item, giving 'new_key' to the new item.
4163 * It guarantees both items live in the same tree leaf and the new item is
4164 * contiguous with the original item.
4165 *
4166 * This allows us to split a file extent in place, keeping a lock on the leaf
4167 * the entire time.
4168 */
4169int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4170 struct btrfs_root *root,
4171 struct btrfs_path *path,
4172 const struct btrfs_key *new_key)
4173{
4174 struct extent_buffer *leaf;
4175 int ret;
4176 u32 item_size;
4177
4178 leaf = path->nodes[0];
3212fa14 4179 item_size = btrfs_item_size(leaf, path->slots[0]);
f0641656
FM
4180 ret = setup_leaf_for_split(trans, root, path,
4181 item_size + sizeof(struct btrfs_item));
4182 if (ret)
4183 return ret;
4184
4185 path->slots[0]++;
4186 btrfs_setup_item_for_insert(root, path, new_key, item_size);
4187 leaf = path->nodes[0];
4188 memcpy_extent_buffer(leaf,
4189 btrfs_item_ptr_offset(leaf, path->slots[0]),
4190 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4191 item_size);
4192 return 0;
4193}
4194
74123bd7 4195/*
5de08d7d 4196 * delete the pointer from a given node.
74123bd7 4197 *
d352ac68
CM
4198 * the tree should have been previously balanced so the deletion does not
4199 * empty a node.
74123bd7 4200 */
afe5fea7
TI
4201static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4202 int level, int slot)
be0e5c09 4203{
5f39d397 4204 struct extent_buffer *parent = path->nodes[level];
7518a238 4205 u32 nritems;
f3ea38da 4206 int ret;
be0e5c09 4207
5f39d397 4208 nritems = btrfs_header_nritems(parent);
d397712b 4209 if (slot != nritems - 1) {
bf1d3425 4210 if (level) {
f3a84ccd
FM
4211 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4212 slot + 1, nritems - slot - 1);
bf1d3425
DS
4213 BUG_ON(ret < 0);
4214 }
5f39d397
CM
4215 memmove_extent_buffer(parent,
4216 btrfs_node_key_ptr_offset(slot),
4217 btrfs_node_key_ptr_offset(slot + 1),
d6025579
CM
4218 sizeof(struct btrfs_key_ptr) *
4219 (nritems - slot - 1));
57ba86c0 4220 } else if (level) {
f3a84ccd 4221 ret = btrfs_tree_mod_log_insert_key(parent, slot,
33cff222 4222 BTRFS_MOD_LOG_KEY_REMOVE);
57ba86c0 4223 BUG_ON(ret < 0);
bb803951 4224 }
f3ea38da 4225
7518a238 4226 nritems--;
5f39d397 4227 btrfs_set_header_nritems(parent, nritems);
7518a238 4228 if (nritems == 0 && parent == root->node) {
5f39d397 4229 BUG_ON(btrfs_header_level(root->node) != 1);
bb803951 4230 /* just turn the root into a leaf and break */
5f39d397 4231 btrfs_set_header_level(root->node, 0);
bb803951 4232 } else if (slot == 0) {
5f39d397
CM
4233 struct btrfs_disk_key disk_key;
4234
4235 btrfs_node_key(parent, &disk_key, 0);
b167fa91 4236 fixup_low_keys(path, &disk_key, level + 1);
be0e5c09 4237 }
d6025579 4238 btrfs_mark_buffer_dirty(parent);
be0e5c09
CM
4239}
4240
323ac95b
CM
4241/*
4242 * a helper function to delete the leaf pointed to by path->slots[1] and
5d4f98a2 4243 * path->nodes[1].
323ac95b
CM
4244 *
4245 * This deletes the pointer in path->nodes[1] and frees the leaf
4246 * block extent. zero is returned if it all worked out, < 0 otherwise.
4247 *
4248 * The path must have already been setup for deleting the leaf, including
4249 * all the proper balancing. path->nodes[1] must be locked.
4250 */
143bede5
JM
4251static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4252 struct btrfs_root *root,
4253 struct btrfs_path *path,
4254 struct extent_buffer *leaf)
323ac95b 4255{
5d4f98a2 4256 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
afe5fea7 4257 del_ptr(root, path, 1, path->slots[1]);
323ac95b 4258
4d081c41
CM
4259 /*
4260 * btrfs_free_extent is expensive, we want to make sure we
4261 * aren't holding any locks when we call it
4262 */
4263 btrfs_unlock_up_safe(path, 0);
4264
f0486c68
YZ
4265 root_sub_used(root, leaf->len);
4266
67439dad 4267 atomic_inc(&leaf->refs);
7a163608 4268 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
3083ee2e 4269 free_extent_buffer_stale(leaf);
323ac95b 4270}
74123bd7
CM
4271/*
4272 * delete the item at the leaf level in path. If that empties
4273 * the leaf, remove it from the tree
4274 */
85e21bac
CM
4275int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4276 struct btrfs_path *path, int slot, int nr)
be0e5c09 4277{
0b246afa 4278 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 4279 struct extent_buffer *leaf;
aa5d6bed
CM
4280 int ret = 0;
4281 int wret;
7518a238 4282 u32 nritems;
be0e5c09 4283
5f39d397 4284 leaf = path->nodes[0];
5f39d397 4285 nritems = btrfs_header_nritems(leaf);
be0e5c09 4286
85e21bac 4287 if (slot + nr != nritems) {
0cae23b6
FM
4288 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4289 const int data_end = leaf_data_end(leaf);
c82f823c 4290 struct btrfs_map_token token;
0cae23b6
FM
4291 u32 dsize = 0;
4292 int i;
4293
4294 for (i = 0; i < nr; i++)
4295 dsize += btrfs_item_size(leaf, slot + i);
5f39d397 4296
3d9ec8c4 4297 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
d6025579 4298 data_end + dsize,
3d9ec8c4 4299 BTRFS_LEAF_DATA_OFFSET + data_end,
85e21bac 4300 last_off - data_end);
5f39d397 4301
c82f823c 4302 btrfs_init_map_token(&token, leaf);
85e21bac 4303 for (i = slot + nr; i < nritems; i++) {
5f39d397 4304 u32 ioff;
db94535d 4305
3212fa14
JB
4306 ioff = btrfs_token_item_offset(&token, i);
4307 btrfs_set_token_item_offset(&token, i, ioff + dsize);
0783fcfc 4308 }
db94535d 4309
5f39d397 4310 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
85e21bac 4311 btrfs_item_nr_offset(slot + nr),
d6025579 4312 sizeof(struct btrfs_item) *
85e21bac 4313 (nritems - slot - nr));
be0e5c09 4314 }
85e21bac
CM
4315 btrfs_set_header_nritems(leaf, nritems - nr);
4316 nritems -= nr;
5f39d397 4317
74123bd7 4318 /* delete the leaf if we've emptied it */
7518a238 4319 if (nritems == 0) {
5f39d397
CM
4320 if (leaf == root->node) {
4321 btrfs_set_header_level(leaf, 0);
9a8dd150 4322 } else {
6a884d7d 4323 btrfs_clean_tree_block(leaf);
143bede5 4324 btrfs_del_leaf(trans, root, path, leaf);
9a8dd150 4325 }
be0e5c09 4326 } else {
7518a238 4327 int used = leaf_space_used(leaf, 0, nritems);
aa5d6bed 4328 if (slot == 0) {
5f39d397
CM
4329 struct btrfs_disk_key disk_key;
4330
4331 btrfs_item_key(leaf, &disk_key, 0);
b167fa91 4332 fixup_low_keys(path, &disk_key, 1);
aa5d6bed 4333 }
aa5d6bed 4334
7c4063d1
FM
4335 /*
4336 * Try to delete the leaf if it is mostly empty. We do this by
4337 * trying to move all its items into its left and right neighbours.
4338 * If we can't move all the items, then we don't delete it - it's
4339 * not ideal, but future insertions might fill the leaf with more
4340 * items, or items from other leaves might be moved later into our
4341 * leaf due to deletions on those leaves.
4342 */
0b246afa 4343 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
7c4063d1
FM
4344 u32 min_push_space;
4345
be0e5c09
CM
4346 /* push_leaf_left fixes the path.
4347 * make sure the path still points to our leaf
4348 * for possible call to del_ptr below
4349 */
4920c9ac 4350 slot = path->slots[1];
67439dad 4351 atomic_inc(&leaf->refs);
7c4063d1
FM
4352 /*
4353 * We want to be able to at least push one item to the
4354 * left neighbour leaf, and that's the first item.
4355 */
4356 min_push_space = sizeof(struct btrfs_item) +
4357 btrfs_item_size(leaf, 0);
4358 wret = push_leaf_left(trans, root, path, 0,
4359 min_push_space, 1, (u32)-1);
54aa1f4d 4360 if (wret < 0 && wret != -ENOSPC)
aa5d6bed 4361 ret = wret;
5f39d397
CM
4362
4363 if (path->nodes[0] == leaf &&
4364 btrfs_header_nritems(leaf)) {
7c4063d1
FM
4365 /*
4366 * If we were not able to push all items from our
4367 * leaf to its left neighbour, then attempt to
4368 * either push all the remaining items to the
4369 * right neighbour or none. There's no advantage
4370 * in pushing only some items, instead of all, as
4371 * it's pointless to end up with a leaf having
4372 * too few items while the neighbours can be full
4373 * or nearly full.
4374 */
4375 nritems = btrfs_header_nritems(leaf);
4376 min_push_space = leaf_space_used(leaf, 0, nritems);
4377 wret = push_leaf_right(trans, root, path, 0,
4378 min_push_space, 1, 0);
54aa1f4d 4379 if (wret < 0 && wret != -ENOSPC)
aa5d6bed
CM
4380 ret = wret;
4381 }
5f39d397
CM
4382
4383 if (btrfs_header_nritems(leaf) == 0) {
323ac95b 4384 path->slots[1] = slot;
143bede5 4385 btrfs_del_leaf(trans, root, path, leaf);
5f39d397 4386 free_extent_buffer(leaf);
143bede5 4387 ret = 0;
5de08d7d 4388 } else {
925baedd
CM
4389 /* if we're still in the path, make sure
4390 * we're dirty. Otherwise, one of the
4391 * push_leaf functions must have already
4392 * dirtied this buffer
4393 */
4394 if (path->nodes[0] == leaf)
4395 btrfs_mark_buffer_dirty(leaf);
5f39d397 4396 free_extent_buffer(leaf);
be0e5c09 4397 }
d5719762 4398 } else {
5f39d397 4399 btrfs_mark_buffer_dirty(leaf);
be0e5c09
CM
4400 }
4401 }
aa5d6bed 4402 return ret;
be0e5c09
CM
4403}
4404
7bb86316 4405/*
925baedd 4406 * search the tree again to find a leaf with lesser keys
7bb86316
CM
4407 * returns 0 if it found something or 1 if there are no lesser leaves.
4408 * returns < 0 on io errors.
d352ac68
CM
4409 *
4410 * This may release the path, and so you may lose any locks held at the
4411 * time you call it.
7bb86316 4412 */
16e7549f 4413int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
7bb86316 4414{
925baedd
CM
4415 struct btrfs_key key;
4416 struct btrfs_disk_key found_key;
4417 int ret;
7bb86316 4418
925baedd 4419 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
7bb86316 4420
e8b0d724 4421 if (key.offset > 0) {
925baedd 4422 key.offset--;
e8b0d724 4423 } else if (key.type > 0) {
925baedd 4424 key.type--;
e8b0d724
FDBM
4425 key.offset = (u64)-1;
4426 } else if (key.objectid > 0) {
925baedd 4427 key.objectid--;
e8b0d724
FDBM
4428 key.type = (u8)-1;
4429 key.offset = (u64)-1;
4430 } else {
925baedd 4431 return 1;
e8b0d724 4432 }
7bb86316 4433
b3b4aa74 4434 btrfs_release_path(path);
925baedd
CM
4435 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4436 if (ret < 0)
4437 return ret;
4438 btrfs_item_key(path->nodes[0], &found_key, 0);
4439 ret = comp_keys(&found_key, &key);
337c6f68
FM
4440 /*
4441 * We might have had an item with the previous key in the tree right
4442 * before we released our path. And after we released our path, that
4443 * item might have been pushed to the first slot (0) of the leaf we
4444 * were holding due to a tree balance. Alternatively, an item with the
4445 * previous key can exist as the only element of a leaf (big fat item).
4446 * Therefore account for these 2 cases, so that our callers (like
4447 * btrfs_previous_item) don't miss an existing item with a key matching
4448 * the previous key we computed above.
4449 */
4450 if (ret <= 0)
925baedd
CM
4451 return 0;
4452 return 1;
7bb86316
CM
4453}
4454
3f157a2f
CM
4455/*
4456 * A helper function to walk down the tree starting at min_key, and looking
de78b51a
ES
4457 * for nodes or leaves that are have a minimum transaction id.
4458 * This is used by the btree defrag code, and tree logging
3f157a2f
CM
4459 *
4460 * This does not cow, but it does stuff the starting key it finds back
4461 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4462 * key and get a writable path.
4463 *
3f157a2f
CM
4464 * This honors path->lowest_level to prevent descent past a given level
4465 * of the tree.
4466 *
d352ac68
CM
4467 * min_trans indicates the oldest transaction that you are interested
4468 * in walking through. Any nodes or leaves older than min_trans are
4469 * skipped over (without reading them).
4470 *
3f157a2f
CM
4471 * returns zero if something useful was found, < 0 on error and 1 if there
4472 * was nothing in the tree that matched the search criteria.
4473 */
4474int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
de78b51a 4475 struct btrfs_path *path,
3f157a2f
CM
4476 u64 min_trans)
4477{
4478 struct extent_buffer *cur;
4479 struct btrfs_key found_key;
4480 int slot;
9652480b 4481 int sret;
3f157a2f
CM
4482 u32 nritems;
4483 int level;
4484 int ret = 1;
f98de9b9 4485 int keep_locks = path->keep_locks;
3f157a2f 4486
c922b016 4487 ASSERT(!path->nowait);
f98de9b9 4488 path->keep_locks = 1;
3f157a2f 4489again:
bd681513 4490 cur = btrfs_read_lock_root_node(root);
3f157a2f 4491 level = btrfs_header_level(cur);
e02119d5 4492 WARN_ON(path->nodes[level]);
3f157a2f 4493 path->nodes[level] = cur;
bd681513 4494 path->locks[level] = BTRFS_READ_LOCK;
3f157a2f
CM
4495
4496 if (btrfs_header_generation(cur) < min_trans) {
4497 ret = 1;
4498 goto out;
4499 }
d397712b 4500 while (1) {
3f157a2f
CM
4501 nritems = btrfs_header_nritems(cur);
4502 level = btrfs_header_level(cur);
e3b83361 4503 sret = btrfs_bin_search(cur, min_key, &slot);
cbca7d59
FM
4504 if (sret < 0) {
4505 ret = sret;
4506 goto out;
4507 }
3f157a2f 4508
323ac95b
CM
4509 /* at the lowest level, we're done, setup the path and exit */
4510 if (level == path->lowest_level) {
e02119d5
CM
4511 if (slot >= nritems)
4512 goto find_next_key;
3f157a2f
CM
4513 ret = 0;
4514 path->slots[level] = slot;
4515 btrfs_item_key_to_cpu(cur, &found_key, slot);
4516 goto out;
4517 }
9652480b
Y
4518 if (sret && slot > 0)
4519 slot--;
3f157a2f 4520 /*
de78b51a 4521 * check this node pointer against the min_trans parameters.
260db43c 4522 * If it is too old, skip to the next one.
3f157a2f 4523 */
d397712b 4524 while (slot < nritems) {
3f157a2f 4525 u64 gen;
e02119d5 4526
3f157a2f
CM
4527 gen = btrfs_node_ptr_generation(cur, slot);
4528 if (gen < min_trans) {
4529 slot++;
4530 continue;
4531 }
de78b51a 4532 break;
3f157a2f 4533 }
e02119d5 4534find_next_key:
3f157a2f
CM
4535 /*
4536 * we didn't find a candidate key in this node, walk forward
4537 * and find another one
4538 */
4539 if (slot >= nritems) {
e02119d5
CM
4540 path->slots[level] = slot;
4541 sret = btrfs_find_next_key(root, path, min_key, level,
de78b51a 4542 min_trans);
e02119d5 4543 if (sret == 0) {
b3b4aa74 4544 btrfs_release_path(path);
3f157a2f
CM
4545 goto again;
4546 } else {
4547 goto out;
4548 }
4549 }
4550 /* save our key for returning back */
4551 btrfs_node_key_to_cpu(cur, &found_key, slot);
4552 path->slots[level] = slot;
4553 if (level == path->lowest_level) {
4554 ret = 0;
3f157a2f
CM
4555 goto out;
4556 }
4b231ae4 4557 cur = btrfs_read_node_slot(cur, slot);
fb770ae4
LB
4558 if (IS_ERR(cur)) {
4559 ret = PTR_ERR(cur);
4560 goto out;
4561 }
3f157a2f 4562
bd681513 4563 btrfs_tree_read_lock(cur);
b4ce94de 4564
bd681513 4565 path->locks[level - 1] = BTRFS_READ_LOCK;
3f157a2f 4566 path->nodes[level - 1] = cur;
f7c79f30 4567 unlock_up(path, level, 1, 0, NULL);
3f157a2f
CM
4568 }
4569out:
f98de9b9
FM
4570 path->keep_locks = keep_locks;
4571 if (ret == 0) {
4572 btrfs_unlock_up_safe(path, path->lowest_level + 1);
3f157a2f 4573 memcpy(min_key, &found_key, sizeof(found_key));
f98de9b9 4574 }
3f157a2f
CM
4575 return ret;
4576}
4577
4578/*
4579 * this is similar to btrfs_next_leaf, but does not try to preserve
4580 * and fixup the path. It looks for and returns the next key in the
de78b51a 4581 * tree based on the current path and the min_trans parameters.
3f157a2f
CM
4582 *
4583 * 0 is returned if another key is found, < 0 if there are any errors
4584 * and 1 is returned if there are no higher keys in the tree
4585 *
4586 * path->keep_locks should be set to 1 on the search made before
4587 * calling this function.
4588 */
e7a84565 4589int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
de78b51a 4590 struct btrfs_key *key, int level, u64 min_trans)
e7a84565 4591{
e7a84565
CM
4592 int slot;
4593 struct extent_buffer *c;
4594
6a9fb468 4595 WARN_ON(!path->keep_locks && !path->skip_locking);
d397712b 4596 while (level < BTRFS_MAX_LEVEL) {
e7a84565
CM
4597 if (!path->nodes[level])
4598 return 1;
4599
4600 slot = path->slots[level] + 1;
4601 c = path->nodes[level];
3f157a2f 4602next:
e7a84565 4603 if (slot >= btrfs_header_nritems(c)) {
33c66f43
YZ
4604 int ret;
4605 int orig_lowest;
4606 struct btrfs_key cur_key;
4607 if (level + 1 >= BTRFS_MAX_LEVEL ||
4608 !path->nodes[level + 1])
e7a84565 4609 return 1;
33c66f43 4610
6a9fb468 4611 if (path->locks[level + 1] || path->skip_locking) {
33c66f43
YZ
4612 level++;
4613 continue;
4614 }
4615
4616 slot = btrfs_header_nritems(c) - 1;
4617 if (level == 0)
4618 btrfs_item_key_to_cpu(c, &cur_key, slot);
4619 else
4620 btrfs_node_key_to_cpu(c, &cur_key, slot);
4621
4622 orig_lowest = path->lowest_level;
b3b4aa74 4623 btrfs_release_path(path);
33c66f43
YZ
4624 path->lowest_level = level;
4625 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4626 0, 0);
4627 path->lowest_level = orig_lowest;
4628 if (ret < 0)
4629 return ret;
4630
4631 c = path->nodes[level];
4632 slot = path->slots[level];
4633 if (ret == 0)
4634 slot++;
4635 goto next;
e7a84565 4636 }
33c66f43 4637
e7a84565
CM
4638 if (level == 0)
4639 btrfs_item_key_to_cpu(c, key, slot);
3f157a2f 4640 else {
3f157a2f
CM
4641 u64 gen = btrfs_node_ptr_generation(c, slot);
4642
3f157a2f
CM
4643 if (gen < min_trans) {
4644 slot++;
4645 goto next;
4646 }
e7a84565 4647 btrfs_node_key_to_cpu(c, key, slot);
3f157a2f 4648 }
e7a84565
CM
4649 return 0;
4650 }
4651 return 1;
4652}
4653
3d7806ec
JS
4654int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4655 u64 time_seq)
d97e63b6
CM
4656{
4657 int slot;
8e73f275 4658 int level;
5f39d397 4659 struct extent_buffer *c;
8e73f275 4660 struct extent_buffer *next;
d96b3424 4661 struct btrfs_fs_info *fs_info = root->fs_info;
925baedd 4662 struct btrfs_key key;
d96b3424 4663 bool need_commit_sem = false;
925baedd
CM
4664 u32 nritems;
4665 int ret;
0e46318d 4666 int i;
925baedd 4667
bdcdd86c
FM
4668 /*
4669 * The nowait semantics are used only for write paths, where we don't
4670 * use the tree mod log and sequence numbers.
4671 */
4672 if (time_seq)
4673 ASSERT(!path->nowait);
c922b016 4674
925baedd 4675 nritems = btrfs_header_nritems(path->nodes[0]);
d397712b 4676 if (nritems == 0)
925baedd 4677 return 1;
925baedd 4678
8e73f275
CM
4679 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4680again:
4681 level = 1;
4682 next = NULL;
b3b4aa74 4683 btrfs_release_path(path);
8e73f275 4684
a2135011 4685 path->keep_locks = 1;
8e73f275 4686
d96b3424 4687 if (time_seq) {
3d7806ec 4688 ret = btrfs_search_old_slot(root, &key, path, time_seq);
d96b3424
FM
4689 } else {
4690 if (path->need_commit_sem) {
4691 path->need_commit_sem = 0;
4692 need_commit_sem = true;
bdcdd86c
FM
4693 if (path->nowait) {
4694 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4695 ret = -EAGAIN;
4696 goto done;
4697 }
4698 } else {
4699 down_read(&fs_info->commit_root_sem);
4700 }
d96b3424 4701 }
3d7806ec 4702 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
d96b3424 4703 }
925baedd
CM
4704 path->keep_locks = 0;
4705
4706 if (ret < 0)
d96b3424 4707 goto done;
925baedd 4708
a2135011 4709 nritems = btrfs_header_nritems(path->nodes[0]);
168fd7d2
CM
4710 /*
4711 * by releasing the path above we dropped all our locks. A balance
4712 * could have added more items next to the key that used to be
4713 * at the very end of the block. So, check again here and
4714 * advance the path if there are now more items available.
4715 */
a2135011 4716 if (nritems > 0 && path->slots[0] < nritems - 1) {
e457afec
YZ
4717 if (ret == 0)
4718 path->slots[0]++;
8e73f275 4719 ret = 0;
925baedd
CM
4720 goto done;
4721 }
0b43e04f
LB
4722 /*
4723 * So the above check misses one case:
4724 * - after releasing the path above, someone has removed the item that
4725 * used to be at the very end of the block, and balance between leafs
4726 * gets another one with bigger key.offset to replace it.
4727 *
4728 * This one should be returned as well, or we can get leaf corruption
4729 * later(esp. in __btrfs_drop_extents()).
4730 *
4731 * And a bit more explanation about this check,
4732 * with ret > 0, the key isn't found, the path points to the slot
4733 * where it should be inserted, so the path->slots[0] item must be the
4734 * bigger one.
4735 */
4736 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4737 ret = 0;
4738 goto done;
4739 }
d97e63b6 4740
d397712b 4741 while (level < BTRFS_MAX_LEVEL) {
8e73f275
CM
4742 if (!path->nodes[level]) {
4743 ret = 1;
4744 goto done;
4745 }
5f39d397 4746
d97e63b6
CM
4747 slot = path->slots[level] + 1;
4748 c = path->nodes[level];
5f39d397 4749 if (slot >= btrfs_header_nritems(c)) {
d97e63b6 4750 level++;
8e73f275
CM
4751 if (level == BTRFS_MAX_LEVEL) {
4752 ret = 1;
4753 goto done;
4754 }
d97e63b6
CM
4755 continue;
4756 }
5f39d397 4757
0e46318d
JB
4758
4759 /*
4760 * Our current level is where we're going to start from, and to
4761 * make sure lockdep doesn't complain we need to drop our locks
4762 * and nodes from 0 to our current level.
4763 */
4764 for (i = 0; i < level; i++) {
4765 if (path->locks[level]) {
4766 btrfs_tree_read_unlock(path->nodes[i]);
4767 path->locks[i] = 0;
4768 }
4769 free_extent_buffer(path->nodes[i]);
4770 path->nodes[i] = NULL;
925baedd 4771 }
5f39d397 4772
8e73f275 4773 next = c;
d07b8528 4774 ret = read_block_for_search(root, path, &next, level,
cda79c54 4775 slot, &key);
bdcdd86c 4776 if (ret == -EAGAIN && !path->nowait)
8e73f275 4777 goto again;
5f39d397 4778
76a05b35 4779 if (ret < 0) {
b3b4aa74 4780 btrfs_release_path(path);
76a05b35
CM
4781 goto done;
4782 }
4783
5cd57b2c 4784 if (!path->skip_locking) {
bd681513 4785 ret = btrfs_try_tree_read_lock(next);
bdcdd86c
FM
4786 if (!ret && path->nowait) {
4787 ret = -EAGAIN;
4788 goto done;
4789 }
d42244a0
JS
4790 if (!ret && time_seq) {
4791 /*
4792 * If we don't get the lock, we may be racing
4793 * with push_leaf_left, holding that lock while
4794 * itself waiting for the leaf we've currently
4795 * locked. To solve this situation, we give up
4796 * on our lock and cycle.
4797 */
cf538830 4798 free_extent_buffer(next);
d42244a0
JS
4799 btrfs_release_path(path);
4800 cond_resched();
4801 goto again;
4802 }
0e46318d
JB
4803 if (!ret)
4804 btrfs_tree_read_lock(next);
5cd57b2c 4805 }
d97e63b6
CM
4806 break;
4807 }
4808 path->slots[level] = slot;
d397712b 4809 while (1) {
d97e63b6 4810 level--;
d97e63b6
CM
4811 path->nodes[level] = next;
4812 path->slots[level] = 0;
a74a4b97 4813 if (!path->skip_locking)
ffeb03cf 4814 path->locks[level] = BTRFS_READ_LOCK;
d97e63b6
CM
4815 if (!level)
4816 break;
b4ce94de 4817
d07b8528 4818 ret = read_block_for_search(root, path, &next, level,
cda79c54 4819 0, &key);
bdcdd86c 4820 if (ret == -EAGAIN && !path->nowait)
8e73f275
CM
4821 goto again;
4822
76a05b35 4823 if (ret < 0) {
b3b4aa74 4824 btrfs_release_path(path);
76a05b35
CM
4825 goto done;
4826 }
4827
bdcdd86c
FM
4828 if (!path->skip_locking) {
4829 if (path->nowait) {
4830 if (!btrfs_try_tree_read_lock(next)) {
4831 ret = -EAGAIN;
4832 goto done;
4833 }
4834 } else {
4835 btrfs_tree_read_lock(next);
4836 }
4837 }
d97e63b6 4838 }
8e73f275 4839 ret = 0;
925baedd 4840done:
f7c79f30 4841 unlock_up(path, 0, 1, 0, NULL);
d96b3424
FM
4842 if (need_commit_sem) {
4843 int ret2;
4844
4845 path->need_commit_sem = 1;
4846 ret2 = finish_need_commit_sem_search(path);
4847 up_read(&fs_info->commit_root_sem);
4848 if (ret2)
4849 ret = ret2;
4850 }
8e73f275
CM
4851
4852 return ret;
d97e63b6 4853}
0b86a832 4854
890d2b1a
JB
4855int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4856{
4857 path->slots[0]++;
4858 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4859 return btrfs_next_old_leaf(root, path, time_seq);
4860 return 0;
4861}
4862
3f157a2f
CM
4863/*
4864 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4865 * searching until it gets past min_objectid or finds an item of 'type'
4866 *
4867 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4868 */
0b86a832
CM
4869int btrfs_previous_item(struct btrfs_root *root,
4870 struct btrfs_path *path, u64 min_objectid,
4871 int type)
4872{
4873 struct btrfs_key found_key;
4874 struct extent_buffer *leaf;
e02119d5 4875 u32 nritems;
0b86a832
CM
4876 int ret;
4877
d397712b 4878 while (1) {
0b86a832
CM
4879 if (path->slots[0] == 0) {
4880 ret = btrfs_prev_leaf(root, path);
4881 if (ret != 0)
4882 return ret;
4883 } else {
4884 path->slots[0]--;
4885 }
4886 leaf = path->nodes[0];
e02119d5
CM
4887 nritems = btrfs_header_nritems(leaf);
4888 if (nritems == 0)
4889 return 1;
4890 if (path->slots[0] == nritems)
4891 path->slots[0]--;
4892
0b86a832 4893 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
e02119d5
CM
4894 if (found_key.objectid < min_objectid)
4895 break;
0a4eefbb
YZ
4896 if (found_key.type == type)
4897 return 0;
e02119d5
CM
4898 if (found_key.objectid == min_objectid &&
4899 found_key.type < type)
4900 break;
0b86a832
CM
4901 }
4902 return 1;
4903}
ade2e0b3
WS
4904
4905/*
4906 * search in extent tree to find a previous Metadata/Data extent item with
4907 * min objecitd.
4908 *
4909 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4910 */
4911int btrfs_previous_extent_item(struct btrfs_root *root,
4912 struct btrfs_path *path, u64 min_objectid)
4913{
4914 struct btrfs_key found_key;
4915 struct extent_buffer *leaf;
4916 u32 nritems;
4917 int ret;
4918
4919 while (1) {
4920 if (path->slots[0] == 0) {
ade2e0b3
WS
4921 ret = btrfs_prev_leaf(root, path);
4922 if (ret != 0)
4923 return ret;
4924 } else {
4925 path->slots[0]--;
4926 }
4927 leaf = path->nodes[0];
4928 nritems = btrfs_header_nritems(leaf);
4929 if (nritems == 0)
4930 return 1;
4931 if (path->slots[0] == nritems)
4932 path->slots[0]--;
4933
4934 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4935 if (found_key.objectid < min_objectid)
4936 break;
4937 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4938 found_key.type == BTRFS_METADATA_ITEM_KEY)
4939 return 0;
4940 if (found_key.objectid == min_objectid &&
4941 found_key.type < BTRFS_EXTENT_ITEM_KEY)
4942 break;
4943 }
4944 return 1;
4945}
226463d7
JB
4946
4947int __init btrfs_ctree_init(void)
4948{
4949 btrfs_path_cachep = kmem_cache_create("btrfs_path",
4950 sizeof(struct btrfs_path), 0,
4951 SLAB_MEM_SPREAD, NULL);
4952 if (!btrfs_path_cachep)
4953 return -ENOMEM;
4954 return 0;
4955}
4956
4957void __cold btrfs_ctree_exit(void)
4958{
4959 kmem_cache_destroy(btrfs_path_cachep);
4960}