btrfs: move root helpers back into ctree.h
[linux-block.git] / fs / btrfs / ctree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570 2/*
d352ac68 3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6cbd5570
CM
4 */
5
a6b6e75e 6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
bd989ba3 8#include <linux/rbtree.h>
adf02123 9#include <linux/mm.h>
e41d12f5 10#include <linux/error-injection.h>
9b569ea0 11#include "messages.h"
eb60ceac
CM
12#include "ctree.h"
13#include "disk-io.h"
7f5c1516 14#include "transaction.h"
5f39d397 15#include "print-tree.h"
925baedd 16#include "locking.h"
de37aa51 17#include "volumes.h"
f616f5cd 18#include "qgroup.h"
f3a84ccd 19#include "tree-mod-log.h"
88c602ab 20#include "tree-checker.h"
ec8eb376 21#include "fs.h"
ad1ac501 22#include "accessors.h"
a0231804 23#include "extent-tree.h"
67707479 24#include "relocation.h"
9a8dd150 25
226463d7
JB
26static struct kmem_cache *btrfs_path_cachep;
27
e089f05c
CM
28static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_path *path, int level);
310712b2
OS
30static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
31 const struct btrfs_key *ins_key, struct btrfs_path *path,
32 int data_size, int extend);
5f39d397 33static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 34 struct extent_buffer *dst,
971a1f66 35 struct extent_buffer *src, int empty);
5f39d397 36static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
37 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf);
afe5fea7
TI
39static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
40 int level, int slot);
d97e63b6 41
af024ed2
JT
42static const struct btrfs_csums {
43 u16 size;
59a0fcdb
DS
44 const char name[10];
45 const char driver[12];
af024ed2
JT
46} btrfs_csums[] = {
47 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
3951e7f0 48 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
3831bf00 49 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
352ae07b
DS
50 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
51 .driver = "blake2b-256" },
af024ed2
JT
52};
53
54int btrfs_super_csum_size(const struct btrfs_super_block *s)
55{
56 u16 t = btrfs_super_csum_type(s);
57 /*
58 * csum type is validated at mount time
59 */
60 return btrfs_csums[t].size;
61}
62
63const char *btrfs_super_csum_name(u16 csum_type)
64{
65 /* csum type is validated at mount time */
66 return btrfs_csums[csum_type].name;
67}
68
b4e967be
DS
69/*
70 * Return driver name if defined, otherwise the name that's also a valid driver
71 * name
72 */
73const char *btrfs_super_csum_driver(u16 csum_type)
74{
75 /* csum type is validated at mount time */
59a0fcdb
DS
76 return btrfs_csums[csum_type].driver[0] ?
77 btrfs_csums[csum_type].driver :
b4e967be
DS
78 btrfs_csums[csum_type].name;
79}
80
604997b4 81size_t __attribute_const__ btrfs_get_num_csums(void)
f7cea56c
DS
82{
83 return ARRAY_SIZE(btrfs_csums);
84}
85
df24a2b9 86struct btrfs_path *btrfs_alloc_path(void)
2c90e5d6 87{
e2c89907 88 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
2c90e5d6
CM
89}
90
d352ac68 91/* this also releases the path */
df24a2b9 92void btrfs_free_path(struct btrfs_path *p)
be0e5c09 93{
ff175d57
JJ
94 if (!p)
95 return;
b3b4aa74 96 btrfs_release_path(p);
df24a2b9 97 kmem_cache_free(btrfs_path_cachep, p);
be0e5c09
CM
98}
99
d352ac68
CM
100/*
101 * path release drops references on the extent buffers in the path
102 * and it drops any locks held by this path
103 *
104 * It is safe to call this on paths that no locks or extent buffers held.
105 */
b3b4aa74 106noinline void btrfs_release_path(struct btrfs_path *p)
eb60ceac
CM
107{
108 int i;
a2135011 109
234b63a0 110 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3f157a2f 111 p->slots[i] = 0;
eb60ceac 112 if (!p->nodes[i])
925baedd
CM
113 continue;
114 if (p->locks[i]) {
bd681513 115 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
925baedd
CM
116 p->locks[i] = 0;
117 }
5f39d397 118 free_extent_buffer(p->nodes[i]);
3f157a2f 119 p->nodes[i] = NULL;
eb60ceac
CM
120 }
121}
122
8bb808c6
DS
123/*
124 * We want the transaction abort to print stack trace only for errors where the
125 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
126 * caused by external factors.
127 */
128bool __cold abort_should_print_stack(int errno)
129{
130 switch (errno) {
131 case -EIO:
132 case -EROFS:
133 case -ENOMEM:
134 return false;
135 }
136 return true;
137}
138
d352ac68
CM
139/*
140 * safely gets a reference on the root node of a tree. A lock
141 * is not taken, so a concurrent writer may put a different node
142 * at the root of the tree. See btrfs_lock_root_node for the
143 * looping required.
144 *
145 * The extent buffer returned by this has a reference taken, so
146 * it won't disappear. It may stop being the root of the tree
147 * at any time because there are no locks held.
148 */
925baedd
CM
149struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
150{
151 struct extent_buffer *eb;
240f62c8 152
3083ee2e
JB
153 while (1) {
154 rcu_read_lock();
155 eb = rcu_dereference(root->node);
156
157 /*
158 * RCU really hurts here, we could free up the root node because
01327610 159 * it was COWed but we may not get the new root node yet so do
3083ee2e
JB
160 * the inc_not_zero dance and if it doesn't work then
161 * synchronize_rcu and try again.
162 */
163 if (atomic_inc_not_zero(&eb->refs)) {
164 rcu_read_unlock();
165 break;
166 }
167 rcu_read_unlock();
168 synchronize_rcu();
169 }
925baedd
CM
170 return eb;
171}
172
92a7cc42
QW
173/*
174 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
175 * just get put onto a simple dirty list. Transaction walks this list to make
176 * sure they get properly updated on disk.
d352ac68 177 */
0b86a832
CM
178static void add_root_to_dirty_list(struct btrfs_root *root)
179{
0b246afa
JM
180 struct btrfs_fs_info *fs_info = root->fs_info;
181
e7070be1
JB
182 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
183 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
184 return;
185
0b246afa 186 spin_lock(&fs_info->trans_lock);
e7070be1
JB
187 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
188 /* Want the extent tree to be the last on the list */
4fd786e6 189 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
e7070be1 190 list_move_tail(&root->dirty_list,
0b246afa 191 &fs_info->dirty_cowonly_roots);
e7070be1
JB
192 else
193 list_move(&root->dirty_list,
0b246afa 194 &fs_info->dirty_cowonly_roots);
0b86a832 195 }
0b246afa 196 spin_unlock(&fs_info->trans_lock);
0b86a832
CM
197}
198
d352ac68
CM
199/*
200 * used by snapshot creation to make a copy of a root for a tree with
201 * a given objectid. The buffer with the new root node is returned in
202 * cow_ret, and this func returns zero on success or a negative error code.
203 */
be20aa9d
CM
204int btrfs_copy_root(struct btrfs_trans_handle *trans,
205 struct btrfs_root *root,
206 struct extent_buffer *buf,
207 struct extent_buffer **cow_ret, u64 new_root_objectid)
208{
0b246afa 209 struct btrfs_fs_info *fs_info = root->fs_info;
be20aa9d 210 struct extent_buffer *cow;
be20aa9d
CM
211 int ret = 0;
212 int level;
5d4f98a2 213 struct btrfs_disk_key disk_key;
be20aa9d 214
92a7cc42 215 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 216 trans->transid != fs_info->running_transaction->transid);
92a7cc42 217 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 218 trans->transid != root->last_trans);
be20aa9d
CM
219
220 level = btrfs_header_level(buf);
5d4f98a2
YZ
221 if (level == 0)
222 btrfs_item_key(buf, &disk_key, 0);
223 else
224 btrfs_node_key(buf, &disk_key, 0);
31840ae1 225
4d75f8a9 226 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
cf6f34aa
JB
227 &disk_key, level, buf->start, 0,
228 BTRFS_NESTING_NEW_ROOT);
5d4f98a2 229 if (IS_ERR(cow))
be20aa9d
CM
230 return PTR_ERR(cow);
231
58e8012c 232 copy_extent_buffer_full(cow, buf);
be20aa9d
CM
233 btrfs_set_header_bytenr(cow, cow->start);
234 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
235 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
236 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
237 BTRFS_HEADER_FLAG_RELOC);
238 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
239 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
240 else
241 btrfs_set_header_owner(cow, new_root_objectid);
be20aa9d 242
de37aa51 243 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 244
be20aa9d 245 WARN_ON(btrfs_header_generation(buf) > trans->transid);
5d4f98a2 246 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 247 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 248 else
e339a6b0 249 ret = btrfs_inc_ref(trans, root, cow, 0);
867ed321 250 if (ret) {
72c9925f
FM
251 btrfs_tree_unlock(cow);
252 free_extent_buffer(cow);
867ed321 253 btrfs_abort_transaction(trans, ret);
be20aa9d 254 return ret;
867ed321 255 }
be20aa9d
CM
256
257 btrfs_mark_buffer_dirty(cow);
258 *cow_ret = cow;
259 return 0;
260}
261
5d4f98a2
YZ
262/*
263 * check if the tree block can be shared by multiple trees
264 */
265int btrfs_block_can_be_shared(struct btrfs_root *root,
266 struct extent_buffer *buf)
267{
268 /*
92a7cc42
QW
269 * Tree blocks not in shareable trees and tree roots are never shared.
270 * If a block was allocated after the last snapshot and the block was
271 * not allocated by tree relocation, we know the block is not shared.
5d4f98a2 272 */
92a7cc42 273 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
5d4f98a2
YZ
274 buf != root->node && buf != root->commit_root &&
275 (btrfs_header_generation(buf) <=
276 btrfs_root_last_snapshot(&root->root_item) ||
277 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
278 return 1;
a79865c6 279
5d4f98a2
YZ
280 return 0;
281}
282
283static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
284 struct btrfs_root *root,
285 struct extent_buffer *buf,
f0486c68
YZ
286 struct extent_buffer *cow,
287 int *last_ref)
5d4f98a2 288{
0b246afa 289 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
290 u64 refs;
291 u64 owner;
292 u64 flags;
293 u64 new_flags = 0;
294 int ret;
295
296 /*
297 * Backrefs update rules:
298 *
299 * Always use full backrefs for extent pointers in tree block
300 * allocated by tree relocation.
301 *
302 * If a shared tree block is no longer referenced by its owner
303 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
304 * use full backrefs for extent pointers in tree block.
305 *
306 * If a tree block is been relocating
307 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
308 * use full backrefs for extent pointers in tree block.
309 * The reason for this is some operations (such as drop tree)
310 * are only allowed for blocks use full backrefs.
311 */
312
313 if (btrfs_block_can_be_shared(root, buf)) {
2ff7e61e 314 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
3173a18f
JB
315 btrfs_header_level(buf), 1,
316 &refs, &flags);
be1a5564
MF
317 if (ret)
318 return ret;
e5df9573
MF
319 if (refs == 0) {
320 ret = -EROFS;
0b246afa 321 btrfs_handle_fs_error(fs_info, ret, NULL);
e5df9573
MF
322 return ret;
323 }
5d4f98a2
YZ
324 } else {
325 refs = 1;
326 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
327 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
328 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
329 else
330 flags = 0;
331 }
332
333 owner = btrfs_header_owner(buf);
334 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
335 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
336
337 if (refs > 1) {
338 if ((owner == root->root_key.objectid ||
339 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
340 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
e339a6b0 341 ret = btrfs_inc_ref(trans, root, buf, 1);
692826b2
JM
342 if (ret)
343 return ret;
5d4f98a2
YZ
344
345 if (root->root_key.objectid ==
346 BTRFS_TREE_RELOC_OBJECTID) {
e339a6b0 347 ret = btrfs_dec_ref(trans, root, buf, 0);
692826b2
JM
348 if (ret)
349 return ret;
e339a6b0 350 ret = btrfs_inc_ref(trans, root, cow, 1);
692826b2
JM
351 if (ret)
352 return ret;
5d4f98a2
YZ
353 }
354 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
355 } else {
356
357 if (root->root_key.objectid ==
358 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 359 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 360 else
e339a6b0 361 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
362 if (ret)
363 return ret;
5d4f98a2
YZ
364 }
365 if (new_flags != 0) {
b1c79e09
JB
366 int level = btrfs_header_level(buf);
367
42c9d0b5 368 ret = btrfs_set_disk_extent_flags(trans, buf,
2fe6a5a1 369 new_flags, level);
be1a5564
MF
370 if (ret)
371 return ret;
5d4f98a2
YZ
372 }
373 } else {
374 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
375 if (root->root_key.objectid ==
376 BTRFS_TREE_RELOC_OBJECTID)
e339a6b0 377 ret = btrfs_inc_ref(trans, root, cow, 1);
5d4f98a2 378 else
e339a6b0 379 ret = btrfs_inc_ref(trans, root, cow, 0);
692826b2
JM
380 if (ret)
381 return ret;
e339a6b0 382 ret = btrfs_dec_ref(trans, root, buf, 1);
692826b2
JM
383 if (ret)
384 return ret;
5d4f98a2 385 }
6a884d7d 386 btrfs_clean_tree_block(buf);
f0486c68 387 *last_ref = 1;
5d4f98a2
YZ
388 }
389 return 0;
390}
391
d352ac68 392/*
d397712b
CM
393 * does the dirty work in cow of a single block. The parent block (if
394 * supplied) is updated to point to the new cow copy. The new buffer is marked
395 * dirty and returned locked. If you modify the block it needs to be marked
396 * dirty again.
d352ac68
CM
397 *
398 * search_start -- an allocation hint for the new block
399 *
d397712b
CM
400 * empty_size -- a hint that you plan on doing more cow. This is the size in
401 * bytes the allocator should try to find free next to the block it returns.
402 * This is just a hint and may be ignored by the allocator.
d352ac68 403 */
d397712b 404static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
405 struct btrfs_root *root,
406 struct extent_buffer *buf,
407 struct extent_buffer *parent, int parent_slot,
408 struct extent_buffer **cow_ret,
9631e4cc
JB
409 u64 search_start, u64 empty_size,
410 enum btrfs_lock_nesting nest)
02217ed2 411{
0b246afa 412 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 413 struct btrfs_disk_key disk_key;
5f39d397 414 struct extent_buffer *cow;
be1a5564 415 int level, ret;
f0486c68 416 int last_ref = 0;
925baedd 417 int unlock_orig = 0;
0f5053eb 418 u64 parent_start = 0;
7bb86316 419
925baedd
CM
420 if (*cow_ret == buf)
421 unlock_orig = 1;
422
49d0c642 423 btrfs_assert_tree_write_locked(buf);
925baedd 424
92a7cc42 425 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
0b246afa 426 trans->transid != fs_info->running_transaction->transid);
92a7cc42 427 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
27cdeb70 428 trans->transid != root->last_trans);
5f39d397 429
7bb86316 430 level = btrfs_header_level(buf);
31840ae1 431
5d4f98a2
YZ
432 if (level == 0)
433 btrfs_item_key(buf, &disk_key, 0);
434 else
435 btrfs_node_key(buf, &disk_key, 0);
436
0f5053eb
GR
437 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
438 parent_start = parent->start;
5d4f98a2 439
79bd3712
FM
440 cow = btrfs_alloc_tree_block(trans, root, parent_start,
441 root->root_key.objectid, &disk_key, level,
442 search_start, empty_size, nest);
54aa1f4d
CM
443 if (IS_ERR(cow))
444 return PTR_ERR(cow);
6702ed49 445
b4ce94de
CM
446 /* cow is set to blocking by btrfs_init_new_buffer */
447
58e8012c 448 copy_extent_buffer_full(cow, buf);
db94535d 449 btrfs_set_header_bytenr(cow, cow->start);
5f39d397 450 btrfs_set_header_generation(cow, trans->transid);
5d4f98a2
YZ
451 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
452 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
453 BTRFS_HEADER_FLAG_RELOC);
454 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
455 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
456 else
457 btrfs_set_header_owner(cow, root->root_key.objectid);
6702ed49 458
de37aa51 459 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
2b82032c 460
be1a5564 461 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
b68dc2a9 462 if (ret) {
572c83ac
JB
463 btrfs_tree_unlock(cow);
464 free_extent_buffer(cow);
66642832 465 btrfs_abort_transaction(trans, ret);
b68dc2a9
MF
466 return ret;
467 }
1a40e23b 468
92a7cc42 469 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
83d4cfd4 470 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
93314e3b 471 if (ret) {
572c83ac
JB
472 btrfs_tree_unlock(cow);
473 free_extent_buffer(cow);
66642832 474 btrfs_abort_transaction(trans, ret);
83d4cfd4 475 return ret;
93314e3b 476 }
83d4cfd4 477 }
3fd0a558 478
02217ed2 479 if (buf == root->node) {
925baedd 480 WARN_ON(parent && parent != buf);
5d4f98a2
YZ
481 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
482 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
483 parent_start = buf->start;
925baedd 484
67439dad 485 atomic_inc(&cow->refs);
406808ab 486 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
d9d19a01 487 BUG_ON(ret < 0);
240f62c8 488 rcu_assign_pointer(root->node, cow);
925baedd 489
7a163608
FM
490 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
491 parent_start, last_ref);
5f39d397 492 free_extent_buffer(buf);
0b86a832 493 add_root_to_dirty_list(root);
02217ed2 494 } else {
5d4f98a2 495 WARN_ON(trans->transid != btrfs_header_generation(parent));
f3a84ccd 496 btrfs_tree_mod_log_insert_key(parent, parent_slot,
33cff222 497 BTRFS_MOD_LOG_KEY_REPLACE);
5f39d397 498 btrfs_set_node_blockptr(parent, parent_slot,
db94535d 499 cow->start);
74493f7a
CM
500 btrfs_set_node_ptr_generation(parent, parent_slot,
501 trans->transid);
d6025579 502 btrfs_mark_buffer_dirty(parent);
5de865ee 503 if (last_ref) {
f3a84ccd 504 ret = btrfs_tree_mod_log_free_eb(buf);
5de865ee 505 if (ret) {
572c83ac
JB
506 btrfs_tree_unlock(cow);
507 free_extent_buffer(cow);
66642832 508 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
509 return ret;
510 }
511 }
7a163608
FM
512 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
513 parent_start, last_ref);
02217ed2 514 }
925baedd
CM
515 if (unlock_orig)
516 btrfs_tree_unlock(buf);
3083ee2e 517 free_extent_buffer_stale(buf);
ccd467d6 518 btrfs_mark_buffer_dirty(cow);
2c90e5d6 519 *cow_ret = cow;
02217ed2
CM
520 return 0;
521}
522
5d4f98a2
YZ
523static inline int should_cow_block(struct btrfs_trans_handle *trans,
524 struct btrfs_root *root,
525 struct extent_buffer *buf)
526{
f5ee5c9a 527 if (btrfs_is_testing(root->fs_info))
faa2dbf0 528 return 0;
fccb84c9 529
d1980131
DS
530 /* Ensure we can see the FORCE_COW bit */
531 smp_mb__before_atomic();
f1ebcc74
LB
532
533 /*
534 * We do not need to cow a block if
535 * 1) this block is not created or changed in this transaction;
536 * 2) this block does not belong to TREE_RELOC tree;
537 * 3) the root is not forced COW.
538 *
539 * What is forced COW:
01327610 540 * when we create snapshot during committing the transaction,
52042d8e 541 * after we've finished copying src root, we must COW the shared
f1ebcc74
LB
542 * block to ensure the metadata consistency.
543 */
5d4f98a2
YZ
544 if (btrfs_header_generation(buf) == trans->transid &&
545 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
546 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
f1ebcc74 547 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
27cdeb70 548 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
5d4f98a2
YZ
549 return 0;
550 return 1;
551}
552
d352ac68
CM
553/*
554 * cows a single block, see __btrfs_cow_block for the real work.
01327610 555 * This version of it has extra checks so that a block isn't COWed more than
d352ac68
CM
556 * once per transaction, as long as it hasn't been written yet
557 */
d397712b 558noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
5f39d397
CM
559 struct btrfs_root *root, struct extent_buffer *buf,
560 struct extent_buffer *parent, int parent_slot,
9631e4cc
JB
561 struct extent_buffer **cow_ret,
562 enum btrfs_lock_nesting nest)
6702ed49 563{
0b246afa 564 struct btrfs_fs_info *fs_info = root->fs_info;
6702ed49 565 u64 search_start;
f510cfec 566 int ret;
dc17ff8f 567
83354f07
JB
568 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
569 btrfs_err(fs_info,
570 "COW'ing blocks on a fs root that's being dropped");
571
0b246afa 572 if (trans->transaction != fs_info->running_transaction)
31b1a2bd 573 WARN(1, KERN_CRIT "trans %llu running %llu\n",
c1c9ff7c 574 trans->transid,
0b246afa 575 fs_info->running_transaction->transid);
31b1a2bd 576
0b246afa 577 if (trans->transid != fs_info->generation)
31b1a2bd 578 WARN(1, KERN_CRIT "trans %llu running %llu\n",
0b246afa 579 trans->transid, fs_info->generation);
dc17ff8f 580
5d4f98a2 581 if (!should_cow_block(trans, root, buf)) {
6702ed49
CM
582 *cow_ret = buf;
583 return 0;
584 }
c487685d 585
ee22184b 586 search_start = buf->start & ~((u64)SZ_1G - 1);
b4ce94de 587
f616f5cd
QW
588 /*
589 * Before CoWing this block for later modification, check if it's
590 * the subtree root and do the delayed subtree trace if needed.
591 *
592 * Also We don't care about the error, as it's handled internally.
593 */
594 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
f510cfec 595 ret = __btrfs_cow_block(trans, root, buf, parent,
9631e4cc 596 parent_slot, cow_ret, search_start, 0, nest);
1abe9b8a 597
598 trace_btrfs_cow_block(root, buf, *cow_ret);
599
f510cfec 600 return ret;
6702ed49 601}
f75e2b79 602ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
6702ed49 603
d352ac68
CM
604/*
605 * helper function for defrag to decide if two blocks pointed to by a
606 * node are actually close by
607 */
6b80053d 608static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
6702ed49 609{
6b80053d 610 if (blocknr < other && other - (blocknr + blocksize) < 32768)
6702ed49 611 return 1;
6b80053d 612 if (blocknr > other && blocknr - (other + blocksize) < 32768)
6702ed49
CM
613 return 1;
614 return 0;
615}
616
ce6ef5ab
DS
617#ifdef __LITTLE_ENDIAN
618
619/*
620 * Compare two keys, on little-endian the disk order is same as CPU order and
621 * we can avoid the conversion.
622 */
623static int comp_keys(const struct btrfs_disk_key *disk_key,
624 const struct btrfs_key *k2)
625{
626 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
627
628 return btrfs_comp_cpu_keys(k1, k2);
629}
630
631#else
632
081e9573
CM
633/*
634 * compare two keys in a memcmp fashion
635 */
310712b2
OS
636static int comp_keys(const struct btrfs_disk_key *disk,
637 const struct btrfs_key *k2)
081e9573
CM
638{
639 struct btrfs_key k1;
640
641 btrfs_disk_key_to_cpu(&k1, disk);
642
20736aba 643 return btrfs_comp_cpu_keys(&k1, k2);
081e9573 644}
ce6ef5ab 645#endif
081e9573 646
f3465ca4
JB
647/*
648 * same as comp_keys only with two btrfs_key's
649 */
e1f60a65 650int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
f3465ca4
JB
651{
652 if (k1->objectid > k2->objectid)
653 return 1;
654 if (k1->objectid < k2->objectid)
655 return -1;
656 if (k1->type > k2->type)
657 return 1;
658 if (k1->type < k2->type)
659 return -1;
660 if (k1->offset > k2->offset)
661 return 1;
662 if (k1->offset < k2->offset)
663 return -1;
664 return 0;
665}
081e9573 666
d352ac68
CM
667/*
668 * this is used by the defrag code to go through all the
669 * leaves pointed to by a node and reallocate them so that
670 * disk order is close to key order
671 */
6702ed49 672int btrfs_realloc_node(struct btrfs_trans_handle *trans,
5f39d397 673 struct btrfs_root *root, struct extent_buffer *parent,
de78b51a 674 int start_slot, u64 *last_ret,
a6b6e75e 675 struct btrfs_key *progress)
6702ed49 676{
0b246afa 677 struct btrfs_fs_info *fs_info = root->fs_info;
6b80053d 678 struct extent_buffer *cur;
6702ed49 679 u64 blocknr;
e9d0b13b
CM
680 u64 search_start = *last_ret;
681 u64 last_block = 0;
6702ed49
CM
682 u64 other;
683 u32 parent_nritems;
6702ed49
CM
684 int end_slot;
685 int i;
686 int err = 0;
6b80053d 687 u32 blocksize;
081e9573
CM
688 int progress_passed = 0;
689 struct btrfs_disk_key disk_key;
6702ed49 690
0b246afa
JM
691 WARN_ON(trans->transaction != fs_info->running_transaction);
692 WARN_ON(trans->transid != fs_info->generation);
86479a04 693
6b80053d 694 parent_nritems = btrfs_header_nritems(parent);
0b246afa 695 blocksize = fs_info->nodesize;
5dfe2be7 696 end_slot = parent_nritems - 1;
6702ed49 697
5dfe2be7 698 if (parent_nritems <= 1)
6702ed49
CM
699 return 0;
700
5dfe2be7 701 for (i = start_slot; i <= end_slot; i++) {
6702ed49 702 int close = 1;
a6b6e75e 703
081e9573
CM
704 btrfs_node_key(parent, &disk_key, i);
705 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
706 continue;
707
708 progress_passed = 1;
6b80053d 709 blocknr = btrfs_node_blockptr(parent, i);
e9d0b13b
CM
710 if (last_block == 0)
711 last_block = blocknr;
5708b959 712
6702ed49 713 if (i > 0) {
6b80053d
CM
714 other = btrfs_node_blockptr(parent, i - 1);
715 close = close_blocks(blocknr, other, blocksize);
6702ed49 716 }
5dfe2be7 717 if (!close && i < end_slot) {
6b80053d
CM
718 other = btrfs_node_blockptr(parent, i + 1);
719 close = close_blocks(blocknr, other, blocksize);
6702ed49 720 }
e9d0b13b
CM
721 if (close) {
722 last_block = blocknr;
6702ed49 723 continue;
e9d0b13b 724 }
6702ed49 725
206983b7
JB
726 cur = btrfs_read_node_slot(parent, i);
727 if (IS_ERR(cur))
728 return PTR_ERR(cur);
e9d0b13b 729 if (search_start == 0)
6b80053d 730 search_start = last_block;
e9d0b13b 731
e7a84565 732 btrfs_tree_lock(cur);
6b80053d 733 err = __btrfs_cow_block(trans, root, cur, parent, i,
e7a84565 734 &cur, search_start,
6b80053d 735 min(16 * blocksize,
9631e4cc
JB
736 (end_slot - i) * blocksize),
737 BTRFS_NESTING_COW);
252c38f0 738 if (err) {
e7a84565 739 btrfs_tree_unlock(cur);
6b80053d 740 free_extent_buffer(cur);
6702ed49 741 break;
252c38f0 742 }
e7a84565
CM
743 search_start = cur->start;
744 last_block = cur->start;
f2183bde 745 *last_ret = search_start;
e7a84565
CM
746 btrfs_tree_unlock(cur);
747 free_extent_buffer(cur);
6702ed49
CM
748 }
749 return err;
750}
751
74123bd7 752/*
fb81212c 753 * Search for a key in the given extent_buffer.
5f39d397 754 *
fb81212c
FM
755 * The lower boundary for the search is specified by the slot number @low. Use a
756 * value of 0 to search over the whole extent buffer.
74123bd7 757 *
fb81212c
FM
758 * The slot in the extent buffer is returned via @slot. If the key exists in the
759 * extent buffer, then @slot will point to the slot where the key is, otherwise
760 * it points to the slot where you would insert the key.
761 *
762 * Slot may point to the total number of items (i.e. one position beyond the last
763 * key) if the key is bigger than the last key in the extent buffer.
74123bd7 764 */
fb81212c 765static noinline int generic_bin_search(struct extent_buffer *eb, int low,
67d5e289 766 const struct btrfs_key *key, int *slot)
be0e5c09 767{
fb81212c
FM
768 unsigned long p;
769 int item_size;
67d5e289 770 int high = btrfs_header_nritems(eb);
be0e5c09 771 int ret;
5cd17f34 772 const int key_size = sizeof(struct btrfs_disk_key);
be0e5c09 773
5e24e9af
LB
774 if (low > high) {
775 btrfs_err(eb->fs_info,
776 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
777 __func__, low, high, eb->start,
778 btrfs_header_owner(eb), btrfs_header_level(eb));
779 return -EINVAL;
780 }
781
fb81212c
FM
782 if (btrfs_header_level(eb) == 0) {
783 p = offsetof(struct btrfs_leaf, items);
784 item_size = sizeof(struct btrfs_item);
785 } else {
786 p = offsetof(struct btrfs_node, ptrs);
787 item_size = sizeof(struct btrfs_key_ptr);
788 }
789
d397712b 790 while (low < high) {
5cd17f34
DS
791 unsigned long oip;
792 unsigned long offset;
793 struct btrfs_disk_key *tmp;
794 struct btrfs_disk_key unaligned;
795 int mid;
796
be0e5c09 797 mid = (low + high) / 2;
5f39d397 798 offset = p + mid * item_size;
5cd17f34 799 oip = offset_in_page(offset);
5f39d397 800
5cd17f34 801 if (oip + key_size <= PAGE_SIZE) {
884b07d0 802 const unsigned long idx = get_eb_page_index(offset);
5cd17f34 803 char *kaddr = page_address(eb->pages[idx]);
5f39d397 804
884b07d0 805 oip = get_eb_offset_in_page(eb, offset);
5cd17f34 806 tmp = (struct btrfs_disk_key *)(kaddr + oip);
5f39d397 807 } else {
5cd17f34
DS
808 read_extent_buffer(eb, &unaligned, offset, key_size);
809 tmp = &unaligned;
5f39d397 810 }
5cd17f34 811
be0e5c09
CM
812 ret = comp_keys(tmp, key);
813
814 if (ret < 0)
815 low = mid + 1;
816 else if (ret > 0)
817 high = mid;
818 else {
819 *slot = mid;
820 return 0;
821 }
822 }
823 *slot = low;
824 return 1;
825}
826
97571fd0 827/*
fb81212c
FM
828 * Simple binary search on an extent buffer. Works for both leaves and nodes, and
829 * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
97571fd0 830 */
a74b35ec 831int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
e3b83361 832 int *slot)
be0e5c09 833{
fb81212c 834 return generic_bin_search(eb, 0, key, slot);
be0e5c09
CM
835}
836
f0486c68
YZ
837static void root_add_used(struct btrfs_root *root, u32 size)
838{
839 spin_lock(&root->accounting_lock);
840 btrfs_set_root_used(&root->root_item,
841 btrfs_root_used(&root->root_item) + size);
842 spin_unlock(&root->accounting_lock);
843}
844
845static void root_sub_used(struct btrfs_root *root, u32 size)
846{
847 spin_lock(&root->accounting_lock);
848 btrfs_set_root_used(&root->root_item,
849 btrfs_root_used(&root->root_item) - size);
850 spin_unlock(&root->accounting_lock);
851}
852
d352ac68
CM
853/* given a node and slot number, this reads the blocks it points to. The
854 * extent buffer is returned with a reference taken (but unlocked).
d352ac68 855 */
4b231ae4
DS
856struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
857 int slot)
bb803951 858{
ca7a79ad 859 int level = btrfs_header_level(parent);
789d6a3a 860 struct btrfs_tree_parent_check check = { 0 };
416bc658
JB
861 struct extent_buffer *eb;
862
fb770ae4
LB
863 if (slot < 0 || slot >= btrfs_header_nritems(parent))
864 return ERR_PTR(-ENOENT);
ca7a79ad
CM
865
866 BUG_ON(level == 0);
867
789d6a3a
QW
868 check.level = level - 1;
869 check.transid = btrfs_node_ptr_generation(parent, slot);
870 check.owner_root = btrfs_header_owner(parent);
871 check.has_first_key = true;
872 btrfs_node_key_to_cpu(parent, &check.first_key, slot);
873
d0d20b0f 874 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
789d6a3a 875 &check);
4eb150d6
QW
876 if (IS_ERR(eb))
877 return eb;
878 if (!extent_buffer_uptodate(eb)) {
fb770ae4 879 free_extent_buffer(eb);
4eb150d6 880 return ERR_PTR(-EIO);
416bc658
JB
881 }
882
883 return eb;
bb803951
CM
884}
885
d352ac68
CM
886/*
887 * node level balancing, used to make sure nodes are in proper order for
888 * item deletion. We balance from the top down, so we have to make sure
889 * that a deletion won't leave an node completely empty later on.
890 */
e02119d5 891static noinline int balance_level(struct btrfs_trans_handle *trans,
98ed5174
CM
892 struct btrfs_root *root,
893 struct btrfs_path *path, int level)
bb803951 894{
0b246afa 895 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
896 struct extent_buffer *right = NULL;
897 struct extent_buffer *mid;
898 struct extent_buffer *left = NULL;
899 struct extent_buffer *parent = NULL;
bb803951
CM
900 int ret = 0;
901 int wret;
902 int pslot;
bb803951 903 int orig_slot = path->slots[level];
79f95c82 904 u64 orig_ptr;
bb803951 905
98e6b1eb 906 ASSERT(level > 0);
bb803951 907
5f39d397 908 mid = path->nodes[level];
b4ce94de 909
ac5887c8 910 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
7bb86316
CM
911 WARN_ON(btrfs_header_generation(mid) != trans->transid);
912
1d4f8a0c 913 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
79f95c82 914
a05a9bb1 915 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 916 parent = path->nodes[level + 1];
a05a9bb1
LZ
917 pslot = path->slots[level + 1];
918 }
bb803951 919
40689478
CM
920 /*
921 * deal with the case where there is only one pointer in the root
922 * by promoting the node below to a root
923 */
5f39d397
CM
924 if (!parent) {
925 struct extent_buffer *child;
bb803951 926
5f39d397 927 if (btrfs_header_nritems(mid) != 1)
bb803951
CM
928 return 0;
929
930 /* promote the child to a root */
4b231ae4 931 child = btrfs_read_node_slot(mid, 0);
fb770ae4
LB
932 if (IS_ERR(child)) {
933 ret = PTR_ERR(child);
0b246afa 934 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
935 goto enospc;
936 }
937
925baedd 938 btrfs_tree_lock(child);
9631e4cc
JB
939 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
940 BTRFS_NESTING_COW);
f0486c68
YZ
941 if (ret) {
942 btrfs_tree_unlock(child);
943 free_extent_buffer(child);
944 goto enospc;
945 }
2f375ab9 946
406808ab 947 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
d9d19a01 948 BUG_ON(ret < 0);
240f62c8 949 rcu_assign_pointer(root->node, child);
925baedd 950
0b86a832 951 add_root_to_dirty_list(root);
925baedd 952 btrfs_tree_unlock(child);
b4ce94de 953
925baedd 954 path->locks[level] = 0;
bb803951 955 path->nodes[level] = NULL;
6a884d7d 956 btrfs_clean_tree_block(mid);
925baedd 957 btrfs_tree_unlock(mid);
bb803951 958 /* once for the path */
5f39d397 959 free_extent_buffer(mid);
f0486c68
YZ
960
961 root_sub_used(root, mid->len);
7a163608 962 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
bb803951 963 /* once for the root ptr */
3083ee2e 964 free_extent_buffer_stale(mid);
f0486c68 965 return 0;
bb803951 966 }
5f39d397 967 if (btrfs_header_nritems(mid) >
0b246afa 968 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
bb803951
CM
969 return 0;
970
4b231ae4 971 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
972 if (IS_ERR(left))
973 left = NULL;
974
5f39d397 975 if (left) {
bf77467a 976 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
5f39d397 977 wret = btrfs_cow_block(trans, root, left,
9631e4cc 978 parent, pslot - 1, &left,
bf59a5a2 979 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
980 if (wret) {
981 ret = wret;
982 goto enospc;
983 }
2cc58cf2 984 }
fb770ae4 985
4b231ae4 986 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
987 if (IS_ERR(right))
988 right = NULL;
989
5f39d397 990 if (right) {
bf77467a 991 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
5f39d397 992 wret = btrfs_cow_block(trans, root, right,
9631e4cc 993 parent, pslot + 1, &right,
bf59a5a2 994 BTRFS_NESTING_RIGHT_COW);
2cc58cf2
CM
995 if (wret) {
996 ret = wret;
997 goto enospc;
998 }
999 }
1000
1001 /* first, try to make some room in the middle buffer */
5f39d397
CM
1002 if (left) {
1003 orig_slot += btrfs_header_nritems(left);
d30a668f 1004 wret = push_node_left(trans, left, mid, 1);
79f95c82
CM
1005 if (wret < 0)
1006 ret = wret;
bb803951 1007 }
79f95c82
CM
1008
1009 /*
1010 * then try to empty the right most buffer into the middle
1011 */
5f39d397 1012 if (right) {
d30a668f 1013 wret = push_node_left(trans, mid, right, 1);
54aa1f4d 1014 if (wret < 0 && wret != -ENOSPC)
79f95c82 1015 ret = wret;
5f39d397 1016 if (btrfs_header_nritems(right) == 0) {
6a884d7d 1017 btrfs_clean_tree_block(right);
925baedd 1018 btrfs_tree_unlock(right);
afe5fea7 1019 del_ptr(root, path, level + 1, pslot + 1);
f0486c68 1020 root_sub_used(root, right->len);
7a163608
FM
1021 btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1022 0, 1);
3083ee2e 1023 free_extent_buffer_stale(right);
f0486c68 1024 right = NULL;
bb803951 1025 } else {
5f39d397
CM
1026 struct btrfs_disk_key right_key;
1027 btrfs_node_key(right, &right_key, 0);
f3a84ccd 1028 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1029 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1030 BUG_ON(ret < 0);
5f39d397
CM
1031 btrfs_set_node_key(parent, &right_key, pslot + 1);
1032 btrfs_mark_buffer_dirty(parent);
bb803951
CM
1033 }
1034 }
5f39d397 1035 if (btrfs_header_nritems(mid) == 1) {
79f95c82
CM
1036 /*
1037 * we're not allowed to leave a node with one item in the
1038 * tree during a delete. A deletion from lower in the tree
1039 * could try to delete the only pointer in this node.
1040 * So, pull some keys from the left.
1041 * There has to be a left pointer at this point because
1042 * otherwise we would have pulled some pointers from the
1043 * right
1044 */
305a26af
MF
1045 if (!left) {
1046 ret = -EROFS;
0b246afa 1047 btrfs_handle_fs_error(fs_info, ret, NULL);
305a26af
MF
1048 goto enospc;
1049 }
55d32ed8 1050 wret = balance_node_right(trans, mid, left);
54aa1f4d 1051 if (wret < 0) {
79f95c82 1052 ret = wret;
54aa1f4d
CM
1053 goto enospc;
1054 }
bce4eae9 1055 if (wret == 1) {
d30a668f 1056 wret = push_node_left(trans, left, mid, 1);
bce4eae9
CM
1057 if (wret < 0)
1058 ret = wret;
1059 }
79f95c82
CM
1060 BUG_ON(wret == 1);
1061 }
5f39d397 1062 if (btrfs_header_nritems(mid) == 0) {
6a884d7d 1063 btrfs_clean_tree_block(mid);
925baedd 1064 btrfs_tree_unlock(mid);
afe5fea7 1065 del_ptr(root, path, level + 1, pslot);
f0486c68 1066 root_sub_used(root, mid->len);
7a163608 1067 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
3083ee2e 1068 free_extent_buffer_stale(mid);
f0486c68 1069 mid = NULL;
79f95c82
CM
1070 } else {
1071 /* update the parent key to reflect our changes */
5f39d397
CM
1072 struct btrfs_disk_key mid_key;
1073 btrfs_node_key(mid, &mid_key, 0);
f3a84ccd 1074 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1075 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1076 BUG_ON(ret < 0);
5f39d397
CM
1077 btrfs_set_node_key(parent, &mid_key, pslot);
1078 btrfs_mark_buffer_dirty(parent);
79f95c82 1079 }
bb803951 1080
79f95c82 1081 /* update the path */
5f39d397
CM
1082 if (left) {
1083 if (btrfs_header_nritems(left) > orig_slot) {
67439dad 1084 atomic_inc(&left->refs);
925baedd 1085 /* left was locked after cow */
5f39d397 1086 path->nodes[level] = left;
bb803951
CM
1087 path->slots[level + 1] -= 1;
1088 path->slots[level] = orig_slot;
925baedd
CM
1089 if (mid) {
1090 btrfs_tree_unlock(mid);
5f39d397 1091 free_extent_buffer(mid);
925baedd 1092 }
bb803951 1093 } else {
5f39d397 1094 orig_slot -= btrfs_header_nritems(left);
bb803951
CM
1095 path->slots[level] = orig_slot;
1096 }
1097 }
79f95c82 1098 /* double check we haven't messed things up */
e20d96d6 1099 if (orig_ptr !=
5f39d397 1100 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
79f95c82 1101 BUG();
54aa1f4d 1102enospc:
925baedd
CM
1103 if (right) {
1104 btrfs_tree_unlock(right);
5f39d397 1105 free_extent_buffer(right);
925baedd
CM
1106 }
1107 if (left) {
1108 if (path->nodes[level] != left)
1109 btrfs_tree_unlock(left);
5f39d397 1110 free_extent_buffer(left);
925baedd 1111 }
bb803951
CM
1112 return ret;
1113}
1114
d352ac68
CM
1115/* Node balancing for insertion. Here we only split or push nodes around
1116 * when they are completely full. This is also done top down, so we
1117 * have to be pessimistic.
1118 */
d397712b 1119static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
98ed5174
CM
1120 struct btrfs_root *root,
1121 struct btrfs_path *path, int level)
e66f709b 1122{
0b246afa 1123 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
1124 struct extent_buffer *right = NULL;
1125 struct extent_buffer *mid;
1126 struct extent_buffer *left = NULL;
1127 struct extent_buffer *parent = NULL;
e66f709b
CM
1128 int ret = 0;
1129 int wret;
1130 int pslot;
1131 int orig_slot = path->slots[level];
e66f709b
CM
1132
1133 if (level == 0)
1134 return 1;
1135
5f39d397 1136 mid = path->nodes[level];
7bb86316 1137 WARN_ON(btrfs_header_generation(mid) != trans->transid);
e66f709b 1138
a05a9bb1 1139 if (level < BTRFS_MAX_LEVEL - 1) {
5f39d397 1140 parent = path->nodes[level + 1];
a05a9bb1
LZ
1141 pslot = path->slots[level + 1];
1142 }
e66f709b 1143
5f39d397 1144 if (!parent)
e66f709b 1145 return 1;
e66f709b 1146
4b231ae4 1147 left = btrfs_read_node_slot(parent, pslot - 1);
fb770ae4
LB
1148 if (IS_ERR(left))
1149 left = NULL;
e66f709b
CM
1150
1151 /* first, try to make some room in the middle buffer */
5f39d397 1152 if (left) {
e66f709b 1153 u32 left_nr;
925baedd 1154
bf77467a 1155 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
b4ce94de 1156
5f39d397 1157 left_nr = btrfs_header_nritems(left);
0b246afa 1158 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1159 wret = 1;
1160 } else {
5f39d397 1161 ret = btrfs_cow_block(trans, root, left, parent,
9631e4cc 1162 pslot - 1, &left,
bf59a5a2 1163 BTRFS_NESTING_LEFT_COW);
54aa1f4d
CM
1164 if (ret)
1165 wret = 1;
1166 else {
d30a668f 1167 wret = push_node_left(trans, left, mid, 0);
54aa1f4d 1168 }
33ade1f8 1169 }
e66f709b
CM
1170 if (wret < 0)
1171 ret = wret;
1172 if (wret == 0) {
5f39d397 1173 struct btrfs_disk_key disk_key;
e66f709b 1174 orig_slot += left_nr;
5f39d397 1175 btrfs_node_key(mid, &disk_key, 0);
f3a84ccd 1176 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
33cff222 1177 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1178 BUG_ON(ret < 0);
5f39d397
CM
1179 btrfs_set_node_key(parent, &disk_key, pslot);
1180 btrfs_mark_buffer_dirty(parent);
1181 if (btrfs_header_nritems(left) > orig_slot) {
1182 path->nodes[level] = left;
e66f709b
CM
1183 path->slots[level + 1] -= 1;
1184 path->slots[level] = orig_slot;
925baedd 1185 btrfs_tree_unlock(mid);
5f39d397 1186 free_extent_buffer(mid);
e66f709b
CM
1187 } else {
1188 orig_slot -=
5f39d397 1189 btrfs_header_nritems(left);
e66f709b 1190 path->slots[level] = orig_slot;
925baedd 1191 btrfs_tree_unlock(left);
5f39d397 1192 free_extent_buffer(left);
e66f709b 1193 }
e66f709b
CM
1194 return 0;
1195 }
925baedd 1196 btrfs_tree_unlock(left);
5f39d397 1197 free_extent_buffer(left);
e66f709b 1198 }
4b231ae4 1199 right = btrfs_read_node_slot(parent, pslot + 1);
fb770ae4
LB
1200 if (IS_ERR(right))
1201 right = NULL;
e66f709b
CM
1202
1203 /*
1204 * then try to empty the right most buffer into the middle
1205 */
5f39d397 1206 if (right) {
33ade1f8 1207 u32 right_nr;
b4ce94de 1208
bf77467a 1209 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
b4ce94de 1210
5f39d397 1211 right_nr = btrfs_header_nritems(right);
0b246afa 1212 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
33ade1f8
CM
1213 wret = 1;
1214 } else {
5f39d397
CM
1215 ret = btrfs_cow_block(trans, root, right,
1216 parent, pslot + 1,
bf59a5a2 1217 &right, BTRFS_NESTING_RIGHT_COW);
54aa1f4d
CM
1218 if (ret)
1219 wret = 1;
1220 else {
55d32ed8 1221 wret = balance_node_right(trans, right, mid);
54aa1f4d 1222 }
33ade1f8 1223 }
e66f709b
CM
1224 if (wret < 0)
1225 ret = wret;
1226 if (wret == 0) {
5f39d397
CM
1227 struct btrfs_disk_key disk_key;
1228
1229 btrfs_node_key(right, &disk_key, 0);
f3a84ccd 1230 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
33cff222 1231 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 1232 BUG_ON(ret < 0);
5f39d397
CM
1233 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1234 btrfs_mark_buffer_dirty(parent);
1235
1236 if (btrfs_header_nritems(mid) <= orig_slot) {
1237 path->nodes[level] = right;
e66f709b
CM
1238 path->slots[level + 1] += 1;
1239 path->slots[level] = orig_slot -
5f39d397 1240 btrfs_header_nritems(mid);
925baedd 1241 btrfs_tree_unlock(mid);
5f39d397 1242 free_extent_buffer(mid);
e66f709b 1243 } else {
925baedd 1244 btrfs_tree_unlock(right);
5f39d397 1245 free_extent_buffer(right);
e66f709b 1246 }
e66f709b
CM
1247 return 0;
1248 }
925baedd 1249 btrfs_tree_unlock(right);
5f39d397 1250 free_extent_buffer(right);
e66f709b 1251 }
e66f709b
CM
1252 return 1;
1253}
1254
3c69faec 1255/*
d352ac68
CM
1256 * readahead one full node of leaves, finding things that are close
1257 * to the block in 'slot', and triggering ra on them.
3c69faec 1258 */
2ff7e61e 1259static void reada_for_search(struct btrfs_fs_info *fs_info,
c8c42864
CM
1260 struct btrfs_path *path,
1261 int level, int slot, u64 objectid)
3c69faec 1262{
5f39d397 1263 struct extent_buffer *node;
01f46658 1264 struct btrfs_disk_key disk_key;
3c69faec 1265 u32 nritems;
3c69faec 1266 u64 search;
a7175319 1267 u64 target;
6b80053d 1268 u64 nread = 0;
ace75066 1269 u64 nread_max;
6b80053d
CM
1270 u32 nr;
1271 u32 blocksize;
1272 u32 nscan = 0;
db94535d 1273
ace75066 1274 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
6702ed49
CM
1275 return;
1276
1277 if (!path->nodes[level])
3c69faec
CM
1278 return;
1279
5f39d397 1280 node = path->nodes[level];
925baedd 1281
ace75066
FM
1282 /*
1283 * Since the time between visiting leaves is much shorter than the time
1284 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1285 * much IO at once (possibly random).
1286 */
1287 if (path->reada == READA_FORWARD_ALWAYS) {
1288 if (level > 1)
1289 nread_max = node->fs_info->nodesize;
1290 else
1291 nread_max = SZ_128K;
1292 } else {
1293 nread_max = SZ_64K;
1294 }
1295
3c69faec 1296 search = btrfs_node_blockptr(node, slot);
0b246afa 1297 blocksize = fs_info->nodesize;
069a2e37
FM
1298 if (path->reada != READA_FORWARD_ALWAYS) {
1299 struct extent_buffer *eb;
1300
1301 eb = find_extent_buffer(fs_info, search);
1302 if (eb) {
1303 free_extent_buffer(eb);
1304 return;
1305 }
3c69faec
CM
1306 }
1307
a7175319 1308 target = search;
6b80053d 1309
5f39d397 1310 nritems = btrfs_header_nritems(node);
6b80053d 1311 nr = slot;
25b8b936 1312
d397712b 1313 while (1) {
e4058b54 1314 if (path->reada == READA_BACK) {
6b80053d
CM
1315 if (nr == 0)
1316 break;
1317 nr--;
ace75066
FM
1318 } else if (path->reada == READA_FORWARD ||
1319 path->reada == READA_FORWARD_ALWAYS) {
6b80053d
CM
1320 nr++;
1321 if (nr >= nritems)
1322 break;
3c69faec 1323 }
e4058b54 1324 if (path->reada == READA_BACK && objectid) {
01f46658
CM
1325 btrfs_node_key(node, &disk_key, nr);
1326 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1327 break;
1328 }
6b80053d 1329 search = btrfs_node_blockptr(node, nr);
ace75066
FM
1330 if (path->reada == READA_FORWARD_ALWAYS ||
1331 (search <= target && target - search <= 65536) ||
a7175319 1332 (search > target && search - target <= 65536)) {
bfb484d9 1333 btrfs_readahead_node_child(node, nr);
6b80053d
CM
1334 nread += blocksize;
1335 }
1336 nscan++;
ace75066 1337 if (nread > nread_max || nscan > 32)
6b80053d 1338 break;
3c69faec
CM
1339 }
1340}
925baedd 1341
bfb484d9 1342static noinline void reada_for_balance(struct btrfs_path *path, int level)
b4ce94de 1343{
bfb484d9 1344 struct extent_buffer *parent;
b4ce94de
CM
1345 int slot;
1346 int nritems;
b4ce94de 1347
8c594ea8 1348 parent = path->nodes[level + 1];
b4ce94de 1349 if (!parent)
0b08851f 1350 return;
b4ce94de
CM
1351
1352 nritems = btrfs_header_nritems(parent);
8c594ea8 1353 slot = path->slots[level + 1];
b4ce94de 1354
bfb484d9
JB
1355 if (slot > 0)
1356 btrfs_readahead_node_child(parent, slot - 1);
1357 if (slot + 1 < nritems)
1358 btrfs_readahead_node_child(parent, slot + 1);
b4ce94de
CM
1359}
1360
1361
d352ac68 1362/*
d397712b
CM
1363 * when we walk down the tree, it is usually safe to unlock the higher layers
1364 * in the tree. The exceptions are when our path goes through slot 0, because
1365 * operations on the tree might require changing key pointers higher up in the
1366 * tree.
d352ac68 1367 *
d397712b
CM
1368 * callers might also have set path->keep_locks, which tells this code to keep
1369 * the lock if the path points to the last slot in the block. This is part of
1370 * walking through the tree, and selecting the next slot in the higher block.
d352ac68 1371 *
d397712b
CM
1372 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1373 * if lowest_unlock is 1, level 0 won't be unlocked
d352ac68 1374 */
e02119d5 1375static noinline void unlock_up(struct btrfs_path *path, int level,
f7c79f30
CM
1376 int lowest_unlock, int min_write_lock_level,
1377 int *write_lock_level)
925baedd
CM
1378{
1379 int i;
1380 int skip_level = level;
c1227996 1381 bool check_skip = true;
925baedd
CM
1382
1383 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1384 if (!path->nodes[i])
1385 break;
1386 if (!path->locks[i])
1387 break;
c1227996
NB
1388
1389 if (check_skip) {
1390 if (path->slots[i] == 0) {
925baedd
CM
1391 skip_level = i + 1;
1392 continue;
1393 }
c1227996
NB
1394
1395 if (path->keep_locks) {
1396 u32 nritems;
1397
1398 nritems = btrfs_header_nritems(path->nodes[i]);
1399 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1400 skip_level = i + 1;
1401 continue;
1402 }
1403 }
925baedd 1404 }
051e1b9f 1405
d80bb3f9 1406 if (i >= lowest_unlock && i > skip_level) {
c1227996
NB
1407 check_skip = false;
1408 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
925baedd 1409 path->locks[i] = 0;
f7c79f30
CM
1410 if (write_lock_level &&
1411 i > min_write_lock_level &&
1412 i <= *write_lock_level) {
1413 *write_lock_level = i - 1;
1414 }
925baedd
CM
1415 }
1416 }
1417}
1418
c8c42864 1419/*
376a21d7
FM
1420 * Helper function for btrfs_search_slot() and other functions that do a search
1421 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1422 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1423 * its pages from disk.
c8c42864 1424 *
376a21d7
FM
1425 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1426 * whole btree search, starting again from the current root node.
c8c42864
CM
1427 */
1428static int
d07b8528
LB
1429read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1430 struct extent_buffer **eb_ret, int level, int slot,
cda79c54 1431 const struct btrfs_key *key)
c8c42864 1432{
0b246afa 1433 struct btrfs_fs_info *fs_info = root->fs_info;
789d6a3a 1434 struct btrfs_tree_parent_check check = { 0 };
c8c42864
CM
1435 u64 blocknr;
1436 u64 gen;
c8c42864 1437 struct extent_buffer *tmp;
76a05b35 1438 int ret;
581c1760 1439 int parent_level;
b246666e 1440 bool unlock_up;
c8c42864 1441
b246666e 1442 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
213ff4b7
NB
1443 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1444 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1445 parent_level = btrfs_header_level(*eb_ret);
789d6a3a
QW
1446 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1447 check.has_first_key = true;
1448 check.level = parent_level - 1;
1449 check.transid = gen;
1450 check.owner_root = root->root_key.objectid;
c8c42864 1451
b246666e
FM
1452 /*
1453 * If we need to read an extent buffer from disk and we are holding locks
1454 * on upper level nodes, we unlock all the upper nodes before reading the
1455 * extent buffer, and then return -EAGAIN to the caller as it needs to
1456 * restart the search. We don't release the lock on the current level
1457 * because we need to walk this node to figure out which blocks to read.
1458 */
0b246afa 1459 tmp = find_extent_buffer(fs_info, blocknr);
cb44921a 1460 if (tmp) {
ace75066
FM
1461 if (p->reada == READA_FORWARD_ALWAYS)
1462 reada_for_search(fs_info, p, level, slot, key->objectid);
1463
b9fab919 1464 /* first we do an atomic uptodate check */
bdf7c00e 1465 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
448de471
QW
1466 /*
1467 * Do extra check for first_key, eb can be stale due to
1468 * being cached, read from scrub, or have multiple
1469 * parents (shared tree blocks).
1470 */
e064d5e9 1471 if (btrfs_verify_level_key(tmp,
789d6a3a 1472 parent_level - 1, &check.first_key, gen)) {
448de471
QW
1473 free_extent_buffer(tmp);
1474 return -EUCLEAN;
1475 }
bdf7c00e
JB
1476 *eb_ret = tmp;
1477 return 0;
1478 }
1479
857bc13f
JB
1480 if (p->nowait) {
1481 free_extent_buffer(tmp);
1482 return -EAGAIN;
1483 }
1484
b246666e
FM
1485 if (unlock_up)
1486 btrfs_unlock_up_safe(p, level + 1);
1487
bdf7c00e 1488 /* now we're allowed to do a blocking uptodate check */
789d6a3a 1489 ret = btrfs_read_extent_buffer(tmp, &check);
9a4ffa1b
QW
1490 if (ret) {
1491 free_extent_buffer(tmp);
1492 btrfs_release_path(p);
1493 return -EIO;
cb44921a 1494 }
88c602ab
QW
1495 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1496 free_extent_buffer(tmp);
1497 btrfs_release_path(p);
1498 return -EUCLEAN;
1499 }
b246666e
FM
1500
1501 if (unlock_up)
1502 ret = -EAGAIN;
1503
1504 goto out;
857bc13f
JB
1505 } else if (p->nowait) {
1506 return -EAGAIN;
c8c42864
CM
1507 }
1508
b246666e 1509 if (unlock_up) {
4bb59055
FM
1510 btrfs_unlock_up_safe(p, level + 1);
1511 ret = -EAGAIN;
1512 } else {
1513 ret = 0;
1514 }
8c594ea8 1515
e4058b54 1516 if (p->reada != READA_NONE)
2ff7e61e 1517 reada_for_search(fs_info, p, level, slot, key->objectid);
c8c42864 1518
789d6a3a 1519 tmp = read_tree_block(fs_info, blocknr, &check);
4eb150d6
QW
1520 if (IS_ERR(tmp)) {
1521 btrfs_release_path(p);
1522 return PTR_ERR(tmp);
76a05b35 1523 }
4eb150d6
QW
1524 /*
1525 * If the read above didn't mark this buffer up to date,
1526 * it will never end up being up to date. Set ret to EIO now
1527 * and give up so that our caller doesn't loop forever
1528 * on our EAGAINs.
1529 */
1530 if (!extent_buffer_uptodate(tmp))
1531 ret = -EIO;
02a3307a 1532
b246666e 1533out:
4bb59055
FM
1534 if (ret == 0) {
1535 *eb_ret = tmp;
1536 } else {
1537 free_extent_buffer(tmp);
1538 btrfs_release_path(p);
1539 }
1540
76a05b35 1541 return ret;
c8c42864
CM
1542}
1543
1544/*
1545 * helper function for btrfs_search_slot. This does all of the checks
1546 * for node-level blocks and does any balancing required based on
1547 * the ins_len.
1548 *
1549 * If no extra work was required, zero is returned. If we had to
1550 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1551 * start over
1552 */
1553static int
1554setup_nodes_for_search(struct btrfs_trans_handle *trans,
1555 struct btrfs_root *root, struct btrfs_path *p,
bd681513
CM
1556 struct extent_buffer *b, int level, int ins_len,
1557 int *write_lock_level)
c8c42864 1558{
0b246afa 1559 struct btrfs_fs_info *fs_info = root->fs_info;
95b982de 1560 int ret = 0;
0b246afa 1561
c8c42864 1562 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
0b246afa 1563 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
c8c42864 1564
bd681513
CM
1565 if (*write_lock_level < level + 1) {
1566 *write_lock_level = level + 1;
1567 btrfs_release_path(p);
95b982de 1568 return -EAGAIN;
bd681513
CM
1569 }
1570
bfb484d9 1571 reada_for_balance(p, level);
95b982de 1572 ret = split_node(trans, root, p, level);
c8c42864 1573
c8c42864
CM
1574 b = p->nodes[level];
1575 } else if (ins_len < 0 && btrfs_header_nritems(b) <
0b246afa 1576 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
c8c42864 1577
bd681513
CM
1578 if (*write_lock_level < level + 1) {
1579 *write_lock_level = level + 1;
1580 btrfs_release_path(p);
95b982de 1581 return -EAGAIN;
bd681513
CM
1582 }
1583
bfb484d9 1584 reada_for_balance(p, level);
95b982de
NB
1585 ret = balance_level(trans, root, p, level);
1586 if (ret)
1587 return ret;
c8c42864 1588
c8c42864
CM
1589 b = p->nodes[level];
1590 if (!b) {
b3b4aa74 1591 btrfs_release_path(p);
95b982de 1592 return -EAGAIN;
c8c42864
CM
1593 }
1594 BUG_ON(btrfs_header_nritems(b) == 1);
1595 }
c8c42864
CM
1596 return ret;
1597}
1598
381cf658 1599int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
e33d5c3d
KN
1600 u64 iobjectid, u64 ioff, u8 key_type,
1601 struct btrfs_key *found_key)
1602{
1603 int ret;
1604 struct btrfs_key key;
1605 struct extent_buffer *eb;
381cf658
DS
1606
1607 ASSERT(path);
1d4c08e0 1608 ASSERT(found_key);
e33d5c3d
KN
1609
1610 key.type = key_type;
1611 key.objectid = iobjectid;
1612 key.offset = ioff;
1613
1614 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1d4c08e0 1615 if (ret < 0)
e33d5c3d
KN
1616 return ret;
1617
1618 eb = path->nodes[0];
1619 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1620 ret = btrfs_next_leaf(fs_root, path);
1621 if (ret)
1622 return ret;
1623 eb = path->nodes[0];
1624 }
1625
1626 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1627 if (found_key->type != key.type ||
1628 found_key->objectid != key.objectid)
1629 return 1;
1630
1631 return 0;
1632}
1633
1fc28d8e
LB
1634static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1635 struct btrfs_path *p,
1636 int write_lock_level)
1637{
1fc28d8e 1638 struct extent_buffer *b;
120de408 1639 int root_lock = 0;
1fc28d8e
LB
1640 int level = 0;
1641
1fc28d8e 1642 if (p->search_commit_root) {
d96b3424
FM
1643 b = root->commit_root;
1644 atomic_inc(&b->refs);
be6821f8 1645 level = btrfs_header_level(b);
f9ddfd05
LB
1646 /*
1647 * Ensure that all callers have set skip_locking when
1648 * p->search_commit_root = 1.
1649 */
1650 ASSERT(p->skip_locking == 1);
1fc28d8e
LB
1651
1652 goto out;
1653 }
1654
1655 if (p->skip_locking) {
1656 b = btrfs_root_node(root);
1657 level = btrfs_header_level(b);
1658 goto out;
1659 }
1660
120de408
JB
1661 /* We try very hard to do read locks on the root */
1662 root_lock = BTRFS_READ_LOCK;
1663
1fc28d8e 1664 /*
662c653b
LB
1665 * If the level is set to maximum, we can skip trying to get the read
1666 * lock.
1fc28d8e 1667 */
662c653b
LB
1668 if (write_lock_level < BTRFS_MAX_LEVEL) {
1669 /*
1670 * We don't know the level of the root node until we actually
1671 * have it read locked
1672 */
857bc13f
JB
1673 if (p->nowait) {
1674 b = btrfs_try_read_lock_root_node(root);
1675 if (IS_ERR(b))
1676 return b;
1677 } else {
1678 b = btrfs_read_lock_root_node(root);
1679 }
662c653b
LB
1680 level = btrfs_header_level(b);
1681 if (level > write_lock_level)
1682 goto out;
1683
1684 /* Whoops, must trade for write lock */
1685 btrfs_tree_read_unlock(b);
1686 free_extent_buffer(b);
1687 }
1fc28d8e 1688
1fc28d8e
LB
1689 b = btrfs_lock_root_node(root);
1690 root_lock = BTRFS_WRITE_LOCK;
1691
1692 /* The level might have changed, check again */
1693 level = btrfs_header_level(b);
1694
1695out:
120de408
JB
1696 /*
1697 * The root may have failed to write out at some point, and thus is no
1698 * longer valid, return an error in this case.
1699 */
1700 if (!extent_buffer_uptodate(b)) {
1701 if (root_lock)
1702 btrfs_tree_unlock_rw(b, root_lock);
1703 free_extent_buffer(b);
1704 return ERR_PTR(-EIO);
1705 }
1706
1fc28d8e
LB
1707 p->nodes[level] = b;
1708 if (!p->skip_locking)
1709 p->locks[level] = root_lock;
1710 /*
1711 * Callers are responsible for dropping b's references.
1712 */
1713 return b;
1714}
1715
d96b3424
FM
1716/*
1717 * Replace the extent buffer at the lowest level of the path with a cloned
1718 * version. The purpose is to be able to use it safely, after releasing the
1719 * commit root semaphore, even if relocation is happening in parallel, the
1720 * transaction used for relocation is committed and the extent buffer is
1721 * reallocated in the next transaction.
1722 *
1723 * This is used in a context where the caller does not prevent transaction
1724 * commits from happening, either by holding a transaction handle or holding
1725 * some lock, while it's doing searches through a commit root.
1726 * At the moment it's only used for send operations.
1727 */
1728static int finish_need_commit_sem_search(struct btrfs_path *path)
1729{
1730 const int i = path->lowest_level;
1731 const int slot = path->slots[i];
1732 struct extent_buffer *lowest = path->nodes[i];
1733 struct extent_buffer *clone;
1734
1735 ASSERT(path->need_commit_sem);
1736
1737 if (!lowest)
1738 return 0;
1739
1740 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1741
1742 clone = btrfs_clone_extent_buffer(lowest);
1743 if (!clone)
1744 return -ENOMEM;
1745
1746 btrfs_release_path(path);
1747 path->nodes[i] = clone;
1748 path->slots[i] = slot;
1749
1750 return 0;
1751}
1fc28d8e 1752
e2e58d0f
FM
1753static inline int search_for_key_slot(struct extent_buffer *eb,
1754 int search_low_slot,
1755 const struct btrfs_key *key,
1756 int prev_cmp,
1757 int *slot)
1758{
1759 /*
1760 * If a previous call to btrfs_bin_search() on a parent node returned an
1761 * exact match (prev_cmp == 0), we can safely assume the target key will
1762 * always be at slot 0 on lower levels, since each key pointer
1763 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1764 * subtree it points to. Thus we can skip searching lower levels.
1765 */
1766 if (prev_cmp == 0) {
1767 *slot = 0;
1768 return 0;
1769 }
1770
1771 return generic_bin_search(eb, search_low_slot, key, slot);
1772}
1773
109324cf
FM
1774static int search_leaf(struct btrfs_trans_handle *trans,
1775 struct btrfs_root *root,
1776 const struct btrfs_key *key,
1777 struct btrfs_path *path,
1778 int ins_len,
1779 int prev_cmp)
1780{
1781 struct extent_buffer *leaf = path->nodes[0];
1782 int leaf_free_space = -1;
1783 int search_low_slot = 0;
1784 int ret;
1785 bool do_bin_search = true;
1786
1787 /*
1788 * If we are doing an insertion, the leaf has enough free space and the
1789 * destination slot for the key is not slot 0, then we can unlock our
1790 * write lock on the parent, and any other upper nodes, before doing the
1791 * binary search on the leaf (with search_for_key_slot()), allowing other
1792 * tasks to lock the parent and any other upper nodes.
1793 */
1794 if (ins_len > 0) {
1795 /*
1796 * Cache the leaf free space, since we will need it later and it
1797 * will not change until then.
1798 */
1799 leaf_free_space = btrfs_leaf_free_space(leaf);
1800
1801 /*
1802 * !path->locks[1] means we have a single node tree, the leaf is
1803 * the root of the tree.
1804 */
1805 if (path->locks[1] && leaf_free_space >= ins_len) {
1806 struct btrfs_disk_key first_key;
1807
1808 ASSERT(btrfs_header_nritems(leaf) > 0);
1809 btrfs_item_key(leaf, &first_key, 0);
1810
1811 /*
1812 * Doing the extra comparison with the first key is cheap,
1813 * taking into account that the first key is very likely
1814 * already in a cache line because it immediately follows
1815 * the extent buffer's header and we have recently accessed
1816 * the header's level field.
1817 */
1818 ret = comp_keys(&first_key, key);
1819 if (ret < 0) {
1820 /*
1821 * The first key is smaller than the key we want
1822 * to insert, so we are safe to unlock all upper
1823 * nodes and we have to do the binary search.
1824 *
1825 * We do use btrfs_unlock_up_safe() and not
1826 * unlock_up() because the later does not unlock
1827 * nodes with a slot of 0 - we can safely unlock
1828 * any node even if its slot is 0 since in this
1829 * case the key does not end up at slot 0 of the
1830 * leaf and there's no need to split the leaf.
1831 */
1832 btrfs_unlock_up_safe(path, 1);
1833 search_low_slot = 1;
1834 } else {
1835 /*
1836 * The first key is >= then the key we want to
1837 * insert, so we can skip the binary search as
1838 * the target key will be at slot 0.
1839 *
1840 * We can not unlock upper nodes when the key is
1841 * less than the first key, because we will need
1842 * to update the key at slot 0 of the parent node
1843 * and possibly of other upper nodes too.
1844 * If the key matches the first key, then we can
1845 * unlock all the upper nodes, using
1846 * btrfs_unlock_up_safe() instead of unlock_up()
1847 * as stated above.
1848 */
1849 if (ret == 0)
1850 btrfs_unlock_up_safe(path, 1);
1851 /*
1852 * ret is already 0 or 1, matching the result of
1853 * a btrfs_bin_search() call, so there is no need
1854 * to adjust it.
1855 */
1856 do_bin_search = false;
1857 path->slots[0] = 0;
1858 }
1859 }
1860 }
1861
1862 if (do_bin_search) {
1863 ret = search_for_key_slot(leaf, search_low_slot, key,
1864 prev_cmp, &path->slots[0]);
1865 if (ret < 0)
1866 return ret;
1867 }
1868
1869 if (ins_len > 0) {
1870 /*
1871 * Item key already exists. In this case, if we are allowed to
1872 * insert the item (for example, in dir_item case, item key
1873 * collision is allowed), it will be merged with the original
1874 * item. Only the item size grows, no new btrfs item will be
1875 * added. If search_for_extension is not set, ins_len already
1876 * accounts the size btrfs_item, deduct it here so leaf space
1877 * check will be correct.
1878 */
1879 if (ret == 0 && !path->search_for_extension) {
1880 ASSERT(ins_len >= sizeof(struct btrfs_item));
1881 ins_len -= sizeof(struct btrfs_item);
1882 }
1883
1884 ASSERT(leaf_free_space >= 0);
1885
1886 if (leaf_free_space < ins_len) {
1887 int err;
1888
1889 err = split_leaf(trans, root, key, path, ins_len,
1890 (ret == 0));
bb8e9a60
FM
1891 ASSERT(err <= 0);
1892 if (WARN_ON(err > 0))
1893 err = -EUCLEAN;
109324cf
FM
1894 if (err)
1895 ret = err;
1896 }
1897 }
1898
1899 return ret;
1900}
1901
74123bd7 1902/*
4271ecea
NB
1903 * btrfs_search_slot - look for a key in a tree and perform necessary
1904 * modifications to preserve tree invariants.
74123bd7 1905 *
4271ecea
NB
1906 * @trans: Handle of transaction, used when modifying the tree
1907 * @p: Holds all btree nodes along the search path
1908 * @root: The root node of the tree
1909 * @key: The key we are looking for
9a664971 1910 * @ins_len: Indicates purpose of search:
1911 * >0 for inserts it's size of item inserted (*)
1912 * <0 for deletions
1913 * 0 for plain searches, not modifying the tree
1914 *
1915 * (*) If size of item inserted doesn't include
1916 * sizeof(struct btrfs_item), then p->search_for_extension must
1917 * be set.
4271ecea
NB
1918 * @cow: boolean should CoW operations be performed. Must always be 1
1919 * when modifying the tree.
97571fd0 1920 *
4271ecea
NB
1921 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1922 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1923 *
1924 * If @key is found, 0 is returned and you can find the item in the leaf level
1925 * of the path (level 0)
1926 *
1927 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1928 * points to the slot where it should be inserted
1929 *
1930 * If an error is encountered while searching the tree a negative error number
1931 * is returned
74123bd7 1932 */
310712b2
OS
1933int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1934 const struct btrfs_key *key, struct btrfs_path *p,
1935 int ins_len, int cow)
be0e5c09 1936{
d96b3424 1937 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 1938 struct extent_buffer *b;
be0e5c09
CM
1939 int slot;
1940 int ret;
33c66f43 1941 int err;
be0e5c09 1942 int level;
925baedd 1943 int lowest_unlock = 1;
bd681513
CM
1944 /* everything at write_lock_level or lower must be write locked */
1945 int write_lock_level = 0;
9f3a7427 1946 u8 lowest_level = 0;
f7c79f30 1947 int min_write_lock_level;
d7396f07 1948 int prev_cmp;
9f3a7427 1949
6702ed49 1950 lowest_level = p->lowest_level;
323ac95b 1951 WARN_ON(lowest_level && ins_len > 0);
22b0ebda 1952 WARN_ON(p->nodes[0] != NULL);
eb653de1 1953 BUG_ON(!cow && ins_len);
25179201 1954
857bc13f
JB
1955 /*
1956 * For now only allow nowait for read only operations. There's no
1957 * strict reason why we can't, we just only need it for reads so it's
1958 * only implemented for reads.
1959 */
1960 ASSERT(!p->nowait || !cow);
1961
bd681513 1962 if (ins_len < 0) {
925baedd 1963 lowest_unlock = 2;
65b51a00 1964
bd681513
CM
1965 /* when we are removing items, we might have to go up to level
1966 * two as we update tree pointers Make sure we keep write
1967 * for those levels as well
1968 */
1969 write_lock_level = 2;
1970 } else if (ins_len > 0) {
1971 /*
1972 * for inserting items, make sure we have a write lock on
1973 * level 1 so we can update keys
1974 */
1975 write_lock_level = 1;
1976 }
1977
1978 if (!cow)
1979 write_lock_level = -1;
1980
09a2a8f9 1981 if (cow && (p->keep_locks || p->lowest_level))
bd681513
CM
1982 write_lock_level = BTRFS_MAX_LEVEL;
1983
f7c79f30
CM
1984 min_write_lock_level = write_lock_level;
1985
d96b3424
FM
1986 if (p->need_commit_sem) {
1987 ASSERT(p->search_commit_root);
857bc13f
JB
1988 if (p->nowait) {
1989 if (!down_read_trylock(&fs_info->commit_root_sem))
1990 return -EAGAIN;
1991 } else {
1992 down_read(&fs_info->commit_root_sem);
1993 }
d96b3424
FM
1994 }
1995
bb803951 1996again:
d7396f07 1997 prev_cmp = -1;
1fc28d8e 1998 b = btrfs_search_slot_get_root(root, p, write_lock_level);
be6821f8
FM
1999 if (IS_ERR(b)) {
2000 ret = PTR_ERR(b);
2001 goto done;
2002 }
925baedd 2003
eb60ceac 2004 while (b) {
f624d976
QW
2005 int dec = 0;
2006
5f39d397 2007 level = btrfs_header_level(b);
65b51a00 2008
02217ed2 2009 if (cow) {
9ea2c7c9
NB
2010 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2011
c8c42864
CM
2012 /*
2013 * if we don't really need to cow this block
2014 * then we don't want to set the path blocking,
2015 * so we test it here
2016 */
5963ffca 2017 if (!should_cow_block(trans, root, b))
65b51a00 2018 goto cow_done;
5d4f98a2 2019
bd681513
CM
2020 /*
2021 * must have write locks on this node and the
2022 * parent
2023 */
5124e00e
JB
2024 if (level > write_lock_level ||
2025 (level + 1 > write_lock_level &&
2026 level + 1 < BTRFS_MAX_LEVEL &&
2027 p->nodes[level + 1])) {
bd681513
CM
2028 write_lock_level = level + 1;
2029 btrfs_release_path(p);
2030 goto again;
2031 }
2032
9ea2c7c9
NB
2033 if (last_level)
2034 err = btrfs_cow_block(trans, root, b, NULL, 0,
9631e4cc
JB
2035 &b,
2036 BTRFS_NESTING_COW);
9ea2c7c9
NB
2037 else
2038 err = btrfs_cow_block(trans, root, b,
2039 p->nodes[level + 1],
9631e4cc
JB
2040 p->slots[level + 1], &b,
2041 BTRFS_NESTING_COW);
33c66f43 2042 if (err) {
33c66f43 2043 ret = err;
65b51a00 2044 goto done;
54aa1f4d 2045 }
02217ed2 2046 }
65b51a00 2047cow_done:
eb60ceac 2048 p->nodes[level] = b;
b4ce94de
CM
2049
2050 /*
2051 * we have a lock on b and as long as we aren't changing
2052 * the tree, there is no way to for the items in b to change.
2053 * It is safe to drop the lock on our parent before we
2054 * go through the expensive btree search on b.
2055 *
eb653de1
FDBM
2056 * If we're inserting or deleting (ins_len != 0), then we might
2057 * be changing slot zero, which may require changing the parent.
2058 * So, we can't drop the lock until after we know which slot
2059 * we're operating on.
b4ce94de 2060 */
eb653de1
FDBM
2061 if (!ins_len && !p->keep_locks) {
2062 int u = level + 1;
2063
2064 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2065 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2066 p->locks[u] = 0;
2067 }
2068 }
b4ce94de 2069
e2e58d0f 2070 if (level == 0) {
109324cf 2071 if (ins_len > 0)
e5e1c174 2072 ASSERT(write_lock_level >= 1);
bd681513 2073
109324cf 2074 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
459931ec 2075 if (!p->search_for_split)
f7c79f30 2076 unlock_up(p, level, lowest_unlock,
4b6f8e96 2077 min_write_lock_level, NULL);
65b51a00 2078 goto done;
be0e5c09 2079 }
e2e58d0f
FM
2080
2081 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2082 if (ret < 0)
2083 goto done;
2084 prev_cmp = ret;
2085
f624d976
QW
2086 if (ret && slot > 0) {
2087 dec = 1;
2088 slot--;
2089 }
2090 p->slots[level] = slot;
2091 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2092 &write_lock_level);
2093 if (err == -EAGAIN)
2094 goto again;
2095 if (err) {
2096 ret = err;
2097 goto done;
2098 }
2099 b = p->nodes[level];
2100 slot = p->slots[level];
2101
2102 /*
2103 * Slot 0 is special, if we change the key we have to update
2104 * the parent pointer which means we must have a write lock on
2105 * the parent
2106 */
2107 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2108 write_lock_level = level + 1;
2109 btrfs_release_path(p);
2110 goto again;
2111 }
2112
2113 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2114 &write_lock_level);
2115
2116 if (level == lowest_level) {
2117 if (dec)
2118 p->slots[level]++;
2119 goto done;
2120 }
2121
2122 err = read_block_for_search(root, p, &b, level, slot, key);
2123 if (err == -EAGAIN)
2124 goto again;
2125 if (err) {
2126 ret = err;
2127 goto done;
2128 }
2129
2130 if (!p->skip_locking) {
2131 level = btrfs_header_level(b);
b40130b2
JB
2132
2133 btrfs_maybe_reset_lockdep_class(root, b);
2134
f624d976 2135 if (level <= write_lock_level) {
ac5887c8 2136 btrfs_tree_lock(b);
f624d976
QW
2137 p->locks[level] = BTRFS_WRITE_LOCK;
2138 } else {
857bc13f
JB
2139 if (p->nowait) {
2140 if (!btrfs_try_tree_read_lock(b)) {
2141 free_extent_buffer(b);
2142 ret = -EAGAIN;
2143 goto done;
2144 }
2145 } else {
2146 btrfs_tree_read_lock(b);
2147 }
f624d976
QW
2148 p->locks[level] = BTRFS_READ_LOCK;
2149 }
2150 p->nodes[level] = b;
2151 }
be0e5c09 2152 }
65b51a00
CM
2153 ret = 1;
2154done:
5f5bc6b1 2155 if (ret < 0 && !p->skip_release_on_error)
b3b4aa74 2156 btrfs_release_path(p);
d96b3424
FM
2157
2158 if (p->need_commit_sem) {
2159 int ret2;
2160
2161 ret2 = finish_need_commit_sem_search(p);
2162 up_read(&fs_info->commit_root_sem);
2163 if (ret2)
2164 ret = ret2;
2165 }
2166
65b51a00 2167 return ret;
be0e5c09 2168}
f75e2b79 2169ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
be0e5c09 2170
5d9e75c4
JS
2171/*
2172 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2173 * current state of the tree together with the operations recorded in the tree
2174 * modification log to search for the key in a previous version of this tree, as
2175 * denoted by the time_seq parameter.
2176 *
2177 * Naturally, there is no support for insert, delete or cow operations.
2178 *
2179 * The resulting path and return value will be set up as if we called
2180 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2181 */
310712b2 2182int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
5d9e75c4
JS
2183 struct btrfs_path *p, u64 time_seq)
2184{
0b246afa 2185 struct btrfs_fs_info *fs_info = root->fs_info;
5d9e75c4
JS
2186 struct extent_buffer *b;
2187 int slot;
2188 int ret;
2189 int err;
2190 int level;
2191 int lowest_unlock = 1;
2192 u8 lowest_level = 0;
2193
2194 lowest_level = p->lowest_level;
2195 WARN_ON(p->nodes[0] != NULL);
c922b016 2196 ASSERT(!p->nowait);
5d9e75c4
JS
2197
2198 if (p->search_commit_root) {
2199 BUG_ON(time_seq);
2200 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2201 }
2202
2203again:
f3a84ccd 2204 b = btrfs_get_old_root(root, time_seq);
315bed43
NB
2205 if (!b) {
2206 ret = -EIO;
2207 goto done;
2208 }
5d9e75c4 2209 level = btrfs_header_level(b);
5d9e75c4
JS
2210 p->locks[level] = BTRFS_READ_LOCK;
2211
2212 while (b) {
abe9339d
QW
2213 int dec = 0;
2214
5d9e75c4
JS
2215 level = btrfs_header_level(b);
2216 p->nodes[level] = b;
5d9e75c4
JS
2217
2218 /*
2219 * we have a lock on b and as long as we aren't changing
2220 * the tree, there is no way to for the items in b to change.
2221 * It is safe to drop the lock on our parent before we
2222 * go through the expensive btree search on b.
2223 */
2224 btrfs_unlock_up_safe(p, level + 1);
2225
995e9a16 2226 ret = btrfs_bin_search(b, key, &slot);
cbca7d59
FM
2227 if (ret < 0)
2228 goto done;
5d9e75c4 2229
abe9339d 2230 if (level == 0) {
5d9e75c4
JS
2231 p->slots[level] = slot;
2232 unlock_up(p, level, lowest_unlock, 0, NULL);
abe9339d
QW
2233 goto done;
2234 }
5d9e75c4 2235
abe9339d
QW
2236 if (ret && slot > 0) {
2237 dec = 1;
2238 slot--;
2239 }
2240 p->slots[level] = slot;
2241 unlock_up(p, level, lowest_unlock, 0, NULL);
5d9e75c4 2242
abe9339d
QW
2243 if (level == lowest_level) {
2244 if (dec)
2245 p->slots[level]++;
2246 goto done;
2247 }
5d9e75c4 2248
abe9339d
QW
2249 err = read_block_for_search(root, p, &b, level, slot, key);
2250 if (err == -EAGAIN)
2251 goto again;
2252 if (err) {
2253 ret = err;
5d9e75c4
JS
2254 goto done;
2255 }
abe9339d
QW
2256
2257 level = btrfs_header_level(b);
ac5887c8 2258 btrfs_tree_read_lock(b);
f3a84ccd 2259 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
abe9339d
QW
2260 if (!b) {
2261 ret = -ENOMEM;
2262 goto done;
2263 }
2264 p->locks[level] = BTRFS_READ_LOCK;
2265 p->nodes[level] = b;
5d9e75c4
JS
2266 }
2267 ret = 1;
2268done:
5d9e75c4
JS
2269 if (ret < 0)
2270 btrfs_release_path(p);
2271
2272 return ret;
2273}
2274
2f38b3e1
AJ
2275/*
2276 * helper to use instead of search slot if no exact match is needed but
2277 * instead the next or previous item should be returned.
2278 * When find_higher is true, the next higher item is returned, the next lower
2279 * otherwise.
2280 * When return_any and find_higher are both true, and no higher item is found,
2281 * return the next lower instead.
2282 * When return_any is true and find_higher is false, and no lower item is found,
2283 * return the next higher instead.
2284 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2285 * < 0 on error
2286 */
2287int btrfs_search_slot_for_read(struct btrfs_root *root,
310712b2
OS
2288 const struct btrfs_key *key,
2289 struct btrfs_path *p, int find_higher,
2290 int return_any)
2f38b3e1
AJ
2291{
2292 int ret;
2293 struct extent_buffer *leaf;
2294
2295again:
2296 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2297 if (ret <= 0)
2298 return ret;
2299 /*
2300 * a return value of 1 means the path is at the position where the
2301 * item should be inserted. Normally this is the next bigger item,
2302 * but in case the previous item is the last in a leaf, path points
2303 * to the first free slot in the previous leaf, i.e. at an invalid
2304 * item.
2305 */
2306 leaf = p->nodes[0];
2307
2308 if (find_higher) {
2309 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2310 ret = btrfs_next_leaf(root, p);
2311 if (ret <= 0)
2312 return ret;
2313 if (!return_any)
2314 return 1;
2315 /*
2316 * no higher item found, return the next
2317 * lower instead
2318 */
2319 return_any = 0;
2320 find_higher = 0;
2321 btrfs_release_path(p);
2322 goto again;
2323 }
2324 } else {
e6793769
AJ
2325 if (p->slots[0] == 0) {
2326 ret = btrfs_prev_leaf(root, p);
2327 if (ret < 0)
2328 return ret;
2329 if (!ret) {
23c6bf6a
FDBM
2330 leaf = p->nodes[0];
2331 if (p->slots[0] == btrfs_header_nritems(leaf))
2332 p->slots[0]--;
e6793769 2333 return 0;
2f38b3e1 2334 }
e6793769
AJ
2335 if (!return_any)
2336 return 1;
2337 /*
2338 * no lower item found, return the next
2339 * higher instead
2340 */
2341 return_any = 0;
2342 find_higher = 1;
2343 btrfs_release_path(p);
2344 goto again;
2345 } else {
2f38b3e1
AJ
2346 --p->slots[0];
2347 }
2348 }
2349 return 0;
2350}
2351
0ff40a91
MPS
2352/*
2353 * Execute search and call btrfs_previous_item to traverse backwards if the item
2354 * was not found.
2355 *
2356 * Return 0 if found, 1 if not found and < 0 if error.
2357 */
2358int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2359 struct btrfs_path *path)
2360{
2361 int ret;
2362
2363 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2364 if (ret > 0)
2365 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2366
2367 if (ret == 0)
2368 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2369
2370 return ret;
2371}
2372
43dd529a 2373/*
62142be3
GN
2374 * Search for a valid slot for the given path.
2375 *
2376 * @root: The root node of the tree.
2377 * @key: Will contain a valid item if found.
2378 * @path: The starting point to validate the slot.
2379 *
2380 * Return: 0 if the item is valid
2381 * 1 if not found
2382 * <0 if error.
2383 */
2384int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2385 struct btrfs_path *path)
2386{
2387 while (1) {
2388 int ret;
2389 const int slot = path->slots[0];
2390 const struct extent_buffer *leaf = path->nodes[0];
2391
2392 /* This is where we start walking the path. */
2393 if (slot >= btrfs_header_nritems(leaf)) {
2394 /*
2395 * If we've reached the last slot in this leaf we need
2396 * to go to the next leaf and reset the path.
2397 */
2398 ret = btrfs_next_leaf(root, path);
2399 if (ret)
2400 return ret;
2401 continue;
2402 }
2403 /* Store the found, valid item in @key. */
2404 btrfs_item_key_to_cpu(leaf, key, slot);
2405 break;
2406 }
2407 return 0;
2408}
2409
74123bd7
CM
2410/*
2411 * adjust the pointers going up the tree, starting at level
2412 * making sure the right key of each node is points to 'key'.
2413 * This is used after shifting pointers to the left, so it stops
2414 * fixing up pointers when a given leaf/node is not in slot 0 of the
2415 * higher levels
aa5d6bed 2416 *
74123bd7 2417 */
b167fa91 2418static void fixup_low_keys(struct btrfs_path *path,
143bede5 2419 struct btrfs_disk_key *key, int level)
be0e5c09
CM
2420{
2421 int i;
5f39d397 2422 struct extent_buffer *t;
0e82bcfe 2423 int ret;
5f39d397 2424
234b63a0 2425 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
be0e5c09 2426 int tslot = path->slots[i];
0e82bcfe 2427
eb60ceac 2428 if (!path->nodes[i])
be0e5c09 2429 break;
5f39d397 2430 t = path->nodes[i];
f3a84ccd 2431 ret = btrfs_tree_mod_log_insert_key(t, tslot,
33cff222 2432 BTRFS_MOD_LOG_KEY_REPLACE);
0e82bcfe 2433 BUG_ON(ret < 0);
5f39d397 2434 btrfs_set_node_key(t, key, tslot);
d6025579 2435 btrfs_mark_buffer_dirty(path->nodes[i]);
be0e5c09
CM
2436 if (tslot != 0)
2437 break;
2438 }
2439}
2440
31840ae1
ZY
2441/*
2442 * update item key.
2443 *
2444 * This function isn't completely safe. It's the caller's responsibility
2445 * that the new key won't break the order
2446 */
b7a0365e
DD
2447void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2448 struct btrfs_path *path,
310712b2 2449 const struct btrfs_key *new_key)
31840ae1
ZY
2450{
2451 struct btrfs_disk_key disk_key;
2452 struct extent_buffer *eb;
2453 int slot;
2454
2455 eb = path->nodes[0];
2456 slot = path->slots[0];
2457 if (slot > 0) {
2458 btrfs_item_key(eb, &disk_key, slot - 1);
7c15d410
QW
2459 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2460 btrfs_crit(fs_info,
2461 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2462 slot, btrfs_disk_key_objectid(&disk_key),
2463 btrfs_disk_key_type(&disk_key),
2464 btrfs_disk_key_offset(&disk_key),
2465 new_key->objectid, new_key->type,
2466 new_key->offset);
2467 btrfs_print_leaf(eb);
2468 BUG();
2469 }
31840ae1
ZY
2470 }
2471 if (slot < btrfs_header_nritems(eb) - 1) {
2472 btrfs_item_key(eb, &disk_key, slot + 1);
7c15d410
QW
2473 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2474 btrfs_crit(fs_info,
2475 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2476 slot, btrfs_disk_key_objectid(&disk_key),
2477 btrfs_disk_key_type(&disk_key),
2478 btrfs_disk_key_offset(&disk_key),
2479 new_key->objectid, new_key->type,
2480 new_key->offset);
2481 btrfs_print_leaf(eb);
2482 BUG();
2483 }
31840ae1
ZY
2484 }
2485
2486 btrfs_cpu_key_to_disk(&disk_key, new_key);
2487 btrfs_set_item_key(eb, &disk_key, slot);
2488 btrfs_mark_buffer_dirty(eb);
2489 if (slot == 0)
b167fa91 2490 fixup_low_keys(path, &disk_key, 1);
31840ae1
ZY
2491}
2492
d16c702f
QW
2493/*
2494 * Check key order of two sibling extent buffers.
2495 *
2496 * Return true if something is wrong.
2497 * Return false if everything is fine.
2498 *
2499 * Tree-checker only works inside one tree block, thus the following
2500 * corruption can not be detected by tree-checker:
2501 *
2502 * Leaf @left | Leaf @right
2503 * --------------------------------------------------------------
2504 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2505 *
2506 * Key f6 in leaf @left itself is valid, but not valid when the next
2507 * key in leaf @right is 7.
2508 * This can only be checked at tree block merge time.
2509 * And since tree checker has ensured all key order in each tree block
2510 * is correct, we only need to bother the last key of @left and the first
2511 * key of @right.
2512 */
2513static bool check_sibling_keys(struct extent_buffer *left,
2514 struct extent_buffer *right)
2515{
2516 struct btrfs_key left_last;
2517 struct btrfs_key right_first;
2518 int level = btrfs_header_level(left);
2519 int nr_left = btrfs_header_nritems(left);
2520 int nr_right = btrfs_header_nritems(right);
2521
2522 /* No key to check in one of the tree blocks */
2523 if (!nr_left || !nr_right)
2524 return false;
2525
2526 if (level) {
2527 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2528 btrfs_node_key_to_cpu(right, &right_first, 0);
2529 } else {
2530 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2531 btrfs_item_key_to_cpu(right, &right_first, 0);
2532 }
2533
2534 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2535 btrfs_crit(left->fs_info,
2536"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2537 left_last.objectid, left_last.type,
2538 left_last.offset, right_first.objectid,
2539 right_first.type, right_first.offset);
2540 return true;
2541 }
2542 return false;
2543}
2544
74123bd7
CM
2545/*
2546 * try to push data from one node into the next node left in the
79f95c82 2547 * tree.
aa5d6bed
CM
2548 *
2549 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2550 * error, and > 0 if there was no room in the left hand block.
74123bd7 2551 */
98ed5174 2552static int push_node_left(struct btrfs_trans_handle *trans,
2ff7e61e 2553 struct extent_buffer *dst,
971a1f66 2554 struct extent_buffer *src, int empty)
be0e5c09 2555{
d30a668f 2556 struct btrfs_fs_info *fs_info = trans->fs_info;
be0e5c09 2557 int push_items = 0;
bb803951
CM
2558 int src_nritems;
2559 int dst_nritems;
aa5d6bed 2560 int ret = 0;
be0e5c09 2561
5f39d397
CM
2562 src_nritems = btrfs_header_nritems(src);
2563 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2564 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
7bb86316
CM
2565 WARN_ON(btrfs_header_generation(src) != trans->transid);
2566 WARN_ON(btrfs_header_generation(dst) != trans->transid);
54aa1f4d 2567
bce4eae9 2568 if (!empty && src_nritems <= 8)
971a1f66
CM
2569 return 1;
2570
d397712b 2571 if (push_items <= 0)
be0e5c09
CM
2572 return 1;
2573
bce4eae9 2574 if (empty) {
971a1f66 2575 push_items = min(src_nritems, push_items);
bce4eae9
CM
2576 if (push_items < src_nritems) {
2577 /* leave at least 8 pointers in the node if
2578 * we aren't going to empty it
2579 */
2580 if (src_nritems - push_items < 8) {
2581 if (push_items <= 8)
2582 return 1;
2583 push_items -= 8;
2584 }
2585 }
2586 } else
2587 push_items = min(src_nritems - 8, push_items);
79f95c82 2588
d16c702f
QW
2589 /* dst is the left eb, src is the middle eb */
2590 if (check_sibling_keys(dst, src)) {
2591 ret = -EUCLEAN;
2592 btrfs_abort_transaction(trans, ret);
2593 return ret;
2594 }
f3a84ccd 2595 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
5de865ee 2596 if (ret) {
66642832 2597 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2598 return ret;
2599 }
5f39d397
CM
2600 copy_extent_buffer(dst, src,
2601 btrfs_node_key_ptr_offset(dst_nritems),
2602 btrfs_node_key_ptr_offset(0),
d397712b 2603 push_items * sizeof(struct btrfs_key_ptr));
5f39d397 2604
bb803951 2605 if (push_items < src_nritems) {
57911b8b 2606 /*
f3a84ccd
FM
2607 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2608 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
57911b8b 2609 */
5f39d397
CM
2610 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2611 btrfs_node_key_ptr_offset(push_items),
2612 (src_nritems - push_items) *
2613 sizeof(struct btrfs_key_ptr));
2614 }
2615 btrfs_set_header_nritems(src, src_nritems - push_items);
2616 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2617 btrfs_mark_buffer_dirty(src);
2618 btrfs_mark_buffer_dirty(dst);
31840ae1 2619
79f95c82
CM
2620 return ret;
2621}
2622
2623/*
2624 * try to push data from one node into the next node right in the
2625 * tree.
2626 *
2627 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2628 * error, and > 0 if there was no room in the right hand block.
2629 *
2630 * this will only push up to 1/2 the contents of the left node over
2631 */
5f39d397 2632static int balance_node_right(struct btrfs_trans_handle *trans,
5f39d397
CM
2633 struct extent_buffer *dst,
2634 struct extent_buffer *src)
79f95c82 2635{
55d32ed8 2636 struct btrfs_fs_info *fs_info = trans->fs_info;
79f95c82
CM
2637 int push_items = 0;
2638 int max_push;
2639 int src_nritems;
2640 int dst_nritems;
2641 int ret = 0;
79f95c82 2642
7bb86316
CM
2643 WARN_ON(btrfs_header_generation(src) != trans->transid);
2644 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2645
5f39d397
CM
2646 src_nritems = btrfs_header_nritems(src);
2647 dst_nritems = btrfs_header_nritems(dst);
0b246afa 2648 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
d397712b 2649 if (push_items <= 0)
79f95c82 2650 return 1;
bce4eae9 2651
d397712b 2652 if (src_nritems < 4)
bce4eae9 2653 return 1;
79f95c82
CM
2654
2655 max_push = src_nritems / 2 + 1;
2656 /* don't try to empty the node */
d397712b 2657 if (max_push >= src_nritems)
79f95c82 2658 return 1;
252c38f0 2659
79f95c82
CM
2660 if (max_push < push_items)
2661 push_items = max_push;
2662
d16c702f
QW
2663 /* dst is the right eb, src is the middle eb */
2664 if (check_sibling_keys(src, dst)) {
2665 ret = -EUCLEAN;
2666 btrfs_abort_transaction(trans, ret);
2667 return ret;
2668 }
f3a84ccd 2669 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
bf1d3425 2670 BUG_ON(ret < 0);
5f39d397
CM
2671 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2672 btrfs_node_key_ptr_offset(0),
2673 (dst_nritems) *
2674 sizeof(struct btrfs_key_ptr));
d6025579 2675
f3a84ccd
FM
2676 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2677 push_items);
5de865ee 2678 if (ret) {
66642832 2679 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2680 return ret;
2681 }
5f39d397
CM
2682 copy_extent_buffer(dst, src,
2683 btrfs_node_key_ptr_offset(0),
2684 btrfs_node_key_ptr_offset(src_nritems - push_items),
d397712b 2685 push_items * sizeof(struct btrfs_key_ptr));
79f95c82 2686
5f39d397
CM
2687 btrfs_set_header_nritems(src, src_nritems - push_items);
2688 btrfs_set_header_nritems(dst, dst_nritems + push_items);
79f95c82 2689
5f39d397
CM
2690 btrfs_mark_buffer_dirty(src);
2691 btrfs_mark_buffer_dirty(dst);
31840ae1 2692
aa5d6bed 2693 return ret;
be0e5c09
CM
2694}
2695
97571fd0
CM
2696/*
2697 * helper function to insert a new root level in the tree.
2698 * A new node is allocated, and a single item is inserted to
2699 * point to the existing root
aa5d6bed
CM
2700 *
2701 * returns zero on success or < 0 on failure.
97571fd0 2702 */
d397712b 2703static noinline int insert_new_root(struct btrfs_trans_handle *trans,
5f39d397 2704 struct btrfs_root *root,
fdd99c72 2705 struct btrfs_path *path, int level)
5c680ed6 2706{
0b246afa 2707 struct btrfs_fs_info *fs_info = root->fs_info;
7bb86316 2708 u64 lower_gen;
5f39d397
CM
2709 struct extent_buffer *lower;
2710 struct extent_buffer *c;
925baedd 2711 struct extent_buffer *old;
5f39d397 2712 struct btrfs_disk_key lower_key;
d9d19a01 2713 int ret;
5c680ed6
CM
2714
2715 BUG_ON(path->nodes[level]);
2716 BUG_ON(path->nodes[level-1] != root->node);
2717
7bb86316
CM
2718 lower = path->nodes[level-1];
2719 if (level == 1)
2720 btrfs_item_key(lower, &lower_key, 0);
2721 else
2722 btrfs_node_key(lower, &lower_key, 0);
2723
79bd3712
FM
2724 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2725 &lower_key, level, root->node->start, 0,
2726 BTRFS_NESTING_NEW_ROOT);
5f39d397
CM
2727 if (IS_ERR(c))
2728 return PTR_ERR(c);
925baedd 2729
0b246afa 2730 root_add_used(root, fs_info->nodesize);
f0486c68 2731
5f39d397 2732 btrfs_set_header_nritems(c, 1);
5f39d397 2733 btrfs_set_node_key(c, &lower_key, 0);
db94535d 2734 btrfs_set_node_blockptr(c, 0, lower->start);
7bb86316 2735 lower_gen = btrfs_header_generation(lower);
31840ae1 2736 WARN_ON(lower_gen != trans->transid);
7bb86316
CM
2737
2738 btrfs_set_node_ptr_generation(c, 0, lower_gen);
d5719762 2739
5f39d397 2740 btrfs_mark_buffer_dirty(c);
d5719762 2741
925baedd 2742 old = root->node;
406808ab 2743 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
d9d19a01 2744 BUG_ON(ret < 0);
240f62c8 2745 rcu_assign_pointer(root->node, c);
925baedd
CM
2746
2747 /* the super has an extra ref to root->node */
2748 free_extent_buffer(old);
2749
0b86a832 2750 add_root_to_dirty_list(root);
67439dad 2751 atomic_inc(&c->refs);
5f39d397 2752 path->nodes[level] = c;
ac5887c8 2753 path->locks[level] = BTRFS_WRITE_LOCK;
5c680ed6
CM
2754 path->slots[level] = 0;
2755 return 0;
2756}
2757
74123bd7
CM
2758/*
2759 * worker function to insert a single pointer in a node.
2760 * the node should have enough room for the pointer already
97571fd0 2761 *
74123bd7
CM
2762 * slot and level indicate where you want the key to go, and
2763 * blocknr is the block the key points to.
2764 */
143bede5 2765static void insert_ptr(struct btrfs_trans_handle *trans,
6ad3cf6d 2766 struct btrfs_path *path,
143bede5 2767 struct btrfs_disk_key *key, u64 bytenr,
c3e06965 2768 int slot, int level)
74123bd7 2769{
5f39d397 2770 struct extent_buffer *lower;
74123bd7 2771 int nritems;
f3ea38da 2772 int ret;
5c680ed6
CM
2773
2774 BUG_ON(!path->nodes[level]);
49d0c642 2775 btrfs_assert_tree_write_locked(path->nodes[level]);
5f39d397
CM
2776 lower = path->nodes[level];
2777 nritems = btrfs_header_nritems(lower);
c293498b 2778 BUG_ON(slot > nritems);
6ad3cf6d 2779 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
74123bd7 2780 if (slot != nritems) {
bf1d3425 2781 if (level) {
f3a84ccd
FM
2782 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2783 slot, nritems - slot);
bf1d3425
DS
2784 BUG_ON(ret < 0);
2785 }
5f39d397
CM
2786 memmove_extent_buffer(lower,
2787 btrfs_node_key_ptr_offset(slot + 1),
2788 btrfs_node_key_ptr_offset(slot),
d6025579 2789 (nritems - slot) * sizeof(struct btrfs_key_ptr));
74123bd7 2790 }
c3e06965 2791 if (level) {
f3a84ccd 2792 ret = btrfs_tree_mod_log_insert_key(lower, slot,
33cff222 2793 BTRFS_MOD_LOG_KEY_ADD);
f3ea38da
JS
2794 BUG_ON(ret < 0);
2795 }
5f39d397 2796 btrfs_set_node_key(lower, key, slot);
db94535d 2797 btrfs_set_node_blockptr(lower, slot, bytenr);
74493f7a
CM
2798 WARN_ON(trans->transid == 0);
2799 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
5f39d397
CM
2800 btrfs_set_header_nritems(lower, nritems + 1);
2801 btrfs_mark_buffer_dirty(lower);
74123bd7
CM
2802}
2803
97571fd0
CM
2804/*
2805 * split the node at the specified level in path in two.
2806 * The path is corrected to point to the appropriate node after the split
2807 *
2808 * Before splitting this tries to make some room in the node by pushing
2809 * left and right, if either one works, it returns right away.
aa5d6bed
CM
2810 *
2811 * returns 0 on success and < 0 on failure
97571fd0 2812 */
e02119d5
CM
2813static noinline int split_node(struct btrfs_trans_handle *trans,
2814 struct btrfs_root *root,
2815 struct btrfs_path *path, int level)
be0e5c09 2816{
0b246afa 2817 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397
CM
2818 struct extent_buffer *c;
2819 struct extent_buffer *split;
2820 struct btrfs_disk_key disk_key;
be0e5c09 2821 int mid;
5c680ed6 2822 int ret;
7518a238 2823 u32 c_nritems;
eb60ceac 2824
5f39d397 2825 c = path->nodes[level];
7bb86316 2826 WARN_ON(btrfs_header_generation(c) != trans->transid);
5f39d397 2827 if (c == root->node) {
d9abbf1c 2828 /*
90f8d62e
JS
2829 * trying to split the root, lets make a new one
2830 *
fdd99c72 2831 * tree mod log: We don't log_removal old root in
90f8d62e
JS
2832 * insert_new_root, because that root buffer will be kept as a
2833 * normal node. We are going to log removal of half of the
f3a84ccd
FM
2834 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2835 * holding a tree lock on the buffer, which is why we cannot
2836 * race with other tree_mod_log users.
d9abbf1c 2837 */
fdd99c72 2838 ret = insert_new_root(trans, root, path, level + 1);
5c680ed6
CM
2839 if (ret)
2840 return ret;
b3612421 2841 } else {
e66f709b 2842 ret = push_nodes_for_insert(trans, root, path, level);
5f39d397
CM
2843 c = path->nodes[level];
2844 if (!ret && btrfs_header_nritems(c) <
0b246afa 2845 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
e66f709b 2846 return 0;
54aa1f4d
CM
2847 if (ret < 0)
2848 return ret;
be0e5c09 2849 }
e66f709b 2850
5f39d397 2851 c_nritems = btrfs_header_nritems(c);
5d4f98a2
YZ
2852 mid = (c_nritems + 1) / 2;
2853 btrfs_node_key(c, &disk_key, mid);
7bb86316 2854
79bd3712
FM
2855 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2856 &disk_key, level, c->start, 0,
2857 BTRFS_NESTING_SPLIT);
5f39d397
CM
2858 if (IS_ERR(split))
2859 return PTR_ERR(split);
2860
0b246afa 2861 root_add_used(root, fs_info->nodesize);
bc877d28 2862 ASSERT(btrfs_header_level(c) == level);
54aa1f4d 2863
f3a84ccd 2864 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
5de865ee 2865 if (ret) {
66642832 2866 btrfs_abort_transaction(trans, ret);
5de865ee
FDBM
2867 return ret;
2868 }
5f39d397
CM
2869 copy_extent_buffer(split, c,
2870 btrfs_node_key_ptr_offset(0),
2871 btrfs_node_key_ptr_offset(mid),
2872 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2873 btrfs_set_header_nritems(split, c_nritems - mid);
2874 btrfs_set_header_nritems(c, mid);
aa5d6bed 2875
5f39d397
CM
2876 btrfs_mark_buffer_dirty(c);
2877 btrfs_mark_buffer_dirty(split);
2878
6ad3cf6d 2879 insert_ptr(trans, path, &disk_key, split->start,
c3e06965 2880 path->slots[level + 1] + 1, level + 1);
aa5d6bed 2881
5de08d7d 2882 if (path->slots[level] >= mid) {
5c680ed6 2883 path->slots[level] -= mid;
925baedd 2884 btrfs_tree_unlock(c);
5f39d397
CM
2885 free_extent_buffer(c);
2886 path->nodes[level] = split;
5c680ed6
CM
2887 path->slots[level + 1] += 1;
2888 } else {
925baedd 2889 btrfs_tree_unlock(split);
5f39d397 2890 free_extent_buffer(split);
be0e5c09 2891 }
d5286a92 2892 return 0;
be0e5c09
CM
2893}
2894
74123bd7
CM
2895/*
2896 * how many bytes are required to store the items in a leaf. start
2897 * and nr indicate which items in the leaf to check. This totals up the
2898 * space used both by the item structs and the item data
2899 */
5f39d397 2900static int leaf_space_used(struct extent_buffer *l, int start, int nr)
be0e5c09
CM
2901{
2902 int data_len;
5f39d397 2903 int nritems = btrfs_header_nritems(l);
d4dbff95 2904 int end = min(nritems, start + nr) - 1;
be0e5c09
CM
2905
2906 if (!nr)
2907 return 0;
3212fa14
JB
2908 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
2909 data_len = data_len - btrfs_item_offset(l, end);
0783fcfc 2910 data_len += sizeof(struct btrfs_item) * nr;
d4dbff95 2911 WARN_ON(data_len < 0);
be0e5c09
CM
2912 return data_len;
2913}
2914
d4dbff95
CM
2915/*
2916 * The space between the end of the leaf items and
2917 * the start of the leaf data. IOW, how much room
2918 * the leaf has left for both items and data
2919 */
e902baac 2920noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
d4dbff95 2921{
e902baac 2922 struct btrfs_fs_info *fs_info = leaf->fs_info;
5f39d397
CM
2923 int nritems = btrfs_header_nritems(leaf);
2924 int ret;
0b246afa
JM
2925
2926 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
5f39d397 2927 if (ret < 0) {
0b246afa
JM
2928 btrfs_crit(fs_info,
2929 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2930 ret,
2931 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2932 leaf_space_used(leaf, 0, nritems), nritems);
5f39d397
CM
2933 }
2934 return ret;
d4dbff95
CM
2935}
2936
99d8f83c
CM
2937/*
2938 * min slot controls the lowest index we're willing to push to the
2939 * right. We'll push up to and including min_slot, but no lower
2940 */
f72f0010 2941static noinline int __push_leaf_right(struct btrfs_path *path,
44871b1b
CM
2942 int data_size, int empty,
2943 struct extent_buffer *right,
99d8f83c
CM
2944 int free_space, u32 left_nritems,
2945 u32 min_slot)
00ec4c51 2946{
f72f0010 2947 struct btrfs_fs_info *fs_info = right->fs_info;
5f39d397 2948 struct extent_buffer *left = path->nodes[0];
44871b1b 2949 struct extent_buffer *upper = path->nodes[1];
cfed81a0 2950 struct btrfs_map_token token;
5f39d397 2951 struct btrfs_disk_key disk_key;
00ec4c51 2952 int slot;
34a38218 2953 u32 i;
00ec4c51
CM
2954 int push_space = 0;
2955 int push_items = 0;
34a38218 2956 u32 nr;
7518a238 2957 u32 right_nritems;
5f39d397 2958 u32 data_end;
db94535d 2959 u32 this_item_size;
00ec4c51 2960
34a38218
CM
2961 if (empty)
2962 nr = 0;
2963 else
99d8f83c 2964 nr = max_t(u32, 1, min_slot);
34a38218 2965
31840ae1 2966 if (path->slots[0] >= left_nritems)
87b29b20 2967 push_space += data_size;
31840ae1 2968
44871b1b 2969 slot = path->slots[1];
34a38218
CM
2970 i = left_nritems - 1;
2971 while (i >= nr) {
31840ae1
ZY
2972 if (!empty && push_items > 0) {
2973 if (path->slots[0] > i)
2974 break;
2975 if (path->slots[0] == i) {
e902baac
DS
2976 int space = btrfs_leaf_free_space(left);
2977
31840ae1
ZY
2978 if (space + push_space * 2 > free_space)
2979 break;
2980 }
2981 }
2982
00ec4c51 2983 if (path->slots[0] == i)
87b29b20 2984 push_space += data_size;
db94535d 2985
3212fa14 2986 this_item_size = btrfs_item_size(left, i);
74794207
JB
2987 if (this_item_size + sizeof(struct btrfs_item) +
2988 push_space > free_space)
00ec4c51 2989 break;
31840ae1 2990
00ec4c51 2991 push_items++;
74794207 2992 push_space += this_item_size + sizeof(struct btrfs_item);
34a38218
CM
2993 if (i == 0)
2994 break;
2995 i--;
db94535d 2996 }
5f39d397 2997
925baedd
CM
2998 if (push_items == 0)
2999 goto out_unlock;
5f39d397 3000
6c1500f2 3001 WARN_ON(!empty && push_items == left_nritems);
5f39d397 3002
00ec4c51 3003 /* push left to right */
5f39d397 3004 right_nritems = btrfs_header_nritems(right);
34a38218 3005
dc2e724e 3006 push_space = btrfs_item_data_end(left, left_nritems - push_items);
8f881e8c 3007 push_space -= leaf_data_end(left);
5f39d397 3008
00ec4c51 3009 /* make room in the right data area */
8f881e8c 3010 data_end = leaf_data_end(right);
5f39d397 3011 memmove_extent_buffer(right,
3d9ec8c4
NB
3012 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3013 BTRFS_LEAF_DATA_OFFSET + data_end,
0b246afa 3014 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
5f39d397 3015
00ec4c51 3016 /* copy from the left data area */
3d9ec8c4 3017 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
0b246afa 3018 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
8f881e8c 3019 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
d6025579 3020 push_space);
5f39d397
CM
3021
3022 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3023 btrfs_item_nr_offset(0),
3024 right_nritems * sizeof(struct btrfs_item));
3025
00ec4c51 3026 /* copy the items from left to right */
5f39d397
CM
3027 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3028 btrfs_item_nr_offset(left_nritems - push_items),
3029 push_items * sizeof(struct btrfs_item));
00ec4c51
CM
3030
3031 /* update the item pointers */
c82f823c 3032 btrfs_init_map_token(&token, right);
7518a238 3033 right_nritems += push_items;
5f39d397 3034 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3035 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
7518a238 3036 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3037 push_space -= btrfs_token_item_size(&token, i);
3038 btrfs_set_token_item_offset(&token, i, push_space);
db94535d
CM
3039 }
3040
7518a238 3041 left_nritems -= push_items;
5f39d397 3042 btrfs_set_header_nritems(left, left_nritems);
00ec4c51 3043
34a38218
CM
3044 if (left_nritems)
3045 btrfs_mark_buffer_dirty(left);
f0486c68 3046 else
6a884d7d 3047 btrfs_clean_tree_block(left);
f0486c68 3048
5f39d397 3049 btrfs_mark_buffer_dirty(right);
a429e513 3050
5f39d397
CM
3051 btrfs_item_key(right, &disk_key, 0);
3052 btrfs_set_node_key(upper, &disk_key, slot + 1);
d6025579 3053 btrfs_mark_buffer_dirty(upper);
02217ed2 3054
00ec4c51 3055 /* then fixup the leaf pointer in the path */
7518a238
CM
3056 if (path->slots[0] >= left_nritems) {
3057 path->slots[0] -= left_nritems;
925baedd 3058 if (btrfs_header_nritems(path->nodes[0]) == 0)
6a884d7d 3059 btrfs_clean_tree_block(path->nodes[0]);
925baedd 3060 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3061 free_extent_buffer(path->nodes[0]);
3062 path->nodes[0] = right;
00ec4c51
CM
3063 path->slots[1] += 1;
3064 } else {
925baedd 3065 btrfs_tree_unlock(right);
5f39d397 3066 free_extent_buffer(right);
00ec4c51
CM
3067 }
3068 return 0;
925baedd
CM
3069
3070out_unlock:
3071 btrfs_tree_unlock(right);
3072 free_extent_buffer(right);
3073 return 1;
00ec4c51 3074}
925baedd 3075
44871b1b
CM
3076/*
3077 * push some data in the path leaf to the right, trying to free up at
3078 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3079 *
3080 * returns 1 if the push failed because the other node didn't have enough
3081 * room, 0 if everything worked out and < 0 if there were major errors.
99d8f83c
CM
3082 *
3083 * this will push starting from min_slot to the end of the leaf. It won't
3084 * push any slot lower than min_slot
44871b1b
CM
3085 */
3086static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3087 *root, struct btrfs_path *path,
3088 int min_data_size, int data_size,
3089 int empty, u32 min_slot)
44871b1b
CM
3090{
3091 struct extent_buffer *left = path->nodes[0];
3092 struct extent_buffer *right;
3093 struct extent_buffer *upper;
3094 int slot;
3095 int free_space;
3096 u32 left_nritems;
3097 int ret;
3098
3099 if (!path->nodes[1])
3100 return 1;
3101
3102 slot = path->slots[1];
3103 upper = path->nodes[1];
3104 if (slot >= btrfs_header_nritems(upper) - 1)
3105 return 1;
3106
49d0c642 3107 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3108
4b231ae4 3109 right = btrfs_read_node_slot(upper, slot + 1);
fb770ae4
LB
3110 /*
3111 * slot + 1 is not valid or we fail to read the right node,
3112 * no big deal, just return.
3113 */
3114 if (IS_ERR(right))
91ca338d
TI
3115 return 1;
3116
bf77467a 3117 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
44871b1b 3118
e902baac 3119 free_space = btrfs_leaf_free_space(right);
44871b1b
CM
3120 if (free_space < data_size)
3121 goto out_unlock;
3122
44871b1b 3123 ret = btrfs_cow_block(trans, root, right, upper,
bf59a5a2 3124 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
44871b1b
CM
3125 if (ret)
3126 goto out_unlock;
3127
44871b1b
CM
3128 left_nritems = btrfs_header_nritems(left);
3129 if (left_nritems == 0)
3130 goto out_unlock;
3131
d16c702f
QW
3132 if (check_sibling_keys(left, right)) {
3133 ret = -EUCLEAN;
3134 btrfs_tree_unlock(right);
3135 free_extent_buffer(right);
3136 return ret;
3137 }
2ef1fed2
FDBM
3138 if (path->slots[0] == left_nritems && !empty) {
3139 /* Key greater than all keys in the leaf, right neighbor has
3140 * enough room for it and we're not emptying our leaf to delete
3141 * it, therefore use right neighbor to insert the new item and
52042d8e 3142 * no need to touch/dirty our left leaf. */
2ef1fed2
FDBM
3143 btrfs_tree_unlock(left);
3144 free_extent_buffer(left);
3145 path->nodes[0] = right;
3146 path->slots[0] = 0;
3147 path->slots[1]++;
3148 return 0;
3149 }
3150
f72f0010 3151 return __push_leaf_right(path, min_data_size, empty,
99d8f83c 3152 right, free_space, left_nritems, min_slot);
44871b1b
CM
3153out_unlock:
3154 btrfs_tree_unlock(right);
3155 free_extent_buffer(right);
3156 return 1;
3157}
3158
74123bd7
CM
3159/*
3160 * push some data in the path leaf to the left, trying to free up at
3161 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3162 *
3163 * max_slot can put a limit on how far into the leaf we'll push items. The
3164 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3165 * items
74123bd7 3166 */
8087c193 3167static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
44871b1b 3168 int empty, struct extent_buffer *left,
99d8f83c
CM
3169 int free_space, u32 right_nritems,
3170 u32 max_slot)
be0e5c09 3171{
8087c193 3172 struct btrfs_fs_info *fs_info = left->fs_info;
5f39d397
CM
3173 struct btrfs_disk_key disk_key;
3174 struct extent_buffer *right = path->nodes[0];
be0e5c09 3175 int i;
be0e5c09
CM
3176 int push_space = 0;
3177 int push_items = 0;
7518a238 3178 u32 old_left_nritems;
34a38218 3179 u32 nr;
aa5d6bed 3180 int ret = 0;
db94535d
CM
3181 u32 this_item_size;
3182 u32 old_left_item_size;
cfed81a0
CM
3183 struct btrfs_map_token token;
3184
34a38218 3185 if (empty)
99d8f83c 3186 nr = min(right_nritems, max_slot);
34a38218 3187 else
99d8f83c 3188 nr = min(right_nritems - 1, max_slot);
34a38218
CM
3189
3190 for (i = 0; i < nr; i++) {
31840ae1
ZY
3191 if (!empty && push_items > 0) {
3192 if (path->slots[0] < i)
3193 break;
3194 if (path->slots[0] == i) {
e902baac
DS
3195 int space = btrfs_leaf_free_space(right);
3196
31840ae1
ZY
3197 if (space + push_space * 2 > free_space)
3198 break;
3199 }
3200 }
3201
be0e5c09 3202 if (path->slots[0] == i)
87b29b20 3203 push_space += data_size;
db94535d 3204
3212fa14 3205 this_item_size = btrfs_item_size(right, i);
74794207
JB
3206 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3207 free_space)
be0e5c09 3208 break;
db94535d 3209
be0e5c09 3210 push_items++;
74794207 3211 push_space += this_item_size + sizeof(struct btrfs_item);
db94535d
CM
3212 }
3213
be0e5c09 3214 if (push_items == 0) {
925baedd
CM
3215 ret = 1;
3216 goto out;
be0e5c09 3217 }
fae7f21c 3218 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
5f39d397 3219
be0e5c09 3220 /* push data from right to left */
5f39d397
CM
3221 copy_extent_buffer(left, right,
3222 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3223 btrfs_item_nr_offset(0),
3224 push_items * sizeof(struct btrfs_item));
3225
0b246afa 3226 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3212fa14 3227 btrfs_item_offset(right, push_items - 1);
5f39d397 3228
3d9ec8c4 3229 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3230 leaf_data_end(left) - push_space,
3d9ec8c4 3231 BTRFS_LEAF_DATA_OFFSET +
3212fa14 3232 btrfs_item_offset(right, push_items - 1),
d6025579 3233 push_space);
5f39d397 3234 old_left_nritems = btrfs_header_nritems(left);
87b29b20 3235 BUG_ON(old_left_nritems <= 0);
eb60ceac 3236
c82f823c 3237 btrfs_init_map_token(&token, left);
3212fa14 3238 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
0783fcfc 3239 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
5f39d397 3240 u32 ioff;
db94535d 3241
3212fa14
JB
3242 ioff = btrfs_token_item_offset(&token, i);
3243 btrfs_set_token_item_offset(&token, i,
cc4c13d5 3244 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
be0e5c09 3245 }
5f39d397 3246 btrfs_set_header_nritems(left, old_left_nritems + push_items);
be0e5c09
CM
3247
3248 /* fixup right node */
31b1a2bd
JL
3249 if (push_items > right_nritems)
3250 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
d397712b 3251 right_nritems);
34a38218
CM
3252
3253 if (push_items < right_nritems) {
3212fa14 3254 push_space = btrfs_item_offset(right, push_items - 1) -
8f881e8c 3255 leaf_data_end(right);
3d9ec8c4 3256 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
0b246afa 3257 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3d9ec8c4 3258 BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3259 leaf_data_end(right), push_space);
34a38218
CM
3260
3261 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
5f39d397
CM
3262 btrfs_item_nr_offset(push_items),
3263 (btrfs_header_nritems(right) - push_items) *
3264 sizeof(struct btrfs_item));
34a38218 3265 }
c82f823c
DS
3266
3267 btrfs_init_map_token(&token, right);
eef1c494
Y
3268 right_nritems -= push_items;
3269 btrfs_set_header_nritems(right, right_nritems);
0b246afa 3270 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
5f39d397 3271 for (i = 0; i < right_nritems; i++) {
3212fa14
JB
3272 push_space = push_space - btrfs_token_item_size(&token, i);
3273 btrfs_set_token_item_offset(&token, i, push_space);
db94535d 3274 }
eb60ceac 3275
5f39d397 3276 btrfs_mark_buffer_dirty(left);
34a38218
CM
3277 if (right_nritems)
3278 btrfs_mark_buffer_dirty(right);
f0486c68 3279 else
6a884d7d 3280 btrfs_clean_tree_block(right);
098f59c2 3281
5f39d397 3282 btrfs_item_key(right, &disk_key, 0);
b167fa91 3283 fixup_low_keys(path, &disk_key, 1);
be0e5c09
CM
3284
3285 /* then fixup the leaf pointer in the path */
3286 if (path->slots[0] < push_items) {
3287 path->slots[0] += old_left_nritems;
925baedd 3288 btrfs_tree_unlock(path->nodes[0]);
5f39d397
CM
3289 free_extent_buffer(path->nodes[0]);
3290 path->nodes[0] = left;
be0e5c09
CM
3291 path->slots[1] -= 1;
3292 } else {
925baedd 3293 btrfs_tree_unlock(left);
5f39d397 3294 free_extent_buffer(left);
be0e5c09
CM
3295 path->slots[0] -= push_items;
3296 }
eb60ceac 3297 BUG_ON(path->slots[0] < 0);
aa5d6bed 3298 return ret;
925baedd
CM
3299out:
3300 btrfs_tree_unlock(left);
3301 free_extent_buffer(left);
3302 return ret;
be0e5c09
CM
3303}
3304
44871b1b
CM
3305/*
3306 * push some data in the path leaf to the left, trying to free up at
3307 * least data_size bytes. returns zero if the push worked, nonzero otherwise
99d8f83c
CM
3308 *
3309 * max_slot can put a limit on how far into the leaf we'll push items. The
3310 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3311 * items
44871b1b
CM
3312 */
3313static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
99d8f83c
CM
3314 *root, struct btrfs_path *path, int min_data_size,
3315 int data_size, int empty, u32 max_slot)
44871b1b
CM
3316{
3317 struct extent_buffer *right = path->nodes[0];
3318 struct extent_buffer *left;
3319 int slot;
3320 int free_space;
3321 u32 right_nritems;
3322 int ret = 0;
3323
3324 slot = path->slots[1];
3325 if (slot == 0)
3326 return 1;
3327 if (!path->nodes[1])
3328 return 1;
3329
3330 right_nritems = btrfs_header_nritems(right);
3331 if (right_nritems == 0)
3332 return 1;
3333
49d0c642 3334 btrfs_assert_tree_write_locked(path->nodes[1]);
44871b1b 3335
4b231ae4 3336 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
fb770ae4
LB
3337 /*
3338 * slot - 1 is not valid or we fail to read the left node,
3339 * no big deal, just return.
3340 */
3341 if (IS_ERR(left))
91ca338d
TI
3342 return 1;
3343
bf77467a 3344 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
44871b1b 3345
e902baac 3346 free_space = btrfs_leaf_free_space(left);
44871b1b
CM
3347 if (free_space < data_size) {
3348 ret = 1;
3349 goto out;
3350 }
3351
44871b1b 3352 ret = btrfs_cow_block(trans, root, left,
9631e4cc 3353 path->nodes[1], slot - 1, &left,
bf59a5a2 3354 BTRFS_NESTING_LEFT_COW);
44871b1b
CM
3355 if (ret) {
3356 /* we hit -ENOSPC, but it isn't fatal here */
79787eaa
JM
3357 if (ret == -ENOSPC)
3358 ret = 1;
44871b1b
CM
3359 goto out;
3360 }
3361
d16c702f
QW
3362 if (check_sibling_keys(left, right)) {
3363 ret = -EUCLEAN;
3364 goto out;
3365 }
8087c193 3366 return __push_leaf_left(path, min_data_size,
99d8f83c
CM
3367 empty, left, free_space, right_nritems,
3368 max_slot);
44871b1b
CM
3369out:
3370 btrfs_tree_unlock(left);
3371 free_extent_buffer(left);
3372 return ret;
3373}
3374
3375/*
3376 * split the path's leaf in two, making sure there is at least data_size
3377 * available for the resulting leaf level of the path.
44871b1b 3378 */
143bede5 3379static noinline void copy_for_split(struct btrfs_trans_handle *trans,
143bede5
JM
3380 struct btrfs_path *path,
3381 struct extent_buffer *l,
3382 struct extent_buffer *right,
3383 int slot, int mid, int nritems)
44871b1b 3384{
94f94ad9 3385 struct btrfs_fs_info *fs_info = trans->fs_info;
44871b1b
CM
3386 int data_copy_size;
3387 int rt_data_off;
3388 int i;
44871b1b 3389 struct btrfs_disk_key disk_key;
cfed81a0
CM
3390 struct btrfs_map_token token;
3391
44871b1b
CM
3392 nritems = nritems - mid;
3393 btrfs_set_header_nritems(right, nritems);
dc2e724e 3394 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
44871b1b
CM
3395
3396 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3397 btrfs_item_nr_offset(mid),
3398 nritems * sizeof(struct btrfs_item));
3399
3400 copy_extent_buffer(right, l,
3d9ec8c4
NB
3401 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3402 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
8f881e8c 3403 leaf_data_end(l), data_copy_size);
44871b1b 3404
dc2e724e 3405 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
44871b1b 3406
c82f823c 3407 btrfs_init_map_token(&token, right);
44871b1b 3408 for (i = 0; i < nritems; i++) {
44871b1b
CM
3409 u32 ioff;
3410
3212fa14
JB
3411 ioff = btrfs_token_item_offset(&token, i);
3412 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
44871b1b
CM
3413 }
3414
44871b1b 3415 btrfs_set_header_nritems(l, mid);
44871b1b 3416 btrfs_item_key(right, &disk_key, 0);
6ad3cf6d 3417 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
44871b1b
CM
3418
3419 btrfs_mark_buffer_dirty(right);
3420 btrfs_mark_buffer_dirty(l);
3421 BUG_ON(path->slots[0] != slot);
3422
44871b1b
CM
3423 if (mid <= slot) {
3424 btrfs_tree_unlock(path->nodes[0]);
3425 free_extent_buffer(path->nodes[0]);
3426 path->nodes[0] = right;
3427 path->slots[0] -= mid;
3428 path->slots[1] += 1;
3429 } else {
3430 btrfs_tree_unlock(right);
3431 free_extent_buffer(right);
3432 }
3433
3434 BUG_ON(path->slots[0] < 0);
44871b1b
CM
3435}
3436
99d8f83c
CM
3437/*
3438 * double splits happen when we need to insert a big item in the middle
3439 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3440 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3441 * A B C
3442 *
3443 * We avoid this by trying to push the items on either side of our target
3444 * into the adjacent leaves. If all goes well we can avoid the double split
3445 * completely.
3446 */
3447static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3448 struct btrfs_root *root,
3449 struct btrfs_path *path,
3450 int data_size)
3451{
3452 int ret;
3453 int progress = 0;
3454 int slot;
3455 u32 nritems;
5a4267ca 3456 int space_needed = data_size;
99d8f83c
CM
3457
3458 slot = path->slots[0];
5a4267ca 3459 if (slot < btrfs_header_nritems(path->nodes[0]))
e902baac 3460 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
99d8f83c
CM
3461
3462 /*
3463 * try to push all the items after our slot into the
3464 * right leaf
3465 */
5a4267ca 3466 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3467 if (ret < 0)
3468 return ret;
3469
3470 if (ret == 0)
3471 progress++;
3472
3473 nritems = btrfs_header_nritems(path->nodes[0]);
3474 /*
3475 * our goal is to get our slot at the start or end of a leaf. If
3476 * we've done so we're done
3477 */
3478 if (path->slots[0] == 0 || path->slots[0] == nritems)
3479 return 0;
3480
e902baac 3481 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3482 return 0;
3483
3484 /* try to push all the items before our slot into the next leaf */
3485 slot = path->slots[0];
263d3995
FM
3486 space_needed = data_size;
3487 if (slot > 0)
e902baac 3488 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
5a4267ca 3489 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
99d8f83c
CM
3490 if (ret < 0)
3491 return ret;
3492
3493 if (ret == 0)
3494 progress++;
3495
3496 if (progress)
3497 return 0;
3498 return 1;
3499}
3500
74123bd7
CM
3501/*
3502 * split the path's leaf in two, making sure there is at least data_size
3503 * available for the resulting leaf level of the path.
aa5d6bed
CM
3504 *
3505 * returns 0 if all went well and < 0 on failure.
74123bd7 3506 */
e02119d5
CM
3507static noinline int split_leaf(struct btrfs_trans_handle *trans,
3508 struct btrfs_root *root,
310712b2 3509 const struct btrfs_key *ins_key,
e02119d5
CM
3510 struct btrfs_path *path, int data_size,
3511 int extend)
be0e5c09 3512{
5d4f98a2 3513 struct btrfs_disk_key disk_key;
5f39d397 3514 struct extent_buffer *l;
7518a238 3515 u32 nritems;
eb60ceac
CM
3516 int mid;
3517 int slot;
5f39d397 3518 struct extent_buffer *right;
b7a0365e 3519 struct btrfs_fs_info *fs_info = root->fs_info;
d4dbff95 3520 int ret = 0;
aa5d6bed 3521 int wret;
5d4f98a2 3522 int split;
cc0c5538 3523 int num_doubles = 0;
99d8f83c 3524 int tried_avoid_double = 0;
aa5d6bed 3525
a5719521
YZ
3526 l = path->nodes[0];
3527 slot = path->slots[0];
3212fa14 3528 if (extend && data_size + btrfs_item_size(l, slot) +
0b246afa 3529 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
a5719521
YZ
3530 return -EOVERFLOW;
3531
40689478 3532 /* first try to make some room by pushing left and right */
33157e05 3533 if (data_size && path->nodes[1]) {
5a4267ca
FDBM
3534 int space_needed = data_size;
3535
3536 if (slot < btrfs_header_nritems(l))
e902baac 3537 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3538
3539 wret = push_leaf_right(trans, root, path, space_needed,
3540 space_needed, 0, 0);
d397712b 3541 if (wret < 0)
eaee50e8 3542 return wret;
3685f791 3543 if (wret) {
263d3995
FM
3544 space_needed = data_size;
3545 if (slot > 0)
e902baac 3546 space_needed -= btrfs_leaf_free_space(l);
5a4267ca
FDBM
3547 wret = push_leaf_left(trans, root, path, space_needed,
3548 space_needed, 0, (u32)-1);
3685f791
CM
3549 if (wret < 0)
3550 return wret;
3551 }
3552 l = path->nodes[0];
aa5d6bed 3553
3685f791 3554 /* did the pushes work? */
e902baac 3555 if (btrfs_leaf_free_space(l) >= data_size)
3685f791 3556 return 0;
3326d1b0 3557 }
aa5d6bed 3558
5c680ed6 3559 if (!path->nodes[1]) {
fdd99c72 3560 ret = insert_new_root(trans, root, path, 1);
5c680ed6
CM
3561 if (ret)
3562 return ret;
3563 }
cc0c5538 3564again:
5d4f98a2 3565 split = 1;
cc0c5538 3566 l = path->nodes[0];
eb60ceac 3567 slot = path->slots[0];
5f39d397 3568 nritems = btrfs_header_nritems(l);
d397712b 3569 mid = (nritems + 1) / 2;
54aa1f4d 3570
5d4f98a2
YZ
3571 if (mid <= slot) {
3572 if (nritems == 1 ||
3573 leaf_space_used(l, mid, nritems - mid) + data_size >
0b246afa 3574 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3575 if (slot >= nritems) {
3576 split = 0;
3577 } else {
3578 mid = slot;
3579 if (mid != nritems &&
3580 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3581 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3582 if (data_size && !tried_avoid_double)
3583 goto push_for_double;
5d4f98a2
YZ
3584 split = 2;
3585 }
3586 }
3587 }
3588 } else {
3589 if (leaf_space_used(l, 0, mid) + data_size >
0b246afa 3590 BTRFS_LEAF_DATA_SIZE(fs_info)) {
5d4f98a2
YZ
3591 if (!extend && data_size && slot == 0) {
3592 split = 0;
3593 } else if ((extend || !data_size) && slot == 0) {
3594 mid = 1;
3595 } else {
3596 mid = slot;
3597 if (mid != nritems &&
3598 leaf_space_used(l, mid, nritems - mid) +
0b246afa 3599 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
99d8f83c
CM
3600 if (data_size && !tried_avoid_double)
3601 goto push_for_double;
67871254 3602 split = 2;
5d4f98a2
YZ
3603 }
3604 }
3605 }
3606 }
3607
3608 if (split == 0)
3609 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3610 else
3611 btrfs_item_key(l, &disk_key, mid);
3612
ca9d473a
JB
3613 /*
3614 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3615 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3616 * subclasses, which is 8 at the time of this patch, and we've maxed it
3617 * out. In the future we could add a
3618 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3619 * use BTRFS_NESTING_NEW_ROOT.
3620 */
79bd3712
FM
3621 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3622 &disk_key, 0, l->start, 0,
3623 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3624 BTRFS_NESTING_SPLIT);
f0486c68 3625 if (IS_ERR(right))
5f39d397 3626 return PTR_ERR(right);
f0486c68 3627
0b246afa 3628 root_add_used(root, fs_info->nodesize);
5f39d397 3629
5d4f98a2
YZ
3630 if (split == 0) {
3631 if (mid <= slot) {
3632 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3633 insert_ptr(trans, path, &disk_key,
2ff7e61e 3634 right->start, path->slots[1] + 1, 1);
5d4f98a2
YZ
3635 btrfs_tree_unlock(path->nodes[0]);
3636 free_extent_buffer(path->nodes[0]);
3637 path->nodes[0] = right;
3638 path->slots[0] = 0;
3639 path->slots[1] += 1;
3640 } else {
3641 btrfs_set_header_nritems(right, 0);
6ad3cf6d 3642 insert_ptr(trans, path, &disk_key,
2ff7e61e 3643 right->start, path->slots[1], 1);
5d4f98a2
YZ
3644 btrfs_tree_unlock(path->nodes[0]);
3645 free_extent_buffer(path->nodes[0]);
3646 path->nodes[0] = right;
3647 path->slots[0] = 0;
143bede5 3648 if (path->slots[1] == 0)
b167fa91 3649 fixup_low_keys(path, &disk_key, 1);
d4dbff95 3650 }
196e0249
LB
3651 /*
3652 * We create a new leaf 'right' for the required ins_len and
3653 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3654 * the content of ins_len to 'right'.
3655 */
5d4f98a2 3656 return ret;
d4dbff95 3657 }
74123bd7 3658
94f94ad9 3659 copy_for_split(trans, path, l, right, slot, mid, nritems);
31840ae1 3660
5d4f98a2 3661 if (split == 2) {
cc0c5538
CM
3662 BUG_ON(num_doubles != 0);
3663 num_doubles++;
3664 goto again;
a429e513 3665 }
44871b1b 3666
143bede5 3667 return 0;
99d8f83c
CM
3668
3669push_for_double:
3670 push_for_double_split(trans, root, path, data_size);
3671 tried_avoid_double = 1;
e902baac 3672 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
99d8f83c
CM
3673 return 0;
3674 goto again;
be0e5c09
CM
3675}
3676
ad48fd75
YZ
3677static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3678 struct btrfs_root *root,
3679 struct btrfs_path *path, int ins_len)
459931ec 3680{
ad48fd75 3681 struct btrfs_key key;
459931ec 3682 struct extent_buffer *leaf;
ad48fd75
YZ
3683 struct btrfs_file_extent_item *fi;
3684 u64 extent_len = 0;
3685 u32 item_size;
3686 int ret;
459931ec
CM
3687
3688 leaf = path->nodes[0];
ad48fd75
YZ
3689 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3690
3691 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3692 key.type != BTRFS_EXTENT_CSUM_KEY);
3693
e902baac 3694 if (btrfs_leaf_free_space(leaf) >= ins_len)
ad48fd75 3695 return 0;
459931ec 3696
3212fa14 3697 item_size = btrfs_item_size(leaf, path->slots[0]);
ad48fd75
YZ
3698 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3699 fi = btrfs_item_ptr(leaf, path->slots[0],
3700 struct btrfs_file_extent_item);
3701 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3702 }
b3b4aa74 3703 btrfs_release_path(path);
459931ec 3704
459931ec 3705 path->keep_locks = 1;
ad48fd75
YZ
3706 path->search_for_split = 1;
3707 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
459931ec 3708 path->search_for_split = 0;
a8df6fe6
FM
3709 if (ret > 0)
3710 ret = -EAGAIN;
ad48fd75
YZ
3711 if (ret < 0)
3712 goto err;
459931ec 3713
ad48fd75
YZ
3714 ret = -EAGAIN;
3715 leaf = path->nodes[0];
a8df6fe6 3716 /* if our item isn't there, return now */
3212fa14 3717 if (item_size != btrfs_item_size(leaf, path->slots[0]))
ad48fd75
YZ
3718 goto err;
3719
109f6aef 3720 /* the leaf has changed, it now has room. return now */
e902baac 3721 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
109f6aef
CM
3722 goto err;
3723
ad48fd75
YZ
3724 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3725 fi = btrfs_item_ptr(leaf, path->slots[0],
3726 struct btrfs_file_extent_item);
3727 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3728 goto err;
459931ec
CM
3729 }
3730
ad48fd75 3731 ret = split_leaf(trans, root, &key, path, ins_len, 1);
f0486c68
YZ
3732 if (ret)
3733 goto err;
459931ec 3734
ad48fd75 3735 path->keep_locks = 0;
b9473439 3736 btrfs_unlock_up_safe(path, 1);
ad48fd75
YZ
3737 return 0;
3738err:
3739 path->keep_locks = 0;
3740 return ret;
3741}
3742
25263cd7 3743static noinline int split_item(struct btrfs_path *path,
310712b2 3744 const struct btrfs_key *new_key,
ad48fd75
YZ
3745 unsigned long split_offset)
3746{
3747 struct extent_buffer *leaf;
c91666b1 3748 int orig_slot, slot;
ad48fd75
YZ
3749 char *buf;
3750 u32 nritems;
3751 u32 item_size;
3752 u32 orig_offset;
3753 struct btrfs_disk_key disk_key;
3754
b9473439 3755 leaf = path->nodes[0];
e902baac 3756 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
b9473439 3757
c91666b1 3758 orig_slot = path->slots[0];
3212fa14
JB
3759 orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3760 item_size = btrfs_item_size(leaf, path->slots[0]);
459931ec 3761
459931ec 3762 buf = kmalloc(item_size, GFP_NOFS);
ad48fd75
YZ
3763 if (!buf)
3764 return -ENOMEM;
3765
459931ec
CM
3766 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3767 path->slots[0]), item_size);
459931ec 3768
ad48fd75 3769 slot = path->slots[0] + 1;
459931ec 3770 nritems = btrfs_header_nritems(leaf);
459931ec
CM
3771 if (slot != nritems) {
3772 /* shift the items */
3773 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
ad48fd75
YZ
3774 btrfs_item_nr_offset(slot),
3775 (nritems - slot) * sizeof(struct btrfs_item));
459931ec
CM
3776 }
3777
3778 btrfs_cpu_key_to_disk(&disk_key, new_key);
3779 btrfs_set_item_key(leaf, &disk_key, slot);
3780
3212fa14
JB
3781 btrfs_set_item_offset(leaf, slot, orig_offset);
3782 btrfs_set_item_size(leaf, slot, item_size - split_offset);
459931ec 3783
3212fa14 3784 btrfs_set_item_offset(leaf, orig_slot,
c91666b1 3785 orig_offset + item_size - split_offset);
3212fa14 3786 btrfs_set_item_size(leaf, orig_slot, split_offset);
459931ec
CM
3787
3788 btrfs_set_header_nritems(leaf, nritems + 1);
3789
3790 /* write the data for the start of the original item */
3791 write_extent_buffer(leaf, buf,
3792 btrfs_item_ptr_offset(leaf, path->slots[0]),
3793 split_offset);
3794
3795 /* write the data for the new item */
3796 write_extent_buffer(leaf, buf + split_offset,
3797 btrfs_item_ptr_offset(leaf, slot),
3798 item_size - split_offset);
3799 btrfs_mark_buffer_dirty(leaf);
3800
e902baac 3801 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
459931ec 3802 kfree(buf);
ad48fd75
YZ
3803 return 0;
3804}
3805
3806/*
3807 * This function splits a single item into two items,
3808 * giving 'new_key' to the new item and splitting the
3809 * old one at split_offset (from the start of the item).
3810 *
3811 * The path may be released by this operation. After
3812 * the split, the path is pointing to the old item. The
3813 * new item is going to be in the same node as the old one.
3814 *
3815 * Note, the item being split must be smaller enough to live alone on
3816 * a tree block with room for one extra struct btrfs_item
3817 *
3818 * This allows us to split the item in place, keeping a lock on the
3819 * leaf the entire time.
3820 */
3821int btrfs_split_item(struct btrfs_trans_handle *trans,
3822 struct btrfs_root *root,
3823 struct btrfs_path *path,
310712b2 3824 const struct btrfs_key *new_key,
ad48fd75
YZ
3825 unsigned long split_offset)
3826{
3827 int ret;
3828 ret = setup_leaf_for_split(trans, root, path,
3829 sizeof(struct btrfs_item));
3830 if (ret)
3831 return ret;
3832
25263cd7 3833 ret = split_item(path, new_key, split_offset);
459931ec
CM
3834 return ret;
3835}
3836
d352ac68
CM
3837/*
3838 * make the item pointed to by the path smaller. new_size indicates
3839 * how small to make it, and from_end tells us if we just chop bytes
3840 * off the end of the item or if we shift the item to chop bytes off
3841 * the front.
3842 */
78ac4f9e 3843void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
b18c6685 3844{
b18c6685 3845 int slot;
5f39d397 3846 struct extent_buffer *leaf;
b18c6685
CM
3847 u32 nritems;
3848 unsigned int data_end;
3849 unsigned int old_data_start;
3850 unsigned int old_size;
3851 unsigned int size_diff;
3852 int i;
cfed81a0
CM
3853 struct btrfs_map_token token;
3854
5f39d397 3855 leaf = path->nodes[0];
179e29e4
CM
3856 slot = path->slots[0];
3857
3212fa14 3858 old_size = btrfs_item_size(leaf, slot);
179e29e4 3859 if (old_size == new_size)
143bede5 3860 return;
b18c6685 3861
5f39d397 3862 nritems = btrfs_header_nritems(leaf);
8f881e8c 3863 data_end = leaf_data_end(leaf);
b18c6685 3864
3212fa14 3865 old_data_start = btrfs_item_offset(leaf, slot);
179e29e4 3866
b18c6685
CM
3867 size_diff = old_size - new_size;
3868
3869 BUG_ON(slot < 0);
3870 BUG_ON(slot >= nritems);
3871
3872 /*
3873 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3874 */
3875 /* first correct the data pointers */
c82f823c 3876 btrfs_init_map_token(&token, leaf);
b18c6685 3877 for (i = slot; i < nritems; i++) {
5f39d397 3878 u32 ioff;
db94535d 3879
3212fa14
JB
3880 ioff = btrfs_token_item_offset(&token, i);
3881 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
b18c6685 3882 }
db94535d 3883
b18c6685 3884 /* shift the data */
179e29e4 3885 if (from_end) {
3d9ec8c4
NB
3886 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3887 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3888 data_end, old_data_start + new_size - data_end);
3889 } else {
3890 struct btrfs_disk_key disk_key;
3891 u64 offset;
3892
3893 btrfs_item_key(leaf, &disk_key, slot);
3894
3895 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3896 unsigned long ptr;
3897 struct btrfs_file_extent_item *fi;
3898
3899 fi = btrfs_item_ptr(leaf, slot,
3900 struct btrfs_file_extent_item);
3901 fi = (struct btrfs_file_extent_item *)(
3902 (unsigned long)fi - size_diff);
3903
3904 if (btrfs_file_extent_type(leaf, fi) ==
3905 BTRFS_FILE_EXTENT_INLINE) {
3906 ptr = btrfs_item_ptr_offset(leaf, slot);
3907 memmove_extent_buffer(leaf, ptr,
d397712b 3908 (unsigned long)fi,
7ec20afb 3909 BTRFS_FILE_EXTENT_INLINE_DATA_START);
179e29e4
CM
3910 }
3911 }
3912
3d9ec8c4
NB
3913 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3914 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
179e29e4
CM
3915 data_end, old_data_start - data_end);
3916
3917 offset = btrfs_disk_key_offset(&disk_key);
3918 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3919 btrfs_set_item_key(leaf, &disk_key, slot);
3920 if (slot == 0)
b167fa91 3921 fixup_low_keys(path, &disk_key, 1);
179e29e4 3922 }
5f39d397 3923
3212fa14 3924 btrfs_set_item_size(leaf, slot, new_size);
5f39d397 3925 btrfs_mark_buffer_dirty(leaf);
b18c6685 3926
e902baac 3927 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3928 btrfs_print_leaf(leaf);
b18c6685 3929 BUG();
5f39d397 3930 }
b18c6685
CM
3931}
3932
d352ac68 3933/*
8f69dbd2 3934 * make the item pointed to by the path bigger, data_size is the added size.
d352ac68 3935 */
c71dd880 3936void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
6567e837 3937{
6567e837 3938 int slot;
5f39d397 3939 struct extent_buffer *leaf;
6567e837
CM
3940 u32 nritems;
3941 unsigned int data_end;
3942 unsigned int old_data;
3943 unsigned int old_size;
3944 int i;
cfed81a0
CM
3945 struct btrfs_map_token token;
3946
5f39d397 3947 leaf = path->nodes[0];
6567e837 3948
5f39d397 3949 nritems = btrfs_header_nritems(leaf);
8f881e8c 3950 data_end = leaf_data_end(leaf);
6567e837 3951
e902baac 3952 if (btrfs_leaf_free_space(leaf) < data_size) {
a4f78750 3953 btrfs_print_leaf(leaf);
6567e837 3954 BUG();
5f39d397 3955 }
6567e837 3956 slot = path->slots[0];
dc2e724e 3957 old_data = btrfs_item_data_end(leaf, slot);
6567e837
CM
3958
3959 BUG_ON(slot < 0);
3326d1b0 3960 if (slot >= nritems) {
a4f78750 3961 btrfs_print_leaf(leaf);
c71dd880 3962 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
0b246afa 3963 slot, nritems);
290342f6 3964 BUG();
3326d1b0 3965 }
6567e837
CM
3966
3967 /*
3968 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3969 */
3970 /* first correct the data pointers */
c82f823c 3971 btrfs_init_map_token(&token, leaf);
6567e837 3972 for (i = slot; i < nritems; i++) {
5f39d397 3973 u32 ioff;
db94535d 3974
3212fa14
JB
3975 ioff = btrfs_token_item_offset(&token, i);
3976 btrfs_set_token_item_offset(&token, i, ioff - data_size);
6567e837 3977 }
5f39d397 3978
6567e837 3979 /* shift the data */
3d9ec8c4
NB
3980 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3981 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
6567e837 3982 data_end, old_data - data_end);
5f39d397 3983
6567e837 3984 data_end = old_data;
3212fa14
JB
3985 old_size = btrfs_item_size(leaf, slot);
3986 btrfs_set_item_size(leaf, slot, old_size + data_size);
5f39d397 3987 btrfs_mark_buffer_dirty(leaf);
6567e837 3988
e902baac 3989 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 3990 btrfs_print_leaf(leaf);
6567e837 3991 BUG();
5f39d397 3992 }
6567e837
CM
3993}
3994
43dd529a
DS
3995/*
3996 * Make space in the node before inserting one or more items.
da9ffb24
NB
3997 *
3998 * @root: root we are inserting items to
3999 * @path: points to the leaf/slot where we are going to insert new items
b7ef5f3a 4000 * @batch: information about the batch of items to insert
43dd529a
DS
4001 *
4002 * Main purpose is to save stack depth by doing the bulk of the work in a
4003 * function that doesn't call btrfs_search_slot
74123bd7 4004 */
f0641656
FM
4005static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4006 const struct btrfs_item_batch *batch)
be0e5c09 4007{
0b246afa 4008 struct btrfs_fs_info *fs_info = root->fs_info;
9c58309d 4009 int i;
7518a238 4010 u32 nritems;
be0e5c09 4011 unsigned int data_end;
e2fa7227 4012 struct btrfs_disk_key disk_key;
44871b1b
CM
4013 struct extent_buffer *leaf;
4014 int slot;
cfed81a0 4015 struct btrfs_map_token token;
fc0d82e1 4016 u32 total_size;
cfed81a0 4017
b7ef5f3a
FM
4018 /*
4019 * Before anything else, update keys in the parent and other ancestors
4020 * if needed, then release the write locks on them, so that other tasks
4021 * can use them while we modify the leaf.
4022 */
24cdc847 4023 if (path->slots[0] == 0) {
b7ef5f3a 4024 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
b167fa91 4025 fixup_low_keys(path, &disk_key, 1);
24cdc847
FM
4026 }
4027 btrfs_unlock_up_safe(path, 1);
4028
5f39d397 4029 leaf = path->nodes[0];
44871b1b 4030 slot = path->slots[0];
74123bd7 4031
5f39d397 4032 nritems = btrfs_header_nritems(leaf);
8f881e8c 4033 data_end = leaf_data_end(leaf);
b7ef5f3a 4034 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
eb60ceac 4035
e902baac 4036 if (btrfs_leaf_free_space(leaf) < total_size) {
a4f78750 4037 btrfs_print_leaf(leaf);
0b246afa 4038 btrfs_crit(fs_info, "not enough freespace need %u have %d",
e902baac 4039 total_size, btrfs_leaf_free_space(leaf));
be0e5c09 4040 BUG();
d4dbff95 4041 }
5f39d397 4042
c82f823c 4043 btrfs_init_map_token(&token, leaf);
be0e5c09 4044 if (slot != nritems) {
dc2e724e 4045 unsigned int old_data = btrfs_item_data_end(leaf, slot);
be0e5c09 4046
5f39d397 4047 if (old_data < data_end) {
a4f78750 4048 btrfs_print_leaf(leaf);
7269ddd2
NB
4049 btrfs_crit(fs_info,
4050 "item at slot %d with data offset %u beyond data end of leaf %u",
5d163e0e 4051 slot, old_data, data_end);
290342f6 4052 BUG();
5f39d397 4053 }
be0e5c09
CM
4054 /*
4055 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4056 */
4057 /* first correct the data pointers */
0783fcfc 4058 for (i = slot; i < nritems; i++) {
5f39d397 4059 u32 ioff;
db94535d 4060
3212fa14
JB
4061 ioff = btrfs_token_item_offset(&token, i);
4062 btrfs_set_token_item_offset(&token, i,
74794207 4063 ioff - batch->total_data_size);
0783fcfc 4064 }
be0e5c09 4065 /* shift the items */
b7ef5f3a 4066 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
5f39d397 4067 btrfs_item_nr_offset(slot),
d6025579 4068 (nritems - slot) * sizeof(struct btrfs_item));
be0e5c09
CM
4069
4070 /* shift the data */
3d9ec8c4 4071 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
b7ef5f3a
FM
4072 data_end - batch->total_data_size,
4073 BTRFS_LEAF_DATA_OFFSET + data_end,
4074 old_data - data_end);
be0e5c09
CM
4075 data_end = old_data;
4076 }
5f39d397 4077
62e2749e 4078 /* setup the item for the new data */
b7ef5f3a
FM
4079 for (i = 0; i < batch->nr; i++) {
4080 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
9c58309d 4081 btrfs_set_item_key(leaf, &disk_key, slot + i);
b7ef5f3a 4082 data_end -= batch->data_sizes[i];
3212fa14
JB
4083 btrfs_set_token_item_offset(&token, slot + i, data_end);
4084 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
9c58309d 4085 }
44871b1b 4086
b7ef5f3a 4087 btrfs_set_header_nritems(leaf, nritems + batch->nr);
b9473439 4088 btrfs_mark_buffer_dirty(leaf);
aa5d6bed 4089
e902baac 4090 if (btrfs_leaf_free_space(leaf) < 0) {
a4f78750 4091 btrfs_print_leaf(leaf);
be0e5c09 4092 BUG();
5f39d397 4093 }
44871b1b
CM
4094}
4095
f0641656
FM
4096/*
4097 * Insert a new item into a leaf.
4098 *
4099 * @root: The root of the btree.
4100 * @path: A path pointing to the target leaf and slot.
4101 * @key: The key of the new item.
4102 * @data_size: The size of the data associated with the new key.
4103 */
4104void btrfs_setup_item_for_insert(struct btrfs_root *root,
4105 struct btrfs_path *path,
4106 const struct btrfs_key *key,
4107 u32 data_size)
4108{
4109 struct btrfs_item_batch batch;
4110
4111 batch.keys = key;
4112 batch.data_sizes = &data_size;
4113 batch.total_data_size = data_size;
4114 batch.nr = 1;
4115
4116 setup_items_for_insert(root, path, &batch);
4117}
4118
44871b1b
CM
4119/*
4120 * Given a key and some data, insert items into the tree.
4121 * This does all the path init required, making room in the tree if needed.
4122 */
4123int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4124 struct btrfs_root *root,
4125 struct btrfs_path *path,
b7ef5f3a 4126 const struct btrfs_item_batch *batch)
44871b1b 4127{
44871b1b
CM
4128 int ret = 0;
4129 int slot;
b7ef5f3a 4130 u32 total_size;
44871b1b 4131
b7ef5f3a
FM
4132 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4133 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
44871b1b
CM
4134 if (ret == 0)
4135 return -EEXIST;
4136 if (ret < 0)
143bede5 4137 return ret;
44871b1b 4138
44871b1b
CM
4139 slot = path->slots[0];
4140 BUG_ON(slot < 0);
4141
b7ef5f3a 4142 setup_items_for_insert(root, path, batch);
143bede5 4143 return 0;
62e2749e
CM
4144}
4145
4146/*
4147 * Given a key and some data, insert an item into the tree.
4148 * This does all the path init required, making room in the tree if needed.
4149 */
310712b2
OS
4150int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4151 const struct btrfs_key *cpu_key, void *data,
4152 u32 data_size)
62e2749e
CM
4153{
4154 int ret = 0;
2c90e5d6 4155 struct btrfs_path *path;
5f39d397
CM
4156 struct extent_buffer *leaf;
4157 unsigned long ptr;
62e2749e 4158
2c90e5d6 4159 path = btrfs_alloc_path();
db5b493a
TI
4160 if (!path)
4161 return -ENOMEM;
2c90e5d6 4162 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
62e2749e 4163 if (!ret) {
5f39d397
CM
4164 leaf = path->nodes[0];
4165 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4166 write_extent_buffer(leaf, data, ptr, data_size);
4167 btrfs_mark_buffer_dirty(leaf);
62e2749e 4168 }
2c90e5d6 4169 btrfs_free_path(path);
aa5d6bed 4170 return ret;
be0e5c09
CM
4171}
4172
f0641656
FM
4173/*
4174 * This function duplicates an item, giving 'new_key' to the new item.
4175 * It guarantees both items live in the same tree leaf and the new item is
4176 * contiguous with the original item.
4177 *
4178 * This allows us to split a file extent in place, keeping a lock on the leaf
4179 * the entire time.
4180 */
4181int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4182 struct btrfs_root *root,
4183 struct btrfs_path *path,
4184 const struct btrfs_key *new_key)
4185{
4186 struct extent_buffer *leaf;
4187 int ret;
4188 u32 item_size;
4189
4190 leaf = path->nodes[0];
3212fa14 4191 item_size = btrfs_item_size(leaf, path->slots[0]);
f0641656
FM
4192 ret = setup_leaf_for_split(trans, root, path,
4193 item_size + sizeof(struct btrfs_item));
4194 if (ret)
4195 return ret;
4196
4197 path->slots[0]++;
4198 btrfs_setup_item_for_insert(root, path, new_key, item_size);
4199 leaf = path->nodes[0];
4200 memcpy_extent_buffer(leaf,
4201 btrfs_item_ptr_offset(leaf, path->slots[0]),
4202 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4203 item_size);
4204 return 0;
4205}
4206
74123bd7 4207/*
5de08d7d 4208 * delete the pointer from a given node.
74123bd7 4209 *
d352ac68
CM
4210 * the tree should have been previously balanced so the deletion does not
4211 * empty a node.
74123bd7 4212 */
afe5fea7
TI
4213static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4214 int level, int slot)
be0e5c09 4215{
5f39d397 4216 struct extent_buffer *parent = path->nodes[level];
7518a238 4217 u32 nritems;
f3ea38da 4218 int ret;
be0e5c09 4219
5f39d397 4220 nritems = btrfs_header_nritems(parent);
d397712b 4221 if (slot != nritems - 1) {
bf1d3425 4222 if (level) {
f3a84ccd
FM
4223 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4224 slot + 1, nritems - slot - 1);
bf1d3425
DS
4225 BUG_ON(ret < 0);
4226 }
5f39d397
CM
4227 memmove_extent_buffer(parent,
4228 btrfs_node_key_ptr_offset(slot),
4229 btrfs_node_key_ptr_offset(slot + 1),
d6025579
CM
4230 sizeof(struct btrfs_key_ptr) *
4231 (nritems - slot - 1));
57ba86c0 4232 } else if (level) {
f3a84ccd 4233 ret = btrfs_tree_mod_log_insert_key(parent, slot,
33cff222 4234 BTRFS_MOD_LOG_KEY_REMOVE);
57ba86c0 4235 BUG_ON(ret < 0);
bb803951 4236 }
f3ea38da 4237
7518a238 4238 nritems--;
5f39d397 4239 btrfs_set_header_nritems(parent, nritems);
7518a238 4240 if (nritems == 0 && parent == root->node) {
5f39d397 4241 BUG_ON(btrfs_header_level(root->node) != 1);
bb803951 4242 /* just turn the root into a leaf and break */
5f39d397 4243 btrfs_set_header_level(root->node, 0);
bb803951 4244 } else if (slot == 0) {
5f39d397
CM
4245 struct btrfs_disk_key disk_key;
4246
4247 btrfs_node_key(parent, &disk_key, 0);
b167fa91 4248 fixup_low_keys(path, &disk_key, level + 1);
be0e5c09 4249 }
d6025579 4250 btrfs_mark_buffer_dirty(parent);
be0e5c09
CM
4251}
4252
323ac95b
CM
4253/*
4254 * a helper function to delete the leaf pointed to by path->slots[1] and
5d4f98a2 4255 * path->nodes[1].
323ac95b
CM
4256 *
4257 * This deletes the pointer in path->nodes[1] and frees the leaf
4258 * block extent. zero is returned if it all worked out, < 0 otherwise.
4259 *
4260 * The path must have already been setup for deleting the leaf, including
4261 * all the proper balancing. path->nodes[1] must be locked.
4262 */
143bede5
JM
4263static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4264 struct btrfs_root *root,
4265 struct btrfs_path *path,
4266 struct extent_buffer *leaf)
323ac95b 4267{
5d4f98a2 4268 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
afe5fea7 4269 del_ptr(root, path, 1, path->slots[1]);
323ac95b 4270
4d081c41
CM
4271 /*
4272 * btrfs_free_extent is expensive, we want to make sure we
4273 * aren't holding any locks when we call it
4274 */
4275 btrfs_unlock_up_safe(path, 0);
4276
f0486c68
YZ
4277 root_sub_used(root, leaf->len);
4278
67439dad 4279 atomic_inc(&leaf->refs);
7a163608 4280 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
3083ee2e 4281 free_extent_buffer_stale(leaf);
323ac95b 4282}
74123bd7
CM
4283/*
4284 * delete the item at the leaf level in path. If that empties
4285 * the leaf, remove it from the tree
4286 */
85e21bac
CM
4287int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4288 struct btrfs_path *path, int slot, int nr)
be0e5c09 4289{
0b246afa 4290 struct btrfs_fs_info *fs_info = root->fs_info;
5f39d397 4291 struct extent_buffer *leaf;
aa5d6bed
CM
4292 int ret = 0;
4293 int wret;
7518a238 4294 u32 nritems;
be0e5c09 4295
5f39d397 4296 leaf = path->nodes[0];
5f39d397 4297 nritems = btrfs_header_nritems(leaf);
be0e5c09 4298
85e21bac 4299 if (slot + nr != nritems) {
0cae23b6
FM
4300 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4301 const int data_end = leaf_data_end(leaf);
c82f823c 4302 struct btrfs_map_token token;
0cae23b6
FM
4303 u32 dsize = 0;
4304 int i;
4305
4306 for (i = 0; i < nr; i++)
4307 dsize += btrfs_item_size(leaf, slot + i);
5f39d397 4308
3d9ec8c4 4309 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
d6025579 4310 data_end + dsize,
3d9ec8c4 4311 BTRFS_LEAF_DATA_OFFSET + data_end,
85e21bac 4312 last_off - data_end);
5f39d397 4313
c82f823c 4314 btrfs_init_map_token(&token, leaf);
85e21bac 4315 for (i = slot + nr; i < nritems; i++) {
5f39d397 4316 u32 ioff;
db94535d 4317
3212fa14
JB
4318 ioff = btrfs_token_item_offset(&token, i);
4319 btrfs_set_token_item_offset(&token, i, ioff + dsize);
0783fcfc 4320 }
db94535d 4321
5f39d397 4322 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
85e21bac 4323 btrfs_item_nr_offset(slot + nr),
d6025579 4324 sizeof(struct btrfs_item) *
85e21bac 4325 (nritems - slot - nr));
be0e5c09 4326 }
85e21bac
CM
4327 btrfs_set_header_nritems(leaf, nritems - nr);
4328 nritems -= nr;
5f39d397 4329
74123bd7 4330 /* delete the leaf if we've emptied it */
7518a238 4331 if (nritems == 0) {
5f39d397
CM
4332 if (leaf == root->node) {
4333 btrfs_set_header_level(leaf, 0);
9a8dd150 4334 } else {
6a884d7d 4335 btrfs_clean_tree_block(leaf);
143bede5 4336 btrfs_del_leaf(trans, root, path, leaf);
9a8dd150 4337 }
be0e5c09 4338 } else {
7518a238 4339 int used = leaf_space_used(leaf, 0, nritems);
aa5d6bed 4340 if (slot == 0) {
5f39d397
CM
4341 struct btrfs_disk_key disk_key;
4342
4343 btrfs_item_key(leaf, &disk_key, 0);
b167fa91 4344 fixup_low_keys(path, &disk_key, 1);
aa5d6bed 4345 }
aa5d6bed 4346
7c4063d1
FM
4347 /*
4348 * Try to delete the leaf if it is mostly empty. We do this by
4349 * trying to move all its items into its left and right neighbours.
4350 * If we can't move all the items, then we don't delete it - it's
4351 * not ideal, but future insertions might fill the leaf with more
4352 * items, or items from other leaves might be moved later into our
4353 * leaf due to deletions on those leaves.
4354 */
0b246afa 4355 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
7c4063d1
FM
4356 u32 min_push_space;
4357
be0e5c09
CM
4358 /* push_leaf_left fixes the path.
4359 * make sure the path still points to our leaf
4360 * for possible call to del_ptr below
4361 */
4920c9ac 4362 slot = path->slots[1];
67439dad 4363 atomic_inc(&leaf->refs);
7c4063d1
FM
4364 /*
4365 * We want to be able to at least push one item to the
4366 * left neighbour leaf, and that's the first item.
4367 */
4368 min_push_space = sizeof(struct btrfs_item) +
4369 btrfs_item_size(leaf, 0);
4370 wret = push_leaf_left(trans, root, path, 0,
4371 min_push_space, 1, (u32)-1);
54aa1f4d 4372 if (wret < 0 && wret != -ENOSPC)
aa5d6bed 4373 ret = wret;
5f39d397
CM
4374
4375 if (path->nodes[0] == leaf &&
4376 btrfs_header_nritems(leaf)) {
7c4063d1
FM
4377 /*
4378 * If we were not able to push all items from our
4379 * leaf to its left neighbour, then attempt to
4380 * either push all the remaining items to the
4381 * right neighbour or none. There's no advantage
4382 * in pushing only some items, instead of all, as
4383 * it's pointless to end up with a leaf having
4384 * too few items while the neighbours can be full
4385 * or nearly full.
4386 */
4387 nritems = btrfs_header_nritems(leaf);
4388 min_push_space = leaf_space_used(leaf, 0, nritems);
4389 wret = push_leaf_right(trans, root, path, 0,
4390 min_push_space, 1, 0);
54aa1f4d 4391 if (wret < 0 && wret != -ENOSPC)
aa5d6bed
CM
4392 ret = wret;
4393 }
5f39d397
CM
4394
4395 if (btrfs_header_nritems(leaf) == 0) {
323ac95b 4396 path->slots[1] = slot;
143bede5 4397 btrfs_del_leaf(trans, root, path, leaf);
5f39d397 4398 free_extent_buffer(leaf);
143bede5 4399 ret = 0;
5de08d7d 4400 } else {
925baedd
CM
4401 /* if we're still in the path, make sure
4402 * we're dirty. Otherwise, one of the
4403 * push_leaf functions must have already
4404 * dirtied this buffer
4405 */
4406 if (path->nodes[0] == leaf)
4407 btrfs_mark_buffer_dirty(leaf);
5f39d397 4408 free_extent_buffer(leaf);
be0e5c09 4409 }
d5719762 4410 } else {
5f39d397 4411 btrfs_mark_buffer_dirty(leaf);
be0e5c09
CM
4412 }
4413 }
aa5d6bed 4414 return ret;
be0e5c09
CM
4415}
4416
7bb86316 4417/*
925baedd 4418 * search the tree again to find a leaf with lesser keys
7bb86316
CM
4419 * returns 0 if it found something or 1 if there are no lesser leaves.
4420 * returns < 0 on io errors.
d352ac68
CM
4421 *
4422 * This may release the path, and so you may lose any locks held at the
4423 * time you call it.
7bb86316 4424 */
16e7549f 4425int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
7bb86316 4426{
925baedd
CM
4427 struct btrfs_key key;
4428 struct btrfs_disk_key found_key;
4429 int ret;
7bb86316 4430
925baedd 4431 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
7bb86316 4432
e8b0d724 4433 if (key.offset > 0) {
925baedd 4434 key.offset--;
e8b0d724 4435 } else if (key.type > 0) {
925baedd 4436 key.type--;
e8b0d724
FDBM
4437 key.offset = (u64)-1;
4438 } else if (key.objectid > 0) {
925baedd 4439 key.objectid--;
e8b0d724
FDBM
4440 key.type = (u8)-1;
4441 key.offset = (u64)-1;
4442 } else {
925baedd 4443 return 1;
e8b0d724 4444 }
7bb86316 4445
b3b4aa74 4446 btrfs_release_path(path);
925baedd
CM
4447 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4448 if (ret < 0)
4449 return ret;
4450 btrfs_item_key(path->nodes[0], &found_key, 0);
4451 ret = comp_keys(&found_key, &key);
337c6f68
FM
4452 /*
4453 * We might have had an item with the previous key in the tree right
4454 * before we released our path. And after we released our path, that
4455 * item might have been pushed to the first slot (0) of the leaf we
4456 * were holding due to a tree balance. Alternatively, an item with the
4457 * previous key can exist as the only element of a leaf (big fat item).
4458 * Therefore account for these 2 cases, so that our callers (like
4459 * btrfs_previous_item) don't miss an existing item with a key matching
4460 * the previous key we computed above.
4461 */
4462 if (ret <= 0)
925baedd
CM
4463 return 0;
4464 return 1;
7bb86316
CM
4465}
4466
3f157a2f
CM
4467/*
4468 * A helper function to walk down the tree starting at min_key, and looking
de78b51a
ES
4469 * for nodes or leaves that are have a minimum transaction id.
4470 * This is used by the btree defrag code, and tree logging
3f157a2f
CM
4471 *
4472 * This does not cow, but it does stuff the starting key it finds back
4473 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4474 * key and get a writable path.
4475 *
3f157a2f
CM
4476 * This honors path->lowest_level to prevent descent past a given level
4477 * of the tree.
4478 *
d352ac68
CM
4479 * min_trans indicates the oldest transaction that you are interested
4480 * in walking through. Any nodes or leaves older than min_trans are
4481 * skipped over (without reading them).
4482 *
3f157a2f
CM
4483 * returns zero if something useful was found, < 0 on error and 1 if there
4484 * was nothing in the tree that matched the search criteria.
4485 */
4486int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
de78b51a 4487 struct btrfs_path *path,
3f157a2f
CM
4488 u64 min_trans)
4489{
4490 struct extent_buffer *cur;
4491 struct btrfs_key found_key;
4492 int slot;
9652480b 4493 int sret;
3f157a2f
CM
4494 u32 nritems;
4495 int level;
4496 int ret = 1;
f98de9b9 4497 int keep_locks = path->keep_locks;
3f157a2f 4498
c922b016 4499 ASSERT(!path->nowait);
f98de9b9 4500 path->keep_locks = 1;
3f157a2f 4501again:
bd681513 4502 cur = btrfs_read_lock_root_node(root);
3f157a2f 4503 level = btrfs_header_level(cur);
e02119d5 4504 WARN_ON(path->nodes[level]);
3f157a2f 4505 path->nodes[level] = cur;
bd681513 4506 path->locks[level] = BTRFS_READ_LOCK;
3f157a2f
CM
4507
4508 if (btrfs_header_generation(cur) < min_trans) {
4509 ret = 1;
4510 goto out;
4511 }
d397712b 4512 while (1) {
3f157a2f
CM
4513 nritems = btrfs_header_nritems(cur);
4514 level = btrfs_header_level(cur);
e3b83361 4515 sret = btrfs_bin_search(cur, min_key, &slot);
cbca7d59
FM
4516 if (sret < 0) {
4517 ret = sret;
4518 goto out;
4519 }
3f157a2f 4520
323ac95b
CM
4521 /* at the lowest level, we're done, setup the path and exit */
4522 if (level == path->lowest_level) {
e02119d5
CM
4523 if (slot >= nritems)
4524 goto find_next_key;
3f157a2f
CM
4525 ret = 0;
4526 path->slots[level] = slot;
4527 btrfs_item_key_to_cpu(cur, &found_key, slot);
4528 goto out;
4529 }
9652480b
Y
4530 if (sret && slot > 0)
4531 slot--;
3f157a2f 4532 /*
de78b51a 4533 * check this node pointer against the min_trans parameters.
260db43c 4534 * If it is too old, skip to the next one.
3f157a2f 4535 */
d397712b 4536 while (slot < nritems) {
3f157a2f 4537 u64 gen;
e02119d5 4538
3f157a2f
CM
4539 gen = btrfs_node_ptr_generation(cur, slot);
4540 if (gen < min_trans) {
4541 slot++;
4542 continue;
4543 }
de78b51a 4544 break;
3f157a2f 4545 }
e02119d5 4546find_next_key:
3f157a2f
CM
4547 /*
4548 * we didn't find a candidate key in this node, walk forward
4549 * and find another one
4550 */
4551 if (slot >= nritems) {
e02119d5
CM
4552 path->slots[level] = slot;
4553 sret = btrfs_find_next_key(root, path, min_key, level,
de78b51a 4554 min_trans);
e02119d5 4555 if (sret == 0) {
b3b4aa74 4556 btrfs_release_path(path);
3f157a2f
CM
4557 goto again;
4558 } else {
4559 goto out;
4560 }
4561 }
4562 /* save our key for returning back */
4563 btrfs_node_key_to_cpu(cur, &found_key, slot);
4564 path->slots[level] = slot;
4565 if (level == path->lowest_level) {
4566 ret = 0;
3f157a2f
CM
4567 goto out;
4568 }
4b231ae4 4569 cur = btrfs_read_node_slot(cur, slot);
fb770ae4
LB
4570 if (IS_ERR(cur)) {
4571 ret = PTR_ERR(cur);
4572 goto out;
4573 }
3f157a2f 4574
bd681513 4575 btrfs_tree_read_lock(cur);
b4ce94de 4576
bd681513 4577 path->locks[level - 1] = BTRFS_READ_LOCK;
3f157a2f 4578 path->nodes[level - 1] = cur;
f7c79f30 4579 unlock_up(path, level, 1, 0, NULL);
3f157a2f
CM
4580 }
4581out:
f98de9b9
FM
4582 path->keep_locks = keep_locks;
4583 if (ret == 0) {
4584 btrfs_unlock_up_safe(path, path->lowest_level + 1);
3f157a2f 4585 memcpy(min_key, &found_key, sizeof(found_key));
f98de9b9 4586 }
3f157a2f
CM
4587 return ret;
4588}
4589
4590/*
4591 * this is similar to btrfs_next_leaf, but does not try to preserve
4592 * and fixup the path. It looks for and returns the next key in the
de78b51a 4593 * tree based on the current path and the min_trans parameters.
3f157a2f
CM
4594 *
4595 * 0 is returned if another key is found, < 0 if there are any errors
4596 * and 1 is returned if there are no higher keys in the tree
4597 *
4598 * path->keep_locks should be set to 1 on the search made before
4599 * calling this function.
4600 */
e7a84565 4601int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
de78b51a 4602 struct btrfs_key *key, int level, u64 min_trans)
e7a84565 4603{
e7a84565
CM
4604 int slot;
4605 struct extent_buffer *c;
4606
6a9fb468 4607 WARN_ON(!path->keep_locks && !path->skip_locking);
d397712b 4608 while (level < BTRFS_MAX_LEVEL) {
e7a84565
CM
4609 if (!path->nodes[level])
4610 return 1;
4611
4612 slot = path->slots[level] + 1;
4613 c = path->nodes[level];
3f157a2f 4614next:
e7a84565 4615 if (slot >= btrfs_header_nritems(c)) {
33c66f43
YZ
4616 int ret;
4617 int orig_lowest;
4618 struct btrfs_key cur_key;
4619 if (level + 1 >= BTRFS_MAX_LEVEL ||
4620 !path->nodes[level + 1])
e7a84565 4621 return 1;
33c66f43 4622
6a9fb468 4623 if (path->locks[level + 1] || path->skip_locking) {
33c66f43
YZ
4624 level++;
4625 continue;
4626 }
4627
4628 slot = btrfs_header_nritems(c) - 1;
4629 if (level == 0)
4630 btrfs_item_key_to_cpu(c, &cur_key, slot);
4631 else
4632 btrfs_node_key_to_cpu(c, &cur_key, slot);
4633
4634 orig_lowest = path->lowest_level;
b3b4aa74 4635 btrfs_release_path(path);
33c66f43
YZ
4636 path->lowest_level = level;
4637 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4638 0, 0);
4639 path->lowest_level = orig_lowest;
4640 if (ret < 0)
4641 return ret;
4642
4643 c = path->nodes[level];
4644 slot = path->slots[level];
4645 if (ret == 0)
4646 slot++;
4647 goto next;
e7a84565 4648 }
33c66f43 4649
e7a84565
CM
4650 if (level == 0)
4651 btrfs_item_key_to_cpu(c, key, slot);
3f157a2f 4652 else {
3f157a2f
CM
4653 u64 gen = btrfs_node_ptr_generation(c, slot);
4654
3f157a2f
CM
4655 if (gen < min_trans) {
4656 slot++;
4657 goto next;
4658 }
e7a84565 4659 btrfs_node_key_to_cpu(c, key, slot);
3f157a2f 4660 }
e7a84565
CM
4661 return 0;
4662 }
4663 return 1;
4664}
4665
3d7806ec
JS
4666int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4667 u64 time_seq)
d97e63b6
CM
4668{
4669 int slot;
8e73f275 4670 int level;
5f39d397 4671 struct extent_buffer *c;
8e73f275 4672 struct extent_buffer *next;
d96b3424 4673 struct btrfs_fs_info *fs_info = root->fs_info;
925baedd 4674 struct btrfs_key key;
d96b3424 4675 bool need_commit_sem = false;
925baedd
CM
4676 u32 nritems;
4677 int ret;
0e46318d 4678 int i;
925baedd 4679
bdcdd86c
FM
4680 /*
4681 * The nowait semantics are used only for write paths, where we don't
4682 * use the tree mod log and sequence numbers.
4683 */
4684 if (time_seq)
4685 ASSERT(!path->nowait);
c922b016 4686
925baedd 4687 nritems = btrfs_header_nritems(path->nodes[0]);
d397712b 4688 if (nritems == 0)
925baedd 4689 return 1;
925baedd 4690
8e73f275
CM
4691 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4692again:
4693 level = 1;
4694 next = NULL;
b3b4aa74 4695 btrfs_release_path(path);
8e73f275 4696
a2135011 4697 path->keep_locks = 1;
8e73f275 4698
d96b3424 4699 if (time_seq) {
3d7806ec 4700 ret = btrfs_search_old_slot(root, &key, path, time_seq);
d96b3424
FM
4701 } else {
4702 if (path->need_commit_sem) {
4703 path->need_commit_sem = 0;
4704 need_commit_sem = true;
bdcdd86c
FM
4705 if (path->nowait) {
4706 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4707 ret = -EAGAIN;
4708 goto done;
4709 }
4710 } else {
4711 down_read(&fs_info->commit_root_sem);
4712 }
d96b3424 4713 }
3d7806ec 4714 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
d96b3424 4715 }
925baedd
CM
4716 path->keep_locks = 0;
4717
4718 if (ret < 0)
d96b3424 4719 goto done;
925baedd 4720
a2135011 4721 nritems = btrfs_header_nritems(path->nodes[0]);
168fd7d2
CM
4722 /*
4723 * by releasing the path above we dropped all our locks. A balance
4724 * could have added more items next to the key that used to be
4725 * at the very end of the block. So, check again here and
4726 * advance the path if there are now more items available.
4727 */
a2135011 4728 if (nritems > 0 && path->slots[0] < nritems - 1) {
e457afec
YZ
4729 if (ret == 0)
4730 path->slots[0]++;
8e73f275 4731 ret = 0;
925baedd
CM
4732 goto done;
4733 }
0b43e04f
LB
4734 /*
4735 * So the above check misses one case:
4736 * - after releasing the path above, someone has removed the item that
4737 * used to be at the very end of the block, and balance between leafs
4738 * gets another one with bigger key.offset to replace it.
4739 *
4740 * This one should be returned as well, or we can get leaf corruption
4741 * later(esp. in __btrfs_drop_extents()).
4742 *
4743 * And a bit more explanation about this check,
4744 * with ret > 0, the key isn't found, the path points to the slot
4745 * where it should be inserted, so the path->slots[0] item must be the
4746 * bigger one.
4747 */
4748 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4749 ret = 0;
4750 goto done;
4751 }
d97e63b6 4752
d397712b 4753 while (level < BTRFS_MAX_LEVEL) {
8e73f275
CM
4754 if (!path->nodes[level]) {
4755 ret = 1;
4756 goto done;
4757 }
5f39d397 4758
d97e63b6
CM
4759 slot = path->slots[level] + 1;
4760 c = path->nodes[level];
5f39d397 4761 if (slot >= btrfs_header_nritems(c)) {
d97e63b6 4762 level++;
8e73f275
CM
4763 if (level == BTRFS_MAX_LEVEL) {
4764 ret = 1;
4765 goto done;
4766 }
d97e63b6
CM
4767 continue;
4768 }
5f39d397 4769
0e46318d
JB
4770
4771 /*
4772 * Our current level is where we're going to start from, and to
4773 * make sure lockdep doesn't complain we need to drop our locks
4774 * and nodes from 0 to our current level.
4775 */
4776 for (i = 0; i < level; i++) {
4777 if (path->locks[level]) {
4778 btrfs_tree_read_unlock(path->nodes[i]);
4779 path->locks[i] = 0;
4780 }
4781 free_extent_buffer(path->nodes[i]);
4782 path->nodes[i] = NULL;
925baedd 4783 }
5f39d397 4784
8e73f275 4785 next = c;
d07b8528 4786 ret = read_block_for_search(root, path, &next, level,
cda79c54 4787 slot, &key);
bdcdd86c 4788 if (ret == -EAGAIN && !path->nowait)
8e73f275 4789 goto again;
5f39d397 4790
76a05b35 4791 if (ret < 0) {
b3b4aa74 4792 btrfs_release_path(path);
76a05b35
CM
4793 goto done;
4794 }
4795
5cd57b2c 4796 if (!path->skip_locking) {
bd681513 4797 ret = btrfs_try_tree_read_lock(next);
bdcdd86c
FM
4798 if (!ret && path->nowait) {
4799 ret = -EAGAIN;
4800 goto done;
4801 }
d42244a0
JS
4802 if (!ret && time_seq) {
4803 /*
4804 * If we don't get the lock, we may be racing
4805 * with push_leaf_left, holding that lock while
4806 * itself waiting for the leaf we've currently
4807 * locked. To solve this situation, we give up
4808 * on our lock and cycle.
4809 */
cf538830 4810 free_extent_buffer(next);
d42244a0
JS
4811 btrfs_release_path(path);
4812 cond_resched();
4813 goto again;
4814 }
0e46318d
JB
4815 if (!ret)
4816 btrfs_tree_read_lock(next);
5cd57b2c 4817 }
d97e63b6
CM
4818 break;
4819 }
4820 path->slots[level] = slot;
d397712b 4821 while (1) {
d97e63b6 4822 level--;
d97e63b6
CM
4823 path->nodes[level] = next;
4824 path->slots[level] = 0;
a74a4b97 4825 if (!path->skip_locking)
ffeb03cf 4826 path->locks[level] = BTRFS_READ_LOCK;
d97e63b6
CM
4827 if (!level)
4828 break;
b4ce94de 4829
d07b8528 4830 ret = read_block_for_search(root, path, &next, level,
cda79c54 4831 0, &key);
bdcdd86c 4832 if (ret == -EAGAIN && !path->nowait)
8e73f275
CM
4833 goto again;
4834
76a05b35 4835 if (ret < 0) {
b3b4aa74 4836 btrfs_release_path(path);
76a05b35
CM
4837 goto done;
4838 }
4839
bdcdd86c
FM
4840 if (!path->skip_locking) {
4841 if (path->nowait) {
4842 if (!btrfs_try_tree_read_lock(next)) {
4843 ret = -EAGAIN;
4844 goto done;
4845 }
4846 } else {
4847 btrfs_tree_read_lock(next);
4848 }
4849 }
d97e63b6 4850 }
8e73f275 4851 ret = 0;
925baedd 4852done:
f7c79f30 4853 unlock_up(path, 0, 1, 0, NULL);
d96b3424
FM
4854 if (need_commit_sem) {
4855 int ret2;
4856
4857 path->need_commit_sem = 1;
4858 ret2 = finish_need_commit_sem_search(path);
4859 up_read(&fs_info->commit_root_sem);
4860 if (ret2)
4861 ret = ret2;
4862 }
8e73f275
CM
4863
4864 return ret;
d97e63b6 4865}
0b86a832 4866
890d2b1a
JB
4867int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4868{
4869 path->slots[0]++;
4870 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4871 return btrfs_next_old_leaf(root, path, time_seq);
4872 return 0;
4873}
4874
3f157a2f
CM
4875/*
4876 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4877 * searching until it gets past min_objectid or finds an item of 'type'
4878 *
4879 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4880 */
0b86a832
CM
4881int btrfs_previous_item(struct btrfs_root *root,
4882 struct btrfs_path *path, u64 min_objectid,
4883 int type)
4884{
4885 struct btrfs_key found_key;
4886 struct extent_buffer *leaf;
e02119d5 4887 u32 nritems;
0b86a832
CM
4888 int ret;
4889
d397712b 4890 while (1) {
0b86a832
CM
4891 if (path->slots[0] == 0) {
4892 ret = btrfs_prev_leaf(root, path);
4893 if (ret != 0)
4894 return ret;
4895 } else {
4896 path->slots[0]--;
4897 }
4898 leaf = path->nodes[0];
e02119d5
CM
4899 nritems = btrfs_header_nritems(leaf);
4900 if (nritems == 0)
4901 return 1;
4902 if (path->slots[0] == nritems)
4903 path->slots[0]--;
4904
0b86a832 4905 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
e02119d5
CM
4906 if (found_key.objectid < min_objectid)
4907 break;
0a4eefbb
YZ
4908 if (found_key.type == type)
4909 return 0;
e02119d5
CM
4910 if (found_key.objectid == min_objectid &&
4911 found_key.type < type)
4912 break;
0b86a832
CM
4913 }
4914 return 1;
4915}
ade2e0b3
WS
4916
4917/*
4918 * search in extent tree to find a previous Metadata/Data extent item with
4919 * min objecitd.
4920 *
4921 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4922 */
4923int btrfs_previous_extent_item(struct btrfs_root *root,
4924 struct btrfs_path *path, u64 min_objectid)
4925{
4926 struct btrfs_key found_key;
4927 struct extent_buffer *leaf;
4928 u32 nritems;
4929 int ret;
4930
4931 while (1) {
4932 if (path->slots[0] == 0) {
ade2e0b3
WS
4933 ret = btrfs_prev_leaf(root, path);
4934 if (ret != 0)
4935 return ret;
4936 } else {
4937 path->slots[0]--;
4938 }
4939 leaf = path->nodes[0];
4940 nritems = btrfs_header_nritems(leaf);
4941 if (nritems == 0)
4942 return 1;
4943 if (path->slots[0] == nritems)
4944 path->slots[0]--;
4945
4946 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4947 if (found_key.objectid < min_objectid)
4948 break;
4949 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4950 found_key.type == BTRFS_METADATA_ITEM_KEY)
4951 return 0;
4952 if (found_key.objectid == min_objectid &&
4953 found_key.type < BTRFS_EXTENT_ITEM_KEY)
4954 break;
4955 }
4956 return 1;
4957}
226463d7
JB
4958
4959int __init btrfs_ctree_init(void)
4960{
4961 btrfs_path_cachep = kmem_cache_create("btrfs_path",
4962 sizeof(struct btrfs_path), 0,
4963 SLAB_MEM_SPREAD, NULL);
4964 if (!btrfs_path_cachep)
4965 return -ENOMEM;
4966 return 0;
4967}
4968
4969void __cold btrfs_ctree_exit(void)
4970{
4971 kmem_cache_destroy(btrfs_path_cachep);
4972}